In [1]:
rawUserArtistData = sc.textFile("user_artist_data.txt")

In [2]:
 %%bash
. ~/.bashrc
head -100000 user_artist_data.txt > user_artist_data_100k.txt

In [3]:
rawUserArtistData = sc.textFile("user_artist_data_100k.txt")

In [4]:
userStats = rawUserArtistData.map(lambda l: float(l.split(' ')[0])).stats()

In [5]:
userStats

(count: 100000, mean: 1000279.30585, stdev: 162.949305449, max: 1000586.0, min: 1000002.0)

In [6]:
artistStats = rawUserArtistData.map(lambda l: float(l.split(' ')[1])).stats()

In [7]:
artistStats

(count: 100000, mean: 1153704.17701, stdev: 1654073.25666, max: 10787933.0, min: 1.0)

In [8]:
rawArtistData = sc.textFile("artist_data.txt")
rawArtistData.take(10)

[u'1134999\t06Crazy Life',
 u'6821360\tPang Nakarin',
 u'10113088\tTerfel, Bartoli- Mozart: Don',
 u'10151459\tThe Flaming Sidebur',
 u'6826647\tBodenstandig 3000',
 u'10186265\tJota Quest e Ivete Sangalo',
 u'6828986\tToto_XX (1977',
 u'10236364\tU.S Bombs -',
 u'1135000\tartist formaly know as Mat',
 u'10299728\tKassierer - Musik f\xfcr beide Ohren']

In [9]:
def getArtistByID(line):
    try:
        (artistID, name) = line.split('\t',1)
        artistID = int(artistID)
    except:
        return []
    if not name:
        return []
    else:
        return [(artistID, name.strip())]

artistByID = rawArtistData.flatMap(getArtistByID)

In [10]:
artistByID.take(1000)

[(1134999, u'06Crazy Life'),
 (6821360, u'Pang Nakarin'),
 (10113088, u'Terfel, Bartoli- Mozart: Don'),
 (10151459, u'The Flaming Sidebur'),
 (6826647, u'Bodenstandig 3000'),
 (10186265, u'Jota Quest e Ivete Sangalo'),
 (6828986, u'Toto_XX (1977'),
 (10236364, u'U.S Bombs -'),
 (1135000, u'artist formaly know as Mat'),
 (10299728, u'Kassierer - Musik f\xfcr beide Ohren'),
 (10299744, u'Rahzel, RZA'),
 (6864258, u'Jon Richardson'),
 (6878791, u'Young Fresh Fellowslows & the Minus 5'),
 (10299751, u'Ki-ya-Kiss'),
 (6909716, u'Underminded - The Task Of Modern Educator'),
 (10435121, u'Kox-Box'),
 (6918061, u'alexisonfire [wo!]'),
 (1135001, u'dj salinger'),
 (6940391, u"The B52's - Channel Z"),
 (10475396, u'44 Hoes'),
 (10584537, u'orchestral mandeuvres in dark'),
 (10584538, u'Josh Groban (Featuring Angie Stone)'),
 (6945632, u'Savage Garden - Truley, Madly, Deeply'),
 (10584546, u'Nislije'),
 (10584550, u'ONEYA BASSIVITYMIXTAPE'),
 (10584556, u'Grant Green / Osunlade'),
 (10584564, u'J

In [11]:
rawArtistAlias = sc.textFile("artist_alias.txt")

In [12]:
def tokenCheck(line):
    (id1, id2) = line.split('\t',1)
    if not id1:
        return []
    else:
        return [(int(id1), int(id2))]
    
artistAlias = rawArtistAlias.flatMap(tokenCheck).collectAsMap()

In [13]:
from pyspark.mllib.recommendation import *

bArtistAlias = sc.broadcast(artistAlias)

def createRating(line):
    (userID, artistID, count) = line.split(' ')
    finalArtistID = bArtistAlias.value.get(artistID)
    if not finalArtistID:
        finalArtistID = artistID
    return Rating(int(userID), int(finalArtistID), int(count))
    
trainData = rawUserArtistData.map(createRating).cache()

In [14]:
model = ALS.trainImplicit(trainData, rank=10, iterations=5, lambda_=0.01, alpha=1.0)

In [15]:
model.userFeatures().first()

(1000002,
 array('d', [-0.6344444155693054, 0.009747437201440334, -0.23046931624412537, 0.09434975683689117, 0.015139671042561531, -0.7776004672050476, -0.4415060579776764, -0.4445381462574005, 0.3218255341053009, -0.0976143479347229]))

In [16]:
def userFilter(line):
    (user, artist, playcount) = line
    return (int(user) == 1000002)
rawArtistsForUser = rawUserArtistData.map(lambda l: l.split(' ')).filter(userFilter)


In [17]:
def artistToInt(line):
    (user, artist, playcount) = line
    return int(artist)
existingProducts = rawArtistsForUser.map(artistToInt).collect()

In [18]:
rawArtistsForUser.collect()

[[u'1000002', u'1', u'55'],
 [u'1000002', u'1000006', u'33'],
 [u'1000002', u'1000007', u'8'],
 [u'1000002', u'1000009', u'144'],
 [u'1000002', u'1000010', u'314'],
 [u'1000002', u'1000013', u'8'],
 [u'1000002', u'1000014', u'42'],
 [u'1000002', u'1000017', u'69'],
 [u'1000002', u'1000024', u'329'],
 [u'1000002', u'1000025', u'1'],
 [u'1000002', u'1000028', u'17'],
 [u'1000002', u'1000031', u'47'],
 [u'1000002', u'1000033', u'15'],
 [u'1000002', u'1000042', u'1'],
 [u'1000002', u'1000045', u'1'],
 [u'1000002', u'1000054', u'2'],
 [u'1000002', u'1000055', u'25'],
 [u'1000002', u'1000056', u'4'],
 [u'1000002', u'1000059', u'2'],
 [u'1000002', u'1000062', u'71'],
 [u'1000002', u'1000088', u'157'],
 [u'1000002', u'1000099', u'57'],
 [u'1000002', u'1000107', u'4'],
 [u'1000002', u'1000113', u'30'],
 [u'1000002', u'1000123', u'27'],
 [u'1000002', u'1000127', u'53'],
 [u'1000002', u'1000139', u'56'],
 [u'1000002', u'1000140', u'95'],
 [u'1000002', u'1000157', u'2'],
 [u'1000002', u'1000183', 

In [19]:
def filterForArtistID(line):
    (id_, name) = line
    return id_ in existingProducts
print artistByID.filter(filterForArtistID).values().collect()

[u'Kerrang', u'YMC', u'George Duke', u'Firebird', u'Caf\xe9 Del Mar', u'Mallrats', u'Benny Goodman Orchestra', u'Brian Hughes', u'Armand Van Helden', u'Brant Bjork and The Operators', u'Echo & the Bunnymen', u'Joshua Redman', u'Elvis Costello', u'Enigma', u'Eric Clapton', u'Eurythmics', u'The Buddy Rich Big Band', u'Alien Ant Farm', u'Duke Ellington and Johnny Hodg', u'Jeno Jando', u'The Horace Silver Quintet', u'Pimps', u'Benny Goodman & Harry James', u'Steve Cole', u'Oleander-', u'Yellowjackets', u'Skid Row', u'Sublime', u'Nelly Furtado', u'The Stranglers', u'Elastica', u'Eiffel 65', u'Louis Armstrong', u'Eddie Henderson', u'Alice Cooper', u'Annie Lennox', u'The Jimi Hendrix Experience', u'Hothouse Flowers', u'Hole', u'The Hollies', u'Roxy Music', u'Meanwhile, Back In Communist Russia...', u'Mr. Bungle', u'Bill Evans', u'Count Basie', u'ZZ Top', u'Chuck Mangione', u'Bob James', u'Earl Klugh', u'Larry Carlton', u'Brian Culbertson', u'Suede', u'US3 feat Rahsaan & Gerard Presencer', u'J

In [36]:
# function recommendProducts copied from 
# https://spark.apache.org/docs/1.5.1/api/python/_modules/pyspark/mllib/recommendation.html 
def recommendProducts(self, user, num):
        """
        Recommends the top "num" number of products for a given user and returns a list
        of Rating objects sorted by the predicted rating in descending order.
        """
        return list(self.call("recommendProducts", user, num))
recommendations = recommendProducts(model, 1000002, 5)
recommendedProductIDs = map(lambda (userID, productID, rating): productID, recommendations)

In [39]:
recommendations

[Rating(user=1000002, product=1002328, rating=1.5061759640442747),
 Rating(user=1000002, product=2132, rating=1.4054276589494497),
 Rating(user=1000002, product=581, rating=1.3839277749678225),
 Rating(user=1000002, product=1000790, rating=1.3829498349861997),
 Rating(user=1000002, product=1003159, rating=1.3773343404119047)]

In [40]:
recommendedProductIDs

[1002328, 2132, 581, 1000790, 1003159]

In [46]:
def filterForRecommendedIDs(line):
    (artistID, name) = line
    return artistID in recommendedProductIDs
print artistByID.filter(filterForRecommendedIDs).values().collect()

[u'Billy Joel', u'Depeche Mode', u'Garbage', u'Dave Matthews Band', u'The Corrs']


AttributeError: 'function' object has no attribute 'collect'