In [2]:
import sys
import itertools
from math import sqrt
from operator import add
from os.path import join, isfile, dirname

from pyspark import SparkConf, SparkContext
from pyspark.mllib.recommendation import ALS

In [3]:
def parseRating(line):
    """
    Parses a rating record in MovieLens format userId::movieId::rating::timestamp .
    """
    fields = line.strip().split("::")
    return long(fields[3]) % 10, (int(fields[0]), int(fields[1]), float(fields[2]))

def parseMovie(line):
    """
    Parses a movie record in MovieLens format movieId::movieTitle .
    """
    fields = line.strip().split("::")
    return int(fields[0]), fields[1]

def loadRatings(ratingsFile):
    """
    Load ratings from file.
    """
    if not isfile(ratingsFile):
        print "File %s does not exist." % ratingsFile
        sys.exit(1)
    f = open(ratingsFile, 'r')
    ratings = filter(lambda r: r[2] > 0, [parseRating(line)[1] for line in f])
    f.close()
    if not ratings:
        print "No ratings provided."
        sys.exit(1)
    else:
        return ratings

def computeRmse(model, data, n):
    """
    Compute RMSE (Root Mean Squared Error).
    """
    predictions = model.predictAll(data.map(lambda x: (x[0], x[1])))
    predictionsAndRatings = predictions.map(lambda x: ((x[0], x[1]), x[2])) \
      .join(data.map(lambda x: ((x[0], x[1]), x[2]))) \
      .values()
    return sqrt(predictionsAndRatings.map(lambda x: (x[0] - x[1]) ** 2).reduce(add) / float(n))


In [4]:
# load personal ratings
myRatings = loadRatings("myratings")
myRatingsRDD = sc.parallelize(myRatings, 1)

In [5]:
# ratings is an RDD of (last digit of timestamp, (userId, movieId, rating))
ratings = sc.textFile(join("ratings")).map(parseRating)

# movies is an RDD of (movieId, movieTitle)
movies = dict(sc.textFile(join("movies")).map(parseMovie).collect())

numRatings = ratings.count()
numUsers = ratings.values().map(lambda r: r[0]).distinct().count()
numMovies = ratings.values().map(lambda r: r[1]).distinct().count()

print "Got %d ratings from %d users on %d movies." % (numRatings, numUsers, numMovies)

Got 1000209 ratings from 6040 users on 3706 movies.


In [6]:
# split ratings into train (60%), validation (20%), and test (20%) based on the 
# last digit of the timestamp, add myRatings to train, and cache them

# training, validation, test are all RDDs of (userId, movieId, rating)

numPartitions = 4
training = ratings.filter(lambda x: x[0] < 6) \
  .values() \
  .union(myRatingsRDD) \
  .repartition(numPartitions) \
  .cache()

validation = ratings.filter(lambda x: x[0] >= 6 and x[0] < 8) \
  .values() \
  .repartition(numPartitions) \
  .cache()

test = ratings.filter(lambda x: x[0] >= 8).values().cache()

numTraining = training.count()
numValidation = validation.count()
numTest = test.count()

print "Training: %d, validation: %d, test: %d" % (numTraining, numValidation, numTest)



Training: 602252, validation: 198919, test: 199049


In [8]:
# train models and evaluate them on the validation set

ranks = [8, 12]
lambdas = [0.1, 10.0]
numIters = [10, 20]
bestModel = None
bestValidationRmse = float("inf")
bestRank = 0
bestLambda = -1.0
bestNumIter = -1

for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters):
    model = ALS.trainImplicit(training, rank, numIter, alpha=0.01)
    validationRmse = computeRmse(model, validation, numValidation)
    print "RMSE (validation) = %f for the model trained with " % validationRmse + \
          "rank = %d, lambda = %.1f, and numIter = %d." % (rank, lmbda, numIter)
    if (validationRmse < bestValidationRmse):
        bestModel = model
        bestValidationRmse = validationRmse
        bestRank = rank
        bestLambda = lmbda
        bestNumIter = numIter

testRmse = computeRmse(bestModel, test, numTest)


RMSE (validation) = 3.553831 for the model trained with rank = 8, lambda = 0.1, and numIter = 10.
RMSE (validation) = 3.550790 for the model trained with rank = 8, lambda = 0.1, and numIter = 20.
RMSE (validation) = 3.553905 for the model trained with rank = 8, lambda = 10.0, and numIter = 10.
RMSE (validation) = 3.550829 for the model trained with rank = 8, lambda = 10.0, and numIter = 20.
RMSE (validation) = 3.543438 for the model trained with rank = 12, lambda = 0.1, and numIter = 10.
RMSE (validation) = 3.541189 for the model trained with rank = 12, lambda = 0.1, and numIter = 20.
RMSE (validation) = 3.544877 for the model trained with rank = 12, lambda = 10.0, and numIter = 10.
RMSE (validation) = 3.540559 for the model trained with rank = 12, lambda = 10.0, and numIter = 20.


In [9]:

# evaluate the best model on the test set
print "The best model was trained with rank = %d and lambda = %.1f, " % (bestRank, bestLambda) \
  + "and numIter = %d, and its RMSE on the test set is %f." % (bestNumIter, testRmse)



The best model was trained with rank = 12 and lambda = 10.0, and numIter = 20, and its RMSE on the test set is 3.538415.


In [10]:
# compare the best model with a naive baseline that always returns the mean rating
meanRating = training.union(validation).map(lambda x: x[2]).mean()
baselineRmse = sqrt(test.map(lambda x: (meanRating - x[2]) ** 2).reduce(add) / numTest)
improvement = (baselineRmse - testRmse) / baselineRmse * 100
print "The best model improves the baseline by %.2f" % (improvement) + "%."



The best model improves the baseline by -217.77%.


In [11]:
# make personalized recommendations

myRatedMovieIds = set([x[1] for x in myRatings])
candidates = sc.parallelize([m for m in movies if m not in myRatedMovieIds])
predictions = bestModel.predictAll(candidates.map(lambda x: (0, x))).collect()
recommendations = sorted(predictions, key=lambda x: x[2], reverse=True)[:50]

print "Movies recommended for you:"
for i in xrange(len(recommendations)):
    print ("%2d: %s" % (i + 1, movies[recommendations[i][1]])).encode('ascii', 'ignore')


Movies recommended for you:
 1: Jurassic Park (1993)
 2: Forrest Gump (1994)
 3: Groundhog Day (1993)
 4: Terminator 2: Judgment Day (1991)
 5: Matrix, The (1999)
 6: True Lies (1994)
 7: Saving Private Ryan (1998)
 8: Star Wars: Episode I - The Phantom Menace (1999)
 9: Fugitive, The (1993)
10: Total Recall (1990)
11: Braveheart (1995)
12: Speed (1994)
13: Rock, The (1996)
14: Hunt for Red October, The (1990)
15: Star Wars: Episode V - The Empire Strikes Back (1980)
16: American Beauty (1999)
17: Back to the Future (1985)
18: Star Wars: Episode IV - A New Hope (1977)
19: Shakespeare in Love (1998)
20: There's Something About Mary (1998)
21: Face/Off (1997)
22: Sleepless in Seattle (1993)
23: Fifth Element, The (1997)
24: You've Got Mail (1998)
25: Clueless (1995)
26: Titanic (1997)
27: Star Trek: First Contact (1996)
28: Pleasantville (1998)
29: Four Weddings and a Funeral (1994)
30: Back to the Future Part III (1990)
31: My Best Friend's Wedding (1997)
32: Twister (1996)
33: Wedding 

In [None]:

# clean up
sc.stop()