In [1]:
from __future__ import print_function

from pyspark import SparkContext

from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

import itertools
from math import sqrt
from operator import add
from os.path import join, isfile, dirname


In [2]:
if __name__ == "__main__":
    #get SparkContext instance
    sc = SparkContext.getOrCreate()
    
    #read input data from Book Crossings dataset
    data = sc.textFile("C:\Users\poonam\Downloads\RecommenderSysDataset\Book-Crossings\BX-CSV-Dump\BookCrossings.txt")

    bookCrossings = data.map(lambda l: l.split('\t'))

    ratings = bookCrossings.map(lambda x: Rating(int(x[0]),\
    int(x[1]), float(x[2])))

    #devide input data into training and test set
    training, test = ratings.randomSplit([0.7,0.3])

    training.cache()
    test.cache()

In [3]:
    training.count()

733769

In [4]:
    test.count()

314805

In [5]:
    #set range of parameters for training the model
    ranks       = [5, 10]
    lambdas     = [0.1, .01]
    numIters    = [5, 10]
    bestModel   = None
    bestValidationRmse = float("inf")
    bestRank    = 0
    bestLambda  = -1.0
    bestNumIter = -1

In [6]:
    #Train the model on training set and calculate RMSE
    for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters):
        model = ALS.train(training, rank, numIter, lmbda)
        testdata = training.map(lambda p: (p[0], p[1]))
        predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
                          
        predictionsAndRatings = training.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
        MSE = predictionsAndRatings.map(lambda r: (r[1][0] - r[1][1])**2).mean()
        validationRmse = sqrt(MSE)

        print(rank, lmbda, numIter, validationRmse)
        
        if(validationRmse < bestValidationRmse):
            bestValidationRmse = validationRmse
            bestRank = rank
            bestLambda = lmbda
            bestNumIter = numIter

5 0.1 5 1.66676025334
5 0.1 10 1.47697443255
5 0.01 5 1.90262962823
5 0.01 10 1.51635047315
10 0.1 5 1.16068729316
10 0.1 10 1.02313997399
10 0.01 5 1.18574534257
10 0.01 10 1.02124515909


In [7]:
print("Rank = ",bestRank,", Lambda = ",bestLambda,", Iterations =  ",bestNumIter,", RMSE = ",bestValidationRmse)

print("RMSE for train:\t\t%.2f" % bestValidationRmse)

Rank =  10 , Lambda =  0.01 , Iterations =   10 , RMSE =  1.02124515909
RMSE for train:		1.02


In [8]:
bestModel = ALS.train(training, bestRank, bestNumIter, bestLambda)

In [9]:
#make predictions on test set and calculate RMSE
predictTestdata = test.map(lambda p: (p[0], p[1]))
testPredictions = model.predictAll(predictTestdata).map(lambda r: ((r[0], r[1]), r[2]))
predictionsAndRatingsOnTest   = test.map(lambda r: ((r[0], r[1]), r[2])).join(testPredictions)
testMSE = predictionsAndRatingsOnTest.map(lambda r: (r[1][0] - r[1][1])**2).mean()
testValidationRmse = sqrt(MSE)

In [10]:
print("RMSE for test:\t%.2f" % testValidationRmse)


RMSE for test:	1.02


In [11]:
sc.stop()