In [2]:
from __future__ import print_function

from pyspark import SparkContext

from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

import itertools
from math import sqrt
from operator import add
from os.path import join, isfile, dirname


In [16]:
if __name__ == "__main__":
    #get SparkContext instance
    sc = SparkContext.getOrCreate()
    
    #read input data from Book Crossings dataset
    data = sc.textFile("C:\Users\poonam\Downloads\RecommenderSysDataset\Book-Crossings\BX-CSV-Dump\BookCrossings.txt")

    bookCrossings = data.map(lambda l: l.split('\t'))

    ratings = bookCrossings.map(lambda x: Rating(int(x[0]),\
    int(x[1]), float(x[2])))

    training, test = ratings.randomSplit([0.7,0.3])

    training.cache()
    test.cache()

In [17]:
    training.count()

733643

In [18]:
    test.count()

314931

In [19]:
    ranks       = [5, 10]
    lambdas     = [0.1, .01]
    numIters    = [5, 10]
    bestModel   = None
    bestValidationRmse = float("inf")
    bestRank    = 0
    bestLambda  = -1.0
    bestNumIter = -1

In [21]:
    for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters):
        model = ALS.train(training, rank, numIter, lmbda)
        testdata = training.map(lambda p: (p[0], p[1]))
        predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
                          
        predictionsAndRatings = training.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
        MSE = predictionsAndRatings.map(lambda r: (r[1][0] - r[1][1])**2).mean()
        validationRmse = sqrt(MSE)

        print(rank, lmbda, numIter, validationRmse)
        
        if(validationRmse < bestValidationRmse):
            bestValidationRmse = validationRmse
            bestRank = rank
            bestLambda = lmbda
            bestNumIter = numIter

5 0.1 5 1.68662765582
5 0.1 10 1.46425588388
5 0.01 5 1.90009231849
5 0.01 10 1.66498985573
10 0.1 5 1.17131534927
10 0.1 10 1.02733104301
10 0.01 5 1.19848585841
10 0.01 10 0.999651332084


In [22]:
print("Rank = ",bestRank,", Lambda = ",bestLambda,", Iterations =  ",bestNumIter,", RMSE = ",bestValidationRmse)

print("ALS on train:\t\t%.2f" % bestValidationRmse)

Rank =  10 , Lambda =  0.01 , Iterations =   10 , RMSE =  0.999651332084
ALS on train:		1.00


In [23]:
bestModel = ALS.train(training, bestRank, bestNumIter, bestLambda)

In [24]:
predictTestdata = test.map(lambda p: (p[0], p[1]))
testPredictions = model.predictAll(predictTestdata).map(lambda r: ((r[0], r[1]), r[2]))
predictionsAndRatingsOnTest   = test.map(lambda r: ((r[0], r[1]), r[2])).join(testPredictions)
testMSE = predictionsAndRatingsOnTest.map(lambda r: (r[1][0] - r[1][1])**2).mean()
testValidationRmse = sqrt(MSE)

In [25]:
print("ALS on test:\t%.2f" % testValidationRmse)


ALS on test:	1.00


In [26]:
sc.stop()