In [52]:
from __future__ import print_function

import sys
import os
import itertools
from math import sqrt
from operator import add
from os.path import join, isfile, dirname
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType
from pyspark.sql.types import FloatType
import pandas as pd


conf = SparkConf().setAppName("app_collaborative")
sc = SparkContext(conf=conf)
#sc.setCheckpointDir('checkpoint/')
sqlContext = SQLContext(sc)

USER_ID = 0

In [51]:
sc.stop()

In [53]:
def howFarAreWe(model, against, sizeAgainst):
  # Ignore the rating column  
  againstNoRatings = against.map(lambda x: (int(x[0]), int(x[1])) )

  # Keep the rating to compare against
  againstWiRatings = against.map(lambda x: ((int(x[0]),int(x[1])), int(x[2])) )

  # Make a prediction and map it for later comparison
  # The map has to be ((user,product), rating) not ((product,user), rating)
  predictions = model.predictAll(againstNoRatings).map(lambda p: ( (p[0],p[1]), p[2]) )

  # Returns the pairs (prediction, rating)
  predictionsAndRatings = predictions.join(againstWiRatings).values()

  # Returns the variance
  return sqrt(predictionsAndRatings.map(lambda s: (s[0] - s[1]) ** 2).reduce(add) / float(sizeAgainst))

In [54]:
url ='/home/bella/Downloads/ratings.csv'

In [62]:
df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").load(url)

In [63]:
df.printSchema()

root
 |-- userid: string (nullable = true)
 |-- jobid: string (nullable = true)
 |-- rating: string (nullable = true)



In [64]:
rddUserRatings = df.filter(df.userid == 0).rdd
print(rddUserRatings.count())

9


In [66]:
# Split the data in 3 different sets : training, validating, testing
# 60% 20% 20%
rddRates = df.rdd
rddTraining, rddValidating, rddTesting = rddRates.randomSplit([6,2,2])

#Add user ratings in the training model
rddTraining.union(rddUserRatings)
nbValidating = rddValidating.count()
nbTesting    = rddTesting.count()

print("Training: %d, validation: %d, test: %d" % (rddTraining.count(), nbValidating, rddTesting.count()))

Training: 724, validation: 234, test: 228


In [67]:
ranks  = [5,10,15,20]
reguls = [0.1, 1,10]
iters  = [5,10,20]

finalModel = None
finalRank  = 0
finalRegul = float(0)
finalIter  = -1
finalDist   = float(100)

In [68]:
for cRank, cRegul, cIter in itertools.product(ranks, reguls, iters):

  model = ALS.train(rddTraining, cRank, cIter, float(cRegul))
  dist = howFarAreWe(model, rddValidating, nbValidating)
  if dist < finalDist:
    print("Best so far:%f" % dist)
    finalModel = model
    finalRank  = cRank
    finalRegul = cRegul
    finalIter  = cIter
    finalDist  = dist
#[END train_model]

print("Rank %i" % finalRank) 
print("Regul %f" % finalRegul) 
print("Iter %i" % finalIter)  
print("Dist %f" % finalDist)

Best so far:1.227620
Best so far:1.225185
Best so far:1.166659
Rank 5
Regul 0.100000
Iter 20
Dist 1.166659
