In [1]:
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder

In [2]:
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [3]:
!pwd

/Users/kunal/Desktop/MSAN_Coursework/DistrComp/Collaborative-Filtering-/codes


In [6]:
# Data pre-processing in Spark
path = '../data/'
data = sc.textFile(path+'reviews.csv',4).map(lambda x:x.split(','))
header = data.first() #extract header
ratingsRDD = data.filter(lambda row: row != header).map(lambda p: Row(userId=int(p[0]), businessId=int(p[1]),
                                     rating=int(p[2])))
ratings = spark.createDataFrame(ratingsRDD)
train,valid = ratings.randomSplit([0.8,0.2])

# Cached it improve speed
train.cache()
valid.cache()

DataFrame[businessId: bigint, rating: bigint, userId: bigint]

# Model  Training

In [6]:
train.is_cached

True

In [7]:
# coldstartStrategy will ensure that we have no nan value
als = ALS(maxIter=10, regParam=0.001, userCol="userId",nonnegative=True
          ,itemCol="businessId", ratingCol="rating", rank =10)
model = als.fit(train)

In [8]:
pred_trn = model.transform(train)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(pred_trn)
print("RMSE of training data = " + str(rmse))

RMSE of training data = 0.506115137033


In [9]:
pred_vld = model.transform(valid).na.drop("all",subset=['prediction'])
rmse = evaluator.evaluate(pred_vld)
print("RMSE of validation data = " + str(rmse))

RMSE of validation data = 3.93627665809


# Model Tuning

In [None]:
# We need to tune the parameters: maxIter, regParam,rank to achieve better results
cv = CrossValidator().setEstimator(als).setEvaluator(evaluator).setNumFolds(5)
#ParamGridBuilder() – combinations of parameters and their values.
paramGrid = ParamGridBuilder().addGrid(als.regParam,[0.001,0.01,0.1])\
.addGrid(als.rank,[8,10,12]).build()
#setEstimatorParamMaps() takes ParamGridBuilder().
cv.setEstimatorParamMaps(paramGrid)
cvmodel = cv.fit(train)

In [25]:
print "RMSE : " +  str(evaluator.evaluate(cvmodel.bestModel.transform(valid).na.drop("all",subset=['prediction'])))

RMSE : 4.1125297791


In [18]:
cvmodel.bestModel.transform(valid)

DataFrame[businessId: bigint, rating: bigint, userId: bigint, prediction: float]

# Recommendations

In [26]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

In [27]:
userRecs.take(1)

KeyboardInterrupt: 

In [None]:
movieRecs.take(1)

In [23]:
cvmodel.bestModel.transform(valid).na.drop("all",subset=['prediction']).show()

+----------+------+-------+----------+
|businessId|rating| userId|prediction|
+----------+------+-------+----------+
|       148|     2|1043257| 3.9803114|
|       148|     5| 664361| 15.720967|
|       148|     3|  93196| 1.6424284|
|       148|     3|1094206| 5.3527055|
|       463|     5|1086418| 5.0518208|
|       463|     1| 655357| 0.6180098|
|       463|     5| 481910|  9.026126|
|       471|     5| 346886| 18.823072|
|       471|     5| 720172| 4.8159804|
|       471|     5| 227258| 13.180104|
|       471|     5| 186408|  7.169326|
|       471|     5|1016374| 3.6300378|
|       496|     5| 847817| 15.734599|
|       833|     5| 163506|  5.258196|
|       833|     5| 628952| 5.1324935|
|      1342|     3| 970742|  4.250959|
|      1342|     5| 637151|  2.458761|
|      1342|     5| 984632| 4.0028057|
|      1580|     4|1096478| 3.7303517|
|      1580|     4|1112506| 6.2975397|
+----------+------+-------+----------+
only showing top 20 rows

