In [1]:
import pyspark
from pyspark import SparkFiles
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import rand, col

seed = 100
sc = SparkContext()
filePath = "/home/jovyan/work/Personalization/ml-20m/ratings.csv"

In [2]:
# read dataset into spark RDD
sc.addFile(filePath)
sqlContext = SQLContext(sc)
df = sqlContext.read.csv(SparkFiles.get("ratings.csv"), header=True, inferSchema=True)

sqlContext.registerDataFrameAsTable(df, "df")
df = sqlContext.sql('''
    SELECT 
        userId AS user, 
        movieId AS item,
        rating
    FROM df
''')

In [3]:
# adding uniform random numbers for train/validation/test set
df = df.withColumn('TrainTest', rand(seed=seed))
print("(row, col): ", (df.count(), len(df.columns)))

(row, col):  (20000263, 4)


In [4]:
dftrain = df.where(col('TrainTest') < 0.75).drop(*["TrainTest"])
dftest = df.where(col('TrainTest') >= 0.75).drop(*["TrainTest"])

In [5]:
print(dftrain.printSchema())
print("(row, col): ", (dftrain.count(), len(dftrain.columns)))
dftrain.show(n=5)

root
 |-- user: integer (nullable = true)
 |-- item: integer (nullable = true)
 |-- rating: double (nullable = true)

None
(row, col):  (15001032, 3)
+----+----+------+
|user|item|rating|
+----+----+------+
|   1|   2|   3.5|
|   1|  29|   3.5|
|   1|  32|   3.5|
|   1|  47|   3.5|
|   1| 112|   3.5|
+----+----+------+
only showing top 5 rows



In [6]:
print(dftest.printSchema())
print("(row, col): ", (dftest.count(), len(dftest.columns)))
dftest.show(n=5)

root
 |-- user: integer (nullable = true)
 |-- item: integer (nullable = true)
 |-- rating: double (nullable = true)

None
(row, col):  (4999231, 3)
+----+----+------+
|user|item|rating|
+----+----+------+
|   1|  50|   3.5|
|   1| 223|   4.0|
|   1| 253|   4.0|
|   1| 260|   4.0|
|   1| 541|   4.0|
+----+----+------+
only showing top 5 rows



In [10]:
import os
import math
import operator
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

In [11]:
als = ALS(nonnegative=True, checkpointInterval=3, coldStartStrategy="drop")
paramGrid = ParamGridBuilder() \
    .addGrid(als.rank, [5, 30, 70]) \
    .addGrid(als.regParam, [0.1, 1, 10]) \
    .build()

rmse = RegressionEvaluator(metricName="rmse", labelCol="rating")

In [12]:
tvs = TrainValidationSplit(
    estimator=als,
    estimatorParamMaps=paramGrid,
    evaluator=rmse,
    seed=seed,
    trainRatio=0.66, # this ratio makes train:0.5 valid:0.25 and test:0.25
    parallelism=3
)

In [13]:
model = tvs.fit(dftrain)

In [14]:
model_path = os.getcwd() + '/ALS_model2'
model.save(model_path)

In [15]:
model.transform(dftrain).show()

+------+----+------+----------+
|  user|item|rating|prediction|
+------+----+------+----------+
| 74757| 148|   3.5| 2.7267754|
| 96393| 148|   3.0| 2.5813208|
| 97435| 148|   4.0| 3.0522318|
|136222| 148|   2.0|  2.585815|
|137949| 148|   4.0| 3.1322215|
| 19067| 148|   2.0| 1.5319937|
| 87301| 148|   2.0|  2.642569|
| 88527| 148|   2.0| 2.2116013|
|108726| 148|   3.0| 2.6959863|
|123246| 148|   3.0| 3.0632768|
| 20132| 148|   3.0| 2.6023612|
| 22884| 148|   3.0| 2.5250168|
| 96427| 148|   3.0|  3.005178|
| 10303| 148|   3.0|  2.884504|
| 36821| 148|   4.0| 2.8097978|
| 44979| 148|   3.0| 2.9125397|
| 81218| 148|   1.0| 2.3467267|
| 91782| 148|   3.0| 2.9503355|
| 32882| 148|   3.0| 2.5714395|
| 96884| 148|   4.0| 2.6100273|
+------+----+------+----------+
only showing top 20 rows



In [16]:
testPred = model.transform(dftest)
testPred.show(5)

+-----+----+------+----------+
| user|item|rating|prediction|
+-----+----+------+----------+
|53338| 148|   1.0| 2.5473833|
|22684| 148|   4.0|  2.804213|
|92852| 148|   3.0| 2.4693103|
|83090| 148|   2.0| 1.9907694|
|13170| 148|   3.0| 1.0899945|
+-----+----+------+----------+
only showing top 5 rows



In [17]:
rmse.evaluate(testPred)

0.8144994635502704