### Tutorial on Recommender Systems (Collaborative Filtering)

#### For more information please visit:
#### https://spark.apache.org/docs/2.1.0/ml-collaborative-filtering.html

In [2]:
# Import the libraries
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
# Create a spark session
spark = SparkSession.builder.appName("reco_sys").getOrCreate()

In [4]:
# Read the movie lens dataset
data = spark.read.csv('movielens_ratings.csv',inferSchema=True,header=True)

In [6]:
# The data consists of user ID, movie ID and ratings columns
data.show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
|     12|   2.0|     0|
|     15|   1.0|     0|
|     17|   1.0|     0|
|     19|   1.0|     0|
|     21|   1.0|     0|
|     23|   1.0|     0|
|     26|   3.0|     0|
|     27|   1.0|     0|
|     28|   1.0|     0|
|     29|   1.0|     0|
|     30|   1.0|     0|
|     31|   1.0|     0|
|     34|   1.0|     0|
|     37|   1.0|     0|
|     41|   2.0|     0|
+-------+------+------+
only showing top 20 rows



In [7]:
data.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- userId: integer (nullable = true)



In [8]:
# Create train and test data
train_data, test_data = data.randomSplit([0.8, 0.2])

In [9]:
# The algorithm used for collaborative filtering is Alternate Least Squares
als = ALS(maxIter=5, userCol='userId', ratingCol='rating', itemCol='movieId')

In [10]:
# Create a model
model = als.fit(train_data)

In [11]:
# Transform the test data
predictions = model.transform(test_data)
predictions.show()

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|     31|   1.0|    26|-0.58979523|
|     31|   1.0|    19| 0.53235674|
|     31|   1.0|     0| 0.99249953|
|     85|   1.0|    12| 0.49733388|
|     85|   1.0|     5|  1.1904873|
|     85|   1.0|    23| 0.07371688|
|     85|   1.0|    25|  1.7314409|
|     85|   1.0|    29|  1.3394027|
|     65|   2.0|     3|  2.0913997|
|     65|   2.0|     5|  2.8285742|
|     65|   2.0|    15|  1.7267818|
|     65|   1.0|     2|  1.3626434|
|     53|   1.0|    12|  0.6836374|
|     53|   3.0|    13|  1.9102029|
|     53|   2.0|    19|  2.0836706|
|     53|   1.0|     7|  2.7030106|
|     78|   1.0|    22|  1.2372901|
|     78|   1.0|    17| 0.88190645|
|     78|   1.0|    24|  1.0178746|
|     78|   1.0|    11|  1.2543551|
+-------+------+------+-----------+
only showing top 20 rows



In [14]:
# Evaluate the model: Using a regression evaluator as the predictions are real valued continuous numbers
reg = RegressionEvaluator(predictionCol='prediction', metricName='rmse', labelCol="rating" )

In [15]:
rmse = reg.evaluate(predictions)
print("RMSE: ", rmse)

RMSE:  1.0197470955620793


In [16]:
# Take a single user
single_user = test_data.filter(test_data['userId']==11).select(["movieId", "userId"])

In [17]:
user_pred = model.transform(single_user)

In [18]:
user_pred.orderBy("prediction", ascending=False).show()

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|     18|    11| 3.9303148|
|     51|    11| 3.0288196|
|     76|    11| 2.8124604|
|     48|    11| 2.6151693|
|     30|    11| 2.5955782|
|     66|    11| 2.3128262|
|     80|    11| 2.2977905|
|      6|    11| 2.2801278|
|     36|    11| 2.1252913|
|     43|    11| 2.0249476|
|     77|    11| 1.9049665|
|     35|    11| 1.8851352|
|     21|    11| 1.3030387|
|     10|    11| 1.2650675|
|     78|    11| 1.2543551|
+-------+------+----------+

