In [63]:
Recommender System Background:

*The two most common types of recommender systems are Content-Based and Collaborative Filtering (CF)
-Collaborative Filtering 
     based on "wisdom of the crowd"
     more common (better results)
     spark.ml currently supports model-based collaborative filtering
     spark.ml uses the alternating least squares (ALS) algorithm
        users-described-small set of latent factors-used to predict missing entries
        Matrix Factorization approach to implement a recommendation algorithm
     spark.ml uses the alternating least squares (ALS) algorithm to learn these latent factors

-Content-based recommender systems based on distance/similarity

In [None]:
from pyspark.sql import SparkSession

In [64]:
spark = SparkSession.builder.appName('rec').getOrCreate()

In [29]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [65]:
data = spark.read.csv('spark_master/Spark_for_Machine_Learning/Recommender_Systems/movielens_ratings.csv',inferSchema=True,header=True)

In [66]:
data.head()

Row(movieId=2, rating=3.0, userId=0)

In [67]:
data.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



In [68]:
#Perform a split to evaluate how well our model performed
# Smaller dataset so we will use 0.8 / 0.2
(training, test) = data.randomSplit([.8,.2])

In [69]:
training.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1212|              1212|              1212|
|   mean| 48.96699669966997|1.7863036303630364|14.515676567656765|
| stddev|28.946042800015622|1.1978692341475676|  8.57081552319438|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



In [73]:
# Build the recommendation model using ALS on the training data
als = ALS(maxIter=5,regParam=0.01,userCol='movieId', itemCol='movieId', ratingCol='rating' )
model = als.fit(training)

In [74]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)

In [75]:
predictions.show()

+-------+------+------+----------+
|movieId|rating|userId|prediction|
+-------+------+------+----------+
|     31|   1.0|    26| 1.6298599|
|     31|   1.0|    29| 1.6298599|
|     31|   2.0|    25| 1.6298599|
|     31|   3.0|     8| 1.6298599|
|     85|   1.0|    15| 1.8609217|
|     85|   3.0|     6| 1.8609217|
|     85|   4.0|     7| 1.8609217|
|     65|   1.0|     2| 1.3254828|
|     65|   1.0|    16| 1.3254828|
|     65|   1.0|    28| 1.3254828|
|     65|   2.0|    15| 1.3254828|
|     65|   5.0|    23| 1.3254828|
|     53|   1.0|     6| 2.3590553|
|     78|   1.0|     1| 0.9899168|
|     78|   1.0|    11| 0.9899168|
|     78|   1.0|    24| 0.9899168|
|     78|   1.0|    27| 0.9899168|
|     34|   1.0|    14| 1.8274894|
|     34|   1.0|    16| 1.8274894|
|     34|   1.0|    17| 1.8274894|
+-------+------+------+----------+
only showing top 20 rows



In [76]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

In [77]:
rmse = evaluator.evaluate(predictions)

In [78]:
#The RMSE describes the error in terms of the stars rating column.
print('RMSE:')
print(rmse)

RMSE:
1.1259863097823009


## Result: This RMSE is high at 1.1 considering that the range of scores is from one to five.  This was to be expected due to the small size of the data.

In [80]:
# User had 9 ratings in the test data set 
# Realistically this should be some sort of hold out set!
single_user = test.filter(test['userId']==11).select(['movieId','userId'])

In [81]:
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|      6|    11|
|      9|    11|
|     25|    11|
|     32|    11|
|     43|    11|
|     78|    11|
|     80|    11|
|     94|    11|
|     97|    11|
+-------+------+



In [82]:
recommendations = model.transform(single_user)

In [85]:
recommendations.orderBy('prediction', ascending=False).show()

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|     32|    11| 2.7738624|
|     94|    11| 2.3525496|
|     25|    11| 1.9110643|
|     80|    11| 1.7438934|
|     43|    11| 1.6860056|
|      9|    11| 1.5315741|
|      6|    11| 1.3924892|
|     97|    11|  1.213737|
|     78|    11| 0.9899168|
+-------+------+----------+



## Result: For UserId=11, movieID 32 and 94 would be recommended to him or her.