# Movie recommender on PySpark
> Building a scalable movie recommendation system using PySpark trained on movielens

- toc: true
- badges: true
- comments: true
- categories: [spark, pyspark, movie]
- image:

## Environment Setup

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget https://downloads.apache.org/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar -xvf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [None]:
# import findspark
# findspark.init()

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
# df = spark.createDataFrame([{"hello": "world"} for x in range(1000)])
# df.show(3)

In [None]:
!pip install koalas

In [None]:
# Default Packages (available by Default in Google Colab)
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import random
from pprint import pprint
from matplotlib.lines import Line2D

# Downloaded Packages (not available by Default)
import databricks.koalas

# PySpark Utilities
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql import SparkSession, Row
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics

# Random Seed
SEED = 1492

# Set-up
plt.style.use('seaborn')



## Data Loading

In [None]:
complete_dataset_url = 'http://files.grouplens.org/datasets/movielens/ml-latest.zip'
small_dataset_url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'

We also need to define download locations.

In [None]:
import os

datasets_path = os.path.join('.', 'datasets')
os.makedirs(datasets_path, exist_ok=True)
complete_dataset_path = os.path.join(datasets_path, 'ml-latest.zip')
small_dataset_path = os.path.join(datasets_path, 'ml-latest-small.zip')

Now we can proceed with both downloads.

In [None]:
import urllib.request

small_f = urllib.request.urlretrieve (small_dataset_url, small_dataset_path)
complete_f = urllib.request.urlretrieve (complete_dataset_url, complete_dataset_path)

Both of them are zip files containing a folder with ratings, movies, etc. We need to extract them into its individual folders so we can use each file later on.

In [None]:
import zipfile

with zipfile.ZipFile(small_dataset_path, "r") as z:
    z.extractall(datasets_path)

with zipfile.ZipFile(complete_dataset_path, "r") as z:
    z.extractall(datasets_path)

## Basic example

In [None]:
spark = SparkSession\
    .builder\
    .appName("ALSExample")\
    .getOrCreate()

lines = spark.read.text(os.path.join(os.getenv('SPARK_HOME'),"data/mllib/als/sample_movielens_ratings.txt")).rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                      rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating")
# als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show()

# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)
movieRecs.show()

# Generate top 10 movie recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
userSubsetRecs.show()

# Generate top 10 user recommendations for a specified set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)
movieSubSetRecs.show()

spark.stop()

## Advanced example

- https://nbviewer.jupyter.org/github/SonalSavaliya/Movie-Recommender-System/blob/master/movie_recommender_using_spark.ipynb
- https://nbviewer.jupyter.org/github/Ansu-John/Movie-Recommender-System/blob/main/Movie%20Recommender%20System.ipynb
- https://nbviewer.jupyter.org/github/assadullah1467/PySpark-Recommendation-Engine/blob/master/Recommender_System_PySpark.ipynb

In [None]:
spark = SparkSession.builder.appName("Reco-Spark-Example2").getOrCreate()

In [None]:
data = spark.read.csv(os.path.join(datasets_path,'ml-latest-small','ratings.csv'),
                      inferSchema=True, header=True)

In [None]:
data.show(5)
data.printSchema()
data.describe().show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

+-------+------------------+----------------+------------------+--------------------+
|summary|            userId|         movieId|            rating|           timestamp|
+-------+------------------+----------------+------------------+--------------------+
|  count|            100836|          100836|            100836|              100836|
|   mean|326.12756356856676|19435.2957177992| 3.501556983616962|1.2059460873684695E9|
| stddev| 182.6184914635004|35530.9871987003|1.0425292390606342|2.16261035

In [None]:
titles = spark.read.csv(os.path.join(datasets_path,'ml-latest-small','movies.csv'),
                        inferSchema=True, header=True)

titles.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [None]:
data = data.join(titles,data.movieId==titles.movieId,"left").select([data.movieId,
                                                              titles.title,
                                                              data.userId,
                                                              data.rating])
data.show(5)

+-------+--------------------+------+------+
|movieId|               title|userId|rating|
+-------+--------------------+------+------+
|      1|    Toy Story (1995)|     1|   4.0|
|      3|Grumpier Old Men ...|     1|   4.0|
|      6|         Heat (1995)|     1|   4.0|
|     47|Seven (a.k.a. Se7...|     1|   5.0|
|     50|Usual Suspects, T...|     1|   5.0|
+-------+--------------------+------+------+
only showing top 5 rows



In [None]:
from pyspark.sql.functions import rand, col, lit

In [None]:
data.orderBy(rand()).show(10,False)
data.groupBy('userId').count().orderBy('count',ascending=False).show(10,False)
data.groupBy('userId').count().orderBy('count',ascending=True).show(10,False)
data.groupBy('title').count().orderBy('count',ascending=False).show(10,False)
data.groupBy('title').count().orderBy('count',ascending=True).show(10,False)

+-------+------------------------------------------------------------------------------+------+------+
|movieId|title                                                                         |userId|rating|
+-------+------------------------------------------------------------------------------+------+------+
|3676   |Eraserhead (1977)                                                             |387   |4.0   |
|1198   |Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)|30    |5.0   |
|34405  |Serenity (2005)                                                               |414   |3.5   |
|52281  |Grindhouse (2007)                                                             |590   |3.0   |
|2278   |Ronin (1998)                                                                  |64    |5.0   |
|1676   |Starship Troopers (1997)                                                      |428   |3.5   |
|48516  |Departed, The (2006)                                            

In [None]:
# Smaller dataset so we will use 0.8 / 0.2
(train_data, test_data) = data.randomSplit([0.8, 0.2], seed=42)

In [None]:
# Build the recommendation model using ALS on the training data
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating")
# als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(train_data)

In [None]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test_data)

In [None]:
predictions.show()

+-------+--------------------+------+------+------------+
|movieId|               title|userId|rating|  prediction|
+-------+--------------------+------+------+------------+
|    471|Hudsucker Proxy, ...|   133|   4.0|-0.023575544|
|    471|Hudsucker Proxy, ...|   182|   4.5|   2.7194414|
|    471|Hudsucker Proxy, ...|   387|   3.0|   3.3777792|
|    471|Hudsucker Proxy, ...|   217|   2.0|   1.9056505|
|    471|Hudsucker Proxy, ...|   555|   3.0|   3.0670002|
|    471|Hudsucker Proxy, ...|   176|   5.0|   4.4713492|
|    471|Hudsucker Proxy, ...|   312|   4.0|   4.0964546|
|    471|Hudsucker Proxy, ...|   287|   4.5|  0.77415377|
|    471|Hudsucker Proxy, ...|    32|   3.0|     4.56229|
|    471|Hudsucker Proxy, ...|   373|   5.0| -0.05078125|
|    496|What Happened Was...|   191|   5.0|         NaN|
|    833|High School High ...|   609|   3.0|   1.0141177|
|    833|High School High ...|   492|   4.0|   1.2495432|
|    833|High School High ...|   608|   0.5|   1.1099852|
|   1088|Dirty

In [None]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = nan


A NaN result is due to SPARK-14489 and because the model can't predict values for users for which there's no data. 
A temporary workaround is to exclude rows with predicted NaN values or to replace them with a constant, for instance,
the general mean rating. However, to map to a real business problem, the data scientist, in collaboration with the 
business owner, must define what happens if such an event occurs. For example, you can provide no recommendation for 
a user until that user rates a few items. Alternatively, before user rates five items, you can use a user-based recommender
system that's based on the user's profile (that's another recommender system to develop).

Replace predicted NaN values with the average rating and evaluate the model:

In [None]:
avgRatings = data.select('rating').groupBy().avg().first()[0]
print('The average rating in the dataset is: {}'.format(avgRatings))

evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
print('The root mean squared error for our model is: {}'.format(evaluator.evaluate(predictions.na.fill(avgRatings))))

The average rating in the dataset is: 3.501556983616962
The root mean squared error for our model is: 1.0846835088076119


Now exclude predicted NaN values and evaluate the model

In [None]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
print ('The root mean squared error for our model is: {}'.format(evaluator.evaluate(predictions.na.drop())))

The root mean squared error for our model is: 1.0809233240280964


In [None]:
single_user = test_data.filter(test_data['userId']==12).select(['movieId','userId'])
single_user.show()

recommendations = model.transform(single_user)
recommendations.orderBy('prediction', ascending=False).show()

+-------+------+
|movieId|userId|
+-------+------+
|    357|    12|
|    543|    12|
|    830|    12|
|   2072|    12|
|   2717|    12|
|   4018|    12|
|  40629|    12|
+-------+------+



In [None]:
#create dataset of all distinct movies 
unique_movies=data.select('movieId').distinct()
unique_movies.count()

#assigning alias name 'a' to unique movies df
a = unique_movies.alias('a')

#selecting a user
user_id=12

#creating another dataframe which contains already watched movie by active user 
watched_movies=indexed.filter(indexed['userId'] == user_id).select('movieId').distinct()
watched_movies.count()

#assigning alias name 'b' to watched movies df
b=watched_movies.alias('b')

#joining both tables on left join 
total_movies = a.join(b, a.movieId == b.movieId,how='left')

#selecting movies which active user is yet to rate or watch
remaining_movies=total_movies.where(col("b.movieId").isNull()).select(a.movieId).distinct()
remaining_movies=remaining_movies.withColumn("userId",lit(int(user_id)))

#making recommendations using ALS recommender model and selecting only top 'n' movies
recommendations=model.transform(remaining_movies).orderBy('prediction',ascending=False)
recommendations.show(5,False)

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|6654   |12    |NaN       |
|91784  |12    |NaN       |
|1507   |12    |NaN       |
|100068 |12    |NaN       |
|6336   |12    |NaN       |
+-------+------+----------+
only showing top 5 rows

