In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
import pandas as pd
import os

In [2]:
# Starting a spark session locally
spark = SparkSession \
    .builder \
    .appName("als-reco") \
    .getOrCreate()

In [3]:
movies_file_path = "data/movies_metadata.csv"
df_movies = spark.read.csv(movies_file_path, header=True).select('id', 'title') \
            .withColumn('id', col('id').cast('int'))

df_movies = df_movies.filter(df_movies['id'].isNotNull())

In [4]:
# There are movie title which are corrupted. Let's find them
df_movies.filter(df_movies['title'].rlike("\[*\]")).toPandas().head()

Unnamed: 0,id,title
0,31357,"[{'iso_639_1': 'en', 'name': 'English'}]"
1,11443,"[{'iso_639_1': 'en', 'name': 'English'}]"
2,807,"[{'iso_639_1': 'en', 'name': 'English'}]"
3,32646,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'..."
4,139408,"[{'iso_639_1': 'hu', 'name': 'Magyar'}, {'iso_..."


In [5]:
# Filtering out the corrupted rows
df_movies = df_movies.filter(~df_movies['title'].rlike("\[*\]"))

In [6]:
def count_nans(df):
    return df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).toPandas()
    
def count_nulls(df):
    return df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).toPandas()

In [7]:
count_nans(df_movies)

Unnamed: 0,id,title
0,0,0


In [8]:
count_nulls(df_movies)

Unnamed: 0,id,title
0,0,0


In [9]:
# There are a few duplicate ids and titles
# movie_met.dropDuplicates(['id']).groupBy('id').count().orderBy('count', ascending=False).toPandas()
df_movies = df_movies.dropDuplicates(['id'])

In [10]:
ratings_file_path = "data/ratings_small.csv"
df_ratings = spark.read.csv(ratings_file_path, header=True).select('userId', 'movieId', 'rating') \
            .withColumn('userId', col('userId').cast('int')) \
            .withColumn('movieId', col('movieId').cast('int')) \
            .withColumn('rating', col('rating').cast('int')) \

df_ratings = df_ratings.filter(df_ratings['userId'].isNotNull())
df_ratings = df_ratings.filter(df_ratings['movieId'].isNotNull())
df_ratings = df_ratings.filter(df_ratings['rating'].isNotNull())

In [11]:
df_ratings.limit(5).toPandas()

Unnamed: 0,userId,movieId,rating
0,1,31,2
1,1,1029,3
2,1,1061,3
3,1,1129,2
4,1,1172,4


In [12]:
(df_training, df_test) = df_ratings.randomSplit([0.8, 0.2])

In [13]:
df_training.limit(5).toPandas()

Unnamed: 0,userId,movieId,rating
0,1,31,2
1,1,1029,3
2,1,1061,3
3,1,1129,2
4,1,1172,4


In [28]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")

In [29]:
model = als.fit(df_training)

In [30]:
predictions = model.transform(df_test)

In [31]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

In [18]:
predictions.toPandas().head()

Unnamed: 0,userId,movieId,rating,prediction
0,440,471,3,2.724551
1,292,471,3,3.933341
2,306,471,3,2.398252
3,537,471,5,4.69733
4,241,471,4,3.135671


In [32]:
rmse = evaluator.evaluate(predictions)

In [33]:
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.1617932978547594


In [21]:
def vary_max_iter(max_iter_list):
    rmse_dict = dict.fromkeys(max_iter_list)
    for max_iter in max_iter_list:
        print(f"Max iter: {max_iter}")
        als = ALS(rank=10, maxIter=max_iter, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
        model = als.fit(df_training)
        predictions = model.transform(df_test)
        rmse_dict[max_iter] = evaluator.evaluate(predictions)
    return rmse_dict

In [26]:
def vary_rank(rank_list):
    rmse_dict = dict.fromkeys(rank_list)
    for rank in rank_list:
        print(f"Rank: {rank}")
        als = ALS(rank=rank, maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
        model = als.fit(df_training)
        predictions = model.transform(df_test)
        rmse_dict[rank] = evaluator.evaluate(predictions)
    return rmse_dict

In [37]:
def vary_reg_param(reg_param_list):
    rmse_dict = dict.fromkeys(reg_param_list)
    for reg_param in reg_param_list:
        print(f"Reg param: {reg_param}")
        als = ALS(rank=10, maxIter=5, regParam=reg_param, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
        model = als.fit(df_training)
        predictions = model.transform(df_test)
        rmse_dict[reg_param] = evaluator.evaluate(predictions)
    return rmse_dict

In [22]:
max_iter_list = [1,5,10,20]
vary_max_iter(max_iter_list)

Max iter: 1
Max iter: 5
Max iter: 10
Max iter: 15
Max iter: 20


In [27]:
rank_list = [5,10,20,30]
vary_rank(rank_list)

Rank: 5
Rank: 10
Rank: 20
Rank: 30


{5: 1.0748606030599175,
 10: 1.1617932978547596,
 20: 1.2707469923070824,
 30: 1.3222966962449554}

In [38]:
reg_param_list = [0.001, 0.01, 0.1, 0.5, 0.9]
vary_reg_param(reg_param_list)

Reg param: 0.001
Reg param: 0.01
Reg param: 0.1
Reg param: 0.5
Reg param: 0.9


{0.001: 1.3511604397441932,
 0.01: 1.1617932978547594,
 0.1: 0.9479697106211431,
 0.5: 1.0441125386217338,
 0.9: 1.2839697740087654}