In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pyspark

# Import modules and create Spark session

In [None]:
#import module
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import *
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

#create session
appName = "Recommender system in Spark"
spark = SparkSession \
    .builder \
    .appName(appName) \
    .getOrCreate()

# Read file into dataFrame

In [None]:
#read file into dataFrame using automatically inferred schema
ratings = spark.read.csv('/kaggle/input/movierecommenderdataset/ratings.csv', inferSchema=True, header=True)
movies = spark.read.csv('/kaggle/input/movierecommenderdataset/movies.csv', inferSchema=True, header=True)
#merge "movies" and "ratings" dataFrame based on "movieId"
ratings.join(movies, "movieId").show(10)

# Data preparation

In [None]:
#use only column data of "userId", "movieId", "rating"
data = ratings.select("userId", "movieId", "rating")
#divide data, 75% for training and 25% for testing
splits = data.randomSplit([0.75, 0.25])
train = splits[0].withColumnRenamed("rating", "label")
test = splits[1].withColumnRenamed("rating", "label")
#calculate number of rows
train_rows = train.count()
test_rows = test.count()
print ("Number of training data rows:", train_rows) 
print ("Number of testing data rows:", test_rows)

In [None]:
train.show(10)
test.show(10)

# Define model and train it

In [None]:
from datetime import datetime
#define ALS (Alternating Least Square) as our recommender system
start_time = datetime.now()
als = ALS(maxIter=19, regParam=0.01, userCol="userId", 
          itemCol="movieId", ratingCol="label")
#train our ALS model
model = als.fit(train)
end_time = datetime.now()
print("Training is done!")
print('Execute time {}'.format(end_time - start_time))

# Predict testing data

In [None]:
prediction = model.transform(test)
prediction.join(movies, "movieId").select("userId", "title", "prediction", "label").show(n=10, truncate=False)

# Evaluate the accuracy of our model

In [None]:
#import RegressionEvaluator since we also want to calculate RMSE (Root Mean Square Error)
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(prediction)
print ("Root Mean Square Error (RMSE) = ", rmse)

In [None]:
prediction.count()
rootPrediction = prediction.count()
print("Number of original data rows: ", rootPrediction)
#drop rows with any missing data
cleanPrediction = prediction.dropna(how="any", subset=["prediction"])
afterCleanPrediction = cleanPrediction.count()
print("Number of rows after dropping data with missing value: ", afterCleanPrediction)
print("Number of missing data: ", rootPrediction-afterCleanPrediction)

In [None]:
rmse = evaluator.evaluate(cleanPrediction)
print ("Root Mean Square Error (RMSE):", rmse)

# List 5 user with List 5 recommended film

In [None]:
# Generate top 5 recommended movies for every user
model.recommendForAllUsers(5).cache().show(5,False)

# Film for an user

In [None]:
# Generate top 10 recommended movies for an user
anuserId = 39
ratedMovies = ratings.filter(f.col('userId')==anuserId).select('movieId').rdd.flatMap(lambda x: x).collect()

movies_to_be_rated = (
    ratings
    .filter(~ f.col('movieId').isin(ratedMovies))
    .select('movieId').distinct()
    .withColumn('userId',f.lit(anuserId))
)
movies_to_be_rated.sort('movieId').show(5)

In [None]:
movie_for_user = model.transform(movies_to_be_rated)
movie_for_user.dropna(how="any", subset=["prediction"]).orderBy('prediction', ascending = False).show()

In [None]:
# Tạo ra top 5 user khuyến nghị cho mỗi một bộ phim
movieRecs = model.recommendForAllItems(5)
movieRecs.show(5,False)

# Improve performance

In [None]:
from datetime import datetime
#define ALS (Alternating Least Square) as our recommender system
als = ALS(userCol="userId", itemCol="movieId", ratingCol="label",coldStartStrategy="drop", nonnegative=True)

#Tune model using ParamGridBuilder
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
param_grid = ParamGridBuilder()\
    .addGrid(als.rank, [12,13,14])\
    .addGrid(als.maxIter, [18,19,20])\
    .addGrid(als.regParam, [.17, .18, .19])\
    .build()
#Define evaluator as RMSE
evaluator = RegressionEvaluator(metricName = "rmse", labelCol="label", predictionCol="prediction")
#Build cross validation using TrainValidationSplit
tvs = TrainValidationSplit(
    estimator=als,
    estimatorParamMaps=param_grid,
    evaluator=evaluator
)


#Fit ALS model to training data
model = tvs.fit(train)
print("Training is done!")

#Find best model
best_model = model.bestModel

# Predict testing data with Best model

In [None]:
prediction = best_model.transform(test)
prediction.join(movies, "movieId").select("userId", "title", "prediction", "label").show(n=10, truncate=False)

# Evaluate and See Parameters

In [None]:
rmse = evaluator.evaluate(prediction)

print("RMSE = "+ str(rmse))
print("Best model with parameters")
print("Rank = ", best_model.rank)
print("MaxIter = ", best_model._java_obj.parent().getMaxIter())
print("RegParam = ", best_model._java_obj.parent().getRegParam())

In [None]:
#Recommendation 
user_recs= best_model.recommendForAllUsers(5)
user_recs.show(5,False)