In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pyspark

In [None]:
#Import modul and Create Session
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
appName="Sistem Rekomendasi Film"
spark = SparkSession.builder.master("local").appName(appName).getOrCreate()
sc=spark.sparkContext
sc

In [None]:
#membuat data file ke DataFrame
ratings = spark.read.csv('../input/movielens/ratings.csv', header=True, inferSchema=True)
movies = spark.read.csv('../input/movielens/movies.csv', header=True, inferSchema=True)
df=ratings.join(movies,"movieId")
df.show(5)

In [None]:
#Pick the colomns the we need
data=df.select("userId","movieId","rating")
data.show(5)

In [None]:
#Split data with ration 80% training and 20% testing
splits=data.randomSplit([0.8,0.2])
train=splits[0].withColumnRenamed("rating","label")
test=splits[1].withColumnRenamed("rating","trueLabel")

train_row=train.count()
test_row=test.count()


print("Total data train is :",train_row)
print("Total data test is:", test_row)

In [None]:
#Modelling
als=ALS(maxIter=19,regParam=0.01, userCol="userId", itemCol="movieId",ratingCol="label")
model=als.fit(train)

In [None]:
#Let's Test and Evaluate
prediction = model.transform(test)
prediction.join(movies, "movieId").select(
    "userId", "title", "prediction", "trueLabel").show(n=3, truncate=False)

In [None]:
#Evaluate
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    labelCol="trueLabel", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(prediction)
print ("Root Mean Square Error (RMSE):", rmse)

In [None]:
#That's occur cause there were na data
#let's clean

prediction.count()
a = prediction.count()
print("Total NA before clean: ", a)
cleanPred = prediction.dropna(how="any", subset=["prediction"])
b = cleanPred.count()
print("Total NA after clean ", b)
print("Total NA: ", a-b)

In [None]:
#Evaluate Again
rmse = evaluator.evaluate(cleanPred)
print ("Root Mean Square Error (RMSE):", rmse)

In [None]:
# Generate top 10 movie recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
userSubsetRecs.show()

In [None]:
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
import pandas as pd

import warnings
warnings.simplefilter("ignore")

In [None]:
RecommendbyUser=userSubsetRecs.toPandas()
RecommendbyUser.head()

In [None]:
# Generate top 10 user recommendations for a specified set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)
movieSubSetRecs.show(5)

In [None]:
RecommendbyMovie=movieSubSetRecs.toPandas()
RecommendbyMovie.head()