In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.ml.recommendation import ALS
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf,col,when
from IPython.display import display, Image


spark = SparkSession.builder.appName("book").getOrCreate()

In [None]:
# đọc file ratings.csv lên
rating_df = spark.read.csv('../input/databooks/dataset/ratings.csv', header = True, inferSchema=True)
rating_df.printSchema()

In [None]:
#show rating
rating_df.show(5)

In [None]:
# summary rating
rating_df.summary().show()

In [None]:
# đọc file books.csv lên
books_df = spark.read.csv('../input/databooks/dataset/books.csv', header = True, inferSchema=True)
books_df.printSchema()

In [None]:
# show book
books_df.show(1)

In [None]:
#xây dựng bộ lọc cộng tác
als  = ALS(maxIter=10,regParam=0.1,rank=10,userCol="user_id",itemCol="book_id",ratingCol="rating")
#tách dữ liệu ra 2 phần train trên 80% 
training_df,validation_df = rating_df.randomSplit([8.0,2.0])
#fit dữ liệu có model và transform tập validation
model = als.fit(training_df)
predictions = model.transform(validation_df)
new_predictions = predictions.filter(col("prediction")!= np.nan)
#dùng RegressionEvaluator đánh giá độ chính xác
evalutor = RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction")

rmse_score = evalutor.evaluate(new_predictions)
print("Root Mean Square Errr Value :",str(rmse_score))

In [None]:
predictions.show(10)

In [None]:
# predictions.join(books_df,"book_id").select("user_id","title","prediction").show(5)

In [None]:
# Tách ra 2 ma trận và xem vector bằng bao nhiêu
userFactors = model.userFactors 
itemFactors = model.itemFactors 
userFactors.sort('id').show(5,False) #ma trận user
itemFactors.sort('id').show(5,False) #ma trận item
import numpy as np 
userFeature = model.userFactors.filter(f.col('id')==2171).select(f.col('features')).rdd.flatMap(lambda x: x).collect()[0]
itemFeature = model.itemFactors.filter(f.col('id')==148).select(f.col('features')).rdd.flatMap(lambda x: x).collect()[0] 
print(userFeature) 
print(itemFeature)  
# nhân 2 vetor
print('Predicted rating of user 2171 for book 148: ' + str(np.dot(userFeature,itemFeature))) 

# Top 5 books for all user, for each user

In [None]:
# recommend cho nhiều user
userRecomments=model.recommendForAllUsers(5).cache()
userRecomments.show(5,False)
userRecomments.printSchema()

In [None]:
# recommend cho 1 user
one_user =predictions.filter(col("user_id")==2171).join(books_df,"book_id").select("book_id","title",'image_url','prediction')
one_user.count()

In [None]:
one_user.show()

In [None]:
for book in one_user.take(5):
  print(book.title)
  display(Image(url=book.image_url))

# Top unrated books for a user

In [None]:
user_ID = 2171 
#loc book đã được dánh giá
ratedBooks = rating_df.filter(f.col('user_id')==2171).select('book_id').rdd.flatMap(lambda x: x).collect() 
#loc book chưa được đánh giá
book_to_be_rated = (rating_df 
                      .filter(f.col('book_id').isin(ratedBooks)) 
                      .select('book_id').distinct() 
                      .withColumn('user_id',f.lit(user_ID)) 
                     )
book_to_be_rated.sort('book_id').show(5) 
# dùng model transform book_to_be_rated
user_book_predictions = model.transform(book_to_be_rated)
user_book_predictions.orderBy('prediction',ascendIng=False).show(5) 