In [None]:
from google.colab import drive
drive.mount("/gdrive")
%cd "gdrive"

In [None]:
import os
os.chdir("/gdrive/MyDrive/big_data/RecommendationApplication")
!ls

In [3]:
pip install -q pyspark

[K     |████████████████████████████████| 281.3 MB 8.1 kB/s 
[K     |████████████████████████████████| 198 kB 45.9 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [4]:
from pyspark.sql import SparkSession

In [5]:
sparkSession = SparkSession.builder.appName("ALS Book Recommendation System").getOrCreate()

In [8]:
bookDS = sparkSession.read.csv(path="/gdrive/MyDrive/big_data/RecommendationApplication/books.csv", header=True, inferSchema=True)

In [9]:
ratingDS = sparkSession.read.csv(path="/gdrive/MyDrive/big_data/RecommendationApplication/ratings.csv", header=True, inferSchema=True)

# 1.Data Preprocessing

In [10]:
(train_data, test_data) = ratingDS.randomSplit([0.8, 0.2], seed=42)

# 2.CrossValidator


In [14]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [15]:
als = ALS(userCol="user_id", itemCol="book_id", ratingCol="rating", coldStartStrategy="drop", nonnegative=True, implicitPrefs=False)

In [16]:
# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [1, 10]) \
            .addGrid(als.regParam, [.01, .1]) \
            .build()

In [17]:
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(
           labelCol="rating", 
           predictionCol="prediction",
           metricName="rmse") 
print ("Num models to be tested: ", len(param_grid))

Num models to be tested:  4


In [18]:
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3)

In [25]:
#Fit cross validator to the 'train' dataset
cv_model = cv.fit(train_data)
#Extract best model from the cv model above
best_model = cv_model.bestModel


In [26]:
print("**Best Model**")
# Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())
# Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())
# Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())

**Best Model**
  Rank: 1
  MaxIter: 10
  RegParam: 0.01


# 3.Predictions

In [27]:
# View the predictions
predictions = best_model.transform(test_data)
rmse = evaluator.evaluate(predictions)
print(rmse)

0.8497654221443084


# 4.Recommendations

In [28]:
bookrecommend = best_model.recommendForAllUsers(numItems=5)
bookrecommend.show()



+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|      1|[{5207, 4.332561}...|
|      3|[{5207, 1.1885868...|
|      5|[{5207, 5.37021},...|
|      6|[{5207, 5.489295}...|
|      9|[{5207, 4.3143396...|
|     12|[{5207, 4.7692404...|
|     13|[{5207, 5.168634}...|
|     15|[{5207, 4.3484435...|
|     16|[{5207, 4.2852426...|
|     17|[{5207, 5.4291396...|
|     19|[{5207, 3.9199004...|
|     20|[{5207, 5.64612},...|
|     22|[{5207, 4.237532}...|
|     26|[{5207, 4.3521338...|
|     27|[{5207, 4.888169}...|
|     28|[{5207, 4.0965953...|
|     31|[{5207, 4.4243546...|
|     34|[{5207, 3.6871262...|
|     35|[{5207, 3.8013105...|
|     37|[{5207, 5.204016}...|
+-------+--------------------+
only showing top 20 rows



In [29]:
userrecommend = best_model.recommendForAllItems(numUsers=5)
userrecommend.show(5, truncate=False)



+-------+---------------------------------------------------------------------------------------------------+
|book_id|recommendations                                                                                    |
+-------+---------------------------------------------------------------------------------------------------+
|1      |[{43252, 8.554714}, {21791, 7.818618}, {23353, 7.562179}, {52487, 6.9868517}, {38723, 6.9400334}]  |
|3      |[{43252, 6.018967}, {21791, 5.501061}, {23353, 5.320635}, {52487, 4.915843}, {38723, 4.882902}]    |
|5      |[{43252, 7.7380486}, {21791, 7.0722227}, {23353, 6.840265}, {52487, 6.31986}, {38723, 6.277511}]   |
|6      |[{43252, 7.8131094}, {21791, 7.1408253}, {23353, 6.906617}, {52487, 6.3811646}, {38723, 6.3384047}]|
|9      |[{43252, 6.7856402}, {21791, 6.201765}, {23353, 5.998357}, {52487, 5.542004}, {38723, 5.5048676}]  |
+-------+---------------------------------------------------------------------------------------------------+
only showi

In [30]:
userrecommend.first()

Row(book_id=1, recommendations=[Row(user_id=43252, rating=8.55471420288086), Row(user_id=21791, rating=7.818617820739746), Row(user_id=23353, rating=7.562179088592529), Row(user_id=52487, rating=6.986851692199707), Row(user_id=38723, rating=6.940033435821533)])