# Setup

In [3]:
pip install -q pyspark

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [4]:
from pyspark.sql import SparkSession

from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# Load data

In [5]:
spark = SparkSession.builder.appName("ALS Book Recommendation System").getOrCreate()

In [6]:
bookDF = spark.read.csv(path="./data/books.csv", header=True, inferSchema=True)
bookDF.show(5)

+---+-------+------------+-------+-----------+---------+----------------+--------------------+-------------------------+--------------------+--------------------+-------------+--------------+-------------+------------------+-----------------------+---------+---------+---------+---------+---------+--------------------+--------------------+
| id|book_id|best_book_id|work_id|books_count|     isbn|          isbn13|             authors|original_publication_year|      original_title|               title|language_code|average_rating|ratings_count|work_ratings_count|work_text_reviews_count|ratings_1|ratings_2|ratings_3|ratings_4|ratings_5|           image_url|     small_image_url|
+---+-------+------------+-------+-----------+---------+----------------+--------------------+-------------------------+--------------------+--------------------+-------------+--------------+-------------+------------------+-----------------------+---------+---------+---------+---------+---------+--------------------

In [7]:
ratingDF = spark.read.csv(path="./data/ratings.csv", header=True, inferSchema=True)
ratingDF.show(5)

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    439|     3|
|      1|    588|     5|
|      1|   1169|     4|
|      1|   1185|     4|
+-------+-------+------+
only showing top 5 rows



# Data preprocessing

In [8]:
(train_data, test_data) = ratingDF.randomSplit([0.8, 0.2], seed=42)

# Create model

In [9]:
als = ALS(userCol="user_id", itemCol="book_id", ratingCol="rating",
          coldStartStrategy="drop", nonnegative=True, implicitPrefs=False, seed=0)

# Train model

In [10]:
model = als.fit(train_data)

# Predictions

In [11]:
predictions = model.transform(test_data)
predictions.show(5)

+-------+-------+------+----------+
|book_id|user_id|rating|prediction|
+-------+-------+------+----------+
|      3|  32592|     5| 3.3556385|
|      7|  19984|     5|  5.052669|
|     20|  32592|     4| 3.8333924|
|     22|  19984|     5|  4.546671|
|     26|  35982|     2|  2.566998|
+-------+-------+------+----------+
only showing top 5 rows



# Evaluate model


In [12]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

In [13]:
rmse = evaluator.evaluate(predictions)
print(rmse)

0.9164684337448652


# Recommendations

In [14]:
bookrecommend = model.recommendForAllUsers(numItems=5)
bookrecommend.show()

+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|      1|[{4154, 4.402269}...|
|      3|[{4336, 1.0218552...|
|      5|[{6070, 4.9343743...|
|      6|[{8409, 4.993943}...|
|      9|[{721, 4.159966},...|
|     12|[{8946, 4.850065}...|
|     13|[{7947, 4.7058764...|
|     15|[{6296, 3.9581952...|
|     16|[{8946, 3.9300737...|
|     17|[{7063, 4.8882537...|
|     19|[{4868, 4.200257}...|
|     20|[{9024, 5.1519156...|
|     22|[{8976, 4.4661946...|
|     26|[{8946, 4.684968}...|
|     27|[{8946, 5.372572}...|
|     28|[{8187, 4.093809}...|
|     31|[{8233, 4.025222}...|
|     34|[{8946, 3.3662136...|
|     35|[{5207, 3.8788683...|
|     37|[{8946, 4.889765}...|
+-------+--------------------+
only showing top 20 rows



In [15]:
userrecommend = model.recommendForAllItems(numUsers=5)
userrecommend.show(5, truncate=False)

+-------+---------------------------------------------------------------------------------------------------+
|book_id|recommendations                                                                                    |
+-------+---------------------------------------------------------------------------------------------------+
|1      |[{25515, 5.8924274}, {24688, 5.8916316}, {35231, 5.8735824}, {31925, 5.8597255}, {51626, 5.830609}]|
|3      |[{37765, 5.632052}, {21303, 5.231705}, {27807, 5.185281}, {12099, 5.1783185}, {45665, 5.1698375}]  |
|5      |[{34867, 5.51413}, {8753, 5.4030943}, {49001, 5.383781}, {34272, 5.3764057}, {22349, 5.371452}]    |
|6      |[{7791, 5.6480274}, {24688, 5.6170206}, {42583, 5.433718}, {42571, 5.4329987}, {11522, 5.428794}]  |
|9      |[{24688, 5.0046654}, {37163, 4.9714584}, {53349, 4.966023}, {14816, 4.96223}, {26439, 4.958192}]   |
+-------+---------------------------------------------------------------------------------------------------+
only showi

In [16]:
userrecommend.first()

Row(book_id=1, recommendations=[Row(user_id=25515, rating=5.892427444458008), Row(user_id=24688, rating=5.891631603240967), Row(user_id=35231, rating=5.873582363128662), Row(user_id=31925, rating=5.859725475311279), Row(user_id=51626, rating=5.83060884475708)])