In [1]:
import pandas as pd
import numpy as np
from pyspark import SparkFiles
from pyspark.rdd import RDD
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql.functions import mean
from pyspark.sql.functions import lit, col

def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark



spark = init_spark()

# Collaborative Filtering with ALS

###**Check the sparsity of the users-rating matrix to determine how much bias might have been introduced if we used SVD rather than ALS for collaborative filtering.**


In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [3]:
ratings = spark.read.csv("ratings.csv", header=True)
ratings = ratings.withColumn("user_id", ratings["user_id"].cast(IntegerType()))
ratings = ratings.withColumn("book_id", ratings["book_id"].cast(IntegerType()))
ratings = ratings.withColumn("rating", ratings["rating"].cast(IntegerType()))


In [4]:
# Count the total number of ratings in the dataset
numerator = ratings.select("rating").count()

# Count the number of distinct Id's
num_users = ratings.select("user_id").distinct().count()
num_items = ratings.select("book_id").distinct().count()

# Set the denominator equal to the number of users multiplied by the number of items
denominator = num_users * num_items

# Divide the numerator by the denominator
sparsity = (1.0 - (numerator * 1.0)/ denominator) * 100
print("The ratings dataframe is ", "%.2f" % sparsity + "% empty.")

The ratings dataframe is  98.88% empty.


In [5]:
# Min num ratings 
print("Item with the fewest ratings: ")
ratings.groupBy("book_id").count().sort('count').show(10)

Item with the fewest ratings: 
+-------+-----+
|book_id|count|
+-------+-----+
|   7803|    8|
|   9345|   11|
|   9486|   24|
|   1935|   33|
|   9315|   36|
|   9553|   41|
|   8181|   45|
|   9479|   48|
|   9260|   48|
|   9705|   50|
+-------+-----+
only showing top 10 rows



In [6]:
# Group data by user_id, count ratings
(ratings.groupBy("user_id")
    .count()
    .filter("`count` >= 5")
    .orderBy('count', ascending=False)
    .show(n = 10))

+-------+-----+
|user_id|count|
+-------+-----+
|  12874|  200|
|  30944|  200|
|  28158|  199|
|  12381|  199|
|  52036|  199|
|   6630|  197|
|  45554|  197|
|  24143|  196|
|   9668|  196|
|  19729|  196|
+-------+-----+
only showing top 10 rows



In [7]:
# Group data by book_id, count ratings
(ratings.groupBy("book_id")
    .count()
    .filter("`count` > 1")
    .orderBy('count', ascending=False)
    .show(n = 10))

+-------+-----+
|book_id|count|
+-------+-----+
|      1|22806|
|      2|21850|
|      4|19088|
|      3|16931|
|      5|16604|
|     17|16549|
|     20|15953|
|     18|15855|
|     23|15657|
|      7|15558|
+-------+-----+
only showing top 10 rows



#**Algorithm - Implementation**

### **Split the data into training and test set to use collaborative filtering using ALS Matrix Factorization**


In [8]:
(training, test) = ratings.randomSplit([0.7, 0.3],0)

In [9]:
print(training.count(),test.count())
test.show(5)

4182526 1793953
+-------+-------+------+
|user_id|book_id|rating|
+-------+-------+------+
|      1|      4|     5|
|      1|     16|     3|
|      1|     33|     4|
|      1|     60|     3|
|      1|     66|     4|
+-------+-------+------+
only showing top 5 rows



###**Import ALS and regression evaluator to find RMSE.**

In [10]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

# Prediction using global average for comparison

In [11]:
#Evaluate the root mean squared error if we use the global average as a prediction for all rating, Our model should have
#a better performance than this.

average = training.select('user_id','rating').withColumn("user_id",lit(1)).groupBy('user_id').mean()
global_average = average.select('avg(rating)').collect()[0][0]

print("Global Average:",str(global_average))
    
test_avg = test.withColumn('prediction',lit(global_average))
    
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(test_avg)

print("Root-mean-square error = " + str(rmse))


Global Average: 3.920029905372973
Root-mean-square error = 0.9911524232836714


# Basic ALS Recommender

When we're not using explicit data like ratings here, we use the implicitPrefs argument. Sometimes companies don't have explicit data like ratings, but still want to build a recommendation engine based on other metrics like views, clicks, wishlists, etc. When it comes to implicit preference, it is not within the scope of our good books project. ColdstartStrategy is used when we have no data on a user, which may lead to null prediction if the user on the test set has no rating in the training set. We use the 'drop' cold start strategy since we wish to avoid such a situation.

In [12]:
#als with 10 latent factors(default)

als = ALS( userCol="user_id", itemCol="book_id", ratingCol="rating",
          coldStartStrategy="drop", nonnegative = True, implicitPrefs = False)

als.setSeed(0)
type(als)

pyspark.ml.recommendation.ALS

In [13]:
#ALS with 5 latent factors

als_2 = ALS( userCol="user_id", itemCol="book_id", ratingCol="rating",
          coldStartStrategy="drop", nonnegative = True,rank=5 ,implicitPrefs = False)

als_2.setSeed(0)

ALS_1576281caf35

In [14]:
#ALS with 30 latent factors

als_3 = ALS( userCol="user_id", itemCol="book_id", ratingCol="rating",
          coldStartStrategy="drop", nonnegative = True,rank=30 ,implicitPrefs = False)

als_3.setSeed(0)

ALS_5c547b6aa6f7

In [52]:
#ALS with 100 latent factors

als_4 = ALS( userCol="user_id", itemCol="book_id", ratingCol="rating",
          coldStartStrategy="drop", nonnegative = True,rank=100 ,implicitPrefs = False)

als_4.setSeed(0)

ALS_7ecacd035a50

###**Training the model**

In [53]:
model = als.fit(training)
model_2 = als_2.fit(training)
model_3 = als_3.fit(training)
model_4 = als_4.fit(training)

In [54]:
predictions = model.transform(test)
predictions_2 = model_2.transform(test)
predictions_3 = model_3.transform(test)
predictions_4 = model_4.transform(test)

In [17]:
predictions.show(5)

+-------+-------+------+----------+
|user_id|book_id|rating|prediction|
+-------+-------+------+----------+
|  11141|    148|     3| 3.3516066|
|  33412|    148|     5| 4.0760345|
|  51123|    148|     3| 3.3515449|
|  12367|    148|     5| 3.9604023|
|  19351|    148|     2| 3.0305216|
+-------+-------+------+----------+
only showing top 5 rows



###Evaluation

In [18]:
#Evaluate model 1 (10 latent factors)
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.8353404186047991


In [19]:
#Evaluate model 2 (5 latent factors)
rmse_2 = evaluator.evaluate(predictions_2)
print("Root-mean-square error = " + str(rmse_2))

Root-mean-square error = 0.8419515173639188


In [20]:
#Evaluate model 3 (30 latent factors)
rmse_3 = evaluator.evaluate(predictions_3)
print("Root-mean-square error = " + str(rmse_3))

Root-mean-square error = 0.8263747897020755


In [55]:
#Evaluate model 4 (100 latent factors)
rmse_4 = evaluator.evaluate(predictions_4)
print("Root-mean-square error = " + str(rmse_4))

Root-mean-square error = 0.8218106459010788


**RESULT - RMSE is 0.82 which means the predicted ratin deviates from original by an average of this score.**

# ALS Recommender with Bias

In [21]:
#Create user mean and item mean column to get user-item interaction column

user_mean = training.groupBy('user_id').mean('rating')
item_mean = training.groupBy('book_id').mean('rating')

training_bias = training.join(user_mean,['user_id'],'left_outer').select('user_id','book_id','rating',
                                                                       col('avg(rating)').alias('user_mean'))
    
training_bias = training_bias.join(item_mean,['book_id'],'left_outer').select('user_id','book_id','rating','user_mean',
                                                                       col('avg(rating)').alias('item_mean'))

training_bias = training_bias.withColumn("user_item_interaction",training_bias.rating-(training_bias.user_mean+training_bias.item_mean-global_average))


training_bias.show(5)



+-------+-------+------+------------------+----------------+---------------------+
|user_id|book_id|rating|         user_mean|       item_mean|user_item_interaction|
+-------+-------+------+------------------+----------------+---------------------+
|   1645|    148|     4|3.5081967213114753|3.76158940397351|   0.6502437800879872|
|   6397|    148|     4|              3.75|3.76158940397351|   0.4084405013994634|
|   9427|    148|     3| 3.723404255319149|3.76158940397351|  -0.5649637539196859|
|  10206|    148|     5| 3.824324324324324|3.76158940397351|   1.3341161770751389|
|  11317|    148|     4| 3.641025641025641|3.76158940397351|    0.517414860373822|
+-------+-------+------+------------------+----------------+---------------------+
only showing top 5 rows



Models Training

In [22]:
#Train model using user-item interaction

#model 1: 5 latent factor 
als = ALS(maxIter=5, regParam=0.01,rank=5, userCol="user_id", itemCol="book_id", ratingCol="user_item_interaction",coldStartStrategy="drop")
als.setSeed(0)

model = als.fit(training_bias)



In [23]:
#model 2: latent factor 10 (default)
als_2 = ALS(maxIter=5, regParam=0.01,rank=10, userCol="user_id", itemCol="book_id", ratingCol="user_item_interaction",coldStartStrategy="drop")
als_2.setSeed(0)

model_2 = als_2.fit(training_bias)

In [41]:
#model 3: latent factor 2 (default)
als_3 = ALS(maxIter=5, regParam=0.01,rank=2, userCol="user_id", itemCol="book_id", ratingCol="user_item_interaction",coldStartStrategy="drop")
als_3.setSeed(0)

model_3 = als_3.fit(training_bias)

In [46]:
# create user mean, item mean and user-item interaction coloumn for test set
user_mean = test.groupBy('user_id').mean('rating')
item_mean = test.groupBy('book_id').mean('rating')


test_bias =  test.join(user_mean,['user_id'],'left_outer').select('user_id','book_id','rating',
                                                                       col('avg(rating)').alias('user_mean'))
    
test_bias = test_bias.join(item_mean,['book_id'],'left_outer').select('user_id','book_id','rating','user_mean',
                                                                       col('avg(rating)').alias('item_mean'))    

test_bias_2 = test_bias
test_bias_3 = test_bias

Evaluating models with bias

In [25]:
#Evaluaing model 1 (5 latent factors)
test_bias = model.transform(test_bias)

#use predicted user-item interaction to calculate rating
test_bias = test_bias.withColumn('prediction',test_bias.prediction+test_bias.user_mean+test_bias.item_mean-global_average)

test_bias.show(5)

#calculate root mean squared error
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(test_bias)
print("rmse ALS with bias (5 latent factor):", str(rmse))

+-------+-------+------+------------------+-----------------+-----------------+
|user_id|book_id|rating|         user_mean|        item_mean|       prediction|
+-------+-------+------+------------------+-----------------+-----------------+
|  11141|    148|     3|               3.6|3.804783451842275|3.472783456720801|
|  33412|    148|     5| 4.384615384615385|3.804783451842275|4.313194944888169|
|  51123|    148|     3| 3.923076923076923|3.804783451842275|3.696764351663496|
|  12367|    148|     5|4.4324324324324325|3.804783451842275|4.185638736498705|
|  19351|    148|     2|3.5217391304347827|3.804783451842275|3.400049902816894|
+-------+-------+------+------------------+-----------------+-----------------+
only showing top 5 rows

rmse ALS with bias (5 latent factor): 0.8492059827891234


In [26]:
#Evaluaing model 2 (10 latent factors)
test_bias_2 = model_2.transform(test_bias_2)

#use predicted user-item interaction to calculate rating
test_bias_2 = test_bias_2.withColumn('prediction',test_bias_2.prediction+test_bias_2.user_mean+test_bias_2.item_mean-global_average)

test_bias_2.show(5)

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(test_bias_2)
print("rmse ALS with bias (10 latent factor):", str(rmse))


+-------+-------+------+------------------+-----------------+------------------+
|user_id|book_id|rating|         user_mean|        item_mean|        prediction|
+-------+-------+------+------------------+-----------------+------------------+
|  11141|    148|     3|               3.6|3.804783451842275| 3.799792227499576|
|  33412|    148|     5| 4.384615384615385|3.804783451842275| 4.333770453363472|
|  51123|    148|     3| 3.923076923076923|3.804783451842275| 4.207602249321846|
|  12367|    148|     5|4.4324324324324325|3.804783451842275|4.0861255931976945|
|  19351|    148|     2|3.5217391304347827|3.804783451842275|3.3278078030251286|
+-------+-------+------+------------------+-----------------+------------------+
only showing top 5 rows

rmse ALS with bias (10 latent factor): 0.8801238494791618


In [47]:
#Evaluaing model 3(2 latent factors)
test_bias_3 = model_3.transform(test_bias_3)

#use predicted user-item interaction to calculate rating
test_bias_3 = test_bias_3.withColumn('prediction',test_bias_3.prediction+test_bias_3.user_mean+test_bias_3.item_mean-global_average)

test_bias_3.show(5)

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(test_bias_3)
print("rmse ALS with bias (2 latent factor):", str(rmse))

+-------+-------+------+------------------+-----------------+------------------+
|user_id|book_id|rating|         user_mean|        item_mean|        prediction|
+-------+-------+------+------------------+-----------------+------------------+
|  11141|    148|     3|               3.6|3.804783451842275| 3.578077566974254|
|  33412|    148|     5| 4.384615384615385|3.804783451842275|   4.3202532002626|
|  51123|    148|     3| 3.923076923076923|3.804783451842275|3.8857096823429136|
|  12367|    148|     5|4.4324324324324325|3.804783451842275| 4.268597252232185|
|  19351|    148|     2|3.5217391304347827|3.804783451842275|3.5323379452728054|
+-------+-------+------+------------------+-----------------+------------------+
only showing top 5 rows

rmse ALS with bias (2 latent factor): 0.8376734125294871


In [27]:
#ALS_recommendations = model.recommendForAllUsers(numItems = 10) # n - 10

In [28]:
#ALS_recommendations.show(n = 10)

In [29]:
# Temporary table
#ALS_recommendations.registerTempTable("ALS_recs_temp")

In [30]:
'''
clean_recs = spark.sql("""SELECT user_id,
                            bookIds_and_ratings.book_id AS book_id,
                            bookIds_and_ratings.rating AS prediction
                        FROM ALS_recs_temp
                        LATERAL VIEW explode(recommendations) exploded_table
                            AS bookIds_and_ratings""")
clean_recs.show()
'''

'\nclean_recs = spark.sql("""SELECT user_id,\n                            bookIds_and_ratings.book_id AS book_id,\n                            bookIds_and_ratings.rating AS prediction\n                        FROM ALS_recs_temp\n                        LATERAL VIEW explode(recommendations) exploded_table\n                            AS bookIds_and_ratings""")\nclean_recs.show()\n'

###**Recommendations for unread books**

In [31]:
#(clean_recs.join(ratings, ["user_id", "book_id"], "left").filter(ratings.rating.isNull()).show())

In [32]:
#new_books = (clean_recs.join(ratings, ["user_id", "book_id"], "left").filter(ratings.rating.isNull()))

In [33]:
#print(new_books.count())

In [34]:
#new_books.show(5)