# Part 6 Collaborative Filtering

### Setup

In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
import pandas as pd

In [2]:
spark = SparkSession.builder\
    .config("spark.driver.memory", "32g")\
    .config("spark.executor.memory","32g")\
    .config("spark.driver.maxResultSize","0")\
    .config("spark.sql.autoBroadcastJoinThreshold","-1")\
    .config("spark.sql.broadcastTimeout","1200")\
    .config("spark.default.parallelism", "32")\
    .config("spark.executor.heartbeatInterval", "3600")\ 
    .config("spark.local.dir","D:\\Data")\ 
    .appName("part6").getOrCreate()
# spark.local.dir: disable if your do not have this dir

In [4]:
business = spark.read.csv("part1_dataclean_business.csv", header=True, multiLine=True)
review = spark.read.csv("part4_topicmodeling_review.csv", header=True, multiLine=True)
user = spark.read.csv("part1_dataclean_user.csv", header=True, multiLine=True)

In [7]:
print("business schema")
business.printSchema()

print("review schema")
review.printSchema()

print("user schema")
user.printSchema()

business schema
root
 |-- business_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- stars: string (nullable = true)
 |-- review_count: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- is_open: string (nullable = true)
 |-- postal_code: string (nullable = true)

review schema
root
 |-- review_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- stars: string (nullable = true)
 |-- useful: string (nullable = true)
 |-- text: string (nullable = true)
 |-- polarity: string (nullable = true)
 |-- subjectivity: string (nullable = true)
 |-- compound: string (nullable = true)
 |-- superscore: string (nullable = true)
 |-- Keywords: string (nullable = true)

user schema
root
 |-- user_id: string (nullable = true)
 |-- name: string (nulla

### Prepare Data for ALS

In [8]:
def df_zip_with_index (df, col_name, offset=0):
    new_schema = StructType(
                    [StructField(col_name,LongType(),True)]       
                    + df.schema.fields                           
                )

    zipped_rdd = df.rdd.zipWithIndex()
    new_rdd = zipped_rdd.map(lambda args: ([args[1] + offset] + list(args[0])))
    return spark.createDataFrame(new_rdd, new_schema)

In [9]:
# prepare user

user_new_df = df_zip_with_index(user, "user_index")
user_new_df.show(2)

user_index_df = user_new_df.select("user_id", "user_index")
user_index_df.show(2)

+----------+--------------------+----+------------+-------------------+--------------------+------+-----+-----+----+-------------+
|user_index|             user_id|name|review_count|      yelping_since|             friends|useful|funny| cool|fans|average_stars|
+----------+--------------------+----+------------+-------------------+--------------------+------+-----+-----+----+-------------+
|         0|q_QQ5kBBwlCcbL1s4...|Jane|        1220|2005-03-14 20:26:35|xBDpTUbai0DXrvxCe...| 15038|10030|11291|1357|         3.85|
|         1|dIIKEfOgo0KqUfGQv...|Gabi|        2136|2007-08-10 19:01:51|XPzYf9_mwG2eXYP2B...| 21272|10289|18046|1025|         4.09|
+----------+--------------------+----+------------+-------------------+--------------------+------+-----+-----+----+-------------+
only showing top 2 rows

+--------------------+----------+
|             user_id|user_index|
+--------------------+----------+
|q_QQ5kBBwlCcbL1s4...|         0|
|dIIKEfOgo0KqUfGQv...|         1|
+------------------

In [10]:
# prepare business

business_new_df = df_zip_with_index(business, "business_index").cache()
business_new_df.show(2)

business_index_df = business_new_df.select("business_id", "business_index")
business_index_df.show(2)

+--------------+--------------------+--------------------+--------+-----+-----+------------+--------------------+-------------+---------------+-------+-----------+
|business_index|         business_id|                name|    city|state|stars|review_count|          categories|     latitude|      longitude|is_open|postal_code|
+--------------+--------------------+--------------------+--------+-----+-----+------------+--------------------+-------------+---------------+-------+-----------+
|             0|tCbdrRPZA0oiIYSmH...|Flying Elephants ...|Portland|   OR|  4.0|         126|Salad, Soup, Sand...|45.5889058992|-122.5933307507|      1|      97218|
|             1|jx91IMdGOmLOo8h_F...|Cleary's Restaura...|Portland|   OR|  3.5|          19|Nightlife, Sandwi...|   45.5264727|    -122.535323|      1|      97230|
+--------------+--------------------+--------------------+--------+-----+-----+------------+--------------------+-------------+---------------+-------+-----------+
only showing top

In [11]:
# map new user_id and business_id in the review dataframe

review_df = review.select("user_id", "business_id", "superscore")
review_new_df = review_df.join(user_index_df, "user_id").join(business_index_df,"business_id")
review_new_df.show(2)

+--------------------+--------------------+-----------------+----------+--------------+
|         business_id|             user_id|       superscore|user_index|business_index|
+--------------------+--------------------+-----------------+----------+--------------+
|08n38tS38iznDwL_X...|46wwNRiBGIAJuPlk8...|       4.95210625|     28744|          1047|
|08n38tS38iznDwL_X...|M8vydiEEQQnn4Z-wx...|5.186179212673611|    374051|          1047|
+--------------------+--------------------+-----------------+----------+--------------+
only showing top 2 rows



### Build ALS Model

In [12]:
# create the rating matrix required by the ALS model

rating_df = review_new_df.select("user_index", "business_index", review_new_df.superscore.cast("float").alias("rating"))
rating_df.show(2)
rating_df.printSchema()
rating_df.limit(2).toPandas()

+----------+--------------+---------+
|user_index|business_index|   rating|
+----------+--------------+---------+
|     28744|          1047|4.9521065|
|    374051|          1047| 5.186179|
+----------+--------------+---------+
only showing top 2 rows

rating matrix count 417737
root
 |-- user_index: long (nullable = true)
 |-- business_index: long (nullable = true)
 |-- rating: float (nullable = true)



Unnamed: 0,user_index,business_index,rating
0,28744,1047,4.952106
1,374051,1047,5.186179


In [13]:
(train, test) = rating_df.randomSplit([0.8, 0.2], seed=1)

In [None]:
"""
print("start tuning ALS model")

als = ALS(userCol="user_index", itemCol="business_index", ratingCol="rating", coldStartStrategy="drop")
param_grid = ParamGridBuilder()\
    .addGrid(als.rank,[10, 15, 20])\
    .addGrid(als.maxIter,[10, 15, 20])\
    .build()
evaluator = RegressionEvaluator(metricName="rmse",labelCol="rating")

cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5, seed=1)
cv_als_model = cv.fit(train)

als_predictions = cv_als_model.bestModel.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(als_predictions)
print("rmse: " + str(rmse))

best_model = cv_als_model.bestModel
#best_rank is 20
best_model.rank
#best_maxIter is 20
best_model._java_obj.parent().getMaxIter()
# rmse is 1.3383
print("end tuning ALS model")

print("rank:", best_model._java_obj.parent().getRank())
print("maxIter:", best_model._java_obj.parent().getMaxIter())
print("regParam:", best_model._java_obj.parent().getRegParam())
"""

In [14]:
"""
print("start building ALS model")

als = ALS(rank=20, maxIter=20, regParam=0.3, userCol="user_index", itemCol="business_index", ratingCol="rating", \
               coldStartStrategy="drop", seed=1)
als_model = als.fit(train)

als_predictions = als_model.transform(test)
# evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
# rmse = evaluator.evaluate(als_predictions)
# print("rmse: " + str(rmse))

# save the ALS model
als_model.write().overwrite().save("als_model")
# rmse is 1.2579

print("end building ALS model")

"""

start building ALS model
end building ALS model


In [15]:
# load als model
als_model = ALSModel.load("als_model")

In [16]:
# generate top 10 business for each user

user_recommendations = als_model.recommendForAllUsers(10)
user_recommendations.show(2)

+----------+--------------------+
|user_index|     recommendations|
+----------+--------------------+
|      1088|[{1183, 4.2916126...|
|      1342|[{2493, 5.264535}...|
+----------+--------------------+
only showing top 2 rows



In [17]:
# add the column user_id, cache the recommendaton dataframe and show recommedations sample

all_user_recommendations = user_recommendations.join(user_index_df, "user_index", "inner").cache()
all_user_recommendations.show(2, truncate=False)

+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
|user_index|recommendations                                                                                                                                                                        |user_id               |
+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
|1697      |[{1183, 5.1396894}, {788, 5.02356}, {3375, 4.9667706}, {700, 4.8327394}, {268, 4.82203}, {1323, 4.809856}, {2880, 4.807377}, {3359, 4.8005567}, {3526, 4.7902193}, {628, 4.771118}]    |486c4Pznd00sbs2wh7eNyg|
|1806      |[{2493, 3.7883527}, {726, 3.6609554}, {869, 3.5705402}, {89, 3.5580711}, {2523, 3.526672}, {1653, 3.5241573}

In [19]:
# save all user recommendations

all_user_recommendations.write.mode("overwrite")\
    .format("parquet").save("part6_all_user_recommendations.parquet")

In [21]:
# test load

loaded_user_recommendations = spark.read.load("part6_all_user_recommendations.parquet").cache()
loaded_user_recommendations.show(3)
loaded_user_recommendations.printSchema()

+----------+--------------------+--------------------+
|user_index|     recommendations|             user_id|
+----------+--------------------+--------------------+
|      1199|[{57, 4.4834585},...|T8fXV7fkbBkg8ET6v...|
|      2116|[{3761, 5.422045}...|ZCUf_Uk-eXli07c48...|
|      2684|[{2565, 5.9831705...|3mNz5nQFTIBQm0oU5...|
+----------+--------------------+--------------------+
only showing top 3 rows

root
 |-- user_index: integer (nullable = true)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- business_index: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)
 |-- user_id: string (nullable = true)



In [22]:
def get_collaborative_recommendation(user_id):

    recommended_business =  spark.createDataFrame(
        loaded_user_recommendations.filter(col("user_id") == user_id)
                                                  .rdd.flatMap(lambda p: p[1]))
    return business_new_df.join(recommended_business, "business_index", "inner")\
             .orderBy("rating", ascending = False)
    

In [28]:
# test a random case
random_user = loaded_user_recommendations.sample(0.01, seed=1).head(1)[0].user_id
print(f"test user id: {random_user}")
get_collaborative_recommendation(random_user).toPandas()


test user idBXgoRfst14LeNLOHTJ0rjA


Unnamed: 0,business_index,business_id,name,city,state,stars,review_count,categories,latitude,longitude,is_open,postal_code,rating
0,3209,CM2DhoaUwvr2bQPZlkOJ4Q,Pasta’s,Portland,OR,5.0,6,"Specialty Food, Restaurants, Food Trucks, Food...",45.547669,-122.6682301,1,97227,2.056291
1,1183,0JUkzQXJKaiAbpruTgDPnw,Yatra PDX,Portland,OR,5.0,9,"Food Stands, Food, Indian, Himalayan/Nepalese,...",45.4641695267,-122.65434729,1,97202,1.952722
2,89,-liZt9xZNvnT0tHW0XTwQA,Kate's Ice Cream,Portland,OR,5.0,25,"Vegan, Food, Ice Cream & Frozen Yogurt, Restau...",45.5283148,-122.6386321,1,97232,1.950059
3,3643,rl8U3o2y4IH7aJBfsMPTew,Division Liquor,Portland,OR,4.0,8,"Beer, Wine & Spirits, Food",45.5060226,-122.4949421,1,97236,1.940054
4,628,YQ6srHagEScNH9iu_DhqUg,Sarah's Cookies,Portland,OR,5.0,10,"Food, Bakeries",45.5643461,-122.5557446,1,97220,1.939659
5,1057,3ZcksUCfKGneyZkXGAmckA,Pixie Retreat Raw'r Laboratorie & Makery,Portland,OR,5.0,40,"Specialty Food, Restaurants, Live/Raw Food, Ve...",45.5110677127,-122.6626858006,1,97214,1.935495
6,635,iPnSI7FZbTtcxipcOT_lLA,Proletariat Butchery,Portland,OR,4.5,14,"Specialty Food, Food, Meat Shops, Butcher",45.5483842,-122.5996826,1,97213,1.925042
7,2493,maa0Pgf6ZD3e5us3kte6Sw,Bees and Beans,Portland,OR,5.0,10,"Food, Desserts, Chocolatiers & Shops, Specialt...",45.508732,-122.654872,1,97214,1.883153
8,1727,bWJoJe-gHtYzfTH5JfW-PQ,MUSE Cheesecakes,Portland,OR,5.0,7,"Desserts, Specialty Food, Food, Patisserie/Cak...",45.5544197,-122.6663494,1,97217,1.880594
9,1735,4bP1O1WA_CVaAEmRkfXe5A,Oregonic Tonic Kombucha,Portland,OR,5.0,5,"Food, Breweries, Kombucha",45.5837298,-122.7268969,1,97203,1.869701
