# Part 6 Collaborative Filtering

### Setup

In [18]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession

In [19]:
spark = SparkSession.builder\
    .config("spark.driver.memory", "32g")\
    .config("spark.executor.memory","32g")\
    .config("spark.driver.maxResultSize","0")\
    .config("spark.sql.autoBroadcastJoinThreshold","-1")\
    .config("spark.sql.broadcastTimeout","1200")\
    .config("spark.default.parallelism", "32")\
    .config("spark.executor.heartbeatInterval", "3600")\
    .appName("part6").getOrCreate()

In [20]:
business = spark.read.json("yelp_academic_dataset_business.json")
review = spark.read.json("yelp_academic_dataset_review.json")
user = spark.read.json("yelp_academic_dataset_user.json")


### Prepare Data for ALS

In [21]:
def df_zip_with_index (df, col_name, offset=0):
    new_schema = StructType(
                    [StructField(col_name,LongType(),True)]       
                    + df.schema.fields                           
                )

    zipped_rdd = df.rdd.zipWithIndex()
    new_rdd = zipped_rdd.map(lambda args: ([args[1] + offset] + list(args[0])))
    return spark.createDataFrame(new_rdd, new_schema)

In [27]:
# prepare user

user_new_df = df_zip_with_index(user, "user_index")
user_new_df.show(2)

user_index_df = user_new_df.select("user_id", "user_index")
user_index_df.show(2)

+----------+-------------+---------------+---------------+----------------+--------------+---------------+---------------+---------------+-----------------+----------------+------------------+-----------------+-----+--------------------+----+--------------------+-----+----+------------+------+--------------------+-------------------+
|user_index|average_stars|compliment_cool|compliment_cute|compliment_funny|compliment_hot|compliment_list|compliment_more|compliment_note|compliment_photos|compliment_plain|compliment_profile|compliment_writer| cool|               elite|fans|             friends|funny|name|review_count|useful|             user_id|      yelping_since|
+----------+-------------+---------------+---------------+----------------+--------------+---------------+---------------+---------------+-----------------+----------------+------------------+-----------------+-----+--------------------+----+--------------------+-----+----+------------+------+--------------------+-------------

In [23]:
# prepare business

business_new_df = df_zip_with_index(business, "business_index")
business_new_df.show(2)

business_index_df = business_new_df.select("business_id", "business_index")
business_index_df.show(2)

+--------------+-------------------+--------------------+--------------------+--------------------+--------+--------------------+-------+-------------+---------------+--------------------+-----------+------------+-----+-----+
|business_index|            address|          attributes|         business_id|          categories|    city|               hours|is_open|     latitude|      longitude|                name|postal_code|review_count|stars|state|
+--------------+-------------------+--------------------+--------------------+--------------------+--------+--------------------+-------+-------------+---------------+--------------------+-----------+------------+-----+-----+
|             0|       921 Pearl St|{null, null, 'bee...|6iYb2HFDywm3zjuRg...|Gastropubs, Food,...| Boulder|{11:0-23:0, 11:0-...|      1|   40.0175444|   -105.2833481| Oskar Blues Taproom|      80302|          86|  4.0|   CO|
|             1|7000 NE Airport Way|{null, null, u'be...|tCbdrRPZA0oiIYSmH...|Salad, Soup, Sand.

In [28]:
# map new user_id and business_id in the review dataframe

review_df = review.select("user_id", "business_id", "stars")
review_new_df = review_df.join(user_index_df, "user_id").join(business_index_df,"business_id")
review_new_df.show(2)

+--------------------+--------------------+-----+----------+--------------+
|         business_id|             user_id|stars|user_index|business_index|
+--------------------+--------------------+-----+----------+--------------+
|-36nnCT71XE0InJXK...|ofyOSbE04KsDt7e9T...|  1.0|    300867|         96101|
|-36nnCT71XE0InJXK...|oLd1zN3dy5cmPCCnd...|  2.0|     85243|         96101|
+--------------------+--------------------+-----+----------+--------------+
only showing top 2 rows



### Build ALS Model

In [None]:
# create the rating matrix required by the ALS model

rating_df = review_new_df.select("user_index", "business_index", review_new_df.stars.cast("float").alias("rating"))
rating_df.show(2)
print("rating matrix count", rating_df.count())
rating_df.printSchema()
rating_df.limit(2).toPandas()

In [None]:
(train, test) = rating_df.randomSplit([0.8, 0.2], seed=1)

print("start tuning ALS model")

als = ALS(userCol="user_index", itemCol="business_index", ratingCol="rating", coldStartStrategy="drop")
param_grid = ParamGridBuilder()\
    .addGrid(als.rank,[10, 15, 20, 50, 100])\
    .addGrid(als.maxIter,[10, 15, 20])\
    .addGrid(als.regParam, [.01, .05, .1, .15, 0.3]) \
    .build()
evaluator = RegressionEvaluator(metricName="rmse",labelCol="rating")

cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5, seed=1)
cv_als_model = cv.fit(train)

als_predictions = cv_als_model.bestModel.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(als_predictions)
print("rmse: " + str(rmse))

best_model = cv_als_model.bestModel
#best_rank is 20
best_model.rank
#best_maxIter is 20
best_model._java_obj.parent().getMaxIter()
# rmse is 1.3383
print("end tuning ALS model")

print("rank:", best_model._java_obj.parent().getRank())
print("maxIter:", best_model._java_obj.parent().getMaxIter())
print("regParam:", best_model._java_obj.parent().getRegParam())

In [None]:
print("start building ALS model")

als = ALS(rank=20, maxIter=20, regParam=0.3, userCol="user_index", itemCol="business_index", ratingCol="rating", \
               coldStartStrategy="drop", seed=1)
als_model = als.fit(train)

als_predictions = als_model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(als_predictions)
print("rmse: " + str(rmse))

# save the ALS model
als_model.write().overwrite().save("als_model")
# rmse is 1.2579

print("end building ALS model")

In [None]:
# load als model
als_model = ALSModel.load("als_model")

In [None]:
# generate top 10 business for each user

user_recommendations = als_model.recommendForAllUsers(10)

In [None]:
# add the column user_id, cache the recommendaton dataframe and show recommedations sample

a = user_recommendations.alias("a")
b = user_index_df.alias("b")
    
all_user_recommendations = a.join(b, col("a.user_index") == col("b.user_index"), "inner") \
             .select([col("a."+xx) for xx in a.columns] + [col("b.user_id")])

all_user_recommendations.cache()   
all_user_recommendations.show(1, truncate=False)

In [None]:
def get_collaborative_recommendation(user_id):
    

    recommended_business =  spark.createDataFrame(all_user_recommendations.filter(col("user_id") == user_id).rdd.flatMap(lambda p: p[1]))

    a = recommended_business.alias("a")
    b = business_new_df.alias("b")
    
    return a.join(b, col("a.business_index") == col("b.business_index"), "inner") \
             .select([col("b.business_id"), col("a.rating"), col("b.name"),col("b.categories"),
                                                           col("b.stars"),col("b.review_count"),
                                                           col("b.latitude"),col("b.longitude")]) \
             .orderBy("rating", ascending = False)
    

In [None]:
# test case
user = "ZWD8UH1T7QXQr0Eq-mcWYg"
get_collaborative_recommendation(user).toPandas()


