# Part 6 Collaborative Filtering

### Setup

In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
import pandas as pd

In [2]:
spark = SparkSession.builder\
    .config("spark.driver.memory", "32g")\
    .config("spark.executor.memory","32g")\
    .config("spark.driver.maxResultSize","0")\
    .config("spark.sql.autoBroadcastJoinThreshold","-1")\
    .config("spark.sql.broadcastTimeout","1200")\
    .config("spark.default.parallelism", "32")\
    .config("spark.executor.heartbeatInterval", "3600")\
    .config("spark.local.dir","D:\\Data")\
    .appName("part6").getOrCreate()

In [3]:
business = spark.read.json("yelp_academic_dataset_business.json")
review = spark.read.json("yelp_academic_dataset_review.json")
user = spark.read.json("yelp_academic_dataset_user.json")


### Prepare Data for ALS

In [4]:
def df_zip_with_index (df, col_name, offset=0):
    new_schema = StructType(
                    [StructField(col_name,LongType(),True)]       
                    + df.schema.fields                           
                )

    zipped_rdd = df.rdd.zipWithIndex()
    new_rdd = zipped_rdd.map(lambda args: ([args[1] + offset] + list(args[0])))
    return spark.createDataFrame(new_rdd, new_schema)

In [5]:
# prepare user

user_new_df = df_zip_with_index(user, "user_index")
user_new_df.show(2)

user_index_df = user_new_df.select("user_id", "user_index")
user_index_df.show(2)

+----------+-------------+---------------+---------------+----------------+--------------+---------------+---------------+---------------+-----------------+----------------+------------------+-----------------+-----+--------------------+----+--------------------+-----+----+------------+------+--------------------+-------------------+
|user_index|average_stars|compliment_cool|compliment_cute|compliment_funny|compliment_hot|compliment_list|compliment_more|compliment_note|compliment_photos|compliment_plain|compliment_profile|compliment_writer| cool|               elite|fans|             friends|funny|name|review_count|useful|             user_id|      yelping_since|
+----------+-------------+---------------+---------------+----------------+--------------+---------------+---------------+---------------+-----------------+----------------+------------------+-----------------+-----+--------------------+----+--------------------+-----+----+------------+------+--------------------+-------------

In [6]:
# prepare business

business_new_df = df_zip_with_index(business, "business_index")
business_new_df.show(2)

business_index_df = business_new_df.select("business_id", "business_index")
business_index_df.show(2)

+--------------+-------------------+--------------------+--------------------+--------------------+--------+--------------------+-------+-------------+---------------+--------------------+-----------+------------+-----+-----+
|business_index|            address|          attributes|         business_id|          categories|    city|               hours|is_open|     latitude|      longitude|                name|postal_code|review_count|stars|state|
+--------------+-------------------+--------------------+--------------------+--------------------+--------+--------------------+-------+-------------+---------------+--------------------+-----------+------------+-----+-----+
|             0|       921 Pearl St|{null, null, 'bee...|6iYb2HFDywm3zjuRg...|Gastropubs, Food,...| Boulder|{11:0-23:0, 11:0-...|      1|   40.0175444|   -105.2833481| Oskar Blues Taproom|      80302|          86|  4.0|   CO|
|             1|7000 NE Airport Way|{null, null, u'be...|tCbdrRPZA0oiIYSmH...|Salad, Soup, Sand.

In [7]:
# map new user_id and business_id in the review dataframe

review_df = review.select("user_id", "business_id", "stars")
review_new_df = review_df.join(user_index_df, "user_id").join(business_index_df,"business_id")
review_new_df.show(2)

+--------------------+--------------------+-----+----------+--------------+
|         business_id|             user_id|stars|user_index|business_index|
+--------------------+--------------------+-----+----------+--------------+
|-36nnCT71XE0InJXK...|ofyOSbE04KsDt7e9T...|  1.0|    300867|         96101|
|-36nnCT71XE0InJXK...|oLd1zN3dy5cmPCCnd...|  2.0|     85243|         96101|
+--------------------+--------------------+-----+----------+--------------+
only showing top 2 rows



### Build ALS Model

In [8]:
# create the rating matrix required by the ALS model

rating_df = review_new_df.select("user_index", "business_index", review_new_df.stars.cast("float").alias("rating"))
rating_df.show(2)
print("rating matrix count", rating_df.count())
rating_df.printSchema()
rating_df.limit(2).toPandas()

+----------+--------------+------+
|user_index|business_index|rating|
+----------+--------------+------+
|    300867|         96101|   1.0|
|     85243|         96101|   2.0|
+----------+--------------+------+
only showing top 2 rows

rating matrix count 8635403
root
 |-- user_index: long (nullable = true)
 |-- business_index: long (nullable = true)
 |-- rating: float (nullable = true)



Unnamed: 0,user_index,business_index,rating
0,300867,96101,1.0
1,85243,96101,2.0


In [9]:
(train, test) = rating_df.randomSplit([0.8, 0.2], seed=1)

In [10]:
"""
print("start tuning ALS model")

als = ALS(userCol="user_index", itemCol="business_index", ratingCol="rating", coldStartStrategy="drop")
param_grid = ParamGridBuilder()\
    .addGrid(als.rank,[10, 15, 20])\
    .addGrid(als.maxIter,[10, 15, 20])\
    .build()
evaluator = RegressionEvaluator(metricName="rmse",labelCol="rating")

cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5, seed=1)
cv_als_model = cv.fit(train)

als_predictions = cv_als_model.bestModel.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(als_predictions)
print("rmse: " + str(rmse))

best_model = cv_als_model.bestModel
#best_rank is 20
best_model.rank
#best_maxIter is 20
best_model._java_obj.parent().getMaxIter()
# rmse is 1.3383
print("end tuning ALS model")

print("rank:", best_model._java_obj.parent().getRank())
print("maxIter:", best_model._java_obj.parent().getMaxIter())
print("regParam:", best_model._java_obj.parent().getRegParam())
"""

'\nprint("start tuning ALS model")\n\nals = ALS(userCol="user_index", itemCol="business_index", ratingCol="rating", coldStartStrategy="drop")\nparam_grid = ParamGridBuilder()    .addGrid(als.rank,[10, 15, 20])    .addGrid(als.maxIter,[10, 15, 20])    .build()\nevaluator = RegressionEvaluator(metricName="rmse",labelCol="rating")\n\ncv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5, seed=1)\ncv_als_model = cv.fit(train)\n\nals_predictions = cv_als_model.bestModel.transform(test)\nevaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")\nrmse = evaluator.evaluate(als_predictions)\nprint("rmse: " + str(rmse))\n\nbest_model = cv_als_model.bestModel\n#best_rank is 20\nbest_model.rank\n#best_maxIter is 20\nbest_model._java_obj.parent().getMaxIter()\n# rmse is 1.3383\nprint("end tuning ALS model")\n\nprint("rank:", best_model._java_obj.parent().getRank())\nprint("maxIter:", best_model._java_obj.parent().ge

In [11]:
"""
print("start building ALS model")

als = ALS(rank=20, maxIter=20, regParam=0.3, userCol="user_index", itemCol="business_index", ratingCol="rating", \
               coldStartStrategy="drop", seed=1)
als_model = als.fit(train)

als_predictions = als_model.transform(test)
# evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
# rmse = evaluator.evaluate(als_predictions)
# print("rmse: " + str(rmse))

# save the ALS model
als_model.write().overwrite().save("als_model")
# rmse is 1.2579

print("end building ALS model")
"""

'\nprint("start building ALS model")\n\nals = ALS(rank=20, maxIter=20, regParam=0.3, userCol="user_index", itemCol="business_index", ratingCol="rating",                coldStartStrategy="drop", seed=1)\nals_model = als.fit(train)\n\nals_predictions = als_model.transform(test)\n# evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")\n# rmse = evaluator.evaluate(als_predictions)\n# print("rmse: " + str(rmse))\n\n# save the ALS model\nals_model.write().overwrite().save("als_model")\n# rmse is 1.2579\n\nprint("end building ALS model")\n'

In [12]:
# load als model
als_model = ALSModel.load("als_model")

In [13]:
# generate top 10 business for each user

user_recommendations = als_model.recommendForAllUsers(10)
user_recommendations.show(2)

+----------+--------------------+
|user_index|     recommendations|
+----------+--------------------+
|       148|[{60336, 4.452312...|
|       463|[{159581, 4.88538...|
+----------+--------------------+
only showing top 2 rows



In [25]:
# add the column user_id, cache the recommendaton dataframe and show recommedations sample

all_user_recommendations = user_recommendations.join(user_index_df, "user_index", "inner").cache()
all_user_recommendations.show(2, truncate=False)

+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
|user_index|recommendations                                                                                                                                                                                      |user_id               |
+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
|26        |[{430, 4.8179026}, {38749, 4.6943216}, {3629, 4.6805577}, {53050, 4.632521}, {118534, 4.621344}, {87959, 4.6168857}, {152872, 4.607834}, {126090, 4.597147}, {136161, 4.5937386}, {150086, 4.591945}]|EeKFz5FcrlajMbm0tyFn7Q|
|29        |[{155142, 5.5051017}, {29716, 5.3834133}, {71295, 5.

In [42]:
all_user_recommendations.write.format("parquet").save("part6_all_user_recommendations.parquet")

In [43]:
# test load
loaded_user_recommendations = spark.read.load("part6_all_user_recommendations.parquet")
loaded_user_recommendations.show(3)
loaded_user_recommendations.printSchema()

+----------+--------------------+--------------------+
|user_index|     recommendations|             user_id|
+----------+--------------------+--------------------+
|        74|[{29716, 5.204533...|2-qjCTBWu0E892IUf...|
|       280|[{159581, 4.95877...|g1v5bMkJlg63n_wZz...|
|       322|[{67031, 5.239844...|hDhX63HdUlGNl3DBe...|
+----------+--------------------+--------------------+
only showing top 3 rows

root
 |-- user_index: integer (nullable = true)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- business_index: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)
 |-- user_id: string (nullable = true)



In [44]:
def get_collaborative_recommendation(user_id):

    recommended_business =  spark.createDataFrame(
        loaded_user_recommendations.filter(col("user_id") == user_id)
                                                  .rdd.flatMap(lambda p: p[1]))
    return recommended_business.join(business_new_df, "business_index", "inner")\
             .orderBy("rating", ascending = False)
    

In [45]:
# test case

user = "EeKFz5FcrlajMbm0tyFn7Q"
get_collaborative_recommendation(user).toPandas()




Unnamed: 0,business_index,rating,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
0,430,4.817903,"5339 N IH-35, Ste 100","(None, None, None, None, None, None, None, Non...",e3SRzV_dN_t5ZQW9TtK1qQ,"Family Practice, Doctors, Health & Medical",Austin,,1,30.198704,-97.761154,"Jeffrey F Bocchicchio, MD",78723,6,5.0,TX
1,38749,4.694322,"2600 Longhorn Blvd, Ste 107",,Yxx6A4Yj7qZAzV8qe78nyw,"Auto Repair, Automotive, Tires",Austin,"(9:0-15:0, 9:0-15:0, None, None, 9:0-15:0, 9:0...",0,30.380994,-97.725697,SoulSpeed Performance,78758,7,5.0,TX
2,3629,4.680558,2210 S 1st Street,"(None, None, None, None, None, None, None, Non...",Ss0eE3EYBBXX_U8_uJZxYA,"Sporting Goods, Bikes, Shopping",Austin,"(10:0-19:0, 10:0-19:0, 10:0-19:0, None, 10:0-1...",0,30.244053,-97.758649,Southside Bicycle Shop,78704,9,4.0,TX
3,53050,4.632521,9926 Circle Dr,"(None, None, u'beer_and_wine', None, None, Non...",BxcvlSGdj8alNt-yX7mYQg,"Bars, Karaoke, Active Life, Nightlife, Golf",Austin,,0,30.246176,-97.927125,Circle Country Club,78736,5,5.0,TX
4,118534,4.621344,,"(None, None, None, None, None, None, None, Non...",8suLHIUEGMLf4oOJ4f5Ziw,"Shades & Blinds, Building Supplies, Solar Inst...",Austin,"(8:0-19:0, 8:0-19:0, None, None, 8:0-19:0, 8:0...",1,30.353932,-97.736056,Josh Hobbs Solar Screen Services,78766,27,5.0,TX
5,87959,4.616886,,"(None, None, None, None, None, None, None, Non...",vt_sr5QqKr8R6jk0lgGwZA,"Food, Desserts",Austin,,0,30.272921,-97.744386,Two Gals Cupcakery,78701,5,5.0,TX
6,152872,4.607834,,"(None, None, None, None, None, None, None, Non...",87ls7-pCdQ7IQY-0Oscbiw,"Eyelash Service, Beauty & Spas, Nail Salons",Austin,"(10:0-19:0, None, 10:0-18:0, None, 10:0-19:0, ...",0,30.168207,-97.81776,Nail Lounge By Chandra White,78748,10,4.5,TX
7,126090,4.597147,8504 South Congress,"(None, None, u'beer_and_wine', {'romantic': Fa...",Oq5s3TSuJAPokLKbc7RkMA,"Pizza, Restaurants",Austin,"(17:0-1:0, 17:0-1:0, 17:0-1:0, 17:0-1:0, 17:0-...",0,30.172641,-97.786324,Via 313 Pizza,78745,19,5.0,TX
8,136161,4.593739,"4207 James Casey St, Ste 115","(None, None, None, None, None, None, None, Non...",xraLRVJAK13RCXwThLu4lw,"Acupuncture, Health & Medical",Austin,"(9:0-18:0, 9:0-18:0, None, None, 9:0-18:0, 9:0...",1,30.224732,-97.773718,AcuTouch Acupuncture and Herb Center,78745,8,5.0,TX
9,150086,4.591945,6001 Techni Center Dr,"(None, None, None, None, None, None, None, Tru...",SA1PCjoASj60IKSKiE6Hhg,"Food, CSA",Austin,"(8:0-15:0, 8:0-15:0, None, None, 8:0-15:0, 8:0...",1,30.276598,-97.671343,Good Flow Honey,78721,10,5.0,TX
