# Part 6 Collaborative Filtering

### Setup

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession


In [None]:
spark = SparkSession.builder\
    .config("spark.driver.memory", "32g")\
    .config("spark.executor.memory","32g")\
    .config("spark.driver.maxResultSize","0")\
    .config("spark.sql.autoBroadcastJoinThreshold","-1")\
    .config("spark.sql.broadcastTimeout","1200")\
    .config("spark.default.parallelism", "180")\
    .config("spark.executor.heartbeatInterval", "3600")\
    .appName("part6").getOrCreate()

In [None]:
business = spark.read.json("yelp_academic_dataset_business.json")
review = spark.read.json("yelp_academic_dataset_review.json")
user = spark.read.json("yelp_academic_dataset_user.json")


### Prepare Data for ALS


In [None]:
# prepare user
user_index_df = spark.createDataFrame(user.rdd.map(lambda x: x[0]).zipWithIndex(), \
        StructType([StructField("user_id", StringType(), True),StructField("user_index", IntegerType(), True)]))

user_index_df.show(1)

a = user.alias("a")
b = user_index_df.alias("b")
user_new_df = a.join(b, col("a.user_id") == col("b.user_id"), "inner") \
             .select([col("a."+x) for x in a.columns] + [col("b.user_index")])
user_new_df.select("user_index","user_id", "user_name").show(2)

In [None]:
# prepare business
business_index_df = spark.createDataFrame(business.rdd.map(lambda x: x[0]).zipWithIndex(), \
        StructType([StructField("business_id", StringType(), True),StructField("business_index", IntegerType(), True)]))

business_index_df.show(1)

a = business.alias("a")
b = business_index_df.alias("b")
business_new_df = a.join(b, col("a.business_id") == col("b.business_id"), "inner") \
             .select([col("a."+x) for x in a.columns] + [col("b.business_index")])
business_new_df.select("business_index","business_id", "business_name").show(1)

In [None]:
# map new userId and businessId in the review dataframe

review_df = review.select("user_id", "business_id", "stars")

a = review_df.alias("a")
b = user_index_df.alias("b")
review_user_index_df = a.join(b, col("a.user_id") == col("b.user_id"), "inner") \
                     .select([col("a."+xx) for xx in a.columns] + [col("b.user_index")])

a = review_user_index_df.alias("a")
b = business_index_df.alias("b")
review_new_df = a.join(b, col("a.business_id") == col("b.business_id"), "inner") \
                         .select([col("a."+x) for x in a.columns] + [col("b.business_index")])
review_new_df.show(2)

### Build ALS Model

In [None]:
# create the rating matrix required by the ALS model

rating_df = review_new_df.select("user_index", "business_index", review_new_df.stars.cast("float").alias("rating"))
rating_df.show(1)
print("rating matrix count", rating_df.count())
rating_df.printSchema()
rating_df.limit(1).toPandas()

In [None]:
(train, test) = rating_df.randomSplit([0.8, 0.2], seed=1)

"""
als = ALS(userCol="user_index", itemCol="business_index", ratingCol="rating", coldStartStrategy="drop")
param_grid = ParamGridBuilder().addGrid(als.rank,[10, 15, 20])\
    .addGrid(als.maxIter,[10, 15, 20]).build()
evaluator = RegressionEvaluator(metricName="rmse",labelCol="rating")

cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5, seed=123)
cv_als_model = cv.fit(train)

als_predictions = cv_als_model.bestModel.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(als_predictions)
print("rmse: " + str(rmse))

best_model = cv_als_model.bestModel
#best_rank is 20
best_model.rank
#best_maxIter is 20
best_model._java_obj.parent().getMaxIter()
# rmse is 1.3383
"""

In [None]:
als = ALS(rank=20, maxIter=20, regParam=0.3, userCol="user_index", itemCol="business_index", ratingCol="rating", \
               coldStartStrategy="drop", seed=1)
als_model = als.fit(train)

als_predictions = als_model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(als_predictions)
print("rmse: " + str(rmse))

# save the ALS model
als_model.write().overwrite().save("als_model")
# rmse is 1.2579

In [None]:
# load als model
als_model = ALSModel.load("als_model")

In [None]:
# generate top 10 business for each user

user_recommendations = als_model.recommendForAllUsers(10)

In [None]:
# add the column user_id, cache the recommendaton dataframe and show recommedations sample

a = user_recommendations.alias("a")
b = user_index_df.alias("b")
    
all_user_recommendations = a.join(b, col("a.user_index") == col("b.user_index"), "inner") \
             .select([col("a."+xx) for xx in a.columns] + [col("b.user_id")])

all_user_recommendations.cache()   
all_user_recommendations.show(1, truncate=False)

In [None]:
def get_collaborative_recommendation(user_id):
    

    recommended_business =  spark.createDataFrame(all_user_recommendations.filter(col("user_id") == user_id).rdd.flatMap(lambda p: p[1]))

    a = recommended_business.alias("a")
    b = business_new_df.alias("b")
    
    return a.join(b, col("a.business_index") == col("b.business_index"), "inner") \
             .select([col("b.business_id"), col("a.rating"), col("b.business_name"),col("b.categories"),
                                                           col("b.stars"),col("b.review_count"),
                                                           col("b.latitude"),col("b.longitude")]) \
             .orderBy("rating", ascending = False)
    

In [None]:
# test case
user = "ZWD8UH1T7QXQr0Eq-mcWYg"
get_collaborative_recommendation(user).toPandas()


