# Part 8 Hybrid Recommendation

In [14]:
from pyspark.ml import PipelineModel
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.types import *
from pyspark.sql.functions import *
import numpy as np

In [15]:
spark = SparkSession.builder\
    .config("spark.driver.memory", "32g")\
    .config("spark.executor.memory","32g")\
    .config("spark.driver.maxResultSize","0")\
    .config("spark.sql.autoBroadcastJoinThreshold","-1")\
    .config("spark.sql.broadcastTimeout","1200")\
    .config("spark.default.parallelism", "32")\
    .config("spark.executor.heartbeatInterval", "3600")\
    .config("spark.local.dir","D:\\Data")\
    .appName("part8").getOrCreate()

In [16]:
review = spark.read.csv("part4_topicmodeling_review.csv", header=True, multiLine=True)
business = spark.read.csv("part1_dataclean_business.csv",header=True, multiLine=True)
user = spark.read.csv("part1_dataclean_user.csv", header=True, multiLine=True)

In [17]:
# define utility functions

def cosine_similarity(vector1, vector2): 
    return np.dot(vector1, vector2) / np.sqrt(np.dot(vector1, vector1)) / np.sqrt(np.dot(vector2, vector2)) 

### Prepare Content Based Filtering

In [18]:
# load pipeline model

pipeline_model = PipelineModel.load("content_based_pipeline_model")


# load all business vectors

all_business_vectors_df = spark.read.load("part5_all_business_vectors.parquet")
all_business_vectors_df.show(3)

all_business_vectors = all_business_vectors_df.rdd.map(lambda x: (x[0], x[1])).collect()
print(all_business_vectors[1][1])

def get_similar_business(business_ids, limit=10):
    schema = StructType([   
                            StructField("business_id", StringType(), True),
                            StructField("score", IntegerType(), True)
                        ])
    similar_businesses_df = spark.createDataFrame([], schema)
    
    for b_id in business_ids:
        try:
            input_vec = [(r[1]) for r in all_business_vectors if r[0] == b_id][0]
        except:
            continue
        similar_business_rdd = spark.sparkContext.parallelize(
            (i[0], float(cosine_similarity(input_vec, i[1]))) for i in all_business_vectors)

        similar_business_df = spark.createDataFrame(similar_business_rdd) \
            .withColumnRenamed("_1", "business_id") \
            .withColumnRenamed("_2", "score") \
            .orderBy("score", ascending = False)
            
        similar_business_df = similar_business_df.filter(col("business_id") != b_id).limit(limit)
        similar_businesses_df = similar_businesses_df \
                                    .union(similar_business_df)
    return similar_businesses_df.orderBy("score", ascending=False).limit(limit)

def get_user_recommendation(user_id, limit = 10):
    print(f"[content based filtering] user id: {user}")
    user_reviewed_businesses = review.filter((col("user_id") == user_id) & (col("superscore")>=3.0))\
        .sample(False, 0.5).limit(5).select("business_id")
    business_ids = [i.business_id for i in user_reviewed_businesses.collect()]
    print(f"[content based filtering] user reviewed businesses: {business_ids}")
    return content_based_get_details(get_similar_business(business_ids, limit))

def get_keywords_recommendation(keywords, limit=10):
    
    print(f"[content based filtering] keywords: {keywords}")
    keywords_df = spark.sparkContext.parallelize([(0, keywords)]).toDF(["business_id", "text"])
    
    # transform the the keywords to vectors
    
    transformed_keywords_df = pipeline_model.transform(keywords_df)    
    keywords_vector = transformed_keywords_df.select("word_vector").collect()[0][0]
    sim_bus_byword_rdd = spark.sparkContext\
        .parallelize((i[0], float(cosine_similarity(keywords_vector, i[1]))) for i in all_business_vectors)

    sim_bus_byword_df = spark.createDataFrame(sim_bus_byword_rdd) \
         .withColumnRenamed("_1", "business_id") \
         .withColumnRenamed("_2", "score") \
         .orderBy("score", ascending = False) \
         .limit(limit)
    
    # return top 10 similar businesses
    
    a = sim_bus_byword_df.limit(limit)
    return content_based_get_details(a)

def content_based_get_details(recommended_business):
    return recommended_business.join(business, "business_id", "inner").orderBy("score", ascending=False)

+--------------------+--------------------+
|         business_id|         word_vector|
+--------------------+--------------------+
|0OLouFEhEm3pIh4Sx...|[0.00706271847664...|
|M4zae56hA6sb6V2Ty...|[-0.0525487022407...|
|W60O4ast9uAq03n7n...|[-0.0027295319850...|
+--------------------+--------------------+
only showing top 3 rows

[-0.05254870224070885,0.03538502551234829,-0.03854379182152756,-0.053777138675465165,0.11863246697258918,-0.042976504449101866,-0.06302669620455528,-0.010523899725802371,-0.05600440347677558,-0.051089978157957947,-0.12485574790768786,-0.01871904768133106,-0.01511064755262284,-0.012473998580663153,-0.0065393127728708925,-0.018541000201754955,0.026146981704871602,-0.0016333255095140166,0.019860426191859133,-0.02298519021158624,-0.06866251518718708,-0.04173724704121691,0.058374670250007615,-0.033756228102148386,0.036534135101342076,0.031458382954585626,0.014802017625441538,0.04208449111690074,0.05030599064622714,0.02105374494679037,-0.0195642894819269,0.00341606

In [None]:
# verify the loading process for content based
# test case 1

content_based_get_details(
    get_similar_business(["Npm0cjoyWwyV13OULL9qOA", "ZKPqinA-7gkDA8-yzubSYw"]))\
    .toPandas()

In [None]:
# test case 2

get_user_recommendation("uLhdaZUsVvT0gbNTdOSzDg").toPandas()

In [None]:
# test case 3

get_keywords_recommendation("burger").toPandas()


### Prepare Collaborative Filtering 

In [None]:
loaded_user_recommendations = spark.read.load("part6_all_user_recommendations.parquet").cache()
loaded_user_recommendations.show(3)
loaded_user_recommendations.printSchema()

business_new_df = spark.read.load("part6_business_with_index.parquet")
business_new_df.show(2)

def get_collaborative_recommendation(user_id):

    recommended_business =  spark.createDataFrame(
        loaded_user_recommendations.filter(col("user_id") == user_id)
                                                  .rdd.flatMap(lambda p: p[1]))
    return business_new_df.join(recommended_business, "business_index", "inner")\
             .orderBy("rating", ascending = False)

random_user = loaded_user_recommendations.sample(0.01, seed=1).head(1)[0].user_id
print(f"[collaborative filtering] test user id: {random_user}")
get_collaborative_recommendation(random_user).toPandas()

### Prepare Friend Recommendation

In [None]:
user_friends = user.select("user_id", "friends")

def get_friends_recommendation(user_id, limit=10):
    friends = user_friends.filter(col("user_id")==user_id).limit(1)
    if friends.count()<1:
        return None
    friend_str = friends.select("friends").rdd.collect()[0][0].replace(" ", "")
    if friend_str=="":
        return None
    friend_list = list(friend_str.split(","))
    friends_businesses = review.filter(
         (col("superscore")>=3.0) &(col("user_id").isin(friend_list)))\
        .orderBy("superscore", ascending=False)\
        .select("business_id").distinct().limit(limit).rdd.collect()
    if len(friends_businesses)==0:
        return None
    business_list = []
    for row in friends_businesses:
        business_list.append(row.business_id)
    return get_business_details(business_list) 

def get_business_details(business_list):
    return business.filter(col("business_id").isin(business_list)).orderBy("stars", ascending=False)
get_friends_recommendation("uLhdaZUsVvT0gbNTdOSzDg").toPandas()