# Part 8 Hybrid Recommendation

In [12]:
from pyspark.ml import PipelineModel
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.types import *
from pyspark.sql.functions import *
import numpy as np
import folium
import html

In [13]:
spark = SparkSession.builder\
    .config("spark.driver.memory", "32g")\
    .config("spark.executor.memory","32g")\
    .config("spark.driver.maxResultSize","0")\
    .config("spark.sql.autoBroadcastJoinThreshold","-1")\
    .config("spark.sql.broadcastTimeout","1200")\
    .config("spark.default.parallelism", "32")\
    .config("spark.executor.heartbeatInterval", "3600")\
    .config("spark.local.dir","D:\\Data")\
    .appName("part8").getOrCreate()

In [14]:
review = spark.read.csv("part4_topicmodeling_review.csv", header=True, multiLine=True)
business = spark.read.csv("part1_dataclean_business.csv",header=True, multiLine=True).cache()
user = spark.read.csv("part1_dataclean_user.csv", header=True, multiLine=True)

In [15]:
# define utility functions

def cosine_similarity(vector1, vector2): 
    return np.dot(vector1, vector2) / np.sqrt(np.dot(vector1, vector1)) / np.sqrt(np.dot(vector2, vector2)) 
def display_in_map(business_df):
    folium_map = folium.Map(location=[45.5, -122.5], zoom_start=10)
    for i, r in business_df.toPandas().iterrows():
        folium.Marker(
                    location =[r.latitude, r.longitude], 
                    popup = html.escape(r["name"]) + '<br>' + 'stars: ' + str(r.stars) + '<br>' + 'reviews count: ' + str(r.review_count),    
                    icon = folium.Icon(color='green')).add_to(folium_map)
    return folium_map

### Prepare Content Based Filtering

In [16]:
# load pipeline model

pipeline_model = PipelineModel.load("content_based_pipeline_model")


# load all business vectors

all_business_vectors_df = spark.read.load("part5_all_business_vectors.parquet").cache()
all_business_vectors_df.show(3)

all_business_vectors = all_business_vectors_df.rdd.map(lambda x: (x[0], x[1])).collect()
print(all_business_vectors[1][1])

+--------------------+--------------------+
|         business_id|         word_vector|
+--------------------+--------------------+
|0OLouFEhEm3pIh4Sx...|[0.00706271847664...|
|M4zae56hA6sb6V2Ty...|[-0.0525487022407...|
|W60O4ast9uAq03n7n...|[-0.0027295319850...|
+--------------------+--------------------+
only showing top 3 rows

[-0.05254870224070885,0.03538502551234829,-0.03854379182152756,-0.053777138675465165,0.11863246697258918,-0.042976504449101866,-0.06302669620455528,-0.010523899725802371,-0.05600440347677558,-0.051089978157957947,-0.12485574790768786,-0.01871904768133106,-0.01511064755262284,-0.012473998580663153,-0.0065393127728708925,-0.018541000201754955,0.026146981704871602,-0.0016333255095140166,0.019860426191859133,-0.02298519021158624,-0.06866251518718708,-0.04173724704121691,0.058374670250007615,-0.033756228102148386,0.036534135101342076,0.031458382954585626,0.014802017625441538,0.04208449111690074,0.05030599064622714,0.02105374494679037,-0.0195642894819269,0.00341606

In [17]:
def get_similar_business(business_ids, limit=10):
    schema = StructType([   
                            StructField("business_id", StringType(), True),
                            StructField("score", IntegerType(), True)
                        ])
    similar_businesses_df = spark.createDataFrame([], schema)
    
    for b_id in business_ids:
        try:
            input_vec = [(r[1]) for r in all_business_vectors if r[0] == b_id][0]
        except:
            continue
        similar_business_rdd = spark.sparkContext.parallelize(
            (i[0], float(cosine_similarity(input_vec, i[1]))) for i in all_business_vectors)

        similar_business_df = spark.createDataFrame(similar_business_rdd) \
            .withColumnRenamed("_1", "business_id") \
            .withColumnRenamed("_2", "score") \
            .orderBy("score", ascending = False)
            
        similar_business_df = similar_business_df.filter(col("business_id") != b_id).limit(limit)
        similar_businesses_df = similar_businesses_df \
                                    .union(similar_business_df)
    return similar_businesses_df.orderBy("score", ascending=False).limit(limit)

def get_user_recommendation(user_id, limit = 10):
    print(f"[content based filtering] user id: {user_id}")
    user_reviewed_businesses = review\
        .filter((col("user_id") == user_id) & (col("superscore")>=3.0))\
        .orderBy("superscore", ascending=False)\
        .select("business_id").distinct().limit(limit).rdd.collect()
    if len(user_reviewed_businesses)==0:
        return None
    business_list = []
    for row in user_reviewed_businesses:
        business_list.append(row.business_id)
    print(f"[content based filtering] user reviewed businesses: {business_list}")
    return content_based_get_details(get_similar_business(business_list, limit))

def get_keywords_recommendation(keywords, limit=10):
    
    print(f"[content based filtering] keywords: {keywords}")
    keywords_df = spark.sparkContext.parallelize([(0, keywords)]).toDF(["business_id", "text"])
    
    # transform the the keywords to vectors
    
    transformed_keywords_df = pipeline_model.transform(keywords_df)    
    keywords_vector = transformed_keywords_df.select("word_vector").collect()[0][0]
    sim_bus_byword_rdd = spark.sparkContext\
        .parallelize((i[0], float(cosine_similarity(keywords_vector, i[1]))) for i in all_business_vectors)

    sim_bus_byword_df = spark.createDataFrame(sim_bus_byword_rdd) \
         .withColumnRenamed("_1", "business_id") \
         .withColumnRenamed("_2", "score") \
         .orderBy("score", ascending = False) \
         .limit(limit)
    
    # return top 10 similar businesses
    
    a = sim_bus_byword_df.limit(limit)
    return content_based_get_details(a)

def content_based_get_details(recommended_business):
    return recommended_business.join(business, "business_id", "inner").orderBy("score", ascending=False)

In [18]:
# verify the loading process for content based
# test case 1

content_based_get_details(
    get_similar_business(["Npm0cjoyWwyV13OULL9qOA", "ZKPqinA-7gkDA8-yzubSYw"]))\
    .toPandas()

Unnamed: 0,business_id,score,name,city,state,stars,review_count,categories,latitude,longitude,is_open,postal_code
0,J5pyv-h-OgbJJ1d7KFQXkg,0.959442,Dairy Hill Ice Cream,Portland,OR,4.5,61,"Bakeries, Ice Cream & Frozen Yogurt, Desserts,...",45.477952,-122.6958435774,1,97239
1,C3MsUBX9Zt76COT3l-OAtg,0.918731,Ruby Jewel,Portland,OR,4.5,29,"Food, Desserts, Ice Cream & Frozen Yogurt",45.5345046219,-122.6986144446,1,97210
2,sH-aYKTMVQ8TN7_wv7tr5A,0.909337,Hurry Back Ice Cream,Portland,OR,5.0,85,"Food Trucks, Food, Ice Cream & Frozen Yogurt",45.467275,-122.653168,1,97202
3,D3ND25B6h3xOJVuohZxMtg,0.904133,Cool Moon Ice Cream,Portland,OR,4.5,463,"Ice Cream & Frozen Yogurt, Food, Coffee & Tea,...",45.5287951382,-122.6825249787,1,97209
4,XUA4QMW3_lHPmmLwwDiZDQ,0.900201,Cloud City Ice Cream,Portland,OR,4.5,414,"Food, Coffee & Tea, Ice Cream & Frozen Yogurt,...",45.4795487,-122.6159691,1,97206
5,xGiqtSj3GmfnOFTsVAK_JQ,0.899289,Handel's Homemade Ice Cream,Portland,OR,4.0,85,"Food, Ice Cream & Frozen Yogurt",45.5282597,-122.8148562,1,97229
6,3i-vTnuordT7_iQW8UroFw,0.893876,Fifty Licks Ice Cream,Portland,OR,4.0,245,"Food Trucks, Food, Ice Cream & Frozen Yogurt",45.5227216,-122.6374919,1,97214
7,nVM8SLX5Yp3E6NNmL7tAPQ,0.89118,Fifty Licks,Portland,OR,4.0,478,"Food Trucks, Ice Cream & Frozen Yogurt, Food",45.503615,-122.6450414642,1,97202
8,DU9QRdyekAFrR3GzjDaB5g,0.88946,Ruby Jewel,Portland,OR,4.0,434,"Bakeries, Caterers, Ice Cream & Frozen Yogurt,...",45.5497015429,-122.675727682,1,97227
9,2lqbPc3KrbaFTZJCw0t2TQ,0.889111,Ruby Jewel,Portland,OR,4.0,743,"Event Planning & Services, Caterers, Specialty...",45.5220396,-122.6830297,1,97205


In [None]:
# test case 2

get_user_recommendation("uLhdaZUsVvT0gbNTdOSzDg").toPandas()

[content based filtering] user id: uLhdaZUsVvT0gbNTdOSzDg
[content based filtering] user reviewed businesses: ['sekbE_TpoPkVWIHMu0uBiA', '4Hc4QRv8PBlTXi9jm2s5cw']


In [None]:
# test case 3

get_keywords_recommendation("burger").toPandas()


### Prepare Collaborative Filtering 

In [None]:
loaded_user_recommendations = spark.read.load("part6_all_user_recommendations.parquet").cache()
loaded_user_recommendations.show(3)
loaded_user_recommendations.printSchema()

business_new_df = spark.read.load("part6_business_with_index.parquet").cache()
business_new_df.show(2)

In [None]:
def get_collaborative_recommendation(user_id, limit=10):

    filtered_business = loaded_user_recommendations.filter(col("user_id") == user_id)
    if not filtered_business.head(1):
        return None
    recommended_business =  spark.createDataFrame(
        filtered_business.rdd.flatMap(lambda p: p[1])
    )
    return business_new_df.join(recommended_business, "business_index", "inner")\
             .orderBy("rating", ascending = False).limit(limit)

random_user = loaded_user_recommendations.sample(0.01, seed=1).head(1)[0].user_id
print(f"[collaborative filtering] test user id: {random_user}")
get_collaborative_recommendation(random_user).toPandas()

### Prepare Friend Recommendation

In [None]:
user_friends = user.select("user_id", "friends").cache()

def get_friends_recommendation(user_id, limit=10):
    friends = user_friends.filter(col("user_id")==user_id).limit(1)
    if friends.count()<1:
        return None
    friend_str = friends.select("friends").rdd.collect()[0][0].replace(" ", "")
    if friend_str=="":
        return None
    friend_list = list(friend_str.split(","))
    friends_businesses = review.filter(
         (col("superscore")>=3.0) &(col("user_id").isin(friend_list)))\
        .orderBy("superscore", ascending=False)\
        .select("business_id").distinct().limit(limit).rdd.collect()
    if len(friends_businesses)==0:
        return None
    business_list = []
    for row in friends_businesses:
        business_list.append(row.business_id)
    return get_business_details(business_list) 

def get_business_details(business_list):
    return business.filter(col("business_id").isin(business_list)).orderBy("stars", ascending=False)
get_friends_recommendation("uLhdaZUsVvT0gbNTdOSzDg").toPandas()


### Prepare Location Based Recommendation

In [None]:
# TODO

### Hybrid Recommendation

In [None]:
# final hybrid recommendation model

def get_hybrid_recommendation_by_user(user_id, 
                                      content_based_limit=5, 
                                      collaborative_based_limit=3,
                                      friend_recommendation_limit=2):
    hybrid_result = spark.createDataFrame(spark.sparkContext.emptyRDD(), business.schema)

    print("content based: ")
    content_based_result = get_user_recommendation(user_id, content_based_limit)
    if content_based_result is not None :
        print(content_based_result.toPandas())
        hybrid_result = hybrid_result.union(content_based_result.drop("score"))
    print("end \n\n")
    
    print("collaborative based: ")
    collaborative_result = get_collaborative_recommendation(user_id, collaborative_based_limit)
    if collaborative_result is not None :
        print(collaborative_result.toPandas())
        hybrid_result = hybrid_result.union(collaborative_result.drop("rating", "business_index"))
    print("end \n\n")

    print("friend based: ")
    friend_result = get_friends_recommendation(user_id, friend_recommendation_limit)
    if friend_result is not None :
        print(friend_result.toPandas())
        hybrid_result = hybrid_result.union(friend_result)
    print("end \n\n")
    
    print("hybrid result: ")
    hybrid_result = hybrid_result.repartition(1).dropDuplicates(["name"])
    if hybrid_result is not None :
        print(hybrid_result.toPandas())
    print("end \n\n")

    number_of_result = hybrid_result.count()
    if  number_of_result< 10:
        # TODO add location based result
        # then union
        print(f"need {10-number_of_result} business from location based")
        pass
    return hybrid_result

In [None]:
# test case 1: valid user id

final_result = get_hybrid_recommendation_by_user("uLhdaZUsVvT0gbNTdOSzDg")
display_in_map(final_result)

In [None]:
# test case 2: invalid user id
final_result = get_hybrid_recommendation_by_user("1uLhdaZUsVvT0gbNTdOSzDg")