# Part 8 Hybrid Recommendation

In [6]:
from pyspark.ml import PipelineModel
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.clustering import KMeans
from pyspark.ml.clustering import KMeansModel
import numpy as np
import folium
import html

In [7]:
spark = SparkSession.builder\
    .config("spark.driver.memory", "32g")\
    .config("spark.executor.memory","32g")\
    .config("spark.driver.maxResultSize","0")\
    .config("spark.sql.autoBroadcastJoinThreshold","-1")\
    .config("spark.sql.broadcastTimeout","1200")\
    .config("spark.default.parallelism", "32")\
    .config("spark.executor.heartbeatInterval", "3600")\
    .config("spark.local.dir","D:\\Data")\
    .appName("part8").getOrCreate()

In [8]:
review = spark.read.csv("part4_topicmodeling_review.csv", header=True, multiLine=True)
business = spark.read.csv("part1_dataclean_business.csv",header=True, multiLine=True)
user = spark.read.csv("part1_dataclean_user.csv", header=True, multiLine=True)

AnalysisException: Path does not exist: file:/C:/Users/wujiwubing/Downloads/CS5344Final-main/part4_topicmodeling_review.csv;

In [34]:
# define utility functions

def cosine_similarity(vector1, vector2): 
    return np.dot(vector1, vector2) / np.sqrt(np.dot(vector1, vector1)) / np.sqrt(np.dot(vector2, vector2)) 
def display_in_map(business_df):
    folium_map = folium.Map(location=[45.5, -122.5], zoom_start=10)
    for i, r in business_df.toPandas().iterrows():
        folium.Marker(
                    location =[r.latitude, r.longitude], 
                    popup = html.escape(r["name"]) + '<br>' + 'stars: ' + str(r.stars) + '<br>' + 'reviews count: ' + str(r.review_count),    
                    icon = folium.Icon(color='green')).add_to(folium_map)
    return folium_map

### Prepare Content Based Filtering

In [11]:
# load pipeline model

pipeline_model = PipelineModel.load("content_based_pipeline_model")


# load all business vectors

all_business_vectors_df = spark.read.load("part5_all_business_vectors.parquet")
all_business_vectors_df.show(3)

all_business_vectors = all_business_vectors_df.rdd.map(lambda x: (x[0], x[1])).collect()
print(all_business_vectors[1][1])

+--------------------+--------------------+
|         business_id|         word_vector|
+--------------------+--------------------+
|0OLouFEhEm3pIh4Sx...|[0.00706271847664...|
|M4zae56hA6sb6V2Ty...|[-0.0525487022407...|
|W60O4ast9uAq03n7n...|[-0.0027295319850...|
+--------------------+--------------------+
only showing top 3 rows

[-0.05254870224070885,0.03538502551234829,-0.03854379182152756,-0.053777138675465165,0.11863246697258918,-0.042976504449101866,-0.06302669620455528,-0.010523899725802371,-0.05600440347677558,-0.051089978157957947,-0.12485574790768786,-0.01871904768133106,-0.01511064755262284,-0.012473998580663153,-0.0065393127728708925,-0.018541000201754955,0.026146981704871602,-0.0016333255095140166,0.019860426191859133,-0.02298519021158624,-0.06866251518718708,-0.04173724704121691,0.058374670250007615,-0.033756228102148386,0.036534135101342076,0.031458382954585626,0.014802017625441538,0.04208449111690074,0.05030599064622714,0.02105374494679037,-0.0195642894819269,0.00341606

In [None]:
def get_similar_business(business_ids, limit=10):
    schema = StructType([   
                            StructField("business_id", StringType(), True),
                            StructField("score", IntegerType(), True)
                        ])
    similar_businesses_df = spark.createDataFrame([], schema)
    
    for b_id in business_ids:
        try:
            input_vec = [(r[1]) for r in all_business_vectors if r[0] == b_id][0]
        except:
            continue
        similar_business_rdd = spark.sparkContext.parallelize(
            (i[0], float(cosine_similarity(input_vec, i[1]))) for i in all_business_vectors)

        similar_business_df = spark.createDataFrame(similar_business_rdd) \
            .withColumnRenamed("_1", "business_id") \
            .withColumnRenamed("_2", "score") \
            .orderBy("score", ascending = False)
            
        similar_business_df = similar_business_df.filter(col("business_id") != b_id).limit(limit)
        similar_businesses_df = similar_businesses_df \
                                    .union(similar_business_df)
    return similar_businesses_df.orderBy("score", ascending=False).limit(limit)

def get_user_recommendation(user_id, limit = 10):
    print(f"[content based filtering] user id: {user_id}")
    user_reviewed_businesses = review\
        .filter((col("user_id") == user_id) & (col("superscore")>=3.0))\
        .orderBy("superscore", ascending=False)\
        .select("business_id").distinct().limit(limit).rdd.collect()
    if len(user_reviewed_businesses)==0:
        return None
    business_list = []
    for row in user_reviewed_businesses:
        business_list.append(row.business_id)
    print(f"[content based filtering] user reviewed businesses: {business_list}")
    return content_based_get_details(get_similar_business(business_list, limit))

def get_keywords_recommendation(keywords, limit=10):
    
    print(f"[content based filtering] keywords: {keywords}")
    keywords_df = spark.sparkContext.parallelize([(0, keywords)]).toDF(["business_id", "text"])
    
    # transform the the keywords to vectors
    
    transformed_keywords_df = pipeline_model.transform(keywords_df)    
    keywords_vector = transformed_keywords_df.select("word_vector").collect()[0][0]
    sim_bus_byword_rdd = spark.sparkContext\
        .parallelize((i[0], float(cosine_similarity(keywords_vector, i[1]))) for i in all_business_vectors)

    sim_bus_byword_df = spark.createDataFrame(sim_bus_byword_rdd) \
         .withColumnRenamed("_1", "business_id") \
         .withColumnRenamed("_2", "score") \
         .orderBy("score", ascending = False) \
         .limit(limit)
    
    # return top 10 similar businesses
    
    a = sim_bus_byword_df.limit(limit)
    return content_based_get_details(a)

def content_based_get_details(recommended_business):
    return recommended_business.join(business, "business_id", "inner").orderBy("score", ascending=False)

In [12]:
# verify the loading process for content based
# test case 1

content_based_get_details(
    get_similar_business(["Npm0cjoyWwyV13OULL9qOA", "ZKPqinA-7gkDA8-yzubSYw"]))\
    .toPandas()

Unnamed: 0,business_id,score,name,city,state,stars,review_count,categories,latitude,longitude,is_open,postal_code
0,J5pyv-h-OgbJJ1d7KFQXkg,0.959442,Dairy Hill Ice Cream,Portland,OR,4.5,61,"Bakeries, Ice Cream & Frozen Yogurt, Desserts,...",45.477952,-122.6958435774,1,97239
1,C3MsUBX9Zt76COT3l-OAtg,0.918731,Ruby Jewel,Portland,OR,4.5,29,"Food, Desserts, Ice Cream & Frozen Yogurt",45.5345046219,-122.6986144446,1,97210
2,sH-aYKTMVQ8TN7_wv7tr5A,0.909337,Hurry Back Ice Cream,Portland,OR,5.0,85,"Food Trucks, Food, Ice Cream & Frozen Yogurt",45.467275,-122.653168,1,97202
3,D3ND25B6h3xOJVuohZxMtg,0.904133,Cool Moon Ice Cream,Portland,OR,4.5,463,"Ice Cream & Frozen Yogurt, Food, Coffee & Tea,...",45.5287951382,-122.6825249787,1,97209
4,XUA4QMW3_lHPmmLwwDiZDQ,0.900201,Cloud City Ice Cream,Portland,OR,4.5,414,"Food, Coffee & Tea, Ice Cream & Frozen Yogurt,...",45.4795487,-122.6159691,1,97206
5,xGiqtSj3GmfnOFTsVAK_JQ,0.899289,Handel's Homemade Ice Cream,Portland,OR,4.0,85,"Food, Ice Cream & Frozen Yogurt",45.5282597,-122.8148562,1,97229
6,3i-vTnuordT7_iQW8UroFw,0.893876,Fifty Licks Ice Cream,Portland,OR,4.0,245,"Food Trucks, Food, Ice Cream & Frozen Yogurt",45.5227216,-122.6374919,1,97214
7,nVM8SLX5Yp3E6NNmL7tAPQ,0.89118,Fifty Licks,Portland,OR,4.0,478,"Food Trucks, Ice Cream & Frozen Yogurt, Food",45.503615,-122.6450414642,1,97202
8,DU9QRdyekAFrR3GzjDaB5g,0.88946,Ruby Jewel,Portland,OR,4.0,434,"Bakeries, Caterers, Ice Cream & Frozen Yogurt,...",45.5497015429,-122.675727682,1,97227
9,2lqbPc3KrbaFTZJCw0t2TQ,0.889111,Ruby Jewel,Portland,OR,4.0,743,"Event Planning & Services, Caterers, Specialty...",45.5220396,-122.6830297,1,97205


In [13]:
# test case 2

get_user_recommendation("uLhdaZUsVvT0gbNTdOSzDg").toPandas()

[content based filtering] user id: DataFrame[user_id: string, name: string, review_count: string, yelping_since: string, friends: string, useful: string, funny: string, cool: string, fans: string, average_stars: string]
[content based filtering] user reviewed businesses: ['sekbE_TpoPkVWIHMu0uBiA', '4Hc4QRv8PBlTXi9jm2s5cw']


Unnamed: 0,business_id,score,name,city,state,stars,review_count,categories,latitude,longitude,is_open,postal_code
0,IyLLref8p5xTcuO7SpHf-g,0.999162,Salt & Straw,Portland,OR,4.5,1704,"Restaurants, Food, Food Stands, Ice Cream & Fr...",45.5592731,-122.644136,1,97211
1,o_L9Ss4boqq6ZEF9xeSH6Q,0.996246,Salt & Straw,Portland,OR,4.5,3672,"Food, Bakeries, Ice Cream & Frozen Yogurt, Cof...",45.5289444,-122.6982801,1,97210
2,xzBdKVBwoPwDNLN7ZwVa2w,0.978557,Yoko's Japanese Restaurant,Portland,OR,4.0,373,"Sushi Bars, Coffee & Tea, Japanese, Restaurant...",45.493212,-122.636056,1,97202
3,XUA4QMW3_lHPmmLwwDiZDQ,0.977217,Cloud City Ice Cream,Portland,OR,4.5,414,"Food, Coffee & Tea, Ice Cream & Frozen Yogurt,...",45.4795487,-122.6159691,1,97206
4,nVM8SLX5Yp3E6NNmL7tAPQ,0.9745,Fifty Licks,Portland,OR,4.0,478,"Food Trucks, Ice Cream & Frozen Yogurt, Food",45.503615,-122.6450414642,1,97202
5,mNSNd3RY7kamIDKyw0hHpQ,0.969481,What's The Scoop?,Portland,OR,4.0,366,"American (Traditional), Restaurants, Ice Cream...",45.5489817156,-122.6665197543,1,97227
6,ZZTG_ttcN3YFQA6vuWZdYw,0.968871,Sushi & Sushi,Portland,OR,3.0,122,"Sushi Bars, Restaurants, Japanese",45.4827352122,-122.5772265611,1,97266
7,3i-vTnuordT7_iQW8UroFw,0.968238,Fifty Licks Ice Cream,Portland,OR,4.0,245,"Food Trucks, Food, Ice Cream & Frozen Yogurt",45.5227216,-122.6374919,1,97214
8,zIUFzFUy4Ja0TWMClV0xRQ,0.967478,Marinepolis Sushi Land,Portland,OR,2.5,310,"Conveyor Belt Sushi, Sushi Bars, Restaurants, ...",45.524275,-122.68101,1,97209
9,D3ND25B6h3xOJVuohZxMtg,0.967204,Cool Moon Ice Cream,Portland,OR,4.5,463,"Ice Cream & Frozen Yogurt, Food, Coffee & Tea,...",45.5287951382,-122.6825249787,1,97209


In [14]:
# test case 3

get_keywords_recommendation("burger").toPandas()


[content based filtering] keywords: burger


Unnamed: 0,business_id,score,name,city,state,stars,review_count,categories,latitude,longitude,is_open,postal_code
0,RZEEoKF4odsIg_GV70xD_Q,0.627719,Union Burger,Portland,OR,5.0,46,"Restaurants, Food Trucks, Food, Burgers",45.5763354,-122.6616957039,1,97211
1,EkyssuFKGgcMX6FpmVjiDA,0.602341,Killer Burger,Portland,OR,4.0,797,"Restaurants, Burgers",45.5381080913,-122.6151960203,1,97213
2,QreidCBt3HUOesM8gnULqw,0.595073,Pickle & Salt,Portland,OR,4.5,15,"Food, Halal, Food Trucks, Burgers, Restaurants",45.5227501,-122.6747903,1,97209
3,Hlvpm0izAXRQM1uZn1H35A,0.591029,Little Big Burger,Portland,OR,4.0,949,"Burgers, Restaurants, Fast Food",45.5241461354,-122.6810082251,1,97209
4,gcTc8abc8Jfpmg3t1ejp3g,0.581785,Killer Burger,Portland,OR,4.0,411,"Restaurants, Burgers",45.4596066,-122.6468698,1,97202
5,Jelhk2X7EJIMwUlO8RAN5g,0.581083,Fuller's Burger Shack,Portland,OR,4.0,48,"American (Traditional), American (New), Burger...",45.5731634,-122.5575475,1,97220
6,JyPNR8kk5uBrhEMfuZoCug,0.580262,Little Big Burger,Portland,OR,3.5,73,"Food, Burgers, Restaurants, Vegetarian, Fast Food",45.5116533,-122.644857,1,97214
7,jUnRs-5ZGecrN-sm8iiNjA,0.579537,Little Big Burger,Portland,OR,3.5,232,"Restaurants, Fast Food, American (Traditional)...",45.5501187,-122.6758684,1,97227
8,Yvsezs2OxUazHhw_kIYIPQ,0.575546,Bless Your Heart Burgers 33rd Avenue,Portland,OR,4.0,63,"Burgers, Restaurants, Food Trucks, Food, Ameri...",45.5618172,-122.6305022,1,97211
9,TJoJ_1aQXRobOP-UITj5tw,0.575242,Little Big Burger,Portland,OR,4.0,306,"Restaurants, Burgers, Fast Food",45.529483,-122.69858,1,97210


### Prepare Collaborative Filtering 

In [17]:
loaded_user_recommendations = spark.read.load("part6_all_user_recommendations.parquet").cache()
loaded_user_recommendations.show(3)
loaded_user_recommendations.printSchema()

business_new_df = spark.read.load("part6_business_with_index.parquet")
business_new_df.show(2)

+----------+--------------------+--------------------+
|user_index|     recommendations|             user_id|
+----------+--------------------+--------------------+
|      1199|[{57, 4.4834585},...|T8fXV7fkbBkg8ET6v...|
|      2116|[{3761, 5.422045}...|ZCUf_Uk-eXli07c48...|
|      2684|[{2565, 5.9831705...|3mNz5nQFTIBQm0oU5...|
+----------+--------------------+--------------------+
only showing top 3 rows

root
 |-- user_index: integer (nullable = true)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- business_index: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)
 |-- user_id: string (nullable = true)

+--------------+--------------------+--------------------+--------+-----+-----+------------+--------------------+-------------+---------------+-------+-----------+
|business_index|         business_id|                name|    city|state|stars|review_count|          categories|     latitude|      longitu

Unnamed: 0,business_index,business_id,name,city,state,stars,review_count,categories,latitude,longitude,is_open,postal_code,rating
0,3209,CM2DhoaUwvr2bQPZlkOJ4Q,Pasta’s,Portland,OR,5.0,6,"Specialty Food, Restaurants, Food Trucks, Food...",45.547669,-122.6682301,1,97227,2.056291
1,1183,0JUkzQXJKaiAbpruTgDPnw,Yatra PDX,Portland,OR,5.0,9,"Food Stands, Food, Indian, Himalayan/Nepalese,...",45.4641695267,-122.65434729,1,97202,1.952722
2,89,-liZt9xZNvnT0tHW0XTwQA,Kate's Ice Cream,Portland,OR,5.0,25,"Vegan, Food, Ice Cream & Frozen Yogurt, Restau...",45.5283148,-122.6386321,1,97232,1.950059
3,3643,rl8U3o2y4IH7aJBfsMPTew,Division Liquor,Portland,OR,4.0,8,"Beer, Wine & Spirits, Food",45.5060226,-122.4949421,1,97236,1.940054
4,628,YQ6srHagEScNH9iu_DhqUg,Sarah's Cookies,Portland,OR,5.0,10,"Food, Bakeries",45.5643461,-122.5557446,1,97220,1.939659
5,1057,3ZcksUCfKGneyZkXGAmckA,Pixie Retreat Raw'r Laboratorie & Makery,Portland,OR,5.0,40,"Specialty Food, Restaurants, Live/Raw Food, Ve...",45.5110677127,-122.6626858006,1,97214,1.935495
6,635,iPnSI7FZbTtcxipcOT_lLA,Proletariat Butchery,Portland,OR,4.5,14,"Specialty Food, Food, Meat Shops, Butcher",45.5483842,-122.5996826,1,97213,1.925042
7,2493,maa0Pgf6ZD3e5us3kte6Sw,Bees and Beans,Portland,OR,5.0,10,"Food, Desserts, Chocolatiers & Shops, Specialt...",45.508732,-122.654872,1,97214,1.883153
8,1727,bWJoJe-gHtYzfTH5JfW-PQ,MUSE Cheesecakes,Portland,OR,5.0,7,"Desserts, Specialty Food, Food, Patisserie/Cak...",45.5544197,-122.6663494,1,97217,1.880594
9,1735,4bP1O1WA_CVaAEmRkfXe5A,Oregonic Tonic Kombucha,Portland,OR,5.0,5,"Food, Breweries, Kombucha",45.5837298,-122.7268969,1,97203,1.869701


In [44]:
def get_collaborative_recommendation(user_id, limit=10):

    filtered_business = loaded_user_recommendations.filter(col("user_id") == user_id)
    if not filtered_business.head(1):
        return None
    recommended_business =  spark.createDataFrame(
        filtered_business.rdd.flatMap(lambda p: p[1])
    )
    return business_new_df.join(recommended_business, "business_index", "inner")\
             .orderBy("rating", ascending = False).limit(limit)

random_user = loaded_user_recommendations.sample(0.01, seed=1).head(1)[0].user_id
print(f"[collaborative filtering] test user id: {random_user}")
get_collaborative_recommendation(random_user).toPandas()

[collaborative filtering] test user id: BXgoRfst14LeNLOHTJ0rjA


Unnamed: 0,business_index,business_id,name,city,state,stars,review_count,categories,latitude,longitude,is_open,postal_code,rating
0,3209,CM2DhoaUwvr2bQPZlkOJ4Q,Pasta’s,Portland,OR,5.0,6,"Specialty Food, Restaurants, Food Trucks, Food...",45.547669,-122.6682301,1,97227,2.056291
1,1183,0JUkzQXJKaiAbpruTgDPnw,Yatra PDX,Portland,OR,5.0,9,"Food Stands, Food, Indian, Himalayan/Nepalese,...",45.4641695267,-122.65434729,1,97202,1.952722
2,89,-liZt9xZNvnT0tHW0XTwQA,Kate's Ice Cream,Portland,OR,5.0,25,"Vegan, Food, Ice Cream & Frozen Yogurt, Restau...",45.5283148,-122.6386321,1,97232,1.950059
3,3643,rl8U3o2y4IH7aJBfsMPTew,Division Liquor,Portland,OR,4.0,8,"Beer, Wine & Spirits, Food",45.5060226,-122.4949421,1,97236,1.940054
4,628,YQ6srHagEScNH9iu_DhqUg,Sarah's Cookies,Portland,OR,5.0,10,"Food, Bakeries",45.5643461,-122.5557446,1,97220,1.939659
5,1057,3ZcksUCfKGneyZkXGAmckA,Pixie Retreat Raw'r Laboratorie & Makery,Portland,OR,5.0,40,"Specialty Food, Restaurants, Live/Raw Food, Ve...",45.5110677127,-122.6626858006,1,97214,1.935495
6,635,iPnSI7FZbTtcxipcOT_lLA,Proletariat Butchery,Portland,OR,4.5,14,"Specialty Food, Food, Meat Shops, Butcher",45.5483842,-122.5996826,1,97213,1.925042
7,2493,maa0Pgf6ZD3e5us3kte6Sw,Bees and Beans,Portland,OR,5.0,10,"Food, Desserts, Chocolatiers & Shops, Specialt...",45.508732,-122.654872,1,97214,1.883153
8,1727,bWJoJe-gHtYzfTH5JfW-PQ,MUSE Cheesecakes,Portland,OR,5.0,7,"Desserts, Specialty Food, Food, Patisserie/Cak...",45.5544197,-122.6663494,1,97217,1.880594
9,1735,4bP1O1WA_CVaAEmRkfXe5A,Oregonic Tonic Kombucha,Portland,OR,5.0,5,"Food, Breweries, Kombucha",45.5837298,-122.7268969,1,97203,1.869701


### Prepare Friend Recommendation

In [10]:
user_friends = user.select("user_id", "friends")

def get_friends_recommendation(user_id, limit=10):
    friends = user_friends.filter(col("user_id")==user_id).limit(1)
    if friends.count()<1:
        return None
    friend_str = friends.select("friends").rdd.collect()[0][0].replace(" ", "")
    if friend_str=="":
        return None
    friend_list = list(friend_str.split(","))
    friends_businesses = review.filter(
         (col("superscore")>=3.0) &(col("user_id").isin(friend_list)))\
        .orderBy("superscore", ascending=False)\
        .select("business_id").distinct().limit(limit).rdd.collect()
    if len(friends_businesses)==0:
        return None
    business_list = []
    for row in friends_businesses:
        business_list.append(row.business_id)
    return get_business_details(business_list) 

def get_business_details(business_list):
    return business.filter(col("business_id").isin(business_list)).orderBy("stars", ascending=False)
get_friends_recommendation("uLhdaZUsVvT0gbNTdOSzDg").toPandas()


Unnamed: 0,business_id,name,city,state,stars,review_count,categories,latitude,longitude,is_open,postal_code
0,n73rxa6e6-fTIxQzfv4BuA,Mother’s Bistro & Bar,Portland,OR,4.5,4466,"American (Traditional), Restaurants, Bars, Nig...",45.522196,-122.673868,1,97204
1,1JKK_kmQ6Kxc6NzhQhcAOA,Sivalai Thai Restaurant,Portland,OR,4.5,833,"Restaurants, Thai",45.5190945,-122.6131884,1,97215
2,9P-lp3AWDXGayDqJz9VPwQ,Marukin Ramen,Portland,OR,4.0,616,"Restaurants, Ramen, Noodles, Japanese",45.522407,-122.65937,1,97214
3,z1QhExIrJH0cnBdQKrsk0g,Piazza Italia,Portland,OR,4.0,973,"Restaurants, Shopping, Salad, Delis, Art Galle...",45.5287804,-122.6829301,1,97209
4,nEaWb1R9wFRI6sTWBkymAA,Kizuki Ramen & Izakaya,Portland,OR,4.0,955,"Ramen, Japanese, Restaurants, Izakaya, Noodles",45.5217636,-122.8021536,1,97229
5,jXnMy7-VQ7eqRBtjXAacuQ,Lucky Devil Lounge,Portland,OR,4.0,112,"Restaurants, Bars, Adult Entertainment, Nightl...",45.5017514,-122.6590969,1,97202
6,4VT8kIUlTa4d3mwWrGJ5Zg,Pepe le Moko,Portland,OR,4.0,439,"Cocktail Bars, Nightlife, Bars, Restaurants, S...",45.5218730302,-122.6811734763,1,97205
7,d69y3CN9_SQKrEnXXqQr8g,King Burrito Mexican Food,Portland,OR,4.0,534,"Mexican, Restaurants, Breakfast & Brunch, Burgers",45.5769598,-122.6970866,1,97217
8,nb6XRTOkX2mtWianzv819Q,Shigezo Izakaya,Portland,OR,3.5,678,"Tapas/Small Plates, Japanese, Ramen, Restauran...",45.5176574,-122.6825351,1,97205
9,qgq0usjnOA8qTWHdxbuzZA,Eclectic Kitchen,Portland,OR,3.5,137,"Food, Salad, Sandwiches, Restaurants, Bakeries...",45.548143,-122.6118289,1,97213


### Prepare Location Based Recommendation

In [11]:
# TODO
kmeans_model_path = 'kmean_model'

def read_model(file_path):
    model = KMeansModel.load(kmeans_model_path)
    return model

# get cluster
def get_label(latitude,longtitude,kmeans_model):
    col = ['latitude','longitude']
    vc = VectorAssembler(inputCols=col,outputCol="features")
    new_user = spark.createDataFrame([(latitude,longtitude)],col)
    input_data = vc.transform(new_user)
    res = kmeans_model.transform(input_data)
    return res.select('prediction').collect()[0][0]
# choose top 10 resautant(order by stars) in the target cluster
def location_based_recommand(latitude,longtitude,kmeans_model):
    cluster = get_label(latitude,longtitude,kmeans_model)
    tmp = prediction.filter(prediction['prediction'] == cluster)
    return tmp.orderBy(['stars','review_count'],ascending=False).limit(10)
# test
def lbrs_test(model,dataset):
    pred = model.transform(dataset)
    test_point = pred.sample(False,0.1).limit(1)
    rec_point = location_based_recommand(test_point.select('latitude','longitude').collect()[0][0],test_point.select('latitude','longitude').collect()[0][1],model)
    test_point =  test_point.withColumn('rec',lit(0))
    print("test_point_data:")
    test_point.select("business_id",'name','stars','review_count','latitude','longitude','prediction').show()
    rec_point = rec_point.withColumn('rec',lit(1))
    print("recomment_point_data:")
    rec_point.select("business_id",'name','stars','review_count','latitude','longitude','prediction').show()
    res_for_graph = test_point.union(rec_point)
    token = "pk.eyJ1IjoiYmFueGlhbiIsImEiOiJja3YwcDVxNWs3b2x1MndxNmhmNTF4bGllIn0.-XUGN8rlW6vz9yIP6RKl-A"
    fig = px.scatter_mapbox(res_for_graph.toPandas(), lat="latitude", lon="longitude", color="rec", size="review_count",color_continuous_scale=px.colors.diverging.Portland_r,hover_data= ['name'], zoom=10)
    fig.update_layout(autosize=True,mapbox_style="carto-darkmatter",mapbox=dict(accesstoken=token,bearing=0))
    fig.show()
    

### Hybrid Recommendation

In [48]:
# final hybrid recommendation model

def get_hybrid_recommendation_by_user(user_id, 
                                      content_based_limit=5, 
                                      collaborative_based_limit=3,
                                      friend_recommendation_limit=2):
    hybrid_result = spark.createDataFrame(spark.sparkContext.emptyRDD(), business.schema)

    print("content based: ")
    content_based_result = get_user_recommendation(user_id, content_based_limit)
    if content_based_result is not None :
        print(content_based_result.toPandas())
        hybrid_result = hybrid_result.union(content_based_result.drop("score"))
    print("end \n\n")
    
    print("collaborative based: ")
    collaborative_result = get_collaborative_recommendation(user_id, collaborative_based_limit)
    if collaborative_result is not None :
        print(collaborative_result.toPandas())
        hybrid_result = hybrid_result.union(collaborative_result.drop("rating", "business_index"))
    print("end \n\n")

    print("friend based: ")
    friend_result = get_friends_recommendation(user_id, friend_recommendation_limit)
    if friend_result is not None :
        print(friend_result.toPandas())
        hybrid_result = hybrid_result.union(friend_result)
    print("end \n\n")
    
    print("hybrid result: ")
    hybrid_result = hybrid_result.repartition(1).dropDuplicates(["name"])
    if hybrid_result is not None :
        print(hybrid_result.toPandas())
    print("end \n\n")

    number_of_result = hybrid_result.count()
    if  number_of_result< 10:
        #todo
        model = read_model(kmeans_model_path)
        lbrs_test(model,business)
        print(f"need {10-number_of_result} business from location based")
        pass
    return hybrid_result

In [36]:
# test case 1: valid user id

final_result = get_hybrid_recommendation_by_user("uLhdaZUsVvT0gbNTdOSzDg")
display_in_map(final_result)

content based: 
[content based filtering] user id: DataFrame[user_id: string, name: string, review_count: string, yelping_since: string, friends: string, useful: string, funny: string, cool: string, fans: string, average_stars: string]
[content based filtering] user reviewed businesses: ['sekbE_TpoPkVWIHMu0uBiA', '4Hc4QRv8PBlTXi9jm2s5cw']
              business_id     score                        name      city  \
0  IyLLref8p5xTcuO7SpHf-g  0.999162                Salt & Straw  Portland   
1  o_L9Ss4boqq6ZEF9xeSH6Q  0.996246                Salt & Straw  Portland   
2  xzBdKVBwoPwDNLN7ZwVa2w  0.978557  Yoko's Japanese Restaurant  Portland   
3  XUA4QMW3_lHPmmLwwDiZDQ  0.977217        Cloud City Ice Cream  Portland   
4  nVM8SLX5Yp3E6NNmL7tAPQ  0.974500                 Fifty Licks  Portland   

  state stars review_count                                         categories  \
0    OR   4.5         1704  Restaurants, Food, Food Stands, Ice Cream & Fr...   
1    OR   4.5         3672  Food, 

In [49]:
# test case 2: invalid user id
final_result = get_hybrid_recommendation_by_user("1uLhdaZUsVvT0gbNTdOSzDg")

content based: 
[content based filtering] user id: DataFrame[user_id: string, name: string, review_count: string, yelping_since: string, friends: string, useful: string, funny: string, cool: string, fans: string, average_stars: string]
end 


collaborative based: 
end 


friend based: 
end 


hybrid result: 
Empty DataFrame
Columns: [business_id, name, city, state, stars, review_count, categories, latitude, longitude, is_open, postal_code]
Index: []
end 


need 10 business from location based
