# Part 5 Content Based Filtering

### Setup

In [26]:
from operator import add
from pyspark.sql import SparkSession
from pyspark.ml.feature import *
from pyspark.ml import Pipeline, PipelineModel
import numpy as np
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [27]:
spark = SparkSession.builder\
    .config("spark.driver.memory", "8g")\
    .config("spark.executor.memory","8g")\
    .config("spark.driver.maxResultSize","0")\
    .config("spark.sql.autoBroadcastJoinThreshold","-1")\
    .config("spark.sql.broadcastTimeout","1200")\
    .config("spark.default.parallelism", "8")\
    .config("spark.network.timeout","7200s")\
    .config("spark.executor.heartbeatInterval", "3600s")\
    .config("spark.local.dir","D:\\Data")\
    .appName("pipeline").getOrCreate()
# spark.local.dir: disable if your do not have this dir

### Read in Pre-processed Data

In [28]:
business = spark.read.csv("part1_dataclean_business.csv",header=True, multiLine=True)
review = spark.read.csv("part4_topicmodeling_review.csv",header=True, multiLine=True)

### Join Review Text by Business ID

In [29]:
reviews_text_rdd = review.select("business_id", "text").rdd
reviews_by_business_rdd = reviews_text_rdd.map(tuple).reduceByKey(add)  
reviews_by_business_df = spark.createDataFrame(reviews_by_business_rdd)
reviews_by_business_df = reviews_by_business_df.withColumnRenamed("_1", "business_id").withColumnRenamed("_2", "text")

### Build Content Based Filtering Pipeline

In [30]:
"""
# prepare the pipeline

regex_tokenizer = RegexTokenizer(gaps = False, pattern = "\w+", inputCol = "text", outputCol = "token")
stop_words_remover = StopWordsRemover(inputCol = "token", outputCol = "non_stop_word")
count_vectorizer = CountVectorizer(inputCol="non_stop_word", outputCol="raw_feature")
idf = IDF(inputCol="raw_feature", outputCol="idf_vector")
word_2_vector = Word2Vec(vectorSize = 100, minCount = 5, inputCol = "non_stop_word", outputCol = "word_vector", seed=1)
vector_assembler = VectorAssembler(inputCols=["idf_vector", "word_vector"], outputCol="combined_vector")

# fit the pipeline

pipeline = Pipeline(stages=[regex_tokenizer, stop_words_remover, count_vectorizer, idf, word_2_vector, vector_assembler])
pipeline_model = pipeline.fit(reviews_by_business_df)

# save the pipeline model

pipeline_model.write().overwrite().save("content_based_pipeline_model")
"""

'\n# prepare the pipeline\n\nregex_tokenizer = RegexTokenizer(gaps = False, pattern = "\\w+", inputCol = "text", outputCol = "token")\nstop_words_remover = StopWordsRemover(inputCol = "token", outputCol = "non_stop_word")\ncount_vectorizer = CountVectorizer(inputCol="non_stop_word", outputCol="raw_feature")\nidf = IDF(inputCol="raw_feature", outputCol="idf_vector")\nword_2_vector = Word2Vec(vectorSize = 100, minCount = 5, inputCol = "non_stop_word", outputCol = "word_vector", seed=1)\nvector_assembler = VectorAssembler(inputCols=["idf_vector", "word_vector"], outputCol="combined_vector")\n\n# fit the pipeline\n\npipeline = Pipeline(stages=[regex_tokenizer, stop_words_remover, count_vectorizer, idf, word_2_vector, vector_assembler])\npipeline_model = pipeline.fit(reviews_by_business_df)\n\n# save the pipeline model\n\npipeline_model.write().overwrite().save("content_based_pipeline_model")\n'

In [31]:
# load the pipeline model

pipeline_model = PipelineModel.load("content_based_pipeline_model")
all_business_vectors_df = pipeline_model.transform(reviews_by_business_df)

In [32]:
all_business_vectors_df.printSchema()
all_business_vectors_df.show(3)

# save all business vectors

all_business_vectors_df.select("business_id", "word_vector").write.mode("overwrite")\
    .format("parquet").save("part7_all_business_vectors.parquet")

root
 |-- business_id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- non_stop_word: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- raw_feature: vector (nullable = true)
 |-- idf_vector: vector (nullable = true)
 |-- word_vector: vector (nullable = true)
 |-- combined_vector: vector (nullable = true)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         business_id|                text|               token|       non_stop_word|         raw_feature|          idf_vector|         word_vector|     combined_vector|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|5oed6H5F8qZxNzELq...|A must!!! 

First...|[a, must, f

In [33]:
# test load all business vectors

all_business_vectors_df = spark.read.load("part7_all_business_vectors.parquet")
all_business_vectors_df.show(3)

all_business_vectors = all_business_vectors_df.rdd.map(lambda x: (x[0], x[1])).collect()
print(all_business_vectors[1][1])

+--------------------+--------------------+
|         business_id|         word_vector|
+--------------------+--------------------+
|0OLouFEhEm3pIh4Sx...|[0.00706271847664...|
|M4zae56hA6sb6V2Ty...|[-0.0525487022407...|
|W60O4ast9uAq03n7n...|[-0.0027295319850...|
+--------------------+--------------------+
only showing top 3 rows

[-0.05254870224070885,0.03538502551234829,-0.03854379182152756,-0.053777138675465165,0.11863246697258918,-0.042976504449101866,-0.06302669620455528,-0.010523899725802371,-0.05600440347677558,-0.051089978157957947,-0.12485574790768786,-0.01871904768133106,-0.01511064755262284,-0.012473998580663153,-0.0065393127728708925,-0.018541000201754955,0.026146981704871602,-0.0016333255095140166,0.019860426191859133,-0.02298519021158624,-0.06866251518718708,-0.04173724704121691,0.058374670250007615,-0.033756228102148386,0.036534135101342076,0.031458382954585626,0.014802017625441538,0.04208449111690074,0.05030599064622714,0.02105374494679037,-0.0195642894819269,0.00341606

In [86]:
# utility functions

def cosine_similarity(vector1, vector2): 
    return np.dot(vector1, vector2) / np.sqrt(np.dot(vector1, vector1)) / np.sqrt(np.dot(vector2, vector2)) 


def get_business_details(recommended_business):
    return recommended_business.join(business, "business_id", "inner").orderBy("score", ascending=False)
            

### Get Similar Business by List of Business IDs

In [88]:
def get_similar_business(business_ids, limit=10):
    schema = StructType([   
                            StructField("business_id", StringType(), True),
                            StructField("score", IntegerType(), True)
                        ])
    similar_businesses_df = spark.createDataFrame([], schema)
    
    for b_id in business_ids:
        try:
            input_vec = [(r[1]) for r in all_business_vectors if r[0] == b_id][0]
        except:
            continue
        similar_business_rdd = spark.sparkContext.parallelize(
            (i[0], float(cosine_similarity(input_vec, i[1]))) for i in all_business_vectors)

        similar_business_df = spark.createDataFrame(similar_business_rdd) \
            .withColumnRenamed("_1", "business_id") \
            .withColumnRenamed("_2", "score") \
            .orderBy("score", ascending = False)
            
        similar_business_df = similar_business_df.filter(col("business_id") != b_id).limit(limit)
        similar_businesses_df = similar_businesses_df \
                                    .union(similar_business_df)
    return similar_businesses_df.orderBy("score", ascending=False).limit(limit)

In [89]:
# test get similar business

bids = ["Npm0cjoyWwyV13OULL9qOA", "ZKPqinA-7gkDA8-yzubSYw"]

print("input restaurants details:")
business.select("business_id","name", "categories") \
    .filter(business.business_id.isin(bids) == True).show(truncate=False)
    
# get top 10 similar business
similar_business = get_business_details(get_similar_business(bids))

print("top 10 similar restaurants: ")
similar_business.toPandas()

input restaurants details:
+----------------------+---------------------------+-------------------------------+
|business_id           |name                       |categories                     |
+----------------------+---------------------------+-------------------------------+
|Npm0cjoyWwyV13OULL9qOA|Nana's Ice Cream Scoop Shop|Food, Ice Cream & Frozen Yogurt|
|ZKPqinA-7gkDA8-yzubSYw|Dairy Hill Ice Cream       |Ice Cream & Frozen Yogurt, Food|
+----------------------+---------------------------+-------------------------------+

top 10 similar restaurants: 


Unnamed: 0,business_id,score,name,city,state,stars,review_count,categories,latitude,longitude,is_open,postal_code
0,J5pyv-h-OgbJJ1d7KFQXkg,0.959442,Dairy Hill Ice Cream,Portland,OR,4.5,61,"Bakeries, Ice Cream & Frozen Yogurt, Desserts,...",45.477952,-122.6958435774,1,97239
1,C3MsUBX9Zt76COT3l-OAtg,0.918731,Ruby Jewel,Portland,OR,4.5,29,"Food, Desserts, Ice Cream & Frozen Yogurt",45.5345046219,-122.6986144446,1,97210
2,sH-aYKTMVQ8TN7_wv7tr5A,0.909337,Hurry Back Ice Cream,Portland,OR,5.0,85,"Food Trucks, Food, Ice Cream & Frozen Yogurt",45.467275,-122.653168,1,97202
3,D3ND25B6h3xOJVuohZxMtg,0.904133,Cool Moon Ice Cream,Portland,OR,4.5,463,"Ice Cream & Frozen Yogurt, Food, Coffee & Tea,...",45.5287951382,-122.6825249787,1,97209
4,XUA4QMW3_lHPmmLwwDiZDQ,0.900201,Cloud City Ice Cream,Portland,OR,4.5,414,"Food, Coffee & Tea, Ice Cream & Frozen Yogurt,...",45.4795487,-122.6159691,1,97206
5,xGiqtSj3GmfnOFTsVAK_JQ,0.899289,Handel's Homemade Ice Cream,Portland,OR,4.0,85,"Food, Ice Cream & Frozen Yogurt",45.5282597,-122.8148562,1,97229
6,3i-vTnuordT7_iQW8UroFw,0.893876,Fifty Licks Ice Cream,Portland,OR,4.0,245,"Food Trucks, Food, Ice Cream & Frozen Yogurt",45.5227216,-122.6374919,1,97214
7,nVM8SLX5Yp3E6NNmL7tAPQ,0.89118,Fifty Licks,Portland,OR,4.0,478,"Food Trucks, Ice Cream & Frozen Yogurt, Food",45.503615,-122.6450414642,1,97202
8,DU9QRdyekAFrR3GzjDaB5g,0.88946,Ruby Jewel,Portland,OR,4.0,434,"Bakeries, Caterers, Ice Cream & Frozen Yogurt,...",45.5497015429,-122.675727682,1,97227
9,2lqbPc3KrbaFTZJCw0t2TQ,0.889111,Ruby Jewel,Portland,OR,4.0,743,"Event Planning & Services, Caterers, Specialty...",45.5220396,-122.6830297,1,97205


### Get Similar Business by User ID

In [90]:
def get_user_recommendation(user_id, limit = 10):
    print(f"user id: {user}")
    user_reviewed_businesses = review.filter((col("user_id") == user_id) & (col("superscore")>=3.0))\
        .sample(False, 0.5).limit(5).select("business_id")
    business_ids = [i.business_id for i in user_reviewed_businesses.collect()]
    print(f"user reviewed businesses: {business_ids}")
    return get_business_details(get_similar_business(business_ids, limit))

# test using user id
user = "uLhdaZUsVvT0gbNTdOSzDg"
user_recommendations =get_user_recommendation(user)
    
print("top 10 similar restaurants: ")
user_recommendations.toPandas()

user id: uLhdaZUsVvT0gbNTdOSzDg
user reviewed businesses: ['sekbE_TpoPkVWIHMu0uBiA']
top 10 similar restaurants: 


Unnamed: 0,business_id,score,name,city,state,stars,review_count,categories,latitude,longitude,is_open,postal_code
0,xzBdKVBwoPwDNLN7ZwVa2w,0.978557,Yoko's Japanese Restaurant,Portland,OR,4.0,373,"Sushi Bars, Coffee & Tea, Japanese, Restaurant...",45.493212,-122.636056,1,97202
1,ZZTG_ttcN3YFQA6vuWZdYw,0.968871,Sushi & Sushi,Portland,OR,3.0,122,"Sushi Bars, Restaurants, Japanese",45.4827352122,-122.5772265611,1,97266
2,zIUFzFUy4Ja0TWMClV0xRQ,0.967478,Marinepolis Sushi Land,Portland,OR,2.5,310,"Conveyor Belt Sushi, Sushi Bars, Restaurants, ...",45.524275,-122.68101,1,97209
3,eWghaE-97gjo6XXFm-2eRg,0.966427,Takahashi Restaurant,Portland,OR,4.0,220,"Restaurants, Japanese, Sushi Bars",45.4894604,-122.5565751,1,97266
4,iyu3qzaLyOsLDkxf-XqcyA,0.965699,Mio Sushi,Portland,OR,3.5,152,"Sushi Bars, Restaurants, Japanese, Soup, Asian...",45.4641036987,-122.6466827393,1,97202
5,Rn6Znnus_rryA2KlUjP0zA,0.965465,Mio Sushi Hawthorne,Portland,OR,3.0,149,"Sushi Bars, Restaurants, Japanese",45.5119111,-122.6214655,1,97214
6,bg_YgINQ5X5s5ZfYAb-ifw,0.963899,Marinepolis Sushi Land,Portland,OR,2.5,169,"Japanese, Restaurants, Sushi Bars",45.3944013,-122.7503309,1,97224
7,MzIrEXhiTALvtSfXOvheKA,0.960251,Hot Pot 'n Sushi,Portland,OR,2.5,296,"Japanese, Sushi Bars, Restaurants",45.5730768476,-122.5576400757,1,97220
8,1dZSC57NMNIssNnnam7hug,0.958813,O'sushi,Portland,OR,3.5,213,"Sushi Bars, Japanese, Soup, Restaurants",45.5020521,-122.5780035,1,97266
9,2d7e7hkSvlBwlawrOWbP1w,0.958707,Fujiyama Sushi Bar,Portland,OR,3.5,117,"Sushi Bars, Restaurants, Japanese",45.5179319,-122.5573978,1,97216


### Get Similar Business by Keywords

In [93]:
def get_keywords_recommendation(keywords, limit=10):
    
    print(f"keywords: {keywords}")
    keywords_df = spark.sparkContext.parallelize([(0, keywords)]).toDF(['business_id', 'text'])
    
    # transform the the keywords to vectors
    
    transformed_keywords_df = pipeline_model.transform(keywords_df)    
    keywords_vector = transformed_keywords_df.select('word_vector').collect()[0][0]
    sim_bus_byword_rdd = spark.sparkContext\
        .parallelize((i[0], float(cosine_similarity(keywords_vector, i[1]))) for i in all_business_vectors)

    sim_bus_byword_df = spark.createDataFrame(sim_bus_byword_rdd) \
         .withColumnRenamed('_1', 'business_id') \
         .withColumnRenamed('_2', 'score') \
         .orderBy("score", ascending = False) \
         .limit(limit)
    
    # return top 10 similar businesses
    
    a = sim_bus_byword_df.limit(limit)
    return get_business_details(a)

# test using keywords

get_keywords_recommendation("chinese noodles").toPandas()

keywords: chinese noodles


Unnamed: 0,business_id,score,name,city,state,stars,review_count,categories,latitude,longitude,is_open,postal_code
0,usQRZ_mE5sPCiq9G7aJ64Q,0.638887,Kenny's Noodle House,Portland,OR,4.0,505,"Restaurants, Noodles, Cantonese, Chinese",45.4978746,-122.5775489,1,97266
1,dgTamELrc44HhmGkjdMaTQ,0.63108,Chinese Delicacy,Portland,OR,4.0,112,"Korean, Chinese, Restaurants",45.4765891,-122.579343,1,97266
2,amza5c9G7ZSKg_lFAuVHGw,0.600734,Thai Bungalow PDX,Portland,OR,4.5,21,"Food Trucks, Restaurants, Food, Thai",45.516638,-122.62887,1,97214
3,QgS0W43X-9MlHe3jMTDltg,0.599276,Good Taste,Portland,OR,4.0,416,"Cantonese, Barbeque, Chinese, Noodles, Restaur...",45.5235116,-122.6741571,1,97209
4,A5B8MEce5DHyf3Vyl3s9KQ,0.589289,Drunken Noodle,Portland,OR,4.5,5,"Restaurants, Thai",45.496178,-122.671694,1,97239
5,UeyTWGFukURbiA1JG-52FQ,0.588343,YUI,Portland,OR,4.5,9,"Asian Fusion, Food, Salad, Desserts, Restauran...",45.5630425,-122.6351619,1,97211
6,ok2ivADDFpchAI_RB5FU3A,0.587357,Yong Kang Street,Portland,OR,3.0,111,"Taiwanese, Noodles, Dim Sum, Chinese, Restaurants",45.5184023,-122.6772626,1,97204
7,tXEC_kp_owg-9iN-iQvlNw,0.572733,Yang‘s Noodle,Portland,OR,4.5,98,"Chinese, Noodles, Restaurants",45.5593645,-122.6483421,1,97211
8,mdVogU5UdVoHpubJMX7E2w,0.570026,Sunita's Thai Kitchen,Portland,OR,5.0,8,"Bubble Tea, Coffee & Tea, Restaurants, Food, Thai",45.486783,-122.770581,1,97225
9,ADm5xkCMPiW1Mf7KxcvOtQ,0.564171,Taipei Noodle Haus,Portland,OR,3.5,91,"Restaurants, Korean, Taiwanese, Chinese, Noodles",45.5331828,-122.5431283,1,97220


In [95]:
# more test

get_keywords_recommendation("burger").toPandas()

keywords: burger


Unnamed: 0,business_id,score,name,city,state,stars,review_count,categories,latitude,longitude,is_open,postal_code
0,RZEEoKF4odsIg_GV70xD_Q,0.627719,Union Burger,Portland,OR,5.0,46,"Restaurants, Food Trucks, Food, Burgers",45.5763354,-122.6616957039,1,97211
1,EkyssuFKGgcMX6FpmVjiDA,0.602341,Killer Burger,Portland,OR,4.0,797,"Restaurants, Burgers",45.5381080913,-122.6151960203,1,97213
2,QreidCBt3HUOesM8gnULqw,0.595073,Pickle & Salt,Portland,OR,4.5,15,"Food, Halal, Food Trucks, Burgers, Restaurants",45.5227501,-122.6747903,1,97209
3,Hlvpm0izAXRQM1uZn1H35A,0.591029,Little Big Burger,Portland,OR,4.0,949,"Burgers, Restaurants, Fast Food",45.5241461354,-122.6810082251,1,97209
4,gcTc8abc8Jfpmg3t1ejp3g,0.581785,Killer Burger,Portland,OR,4.0,411,"Restaurants, Burgers",45.4596066,-122.6468698,1,97202
5,Jelhk2X7EJIMwUlO8RAN5g,0.581083,Fuller's Burger Shack,Portland,OR,4.0,48,"American (Traditional), American (New), Burger...",45.5731634,-122.5575475,1,97220
6,JyPNR8kk5uBrhEMfuZoCug,0.580262,Little Big Burger,Portland,OR,3.5,73,"Food, Burgers, Restaurants, Vegetarian, Fast Food",45.5116533,-122.644857,1,97214
7,jUnRs-5ZGecrN-sm8iiNjA,0.579537,Little Big Burger,Portland,OR,3.5,232,"Restaurants, Fast Food, American (Traditional)...",45.5501187,-122.6758684,1,97227
8,Yvsezs2OxUazHhw_kIYIPQ,0.575546,Bless Your Heart Burgers 33rd Avenue,Portland,OR,4.0,63,"Burgers, Restaurants, Food Trucks, Food, Ameri...",45.5618172,-122.6305022,1,97211
9,TJoJ_1aQXRobOP-UITj5tw,0.575242,Little Big Burger,Portland,OR,4.0,306,"Restaurants, Burgers, Fast Food",45.529483,-122.69858,1,97210
