# Part 5 Content Based Filtering

### Setup

In [1]:
from operator import add
from pyspark.sql import SparkSession
from pyspark.ml.feature import *
from pyspark.ml import Pipeline, PipelineModel
import numpy as np
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [4]:
spark = SparkSession.builder\
    .config("spark.driver.memory", "8g")\
    .config("spark.executor.memory","8g")\
    .config("spark.driver.maxResultSize","0")\
    .config("spark.sql.autoBroadcastJoinThreshold","-1")\
    .config("spark.sql.broadcastTimeout","1200")\
    .config("spark.default.parallelism", "8")\
    .config("spark.network.timeout","7200s")\
    .config("spark.executor.heartbeatInterval", "3600s")\
    .appName("pipeline").getOrCreate()

### Read in Pre-processed Data

In [5]:
business = spark.read.csv("business.csv",header=True, multiLine=True)
review = spark.read.csv("part4_topicmodeling_review.csv",header=True, multiLine=True)

In [6]:
business.count()

4127

In [7]:
business.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- stars: string (nullable = true)
 |-- review_count: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- is_open: string (nullable = true)
 |-- postal_code: string (nullable = true)



In [8]:
review.count()

559331

In [9]:
review.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- stars: string (nullable = true)
 |-- useful: string (nullable = true)
 |-- text: string (nullable = true)
 |-- polarity: string (nullable = true)
 |-- subjectivity: string (nullable = true)
 |-- compound: string (nullable = true)
 |-- superscore: string (nullable = true)
 |-- Keywords: string (nullable = true)



### Join Review Text by Business ID

In [10]:
reviews_text_rdd = review.select("business_id", "text").rdd
reviews_by_business_rdd = reviews_text_rdd.map(tuple).reduceByKey(add)  
reviews_by_business_df = spark.createDataFrame(reviews_by_business_rdd)
reviews_by_business_df = reviews_by_business_df.withColumnRenamed("_1", "business_id").withColumnRenamed("_2", "text")

### Build Content Based Filtering Pipeline

In [12]:
# prepare the pipeline

regex_tokenizer = RegexTokenizer(gaps = False, pattern = "\w+", inputCol = "text", outputCol = "token")
stop_words_remover = StopWordsRemover(inputCol = "token", outputCol = "non_stop_word")
count_vectorizer = CountVectorizer(inputCol="non_stop_word", outputCol="raw_feature")
idf = IDF(inputCol="raw_feature", outputCol="idf_vector")
word_2_vector = Word2Vec(vectorSize = 100, minCount = 5, inputCol = "non_stop_word", outputCol = "word_vector", seed=1)
vector_assembler = VectorAssembler(inputCols=["idf_vector", "word_vector"], outputCol="combined_vector")

# fit the pipeline

pipeline = Pipeline(stages=[regex_tokenizer, stop_words_remover, count_vectorizer, idf, word_2_vector, vector_assembler])
pipeline_model = pipeline.fit(reviews_by_business_df)

# save the pipeline model

pipeline_model.write().overwrite().save("content_based_pipeline_model")

In [13]:
# load the pipeline model

pipeline_model = PipelineModel.load("content_based_pipeline_model")
reviews_by_business_transformed_df = pipeline_model.transform(reviews_by_business_df)


In [14]:
reviews_by_business_transformed_df.printSchema()


root
 |-- business_id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- non_stop_word: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- raw_feature: vector (nullable = true)
 |-- idf_vector: vector (nullable = true)
 |-- word_vector: vector (nullable = true)
 |-- combined_vector: vector (nullable = true)



In [16]:
reviews_by_business_transformed_df.show(3)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         business_id|                text|               token|       non_stop_word|         raw_feature|          idf_vector|         word_vector|     combined_vector|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|fNkX4JGvBMmLSoAON...|We were pleasantl...|[we, were, pleasa...|[pleasantly, surp...|(157702,[0,1,2,3,...|(157702,[0,1,2,3,...|[-0.0113749849423...|(157802,[0,1,2,3,...|
|itqW3t8LhYjWvjZDf...|Love Love Love. B...|[love, love, love...|[love, love, love...|(157702,[0,1,2,3,...|(157702,[0,1,2,3,...|[0.03365884937757...|(157802,[0,1,2,3,...|
|yYUbaE9QH9EDEiSyE...|Everything is exp...|[everything, is, ...|[everything, expi...|(157702,[1,2,3,4,...|(157702,[1,2,3,4,...|[-0.0166842110826...|(1

In [17]:
all_business_vectors = reviews_by_business_transformed_df.select("business_id", "word_vector").rdd.map(lambda x: (x[0], x[1])).collect()
all_business_vectors[1][1]

DenseVector([0.0337, 0.0258, 0.0975, -0.0251, -0.0183, 0.0104, -0.0191, -0.0004, 0.0107, -0.0216, -0.02, -0.1039, 0.0377, -0.0424, -0.0413, -0.0157, 0.1056, -0.0221, -0.0412, 0.0683, 0.1046, -0.0004, -0.0083, 0.1137, 0.0526, 0.0502, -0.0112, -0.0918, 0.017, 0.0628, 0.02, 0.006, -0.0134, -0.0392, -0.0904, -0.027, -0.03, -0.0466, -0.0095, -0.0016, -0.0039, -0.0911, -0.0658, 0.0561, -0.0076, -0.0629, -0.0688, -0.1342, -0.0652, -0.0511, 0.009, -0.0549, -0.0209, 0.0329, 0.0416, 0.0066, -0.0041, 0.023, -0.0147, -0.0546, 0.0309, 0.0121, -0.102, -0.0544, 0.0064, -0.0097, 0.0161, 0.0183, 0.0246, -0.0799, -0.033, 0.0232, -0.0119, 0.0041, -0.0496, -0.0715, -0.0002, -0.0063, -0.1076, -0.114, -0.0161, -0.0055, -0.0068, 0.0439, -0.0018, 0.0396, 0.0522, -0.0358, -0.0836, 0.0279, 0.0664, -0.0052, -0.0258, -0.0565, 0.0806, -0.0318, 0.0352, -0.0528, 0.0401, 0.0223])

In [18]:
# utility functions

def cosine_similarity(vector1, vector2): 
    return np.dot(vector1, vector2) / np.sqrt(np.dot(vector1, vector1)) / np.sqrt(np.dot(vector2, vector2)) 


def get_business_details(in_business):
    a = in_business.alias("a")
    b = business.alias("b")
    return a.join(b, col("a.business_id") == col("b.business_id"), 'inner') \
             .select([col('a.'+x) for x in a.columns] + [col('b.name'),col('b.categories'),
                                                           col('b.stars'),col('b.review_count'),
                                                           col('b.latitude'),col('b.longitude')])
    

In [30]:
def get_similar_business(business_ids, limit=10):
    schema = StructType([   
                            StructField("business_id", StringType(), True)
                            ,StructField("score", IntegerType(), True)
                            ,StructField("input_business_id", StringType(), True)
                        ])
    similar_businesses_df = spark.createDataFrame([], schema)
    
    for b_id in business_ids:
        try:
            input_vec = [(r[1]) for r in all_business_vectors if r[0] == b_id][0]
        except:
            continue
        similar_business_rdd = spark.sparkContext.parallelize(
            (i[0], float(cosine_similarity(input_vec, i[1]))) for i in all_business_vectors)

        similar_business_df = spark.createDataFrame(similar_business_rdd) \
            .withColumnRenamed("_1", "business_id") \
            .withColumnRenamed("_2", "score") \
            .orderBy("score", ascending = False)
            
        similar_business_df = similar_business_df.filter(col("business_id") != b_id).limit(limit)
        similar_business_df = similar_business_df.withColumn("input_business_id", lit(b_id))
        similar_businesses_df = similar_businesses_df \
                                    .union(similar_business_df)
    return similar_businesses_df

# test 
bids = ['Dl2vgi5W_nbe-A97D0zgfA', 'RtUvSWO_UZ8V3Wpj0n077w','itqW3t8LhYjWvjZDf-HkhQ', 'yYUbaE9QH9EDEiSyEGyZ9g']

print('\ninput restaurants details:')
business.select('business_id','name', 'categories') \
    .filter(business.business_id.isin(bids) == True).show(truncate=False)
    
# get top 10 similar business
similar_business = get_business_details(get_similar_business(bids))

print('Top 10 similar restaurants for each input restaurant are:"')
similar_business.toPandas()


input restaurants details:
+----------------------+---------------------+-----------------------------------------------------------------------------------+
|business_id           |name                 |categories                                                                         |
+----------------------+---------------------+-----------------------------------------------------------------------------------+
|itqW3t8LhYjWvjZDf-HkhQ|Oracle Coffee Company|Food, Bakeries, Vegan, Coffee & Tea, Internet Cafes, Coffee Roasteries, Restaurants|
|yYUbaE9QH9EDEiSyEGyZ9g|Lents 1 Stop Market  |Convenience Stores, Delis, Restaurants, Food, Juice Bars & Smoothies               |
+----------------------+---------------------+-----------------------------------------------------------------------------------+

Top 10 similar restaurants for each input restaurant are:"


Unnamed: 0,business_id,score,input_business_id,name,categories,stars,review_count,latitude,longitude
0,17pcPIDZ_hqQ7NcTrFwPsQ,0.97372,itqW3t8LhYjWvjZDf-HkhQ,Nossa Familia Coffee,"Restaurants, Coffee & Tea, Coffee Roasteries, ...",4.5,211,45.5299427,-122.6847716
1,LNsROS-qBdtH-NKtCKqrKA,0.940162,yYUbaE9QH9EDEiSyEGyZ9g,50th Market,"Food, Beer, Wine & Spirits, Convenience Stores...",2.5,5,45.5013256,-122.6108413
2,dRMSM-8cISIgtH60XYEDYw,0.969932,itqW3t8LhYjWvjZDf-HkhQ,Case Study Coffee,"Coffee & Tea, Food",4.0,472,45.5192434281,-122.6822642762
3,lZCuDuBYPOaDV8k8tlQlSg,0.967796,itqW3t8LhYjWvjZDf-HkhQ,Barista,"Food, Coffee & Tea",4.5,425,45.5269539,-122.6845432
4,z6aMvwRopS3fZ3P-t2wPhw,0.968823,itqW3t8LhYjWvjZDf-HkhQ,Heart,"Coffee Roasteries, Food",3.5,55,45.5219662,-122.6829152
5,Ly8E8wrBxzXQIBoE_FuxYA,0.966878,itqW3t8LhYjWvjZDf-HkhQ,Never Coffee,"Food, Coffee & Tea",4.5,132,45.5167547,-122.6186539
6,vWqf0BZ8vcMI0z7Vm2z0QQ,0.946222,yYUbaE9QH9EDEiSyEGyZ9g,Walmart Supercenter,"Mobile Phones, Drugstores, Fashion, Food, Shop...",2.0,143,45.4913102,-122.5750681
7,RQOixECuPg6KFTGsPoGZcA,0.946203,yYUbaE9QH9EDEiSyEGyZ9g,Walgreens,"Shopping, Convenience Stores, Beauty & Spas, F...",2.0,23,45.5230196,-122.5797317
8,EbF6YAseIRfaJFk7FgbWEg,0.950295,yYUbaE9QH9EDEiSyEGyZ9g,Plaid Pantry Markets,"Convenience Stores, Food",2.0,14,45.5174322,-122.6540713
9,gmXy3qJFGqk771purRwJdQ,0.947143,yYUbaE9QH9EDEiSyEGyZ9g,7-Eleven,"Convenience Stores, Coffee & Tea, Food",2.5,14,45.4780350998,-122.5632861257
