/
YelpRecommendation.py
244 lines (172 loc) · 12.1 KB
/
YelpRecommendation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
#!/usr/bin/env python
# coding: utf-8
# # Recommendation Engine for Yelp Restaurants in San Francisco - CA
# ## Authors: Dannie Vo and Stew Seo
import numpy as np
import pandas as pd
import json
from elasticsearch import Elasticsearch
from jproperties import Properties
import matplotlib.pyplot as plt
from IPython.display import display, Image
get_ipython().run_line_magic('matplotlib', 'inline')
import plotly.express as px
import plotly.graph_objects as go
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# ### Connecting to Elasticsearch:
def create_df_from_ES_index(yelp_ES_config_path,ES_index_name):
# Connect to Elasticsearch using Cloud_id and password stored in properties file
configs = Properties()
with open(yelp_ES_config_path, 'rb') as config_file:
configs.load(config_file)
client = Elasticsearch(cloud_id=configs.get('Cloud_ID').data,basic_auth=("elastic", configs.get('Elastic_pw').data))
# client.info()
# Get the first patch of document search
resp = client.search(index = ES_index_name, query={"match_all": {}}, size = 10000)
hits_meta = resp['hits']['hits']
# Get all documents
# Work for all cases when there are more than 10000 restaurants in that city/state
while resp['hits']['total']['relation'] == 'gte':
max_ts = max([i['_source']['timestamp'] for i in resp['hits']['hits']])
resp = client.search(index = ES_index_name, query={"range": {"timestamp":{"gte":max_ts}}}, size = 10000)
hits_meta.extend(resp['hits']['hits'])
total_restaurants = len(hits_meta)
# Create a dataframe with all source info for all restaurants in San Francisco
df = pd.DataFrame()
for i in range(total_restaurants):
df = df.append(pd.json_normalize(hits_meta[i]['_source']))
df = df.reset_index(drop=True)
return df
# Create dataframe after connecting to Elasticsearch given the index name
df = create_df_from_ES_index('/Users/dannie/Documents/Projects/restaurant-recommendation-system/Yelp_ES_config.properties','yelp-fusion-restaurants-sf')
# ### Data Cleaning:
def data_cleaning(df, city):
#On the df,price for $$$$ restaurants are not showing =>re-assign
df.loc[df['price']=='$$$$','price'] = '$$$$'
#Re-assign restaurants with NA price to Unknown
df.loc[df['price'].isna(),'price'] = 'Unknown'
#Remove restaurant with empty categories
df = df[df['categories'].apply(len) != 0]
# Convert the list of dictionaries with alias and titles into the list of all categories that the restaurant has using the title values
df['categories'] = df['categories'].apply(lambda x: pd.DataFrame.from_dict(x)['title'].to_list())
# Remove the mismatched transactions to only include options for delivery, pickup, or restaurant reservation
df['transactions'] = df['transactions'].apply(lambda y: list(filter(lambda x: x in ['delivery','pickup','restaurant_reservation'], y)))
# Convert the list of address dispplayed into a string
df['location.display_address'] = df['location.display_address'].apply(lambda x: ', '.join(map(str, x)))
# Drop some un-used columns - some columns has specific values to when the doc was indexed on Elasticsearch
df = df.drop(['hours','phone','is_closed','timestamp', 'messaging.url', 'messaging.use_case_text'], axis=1)
# Convert different transaction categories into columns and join it with the original dataframe
transaction_df = pd.get_dummies(df['transactions'].explode()).groupby(level=0).sum()
df = df.join(transaction_df)
# Only get the restaurants with price '$','$$','$$$', or'$$$$'
df = df[df.price.isin(['$','$$','$$$','$$$$','Unknown'])]
# For the restaurants with nan state (location), extract the state from the display_address
df.loc[df['location.state'].isna(),'location.state'] = df[df['location.state'].isna()]['location.display_address'].str.split(', ').apply(lambda x: x[-1].split(' ')[0])
# Only get the restaurants within the city
df = df[df['location.city'] == city]
# Only get the restaurants that has the zipcode with number(digits) pattern
df = df[df['location.zip_code'].str.isdigit()]
return df.reset_index(drop=True)
df = data_cleaning(df,'San Francisco')
# ### Data Exploration:
def show_image(name):
restaurant = df[df.name == name].reset_index(drop=True)
for i in range(len(restaurant)):
print('Restaurant:',name,'\t Location:',restaurant.loc[i,'location.display_address'])
[display(Image(pics)) for pics in restaurant.loc[i,'photos']]
# #### Show the restaurant's main images on Yelp
show_image("Brenda's French Soul Food")
show_image("Tartine Bakery")
# #### Top 5 restaurants with the most reviews:
df.sort_values(by='review_count',ascending=False).head()[['name','review_count','categories','rating','display_phone','price','location.display_address','delivery','pickup']]
df.groupby('rating').name.agg(['count']).reset_index(drop=False)
# #### Total restaurants by Yelp ratings:
fig = px.bar(df.groupby('rating').name.agg(['count']).reset_index(drop=False)
, x="rating", y="count", color="rating", text_auto=True, title = "Total Restaurants by Yelp Ratings")
fig.update_layout(xaxis_title = "Rating",yaxis_title = "Total Restaurants")
fig.show()
# #### Calculating the total number of restaurants by categories
# Get the restaurant counts for all categories
# Note: 1 restaurant could have multiple categories
cat = df['categories'].apply(pd.Series).stack().reset_index(drop=True).value_counts()
cat = cat.to_frame(name='count').reset_index().rename(columns={'index':'category'})
fig = go.Figure(data=[go.Pie(labels=cat.head(10).category, values=cat.head(10)['count'],hole = 0.3,
pull = [0.2,0,0,0,0,0,0,0,0,0])])
fig.update_layout(title = 'Top 10 categories')
fig.show()
# #### All restaurants by location
fig = px.scatter_mapbox(df, lat="coordinates.latitude", lon="coordinates.longitude", color="location.zip_code",
zoom=11, hover_name = "name", hover_data = ['rating','review_count','transactions','categories'])
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(title = 'SF Restaurant Locations')
fig.show()
# #### Total restaurants breakdown by location (zipcode) and Yelp rating scale:
df.groupby(['location.zip_code','rating']).name.agg(['count']).reset_index(drop=False)
fig = px.sunburst(df.groupby(['location.zip_code','rating']).name.agg(['count']).reset_index(drop=False), path=['location.zip_code','rating'], values='count', title = 'Total Restaurants by Zipcode and Rating')
fig.show()
# #### Restaurants by Zipcode with Delivery/Pickup service:
delivery_pickup = df.groupby('location.zip_code').agg({'delivery':'sum','pickup':'sum'}).reset_index(drop=False)
delivery_pickup[delivery_pickup['location.zip_code'].isin(['94110','94103','94102','94109','94133'])]
fig = px.bar(delivery_pickup[delivery_pickup['location.zip_code'].isin(['94110','94103','94102','94109','94133'])],
x=['delivery','pickup'], y="location.zip_code", orientation='h', barmode='group',title='Total Restaurants by Delivery/Pickup Service for Top 5 Zipcodes With The Most Restaurants',
labels = {'location.zip_code':'Zipcode','variable':'Type','value':'Total Restaurants'})
fig.show()
# #### Restaurants by Price Range/Rating:
rest_by_price = df.groupby('price').name.agg(['count']).reset_index(drop=False).sort_values(by='count',ascending=False)
fig = px.treemap(rest_by_price, path=['price'], values='count', title = 'Total Restaurants by Price')
fig.update_traces(root_color="lightgrey")
fig.show()
# ### Building the Recommendation Engine:
# This function returns the updated dataframe with the name of restaurants and the final content
# that is the combination of many conditions that we want the recommendation system to consider
#Final content column is cleaned properly for the pre-prosessing step before building the recommendation
def create_final_content(similar_conditions):
data = df.copy()
# New mapping for price
data['price'] = data['price'].map({'$':'$','$$':'2$','$$$':'3$','$$$$':'4$','Unknown':'unknown'})
# Update transaction to exclude special character & space (only _ in this case)
# Then join them together into 1 big string
data['transactions']= data['transactions'].apply(lambda x: ' '.join([i.replace('_','') for i in x]))
# Update review_count with new group bucket by review count bins
group_bin = [1, 100, 200, 400, 600, 800, 1000, 2000, np.inf]
group_name = ['group1','group2','group3','group4','group5','group6','group7','group8']
data['review_count'] = pd.cut(data.review_count, bins = group_bin, labels = group_name, include_lowest = True)
# Update ratings with new rating group bin
data['rating'] = pd.cut(data['rating'], bins = [1,2,3,4,5,np.inf], labels = ['r1','r2','r3','r4','r5'], include_lowest = True,right=False)
# Update categories to be all lower cases and remove space
# Then join them together into 1 big string
data['categories'] = data['categories'].apply(lambda x: ' '.join([i.lower().replace(' ','') for i in x]))
# Create a new column by combining the values for categories, review count, rating, transactions, price
#Different conditions based on whether the we have 1 single condition (string format) or multiple conditions (list)
data['final_content'] = data[similar_conditions].apply(lambda x: ' '.join(x.astype(str)), axis=1) if isinstance(similar_conditions,list) else data[similar_conditions]
# Drop duplicates for restaurants based on name and contents that we want to compare the restaurant with
data = data[['name','final_content']].drop_duplicates().reset_index(drop=True)
return data
def build_Yelp_recommendation_system(similar_conditions, restaurant_name, top_n):
# Create the update df with the final content column based on similar conditions
data = create_final_content(similar_conditions)
# Create the count sparse matrix for the content column and remove stop words in case there are extra unnecessary strings
cnt_matrix = CountVectorizer(stop_words='english').fit_transform(data['final_content'])
# Calculate cosine similarity score
cosine_sim = cosine_similarity(cnt_matrix)
# Get the chosen restaurant's index based on name
restaurant_ind = data[data.name == restaurant_name].index[0]
# Get the list of restaurant index and its cosine similarity score as tuple
similar_list = list(enumerate(cosine_sim[restaurant_ind]))
# Sort this list based on cosine similarity score from high to low
highest_similar_list = sorted(similar_list,key=lambda x:x[1],reverse=True)
# There are cases where there are extremely good matches that the recommended restaurant has same cosine similarity score with the chosen restaurant (=1)
# Because the list is sorted based on the cosine similarities score and not restaurant name, it could have different index instead of being the first index
# Therefore we will exclude that restaurant name out of the list and get the top n recommended restaurant
#highest_similar_list.remove((restaurant_ind, 1.0))
highest_similar_list = [item for item in highest_similar_list if item[0]!= restaurant_ind]
print('If the restaurant name shows up in the recommended list more than once, it means that while the restaurant has the same name, it has different conditions listed compared to the other location.')
print('Below is a list of top',top_n, 'recommended restaurants based on',', '.join(similar_conditions) if isinstance(similar_conditions,list) else similar_conditions, 'compared to',restaurant_name+':')
[print(data.name.loc[i[0]]) for i in highest_similar_list[:top_n]]
# #### Build recommendation system to return the list of recommended restaurants based on your chosen restaurant name and one or more restaurants fields and the size of the top recommended list:
# #### Please choose one or more of these fields: price, categories, rating, review_count, transactions
#
build_Yelp_recommendation_system(['price','categories','rating','review_count'], 'San Tung',10)
build_Yelp_recommendation_system('categories','Arsicault',10)