# Restaurant Recommender

Andrea Lopez\
Yuriko Nishijima\
Stephen Connelly

In [1]:
import csv
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords 
from nltk.tokenize import WordPunctTokenizer
import warnings
warnings.filterwarnings('ignore')

### Import Data

In [3]:
df_business = pd.read_json('yelp_academic_dataset_business.json', lines=True)

In [4]:
df_reviews = pd.read_json('yelp_academic_dataset_review.json', lines=True)

In [5]:
df_business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [6]:
df_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [7]:
# Grab relevant columns from reviews: stars and text
df_reviews = df_reviews[['business_id', 'user_id', 'stars', 'text']]

### Clean text

In [8]:
# Get stopwords 
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = []
for word in stopwords.words('english'):
    s = [char for char in word if char not in string.punctuation]
    stop.append(''.join(s))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andrealopez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return " ".join([word for word in nopunc.split() if word.lower() not in stop])

In [10]:
df_reviews['text'] = df_reviews['text'].apply(text_process)

In [12]:
#Split train test
vld_size=0.15 # size of valid data
X_train, X_valid, y_train, y_valid = train_test_split(df_reviews['text'], df_reviews['business_id'], test_size = vld_size) 

### Create matrices

In [13]:
user_text_df = df_reviews[['user_id','text']]
business_text_df = df_reviews[['business_id', 'text']]

In [14]:
user_text_df.head()

Unnamed: 0,user_id,text
0,mh_-eMZ6K5RLWhZyISBhwA,decide eat aware going take 2 hours beginning ...
1,OyoGAe7OKpv6SyGZT5g77Q,Ive taken lot spin classes years nothing compa...
2,8g_iMtfSiwikVnbP2etR0A,Family diner buffet Eclectic assortment large ...
3,_7bHUi9Uuf5__HHc_Q8guQ,Wow Yummy different delicious favorite lamb cu...
4,bcjbaE6dDog4jkNY91ncLQ,Cute interior owner gave us tour upcoming pati...


In [15]:
business_text_df.head()

Unnamed: 0,business_id,text
0,XQfwVwDr-v0ZS3_CbbE5Xw,decide eat aware going take 2 hours beginning ...
1,7ATYjTIgM3jUlt4UM3IypQ,Ive taken lot spin classes years nothing compa...
2,YjUWPpI6HXG530lwP-fb2A,Family diner buffet Eclectic assortment large ...
3,kxX2SOes4o-D3ZQBkiMRfA,Wow Yummy different delicious favorite lamb cu...
4,e4Vwtrqf-wpJfwesgvdgxQ,Cute interior owner gave us tour upcoming pati...


### Vectorizer with features

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
#userid vectorizer
userid_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=5000)
userid_vectors = userid_vectorizer.fit_transform(user_text_df['text'])
userid_vectors.shape

(1092810, 5000)

In [18]:
#Business id vectorizer
businessid_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=5000)
businessid_vectors = businessid_vectorizer.fit_transform(business_text_df['text'])
businessid_vectors.shape

(1092810, 5000)

### Matrix Factorization

In [38]:
pip install --upgrade pip

Collecting pip
  Downloading pip-23.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.0.1
    Uninstalling pip-23.0.1:
      Successfully uninstalled pip-23.0.1
Successfully installed pip-23.1
Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install --upgrade pandas

Collecting pandas
  Downloading pandas-2.0.0-cp38-cp38-macosx_10_9_x86_64.whl (11.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m507.4 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting python-dateutil>=2.8.2 (from pandas)
  Using cached python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)
Collecting tzdata>=2022.1 (from pandas)
  Using cached tzdata-2023.3-py2.py3-none-any.whl (341 kB)
Collecting numpy>=1.20.3 (from pandas)
  Downloading numpy-1.24.2-cp38-cp38-macosx_10_9_x86_64.whl (19.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m576.4 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: tzdata, python-dateutil, numpy, pandas
  Attempting uninstall: python-dateutil
    Found existing installation: python-dateutil 2.8.1
    Uninstalling python-dateutil-2.8.1:
      Successfully uninstalled python-dateutil-2.8.1
  Attempting uninstall: numpy
    Found existin

In [36]:
# Convert 'stars' column to float data type
df_reviews['stars'] = df_reviews['stars'].astype(float)
# Create user x business matrix
userid_rating_matrix = pd.pivot_table(df_reviews, values='stars', index=['user_id'], columns=['business_id'])
userid_rating_matrix.shape

TypeError: pivot_table() got an unexpected keyword argument 'sparse'

In [27]:
# Create P and Q matrices: user 
P = pd.DataFrame(userid_vectors.toarray(), index=user_text_df.index,
                 columns=userid_vectorizer.get_feature_names())
Q = pd.DataFrame(businessid_vectors.toarray(), index=business_text_df.index,
                 columns=businessid_vectorizer.get_feature_names())

In [28]:
Q.head()

Unnamed: 0,0,1,10,100,1000,1010,1015,1030,10pm,11,...,young,younger,yuck,yum,yummy,zero,zone,zoo,zucchini,​
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.169282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.198746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.233754,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
P.head()

Unnamed: 0,0,1,10,100,1000,1010,1015,1030,10pm,11,...,young,younger,yuck,yum,yummy,zero,zone,zoo,zucchini,​
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.169282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.198746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.233754,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Gradient Decent

In [30]:
def matrix_factorization(R, P, Q, steps=25, gamma=0.001,lamda=0.02):
    for step in range(steps):
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    eij=R.loc[i,j]-np.dot(P.loc[i],Q.loc[j])
                    P.loc[i]=P.loc[i]+gamma*(eij*Q.loc[j]-lamda*P.loc[i])
                    Q.loc[j]=Q.loc[j]+gamma*(eij*P.loc[i]-lamda*Q.loc[j])
        e=0
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    e= e + pow(R.loc[i,j]-np.dot(P.loc[i],Q.loc[j]),2)+lamda*(pow(np.linalg.norm(P.loc[i]),2)+pow(np.linalg.norm(Q.loc[j]),2))
        if e<0.001:
            break
        
    return P,Q

In [31]:
%%time
P, Q = matrix_factorization(userid_rating_matrix, P, Q, steps=25, gamma=0.001,lamda=0.02)

NameError: name 'userid_rating_matrix' is not defined

In [None]:
Q.head()

In [None]:
Q.iloc[0].sort_values(ascending=False).head(10)

In [None]:
# Store P, Q and vectorizer in pickle file
import pickle
output = open('yelp_recommendation_model_8.pkl', 'wb')
pickle.dump(P,output)
pickle.dump(Q,output)
pickle.dump(userid_vectorizer,output)
output.close()

### Prediction and Filter based on location

In [None]:
# Calculate the distance between the user and each restaurant using the Haversine formula
from math import radians, sin, cos, sqrt, atan2

def distance(lat1, lon1, lat2, lon2):
    R = 6371 # Earth's radius in kilometers
    dLat = radians(lat2-lat1)
    dLon = radians(lon2-lon1)
    lat1 = radians(lat1)
    lat2 = radians(lat2)
    a = sin(dLat/2) * sin(dLat/2) + sin(dLon/2) * sin(dLon/2) * cos(lat1) * cos(lat2)
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    distance = R * c
    return distance


In [None]:
words = "i want to have dinner with beautiful views"
# user's location
lat = "35"
long = "1000"

# save distance in its own column
df_business["distance"] = df_business.apply(lambda row: distance(lat, long, row["latitude"], row["longitude"]), axis=1)

# filter the restaurants by distance
distance_threshold = 5 # kilometers
filtered_business = df_business[df_business["distance"] <= distance_threshold]


In [None]:
test_df= pd.DataFrame([words], columns=['text','latitude','longitude'])

test_df['text'] = test_df['text'].apply(text_process)
test_vectors = userid_vectorizer.transform(test_df['text'])
test_v_df = pd.DataFrame(test_vectors.toarray(), index=test_df.index, columns=userid_vectorizer.get_feature_names())

predictItemRating=pd.DataFrame(np.dot(test_v_df.loc[0],Q.T),index=Q.index,columns=['Rating'])
topRecommendations=pd.DataFrame.sort_values(predictItemRating,['Rating'],ascending=[0])[:7]

for i in topRecommendations.index:
    print(df_business[df_business['business_id']==i]['name'].iloc[0])
    print(df_business[df_business['business_id']==i]['categories'].iloc[0])
    print(str(df_business[df_business['business_id']==i]['stars'].iloc[0])+ ' '+str(df_business[df_business['business_id']==i]['review_count'].iloc[0]))
    print('')