# Purpose

We want to build a tool for users to search for food businesses (restaurant, cafes) based on certain keywords such as ambience. 

* We aggregate all of the reviews for each restaurant and process them. 
* We remove stop words, tokenize the remaining words and also filter out infrequent words in order to get a collection of words that we think “describe” the restaurant.
* We treat the aggregated review tokens for each restaurant as a document and input this list of documents to an Latent Semantic Indexing (LSI) model to identify categories. 
* The user input search query is the “example” document that need to be matched to the identified categories. 
* Based on the categories that user’s search query matches best with, it is clustered with restaurants with similar categories. We output the top 50 best matches of the model but also control for restaurants with poor ratings (<3.5).


# 0. Preparing the dataset

In [1]:
import json
import os
import ast
import pandas as pd

In [3]:
#importing the yelp businesses dataset
datastr=open('yelp_academic_dataset_business.json','r')
datajson=datastr.readlines()
check=[json.loads(x) for x in datajson]
ind=range(0,len(check))
business=pd.DataFrame(check,index=ind,columns=check[0].keys())

In [2]:
#importing the yelp reviews dataset
datastr=open('yelp_academic_dataset_review.json','r')
datajson=datastr.readlines()
check=[json.loads(x) for x in datajson]
ind=range(0,len(check))
reviews=pd.DataFrame(check,index=ind,columns=check[0].keys())

In [4]:
len(business[business['state']=='NY'])

19

CHOOSE THE MOST FREQUENT OCCURENCES OF FOOD BUSINESS AND ONLY US STATES

In [6]:
#To see most frequent occurences of food categories to determine what to filter business dataset on
import collections as c
emptylist=[]
temp1=[emptylist.extend(x.split(', ')) for x in business['categories'] if pd.isnull(x)==False]
category_count=c.Counter(emptylist)
category_count.most_common(500)

[('Restaurants', 57173),
 ('Shopping', 30231),
 ('Food', 27118),
 ('Beauty & Spas', 18967),
 ('Home Services', 18634),
 ('Health & Medical', 16157),
 ('Local Services', 12906),
 ('Automotive', 12656),
 ('Nightlife', 12438),
 ('Bars', 10853),
 ('Event Planning & Services', 9774),
 ('Active Life', 9119),
 ('Fashion', 7406),
 ('Coffee & Tea', 6936),
 ('Sandwiches', 6912),
 ('Hair Salons', 6825),
 ('Fast Food', 6812),
 ('American (Traditional)', 6659),
 ('Pizza', 6603),
 ('Home & Garden', 6020),
 ('Auto Repair', 5877),
 ('Hotels & Travel', 5808),
 ('Arts & Entertainment', 5794),
 ('Professional Services', 5620),
 ('Doctors', 5450),
 ('Real Estate', 5295),
 ('Burgers', 5126),
 ('Breakfast & Brunch', 5023),
 ('Nail Salons', 4839),
 ('Fitness & Instruction', 4559),
 ('Italian', 4550),
 ('Mexican', 4419),
 ('Specialty Food', 4304),
 ('Chinese', 4247),
 ('American (New)', 4230),
 ('Pets', 4001),
 ('Hair Removal', 3766),
 ('Bakeries', 3509),
 ('Dentists', 3436),
 ('Grocery', 3380),
 ('Skin Care'

In [10]:
#defining functions to filter
def category_filter(category):
    foodstuff=('Restaurants','Food','Nightlife', 'Bars', 'Fast Food', 'Sandwiches', 
               'Coffee & Tea', 'American (Traditional)', 'Pizza', 'Breakfast & Brunch', 
               'Bakeries', 'Burgers', 'Italian', 'Mexican', 'Chinese', 'American (New)', 
               'Cafes', 'Ice Cream & Frozen Yogurt', 'Food Trucks', 'Desserts', 'Seafood', 
               'Salad', 'Sushi Bars', 'Delis', 'Juice Bars & Smoothies', 'Steakhouses')
    if pd.isnull(category['categories']) == False:
        temp=category['categories'].split(', ')
        checksum=sum([foodstuff.count(x) for x in temp])
        return checksum
    else:
        return 0
    
def cleanstate(input_state):
    import numpy as np
    state_list = ('AL','AK','AZ','AR','CA','CO','CT','DE','FL','GA','HI','ID','IL',
                  'IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT',
                  'NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI',
                  'SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY')
    if input_state not in state_list:
        return np.NaN
    return input_state

In [11]:
#filtering for only restaurants and only US states
business['restaurant_flag']=business.apply(category_filter,axis=1)
business['state'] = business['state'].apply(cleanstate)
filtered=business[business['restaurant_flag']>0]
filtered=filtered[filtered['state'].notnull()]
len(filtered)*100/len(business)

24.943661747784912

CREATE RESTAURANT REVIEWS

In [12]:
restaurant_reviews=reviews[reviews["business_id"].isin(filtered["business_id"])]

In [13]:
#basic descriptions
#restaurant_reviews[restaurant_reviews['user_id']=='msQe1u7Z_XuqjGoqhB0J5g']
print(len(restaurant_reviews))
print(len(restaurant_reviews['business_id'].unique()))
print(len(restaurant_reviews['user_id'].unique()))

3365595
47042
951201


In [14]:
#desired fields
user_rating_count=restaurant_reviews[['user_id','business_id']].groupby(['user_id']).count()
user_rating_count=user_rating_count.rename(columns = {'business_id':'user_rating_count'})
average_rating_restaurant=restaurant_reviews[['business_id','stars']].groupby(['business_id']).mean()
average_rating_restaurant=average_rating_restaurant.rename(columns = {'stars':'average_rating_rest'})
count_rating_restaurant=restaurant_reviews[['business_id','stars']].groupby(['business_id']).count()
count_rating_restaurant=count_rating_restaurant.rename(columns = {'stars':'rest_review_count'})

In [15]:
#merging desired columns back to main reviews dataset
restaurant_reviews=restaurant_reviews.merge(user_rating_count,how='left',on='user_id')
restaurant_reviews=restaurant_reviews.merge(average_rating_restaurant,how='left',on=['business_id'])
restaurant_reviews=restaurant_reviews.merge(count_rating_restaurant,how='left',on=['business_id'])

# 1. Set Up

### Import libraries and load dataset

In [17]:
#selecting only desired columns
business_details = filtered[['business_id','name','stars','review_count']]
documents = restaurant_reviews[['business_id','text']]

In [None]:
#Must aggregate reviews for each restaurant
import collections as c
only_rests=set(rest_reviews['business_id'])
final_output=c.defaultdict()

def lolwut(df):
    for i in only_rests:
        emptylist=''
        temp=df[df['business_id']==i]
        for j in range(len(temp)):
            emptylist+=(temp['text'].iloc[j])+'\n'
        final_output[i]=emptylist
    return final_output

aggregated_reviews=lolwut(documents)

In [89]:
#converting dict to dataframe for easier manipulation
reviews=pd.DataFrame()
reviews['business_id']=list(aggregated_reviews)
reviews['text']=list(aggregated_reviews.values())

In [90]:
#Merging business names and stars with the review dataset
import pandas as pd
reviews=reviews.merge(business_details,how='left',on='business_id')
reviews.head()

Unnamed: 0,business_id,text,name,stars,review_count
0,OH3baEaklANPe1farAKgRg,"Why 4 stars? It's McDonalds, that's why. Its c...",McDonald's,2.0,28
1,UdEmYOnk2iJDY9lpEPAlJQ,"I've never had a bad experience at this place,...",Floridino's Pizza & Pasta,4.0,420
2,iAD5r5vNOzc1AP6UA5yFiQ,This place has a serious problem with unwelcom...,Bella Fresco,4.0,131
3,dNDAdeE8olF4eZaslNtBZw,Sister in town from Ontario Canada and wanted ...,Manny's Mexican Restaurant,2.5,127
4,j-6MgWWlotFZAQ23LZrTPQ,"I've been eying the Trattoria since it opened,...",Queen Creek Olive Mill Trattoria,3.5,24


In [91]:
len(reviews)

24386

# 2. Cleaning reviews

In [92]:
#cleaning reviews and removing stop words to be input in the model
#Trying to use LDA to model topics for reviews
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
import pprint
documents=reviews['text'] #need only the text
stoplist = list(STOPWORDS)
texts = [[word for word in document.lower().split() if (word not in stoplist) & (word.isalnum())]
         for document in documents if isinstance(document,str)] #tokenizing

In [93]:
len(texts) #checking to see if same length

24386

In [94]:
#In the above code, we tokenize all of the reviews and remove stop words
#remove words that appear only once 
#frequency of the token across ALL reviews

#counting the frequency of token across ALL reviews
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
        
#removing tokens that occur only once across all reviews
texts = [[token for token in text if frequency[token] > 10] for text in texts] #if frequence(token)>1

In [95]:
dictionary = corpora.Dictionary(texts) #word id-word pairs
corpus = [dictionary.doc2bow(text) for text in texts] #word id-frequence of word pairs

In [96]:
#Checking frequency of the 'contextual' words we would like to identify

import collections as c
emptylist=[]
[emptylist.extend(x) for x in texts]
len(emptylist)
count=c.Counter(emptylist)
count['candle']

93

# 3. LDA and LSI Model

In [71]:
#Set parameters
num_topics = 100 #can experiment with this, for the number of reviews we have, more topics==contextual words!
passes = 20
lda = LdaModel(corpus,
              id2word=dictionary,
              num_topics=num_topics,
              passes=passes)

lda.print_topics()

  diff = np.log(self.expElogbeta)


[(19,
  '0.028*"sure" + 0.027*"dinner" + 0.025*"pretty" + 0.025*"table" + 0.024*"server" + 0.021*"food" + 0.020*"seated" + 0.017*"ended" + 0.015*"experience" + 0.015*"especially"'),
 (47,
  '0.348*"hot" + 0.087*"chili" + 0.055*"dog" + 0.036*"decent" + 0.028*"visits" + 0.023*"break" + 0.020*"larger" + 0.020*"sold" + 0.017*"written" + 0.016*"beautifully"'),
 (89,
  '0.115*"fish" + 0.090*"clean" + 0.071*"cheap" + 0.053*"place" + 0.037*"lunch" + 0.032*"expect" + 0.031*"nice" + 0.030*"size" + 0.030*"coconut" + 0.026*"great"'),
 (82,
  '0.075*"staff" + 0.063*"great" + 0.050*"friendly" + 0.048*"food" + 0.038*"restaurant" + 0.030*"place" + 0.025*"helpful" + 0.024*"visit" + 0.022*"visiting" + 0.022*"owners"'),
 (64,
  '0.183*"open" + 0.073*"expensive" + 0.062*"days" + 0.052*"weeks" + 0.050*"ago" + 0.047*"comfortable" + 0.046*"hungry" + 0.038*"job" + 0.037*"casual" + 0.035*"cozy"'),
 (79,
  '0.071*"minutes" + 0.034*"time" + 0.033*"order" + 0.031*"food" + 0.030*"waited" + 0.027*"service" + 0.025*

In [108]:
#after multiple attempts, trying LSI model, instead of LDA

from gensim.similarities.docsim import Similarity
from gensim import corpora, models, similarities
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=100)

doc=input('What are you searching for today?')
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
index = similarities.MatrixSimilarity(lsi[corpus])
sims = index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
sims[0][0]

indices=[sims[i][0] for i in range(50) if (reviews['stars'].iloc[sims[i][0]]>=3.5)]
reviews[['name','stars','text']].iloc[indices]

What are you searching for today?nice ambiance crowded


  if np.issubdtype(vec.dtype, np.int):


Unnamed: 0,name,stars,text
7896,Subway,5.0,Employees are always super nice and efficient....
7222,Kung Fu Tea,4.0,Nice little shop . Had quite a variety. the k...
6881,Harley's Liquor & Bait,4.5,"Nice people, not a bad selection, and close to..."
20122,George's Italian Deli,4.5,The George is the best. The ladies and the own...
19300,Giant Eagle Market District,4.0,"Without a doubt, Giant Eagle has nailed it wit..."
7994,Panini Bread and Grill,5.0,"It is so nice to have a neighborhood soup , sa..."
23358,Trader Joe's,4.0,Fantastic place. A nice change of pace from y...
16981,Putters Bar & Grill,4.0,The lady's were very nice behind the bar! I k...
5443,Thunderbird Lounge,3.5,"Great cocktails ! Nice atmosphere, very nice m..."
7235,Aldi,4.0,Cheap wine! Cheap fruit! Cheap everything! And...
