In [126]:
import os
import json
from pymongo import MongoClient

import numpy as np
import pandas as pd
from math import sqrt
import re
from textblob import TextBlob
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize, wordpunct_tokenize, WhitespaceTokenizer
from nltk.chunk import ne_chunk
from nltk.tag import pos_tag
from nltk.corpus import stopwords

from pycorenlp import StanfordCoreNLP
from rake_nltk import Rake
from gensim.models import word2vec
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.preprocessing import Normalizer

import json

In [127]:
with open('restaurant_reviews.json', 'r') as f:
     data = json.load(f)

In [131]:
data[0]

{'name': 'Barrel Head Brewhouse',
 'overall-rating': '4.0',
 'reviews-total-number': 427,
 'text_review': ['<p lang="en">The perfect thing to do before a music festival (Outside Lands) is BOTTOMLESS BRUNCH! So I used my trusty yelp search and stumbled upon this joint. The reviews seemed to hit all marks for the perfect brunch to start our festival journey. <br/><br/>MY FELLOW YELPERS DID NOT STEER US WRONG!<br/><br/>1.) SERVICE- MICKEY was the best server i have encountered in a very long time. She knew what we needed before we even did. It was like we bought a $15,000 dollar bottle service table. She basically made us feel like she was apart of our crew and we were sad we had to leave her at the end.<br/><br/>2.) FOOD- Spectacular, cooked to perfection and met everyones specific tastes. The most notable item being the breakfast sandwich. What kind of flaky awesome bread was that!!<br/><br/>3.) ATMOSPHERE- Beautiful modern industrial feel, comfy booths and a bustling crowd but no wait 

In [132]:
#total number of reviews#
totalreview=0
for i in data:
    totalreview+=int(i['reviews-total-number'])
totalreview

223447

In [118]:
##add in the urls for each restaurant
import csv
with open('restaurant_url.csv', 'r') as f:
    reader = csv.reader(f)
    restaurant_url = list(reader)
    
restaurant_url.remove(restaurant_url[0])
restaurant_url = [y for x in restaurant_url for y in x]

for i in range(0,50):
    data[i]['url']=restaurant_url[241+i]
    
for i in range(50,len(data)):
    data[i]['url']=restaurant_url[i-50]
    


In [119]:
stop_word=stopwords.words('english')
stop_word.extend(['restaurant', 'outdoor', 'indoor', 'seating', 'happy hour', 'happy hours','opening','call',
                 'minutes','minute','hour','hours'])

#open the original_food list for the search of food names
with open('foodlist copy.txt','r') as f:
    l = f.readlines()
original_food = []
for i in l:
    original_food.append(i.replace('\n', '').lower())

    
#Calculating the lower bound wilson score for each restaurant based on 
#total reviews, and #total positive reviews
    
def confidence(total_positive, total):
    n = total
    if n == 0:
        return 0
    z = 1.96 #95% CI
    phat = total_positive / n
    return ((phat + z*z/(2*n) - z * sqrt((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n))

#analyze each restaurant  
restaurants_dish=[]
for k in data:
    #clean up the text_review on single restaurant basis
    rest_review=k['text_review']
    rest_review=[i.replace('<p lang="en">', '').replace('<br/><br/>', '').replace('</p>', '')
                 for i in rest_review]
    rest_review=[re.sub(r'<.*>', '',i) for i in rest_review]
    
    #sentiment for each review
    review_sentiment=[]
    for review in rest_review:
        blob = TextBlob(review)
        #keep the polarity only
        review_sentiment.append(blob.sentiment[0])
    
    #Calculating the lower bound wilson score for each restaurant
    total_positive=sum([True for i in review_sentiment if i>=0])
    total=len(review_sentiment)
    wilson_CI=confidence(total_positive, total)
    
    
    # Create a CountVectorizer for the original reviews, without parsing out the keywords
    cv = CountVectorizer(ngram_range=(2, 3),stop_words=stop_word, token_pattern="\\b[a-z][a-z]+\\b")
    rest_review_vecs=cv.fit_transform(rest_review)
    #count the number of mentions of each word phrase
    #rest_review_vecs_sentiment=(rest_review_vecs.T.multiply(np.array(review_sentiment))).T
    word_counts=list(zip(cv.get_feature_names(),rest_review_vecs.sum(axis=0).tolist()[0]))
    word_counts_sorted=sorted(word_counts, key=lambda x: x[1], reverse=True)

    #search for food items in the top mentioned word phrases
    mentioned_food=[]
    for i in original_food:
        sub=i
        mentioned_food.append([s for s in word_counts_sorted[0:10] if sub in s[0]])
        mentioned_food=[x for x in mentioned_food if x != []]
    
    recommend_list=set(sum(mentioned_food, []))
    recommend_list=sorted(recommend_list, key=lambda x: x[1], reverse=True)


    restaurant_dish={}
    restaurant_dish['Name']=k['name']
    restaurant_dish['Total Number of Reviews']=k['reviews-total-number']
    restaurant_dish['Overall Rating']=k['overall-rating']
    restaurant_dish['Wilson Score']=wilson_CI
    restaurant_dish['Restaurant Link']=k['url']
    restaurant_dish['Recommended Dish']=recommend_list
    
    restaurants_dish.append(restaurant_dish)

In [120]:
#store final results in json file for html to use
# with open('restaurants_dish.json', 'w') as f:
#      json.dump(restaurants_dish, f)
        
with open('restaurants_dish.json', 'r') as f:
     restaurants_dish = json.load(f)

In [129]:
with open('restaurants_dish.json', 'r') as f:
     restaurants_dish = json.load(f)
len(restaurants_dish)

290

In [116]:
###code for flask app###
#when customer input a dish name, returns a list of restaurants ordered by their wilson score
recommended_restaurants=[]
for i in restaurants_dish:
    recommended_restaurants.append(list(set([(i['Name'], i['Overall Rating'],i['Total Number of Reviews'],float(i['Wilson Score']), i['Restaurant Link']) for s in i['Recommended Dish'] 
                                    if 'beer' in s[0]])))
recommended_restaurants=sum([x for x in recommended_restaurants if x != []], [])                                                                    
recommended_restaurants=sorted(recommended_restaurants, key=lambda x: x[-2], reverse=True) 

recommended_restaurants_formatted=[]
for i in recommended_restaurants:
    recommended_restaurants_formatted.append([i[0], "Yelp's Rating"+': '+  i[1],"Rachel's Rating"+': ' + str(round((i[-2])*100)),
                                              [s['Recommended Dish'] for s in restaurants_dish if i[0] in s['Name']][0],i[-1] ])
for i in recommended_restaurants_formatted:
    i[3]=[j[0]+':' +str(j[1])+' mentions' for j in i[3]]

recommended_restaurants_formatted

[['Hogwash',
  "Yelp's Rating: 4.5",
  "Rachel's Rating: 94",
  ['fried pickles:135 mentions',
   'curry fries:124 mentions',
   'beer selection:119 mentions',
   'great beer:53 mentions',
   'sausage sandwich:50 mentions',
   'duck egg:49 mentions',
   'beers tap:38 mentions'],
  'https://www.yelp.com/biz/hogwash-san-francisco?osq=Restaurants'],
 ['The Dark Horse Inn',
  "Yelp's Rating: 4.5",
  "Rachel's Rating: 94",
  ['beer selection:57 mentions',
   'pulled pork:49 mentions',
   'mac cheese:43 mentions',
   'fried pickles:28 mentions',
   'fish tacos:22 mentions'],
  'https://www.yelp.com/biz/the-dark-horse-inn-san-francisco?osq=Restaurants'],
 ['Mikkeller Bar',
  "Yelp's Rating: 4.0",
  "Rachel's Rating: 93",
  ['beer selection:155 mentions',
   'great beer:56 mentions',
   'beers tap:54 mentions',
   'beer list:52 mentions',
   'mac cheese:49 mentions',
   'craft beer:46 mentions'],
  'https://www.yelp.com/biz/mikkeller-bar-san-francisco?osq=Restaurants'],
 ['Fermentation Lab',
 

## Sentiment score for each review

In [31]:
review_sentiment=[]
for review in rest_review:
    blob = TextBlob(review)
    #keep the polarity only
    review_sentiment.append(blob.sentiment[0])
#review_sentiment

## combining word_vectors and count vectorization

In [10]:
stop_word=stopwords.words('english')
stop_word.extend(['restaurant', 'outdoor', 'indoor', 'seating', 'happy hour', 'happy hours','opening','call'])

#open the original_food list for the search of food names
with open('foodlist copy.txt','r') as f:
    l = f.readlines()
original_food = []
for i in l:
    original_food.append(i.replace('\n', '').lower())
    
#analyze each restaurant  
restaurants_dish=[]
for k in data[0:1]:
    #clean up the text_review on single restaurant basis
    rest_review=k['text_review']
    rest_review=[i.replace('<p lang="en">', '').replace('<br/><br/>', '').replace('</p>', '')
                 for i in rest_review]
    rest_review=[re.sub(r'<.*>', '',i) for i in rest_review]
    
    # Create a CountVectorizer for the original reviews, without parsing out the keywords
    cv = CountVectorizer(ngram_range=(2, 3),stop_words=stop_word, token_pattern="\\b[a-z][a-z]+\\b")
    rest_review_vecs=cv.fit_transform(rest_review)
    rest_review_df=pd.DataFrame(rest_review_vecs.todense(), columns=[cv.get_feature_names()])
    #count the number of mentions of each word phrase
    word_counts = list(zip(rest_review_df.columns, rest_review_df.sum(axis=0)))
    word_counts_sorted=sorted(word_counts, key=lambda x: x[1], reverse=True)

    #search for food items in the top mentioned word phrases
    mentioned_food=[]
    for i in original_food:
        sub=i
        mentioned_food.append([s for s in word_counts_sorted[0:10] if sub in s[0]])
        mentioned_food=[x for x in mentioned_food if x != []]
    
    recommend_list=set(sum(mentioned_food, []))
    recommend_list=sorted(recommend_list, key=lambda x: x[1], reverse=True)
    
    ## Dimension Reduction, feature extraction 
    lsa = TruncatedSVD(50, algorithm = 'arpack')
    reduced_mat = lsa.fit_transform(rest_review_vecs)
    reduced_mat = Normalizer(copy=False).fit_transform(reduced_mat)
    word_vecs=lsa.components_


    ##Find similar words based on vectors
    def get_similar_docs(target_vec, corpus_vecs, num_res=5):
        sim_scores = np.dot(target_vec.T, corpus_vecs)
        return np.argsort(-sim_scores)[:num_res]

    index1=rest_review_df.columns.get_loc(recommend_list[0][0])
    similar_index1=get_similar_docs(word_vecs[:, index1],word_vecs)
    similar_dish_list=[]
    for i in similar_index1:
        similar_dish_list.append(rest_review_df.columns[i])


    
    #combine word_vec similar dish and the top counted similar dish
    recommended_dish=set(similar_dish_list+recommend_list)

    restaurant_dish={}
    restaurant_dish['name']=k['name']
    restaurant_dish['reviews-total-number']=k['reviews-total-number']
    restaurant_dish['overall-rating']=k['overall-rating']
    restaurant_dish['recommended_dish']=recommended_dish
    
    restaurants_dish.append(restaurant_dish)

In [11]:
restaurant_dish

{'name': 'Barrel Head Brewhouse',
 'overall-rating': '4.0',
 'recommended_dish': {('fish chips', 23),
  'seems like',
  'happy hour',
  'great beer',
  'beer selection',
  ('beer selection', 37),
  ('great beer', 25),
  'food great'},
 'reviews-total-number': 427}

In [None]:
## Stanford Core NLP

In [38]:
# # Text with some entities
# ner_text = rest_review[0]

# # Create Tokens
# tokens = pos_tag(word_tokenize(ner_text))

# # Extract entities from token list
# entities = ne_chunk(tokens)
# print(entities)

In [8]:
# nlp = StanfordCoreNLP('http://localhost:9000')
# res = nlp.annotate(rest_review[1],
#                    properties={
#                        'annotators': 'sentiment',
#                        'outputFormat': 'json',
#                        'timeout': 2000,
#                    })

# for s in res["sentences"]:
#     print (s["index"],
#         " ".join([t["word"] for t in s["tokens"]]),
#          s["sentimentValue"], s["sentiment"])

In [9]:
# res = nlp.annotate(rest_review[1],
#                    properties={
#                        'annotators': 'ner',
#                        'outputFormat': 'json',
#                        'timeout': 2000,
#                    })
# res
# # for s in res["sentences"]:
# #     print (s["index"],
# #         " ".join([t["word"] for t in s["tokens"]]),
# #              s["NN"], s["NNS"])