# Zomato Sentiment Analysis using LSTM

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import os
import sys
import time
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm
# from geopy.geocoders import Nominatim
# from geopy.geocoders import Nominatim, MapQuest # Either you can use MapQuest with API Key or use Nominatim

from ipywidgets import FloatSlider
from ipywidgets import interact,Select
from ipywidgets import Text
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
import warnings
import pickle
warnings.filterwarnings('ignore')

### Data Initialization

#### Data Description



- **url**:
contains the url of the restaurant in the zomato website

- **address**:
contains the address of the restaurant in Bengaluru

- **name**:
contains the name of the restaurant

- **online_order**:
whether online ordering is available in the restaurant or not

- **book_table**:
table book option available or not

- **rate**:
contains the overall rating of the restaurant out of 5

- **votes**:
contains total number of rating for the restaurant as of the above mentioned date

- **phone**:
contains the phone number of the restaurant

- **location**:
contains the neighborhood in which the restaurant is located

- **rest_type**:
restaurant type

- **dish_liked**:
dishes people liked in the restaurant

- **cuisines**:
food styles, separated by comma

- **approx_cost(for two people)**:
contains the approximate cost for meal for two people

- **reviews_list**:
list of tuples containing reviews for the restaurant, each tuple

- **menu_item**:
contains list of menus available in the restaurant

- **listed_in(type)**:
type of meal

- **listed_in(city)**:
contains the neighborhood in which the restaurant is listed


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
!ln -s /content/gdrive/My\ Drive/ /mydrive


Mounted at /content/gdrive


In [3]:
!cp /mydrive/NITW/NITW-06-SeqL/zomato.csv /content
!cp /mydrive/NITW/NITW-06-SeqL/zomato_locations.csv /content
# !cp /mydrive/NITW/NITW-06-SeqL/zomato_ratings.csv /content
# !cp /mydrive/NITW/NITW-06-SeqL/zomato_tokenizer.pkl /content
!cp /mydrive/NITW/NITW-06-SeqL/zomato_sentiment_rest.h5 /content

In [4]:
data=pd.read_csv('zomato.csv')
print(data.shape)
data.head(2)

(51717, 17)


Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari


In [None]:
# data.columns
data.reviews_list.iloc[0]

Python’s `eval()` allows you to evaluate arbitrary Python expressions from a string-based or compiled-code-based input. This function can be handy when you’re trying to dynamically evaluate Python expressions from any input that comes as a string or a compiled code object.

In [None]:
reviewList = eval(data.reviews_list.iloc[0])
print('No of ratings:', len(reviewList))
for rtuple in reviewList:
  rating = float(rtuple[0].split()[1])
  review = rtuple[1].split('\n')[1:]
  review = ' '.join(review)
  review = review.strip()
  print(rating)
  print(review)
  print('------------------------------------------------------------------------------------')

No of ratings: 12
4.0
A beautiful place to dine in.The interiors take you back to the Mughal era. The lightings are just perfect.We went there on the occasion of Christmas and so they had only limited items available. But the taste and service was not compromised at all.The only complaint is that the breads could have been better.Would surely like to come here again.
------------------------------------------------------------------------------------
4.0
I was here for dinner with my family on a weekday. The restaurant was completely empty. Ambience is good with some good old hindi music. Seating arrangement are good too. We ordered masala papad, panner and baby corn starters, lemon and corrionder soup, butter roti, olive and chilli paratha. Food was fresh and good, service is good too. Good for family hangout. Cheers
------------------------------------------------------------------------------------
2.0
Its a restaurant near to Banashankari BDA. Me along with few of my office friends

#### Missing Data and column information

In [None]:
data.isna().sum()

url                                0
address                            0
name                               0
online_order                       0
book_table                         0
rate                            7775
votes                              0
phone                           1208
location                          21
rest_type                        227
dish_liked                     28078
cuisines                          45
approx_cost(for two people)      346
reviews_list                       0
menu_item                          0
listed_in(type)                    0
listed_in(city)                    0
dtype: int64

In [5]:
def check_miss(data):
    '''
    data: requires a DataFrame object.
    ---
    returns: A DataFrame with details about missing values
    '''
    cnull=[sum(data[y].isnull()) for y in data.columns]
    miss=pd.DataFrame({'Col_name':data.columns,'Null Values':
                        [any(data[x].isnull()) for x in data.columns],
                    'Count_Nulls':cnull,
                    'Percentage_Nulls':list((np.array(cnull)*100)/data.shape[0]),
                    'Type':data.dtypes.values
                      })
    miss.sort_values(by='Count_Nulls',ascending=False,inplace=True)
    if miss.Count_Nulls.iloc[0]==0:
        print('No missing values present')
    else:
        print("Column {} have most number of missing values".format(str(miss.Col_name.iloc[0])))
    return miss

check_miss(data)

Column dish_liked have most number of missing values


Unnamed: 0,Col_name,Null Values,Count_Nulls,Percentage_Nulls,Type
10,dish_liked,True,28078,54.291626,object
5,rate,True,7775,15.033741,object
7,phone,True,1208,2.335789,object
12,approx_cost(for two people),True,346,0.669026,object
9,rest_type,True,227,0.438927,object
11,cuisines,True,45,0.087012,object
8,location,True,21,0.040606,object
15,listed_in(type),False,0,0.0,object
14,menu_item,False,0,0.0,object
13,reviews_list,False,0,0.0,object


In [None]:
data.rate.head(10)

In [6]:
data.phone.fillna('NA',inplace=True)
data.rate.fillna('0/5',inplace=True)
data.dish_liked.fillna('None',inplace=True)
data['approx_cost(for two people)'].fillna('0',inplace=True)
data['rest_type'].fillna('Unknown',inplace=True)
data['location'].fillna('Unknown',inplace=True)
data['cuisines'].fillna('Unknown',inplace=True)
data['ratings']=data.rate.apply(lambda x: float(x.split('/')[0]) if len(x)>3 else 0)

In [None]:
data[['rate','ratings']].head(10)

Unnamed: 0,rate,ratings
0,4.1/5,4.1
1,4.1/5,4.1
2,3.8/5,3.8
3,3.7/5,3.7
4,3.8/5,3.8
5,3.8/5,3.8
6,3.6/5,3.6
7,4.6/5,4.6
8,4.0/5,4.0
9,4.2/5,4.2


In [7]:
check_miss(data)

No missing values present


Unnamed: 0,Col_name,Null Values,Count_Nulls,Percentage_Nulls,Type
0,url,False,0,0.0,object
1,address,False,0,0.0,object
16,listed_in(city),False,0,0.0,object
15,listed_in(type),False,0,0.0,object
14,menu_item,False,0,0.0,object
13,reviews_list,False,0,0.0,object
12,approx_cost(for two people),False,0,0.0,object
11,cuisines,False,0,0.0,object
10,dish_liked,False,0,0.0,object
9,rest_type,False,0,0.0,object


### Exploratory Data Analysis

#### Top restaurant chains in Banglore

In [None]:
data['name'].value_counts()

Cafe Coffee Day                                            96
Onesta                                                     85
Just Bake                                                  73
Empire Restaurant                                          71
Five Star Chicken                                          70
                                                           ..
Natis                                                       1
Shreyas                                                     1
Vishwas Kabab Centre                                        1
Taj Biryani Centre                                          1
Plunge - Sheraton Grand Bengaluru Whitefield Hotel &...     1
Name: name, Length: 8792, dtype: int64

In [None]:
# Getting top 20 restaurant chains
chains=data['name'].value_counts()[:20]
fig=px.bar(
            x=chains,
            y=chains.index,
            title='top 20 restaurant chains'.capitalize(),
            color_continuous_scale=['#f07167','#00afb9'],
            color=chains,
            labels=dict(x="Number of outlets", y="Chain", color="Outlets")
            )
fig.show()

##### Observation:
- CCD has most number of chains around the city
- Most of the chains are related to bakery, quick bites and cafes. Which is not unncommon for a tech city.

In [None]:
print(data.shape)
data.head(2)

(51717, 18)


Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city),ratings
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari,4.1
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari,4.1


#### Most Common Restaurant Type

In [None]:
# Getting top 20 restaurant chains
chains=data['rest_type'].value_counts()[:20]
fig=px.bar(
            x=chains,
            y=chains.index,
            title='top 20 restaurant chains'.capitalize(),
            color_continuous_scale=['#f07167','#00afb9'],
            color=chains,
            labels=dict(x="Number of outlets", y="Chain", color="Outlets")
            )
fig.show()

##### Observation:
- As Banglore known as the tech capital, people prefer quick bites over any other restaurant.

#### Restaurants that do not accept online orders

In [None]:
x=data.online_order.value_counts()
#colors = ['#00afb9','#f07167']
#hole - Sets the fraction of the radius to cut out of the pie. Use this to make a donut chart.
trace=go.Pie(labels=x.index,values=x,showlegend=False,hole=0.33,text=[str(round(i*100,3))+'%' for i in x/sum(x)],
             hovertext='Accepting Online Order',hoverinfo='label+text',textinfo='text')
layout=go.Layout(title="Accepting vs not accepting online orders")
fig=go.Figure(data=[trace],layout=layout)
fig.show()

#### Restaurants with table booking

In [None]:
x=data.book_table.value_counts()
#colors = ['#00afb9','#f07167']

trace=go.Pie(labels=x.index,values=x,textinfo='text', showlegend=False,hoverinfo='label+text',
             text=[str(round(i*100,3))+'%' for i in x/sum(x)])
layout=go.Layout(title="Restaurants with table booking")
fig=go.Figure(data=[trace],layout=layout)
fig.show()

##### Observation:
- Very few restaurant provide table bookings

#### Let's put some emphasis on ratings and cost

In [8]:
rating=data.rate.dropna().apply(lambda x : float(x.split('/')[0]) if (len(x)>3)  else np.nan ).dropna()
px.histogram(rating)
# px.histogram(data.ratings)

##### Observation:
- Around of the ratings are between 3 and 4 which seems a little uniform.
- Only a few restaurants with ratings more than 4.5.

#### Distribution of Cost

In [None]:
data['approx_cost(for two people)'].tail(10)

51707    2,000
51708    1,200
51709      800
51710      900
51711      800
51712    1,500
51713      600
51714    2,000
51715    2,500
51716    1,500
Name: approx_cost(for two people), dtype: object

In [9]:
cost_dist=data[['rate','approx_cost(for two people)','online_order']]
cost_dist['rate']=cost_dist['rate'].apply(lambda x: float(x.split('/')[0]) if len(x)>3 else 0)
cost_dist['approx_cost(for two people)']=cost_dist['approx_cost(for two people)'].apply(lambda x: int(x.replace(',','')))

px.histogram(cost_dist['approx_cost(for two people)'])

##### Observation:
- Most of the resturant are in <1000 range

In [10]:
# Lets define ranges for cost
# cheap = 0-500
# affordable = 501-1000
# Average = 1001 - 2000
# Costly = 2001- 4000
# Luxorious = 4001+
def cost_cat(cost):
    if cost <=500:
        return 'cheap'
    elif cost<=1000:
        return 'Affordable'
    elif cost<=2000:
        return 'Average'
    elif cost<=4000:
        return 'Costly'
    else:
        return 'Luxorious'
    return
cost_dist['value']=cost_dist['approx_cost(for two people)'].apply(cost_cat)
cost_dist.head()

Unnamed: 0,rate,approx_cost(for two people),online_order,value
0,4.1,800,Yes,Affordable
1,4.1,800,Yes,Affordable
2,3.8,800,Yes,Affordable
3,3.7,300,No,cheap
4,3.8,600,No,Affordable


In [11]:
cost_dist.tail()

Unnamed: 0,rate,approx_cost(for two people),online_order,value
51712,3.6,1500,No,Average
51713,0.0,600,No,Affordable
51714,0.0,2000,No,Average
51715,4.3,2500,No,Costly
51716,3.4,1500,No,Average


In [None]:
px.scatter(cost_dist,y='rate',x='approx_cost(for two people)',color='value')

##### Obsiervation:
- Around 50% of the restaurants are cheap
- 75% of the restaurants provide average cuisine for 2 people


#### What people prefer?

In [12]:
def cost_cat2(cost):
    cost=float(cost.replace(',',''))
    if cost <=500:
        return 'cheap'
    elif cost<=1000:
        return 'Affordable'
    elif cost<=2000:
        return 'Average'
    elif cost<=4000:
        return 'Costly'
    else:
        return 'Luxorious'
    return


data['value']=data['approx_cost(for two people)'].apply(cost_cat2)

# Votes is the number of people giving ratings
px.bar(data.groupby('value').agg('mean'),y='votes')

##### Observation:
- More number of people prefer to rate an average/costly restaurant.

#### Places famous for food

In [None]:
# Getting top 20 restaurant chains
chains=data['location'].value_counts()[:20]

fig=px.bar(
            x=chains,
            y=chains.index,
            title='top 20 restaurant locations'.capitalize(),
            color_continuous_scale=['#f07167','#00afb9'],
            color=chains,
            labels=dict(x="Number of outlets", y="Chain", color="Outlets")
            )
fig.show()

##### Observation:
- BTM layout is really famous for food and we can see it has more than 5000 restaurants

#### Get lat and long using geopy MapQuest

In [None]:
from geopy.geocoders import MapQuest

##### Get your own API key from: https://developer.mapquest.com/

- After creating the account. Go to My Application and copy and paste the key here:


![](https://i.imgur.com/XRIrHAW.png)

In [None]:
# You need to enter your own API key
new_api_key='Your API Key'
locator=MapQuest(new_api_key)
out=locator.geocode("942, 21st Main Road, 2nd Stage, Banashankari, Bangalore")
out

In [None]:
# Change this to True if you want to run the next cell
to_run=False

In [None]:
# This will take sometime, you can use the locations.csv provided in the zip instead
# You can run this but it will provide similar results
# This below code will get all the laglong coordinates from MapQuest.
# We have taken unique locations in order to reduce the server requests and avoid getting blocked by the API
# This takes around 5-6 hours to get all the data
if to_run==True:
    locations={'Address':list(data.address.unique()),'lat':[None for x in data.address.unique()],'long':[None for x in data.address.unique()]}
    t_loca=len(data.address.unique())
    diterator2= tqdm(range(t_loca))
    for index in diterator2:
        if locations['lat'][index]!=None:
            continue

        try:
            location = locator.geocode(locations['Address'][index])
        except:
            locations['lat'][index]  = np.nan
            locations['long'][index] = np.nan
            continue

        if location is None:
            locations['lat'][index]  = np.nan
            locations['long'][index] = np.nan
        else:
            locations['lat'][index]  = location.latitude
            locations['long'][index] = location.longitude
        time.sleep(0.15)
    pd.DataFrame(locations).to_csv('locations.csv',index=False)
else:
    print('Skipped getting cordinates')

#### Get lat and long from zomato_restaurant_locations.csv

In [13]:
locations=pd.read_csv('zomato_locations.csv')
print(locations.shape)
print(data.shape)
locations.head()


(51717, 4)
(51717, 19)


Unnamed: 0,name,address,lat,long
0,Jalsa,"942, 21st Main Road, 2nd Stage, Banashankari, ...",12.97912,77.5913
1,Spice Elephant,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",12.97912,77.5913
2,San Churro Cafe,"1112, Next to KIMS Medical College, 17th Cross...",12.97912,77.5913
3,Addhuri Udupi Bhojana,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",12.97912,77.5913
4,Grand Village,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",12.767697,77.310733


In [14]:
# data = data.merge(locations[['address','lat','long']], how='inner', on='address')
data = data.merge(locations[['lat','long']], left_index=True, right_index=True)
print(data.shape)
data.columns

(51717, 21)


Index(['url', 'address', 'name', 'online_order', 'book_table', 'rate', 'votes',
       'phone', 'location', 'rest_type', 'dish_liked', 'cuisines',
       'approx_cost(for two people)', 'reviews_list', 'menu_item',
       'listed_in(type)', 'listed_in(city)', 'ratings', 'value', 'lat',
       'long'],
      dtype='object')

In [15]:
print(data.shape)
data.head(2)

(51717, 21)


Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,...,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city),ratings,value,lat,long
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,...,"North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari,4.1,Affordable,12.97912,77.5913
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,...,"Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari,4.1,Affordable,12.97912,77.5913


In [None]:
# lat,long=[],[]
# for i in tqdm(range(data.shape[0])):
#     cord=locations[locations.address==data.address[i]]
#     if not pd.isna(cord.lat.values[0]):
#         lat.append(cord.lat.values[0])
#         long.append(cord.long.values[0])

# data['lat']=lat
# data['long']=long

In [16]:
check_miss(data)

No missing values present


Unnamed: 0,Col_name,Null Values,Count_Nulls,Percentage_Nulls,Type
0,url,False,0,0.0,object
11,cuisines,False,0,0.0,object
19,lat,False,0,0.0,float64
18,value,False,0,0.0,object
17,ratings,False,0,0.0,float64
16,listed_in(city),False,0,0.0,object
15,listed_in(type),False,0,0.0,object
14,menu_item,False,0,0.0,object
13,reviews_list,False,0,0.0,object
12,approx_cost(for two people),False,0,0.0,object


There are no missing values now.

#### Best Cuisine Restaurants (uses mapbox)

In [None]:
data.head(2)

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,...,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city),ratings,value,lat,long
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,...,"North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari,4.1,Affordable,12.97912,77.5913
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,...,"Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari,4.1,Affordable,12.97912,77.5913


In [None]:
data.cuisines.head()

0    North Indian, Mughlai, Chinese
1       Chinese, North Indian, Thai
2            Cafe, Mexican, Italian
3        South Indian, North Indian
4          North Indian, Rajasthani
Name: cuisines, dtype: object

In [None]:
from ipywidgets import interact

def is_cuisine_offered(offered,cuisine):
    if cuisine in offered:
        return True
    else:
        return False

def get_cuisines(data):
    cuisines=[]
    for cuisines_offered in data.cuisines:
        cuisines.extend(cuisines_offered.split(','))
    cuisines=list(set([x.strip() for x in cuisines]))
    return sorted(cuisines)

In [None]:
MAPBOX_ACCESSTOKEN = 'pk.eyJ1IjoiY3JpbXNvbnN5bnRoZSIsImEiOiJja29pd3Fiam0xYnlwMm9wd2sybzlhdGd4In0.91o8UY67rFX4KUv9lnqIAg'

In [None]:
@interact
def cuisines(cuisine=get_cuisines(data)):
    map_data=data[data.cuisines.apply(is_cuisine_offered,args=[cuisine])]
    lats = map_data.lat
    lons = map_data.long
    px.set_mapbox_access_token(MAPBOX_ACCESSTOKEN)
    fig = px.scatter_mapbox(map_data, lat=lats, lon=lons,hover_name='name',hover_data=['ratings','votes'],
                            color="ratings", size="votes",
                  color_continuous_scale=px.colors.sequential.algae, size_max=15, zoom=8)
    fig.show()

interactive(children=(Dropdown(description='cuisine', options=('Afghan', 'Afghani', 'African', 'American', 'An…

#### Restaurants of different cuisines at custom rates (uses mapbox)

In [None]:
data['cost for 2']=data['approx_cost(for two people)'].str.replace(',','')
data['cost for 2']= data['cost for 2'].astype(float)

In [None]:
@interact
def budget_rest(cost=FloatSlider(value=1000,min=500,max=6000,step=100),cuisine=get_cuisines(data)):
    map_data=data[(data['cost for 2']>=cost) & (data.cuisines.apply(is_cuisine_offered,args=[cuisine])) ]

    if map_data.shape[0]!=0:
        lats = map_data.lat
        lons = map_data.long
        px.set_mapbox_access_token(MAPBOX_ACCESSTOKEN)
        fig = px.scatter_mapbox(map_data, lat=lats, lon=lons,hover_name='name',hover_data=['ratings','votes'],
                                color="ratings", size="votes",
                      color_continuous_scale=px.colors.sequential.Purples, size_max=15, zoom=8)
        fig.show()
    else:
        print('No restaurants found')

#### Let's Analyze the restaurant chains (uses mapbox)

In [None]:
# To get restaurant types
def get_rest_types(data):
    rest_types=[]
    for rest_type in data.rest_type:
        rest_types.extend(rest_type.split(','))
    rest_types=list(set([x.strip() for x in rest_types]))
    return sorted(rest_types)

# To check whether the restaurant is of particular type
def is_rest_type(rest_types_present,rest_type):
    if rest_type in rest_types_present:
        return True
    else:
        return False

In [None]:
# Gets the lat and long cordinates for restaurant chains in different areas
def getChainlat(data,areas,name):
    lat=[]
    for area in areas:
        val=data[(data.name==name)&(data.location==area)].head(1).lat.values[0]
        lat.append(val)
    return lat

def getChainlong(data,areas,name):
    long=[]
    for area in areas:
        val=data[(data.name==name)&(data.location==area)].head(1).long.values[0]
        long.append(val)
    return long

In [None]:
@interact
def restaurant_chains(rest_type=Select(options=get_rest_types(data),value='Food Court')):
    top_rest_types=data[data.rest_type.apply(is_rest_type,args=[rest_type]) ]
    t5=top_rest_types.name.value_counts()[:5]
    traces=[]
    for name in t5.index:
        data_chain=data[data.name==name].location.value_counts().reset_index()
        data_chain.columns=['Name','count']
        data_chain['lat']=getChainlat(data,data_chain['Name'],name)
        data_chain['long']=getChainlong(data,data_chain['Name'],name)
        data_chain['text']=data_chain['Name']+'<br>'+data_chain['count'].astype(str)
        trace =  go.Scattermapbox(
                lat=data_chain['lat'],
                lon=data_chain['long'],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=data_chain['count']*4
                ),
                text=data_chain['text'],name=name
            )
        traces.append(trace)
    layout = go.Layout(title="Top 5 {} Restaurant chains locations around Banglore".format(rest_type),
        autosize=True,
        hovermode='closest',
        mapbox=dict(
            accesstoken=MAPBOX_ACCESSTOKEN,
            bearing=0,style="streets",
            center=dict(
                lat=12.96,
                lon=77.59
            ),
            pitch=0,
            zoom=10
        ),
    )


    fig = go.Figure(data=traces, layout=layout)
    fig.show()

#### Wordcloud: Dish Liked

In [None]:
data.dish_liked.head()

0    Pasta, Lunch Buffet, Masala Papad, Paneer Laja...
1    Momos, Lunch Buffet, Chocolate Nirvana, Thai G...
2    Churros, Cannelloni, Minestrone Soup, Hot Choc...
3                                          Masala Dosa
4                                  Panipuri, Gol Gappe
Name: dish_liked, dtype: object

In [None]:
data['dish_liked']=data['dish_liked'].apply(lambda x : x.split(',') if type(x)==str else [''])

In [None]:
data.dish_liked.head()

0    [Pasta,  Lunch Buffet,  Masala Papad,  Paneer ...
1    [Momos,  Lunch Buffet,  Chocolate Nirvana,  Th...
2    [Churros,  Cannelloni,  Minestrone Soup,  Hot ...
3                                        [Masala Dosa]
4                               [Panipuri,  Gol Gappe]
Name: dish_liked, dtype: object

In [None]:
from matplotlib.cm import Oranges,Reds,Purples,pink,Greens,Greys,Blues
import random

In [None]:
@interact
def produce_wordcloud(rest=Select(options=get_rest_types(data),value='Food Court')):
    plt.figure(figsize=(10,15))
    corpus=data[data['rest_type']==rest]['dish_liked'].values.tolist()
    corpus=','.join(x  for list_words in corpus for x in list_words)
    color=random.choice([ Oranges,Reds,Purples,pink,Greens,Greys,Blues])
    wordcloud = WordCloud(max_font_size=None, background_color='#fcf4ee', collocations=False,colormap =color,
                  width=750, height=750).generate(corpus)
    plt.imshow(wordcloud)
    # plt.title(rest,loc='left',**{'color':color.get_over(),'backgroundcolor':'#fcf4ee'})
    plt.title(rest,loc='left')
    plt.axis("off")
    plt.show()

interactive(children=(Select(description='rest', index=12, options=('Bakery', 'Bar', 'Beverage Shop', 'Bhojana…

## Analyzing Reviews

In [None]:
data['reviews_list'].iloc[0]

'[(\'Rated 4.0\', \'RATED\\n  A beautiful place to dine in.The interiors take you back to the Mughal era. The lightings are just perfect.We went there on the occasion of Christmas and so they had only limited items available. But the taste and service was not compromised at all.The only complaint is that the breads could have been better.Would surely like to come here again.\'), (\'Rated 4.0\', \'RATED\\n  I was here for dinner with my family on a weekday. The restaurant was completely empty. Ambience is good with some good old hindi music. Seating arrangement are good too. We ordered masala papad, panner and baby corn starters, lemon and corrionder soup, butter roti, olive and chilli paratha. Food was fresh and good, service is good too. Good for family hangout.\\nCheers\'), (\'Rated 2.0\', \'RATED\\n  Its a restaurant near to Banashankari BDA. Me along with few of my office friends visited to have buffet but unfortunately they only provide veg buffet. On inquiring they said this plac

We will go through the reviews and ratings of each restaurant and create data for analysis.


In [None]:
reviewList = eval(data.reviews_list.iloc[0])
print('No of ratings:', len(reviewList))
for score, doc in reviewList:
  rating = float(score.split()[1])
  review = doc.split('\n')[1:]
  review = ' '.join(review)
  review = review.strip()
  print(rating)
  print(review)
  print('------------------------------------------------------------------------------------')

No of ratings: 12
4.0
A beautiful place to dine in.The interiors take you back to the Mughal era. The lightings are just perfect.We went there on the occasion of Christmas and so they had only limited items available. But the taste and service was not compromised at all.The only complaint is that the breads could have been better.Would surely like to come here again.
------------------------------------------------------------------------------------
4.0
I was here for dinner with my family on a weekday. The restaurant was completely empty. Ambience is good with some good old hindi music. Seating arrangement are good too. We ordered masala papad, panner and baby corn starters, lemon and corrionder soup, butter roti, olive and chilli paratha. Food was fresh and good, service is good too. Good for family hangout. Cheers
------------------------------------------------------------------------------------
2.0
Its a restaurant near to Banashankari BDA. Me along with few of my office friends

In [17]:
all_ratings = []

for name,ratings in tqdm(zip(data['name'],data['reviews_list'])):
    ratings = eval(ratings)
    for score, doc in ratings:
        if score:
            score = score.strip("Rated").strip()
            doc = doc.strip('RATED').strip()
            score = float(score)
            all_ratings.append([name,score, doc])

51717it [00:26, 1941.13it/s]


In [None]:
len(all_ratings)

1319968

In [None]:
all_ratings[1]

['Jalsa',
 4.0,
 'I was here for dinner with my family on a weekday. The restaurant was completely empty. Ambience is good with some good old hindi music. Seating arrangement are good too. We ordered masala papad, panner and baby corn starters, lemon and corrionder soup, butter roti, olive and chilli paratha. Food was fresh and good, service is good too. Good for family hangout.\nCheers']

In [18]:


rating_data=pd.DataFrame(all_ratings,columns=['Name','Rating','Review'])
rating_data['Review']=rating_data['Review'].apply(lambda x : re.sub('[^a-zA-Z0-9\s]',"",x))
print(rating_data.shape)
rating_data.head()
# rating_data.to_csv('zomato_ratings.csv',index=None)
# !cp /content/zomato_ratings.csv /mydrive/NITW/NITW-06-SeqL/

(1319968, 3)


Unnamed: 0,Name,Rating,Review
0,Jalsa,4.0,A beautiful place to dine inThe interiors take...
1,Jalsa,4.0,I was here for dinner with my family on a week...
2,Jalsa,2.0,Its a restaurant near to Banashankari BDA Me a...
3,Jalsa,4.0,We went here on a weekend and one of us had th...
4,Jalsa,5.0,The best thing about the place is its ambiance...


#### Worldcloud: Reviews

In [None]:
# We are just taking first 100 restaurant names. For other we should use the function.
import random
@interact
def produce_wordcloud(rest=Select(options=rating_data.Name.unique()[:100],value='Jalsa')):
    plt.figure(figsize=(10,15))
    corpus=rating_data[rating_data['Name']==rest]['Review'].values.tolist()
    corpus=' '.join(x  for x in corpus)
    color=random.choice([ 'Oranges','Reds','Purples','Greens','Greys','Blues'])
    # wordcloud = WordCloud(max_font_size=None, background_color='#fcf4ee', collocations=False,colormap =color,
    #               width=750, height=750).generate(corpus)
    wordcloud = WordCloud(max_font_size=None, background_color='#fcf4ee', collocations=False,colormap = color,
                  width=750, height=750).generate(corpus)
    plt.imshow(wordcloud)
    # plt.title(rest,loc='left',**{'color':color.get_over(),'backgroundcolor':'#fcf4ee'})
    plt.title(rest,loc='left')
    plt.axis("off")
    plt.show()

#### Ratings

In [19]:

rating=rating_data['Rating'].value_counts()
px.bar(x=rating.index,y=rating,color=rating,color_continuous_scale='BrBg',title='Ratings from Reviews')

## Sentiment Analysis

At first we will have to divide the comment as negative and positive based on their ratings. We are defining 2.5 as mid point if raitng is less than 2.5 then it should be a negative comment and vice versa.


In [20]:
print(rating_data.shape)
rating_data.head()

(1319968, 3)


Unnamed: 0,Name,Rating,Review
0,Jalsa,4.0,A beautiful place to dine inThe interiors take...
1,Jalsa,4.0,I was here for dinner with my family on a week...
2,Jalsa,2.0,Its a restaurant near to Banashankari BDA Me a...
3,Jalsa,4.0,We went here on a weekend and one of us had th...
4,Jalsa,5.0,The best thing about the place is its ambiance...


In [21]:
rating_data['Sent']=rating_data['Rating'].apply(lambda x: 1 if int(x)>2.5 else 0)
rating_data.head(10)

Unnamed: 0,Name,Rating,Review,Sent
0,Jalsa,4.0,A beautiful place to dine inThe interiors take...,1
1,Jalsa,4.0,I was here for dinner with my family on a week...,1
2,Jalsa,2.0,Its a restaurant near to Banashankari BDA Me a...,0
3,Jalsa,4.0,We went here on a weekend and one of us had th...,1
4,Jalsa,5.0,The best thing about the place is its ambiance...,1
5,Jalsa,5.0,Great food and pleasant ambience Expensive but...,1
6,Jalsa,4.0,Good ambience with tasty food\nCheese chilli p...,1
7,Jalsa,4.0,You cant go wrong with Jalsa Never been a fan ...,1
8,Jalsa,5.0,Overdelighted by the service and food provided...,1
9,Jalsa,4.0,The place is nice and comfortable Food wise al...,1


In [22]:
#rating_data.isna().sum()
print(rating_data.shape)
rating_data.head()

(1319968, 4)


Unnamed: 0,Name,Rating,Review,Sent
0,Jalsa,4.0,A beautiful place to dine inThe interiors take...,1
1,Jalsa,4.0,I was here for dinner with my family on a week...,1
2,Jalsa,2.0,Its a restaurant near to Banashankari BDA Me a...,0
3,Jalsa,4.0,We went here on a weekend and one of us had th...,1
4,Jalsa,5.0,The best thing about the place is its ambiance...,1


**tf.keras.preprocessing.text.Tokenizer:**
- This class allows to vectorize a text corpus, by turning each text into either a sequence of integers (each integer being the index of a token in a dictionary) or into a vector where the coefficient for each token could be binary, based on word count, based on tf-idf...
- By default, all punctuation is removed, turning the texts into space-separated sequences of words (words maybe include the ' character). These sequences are then split into lists of tokens. They will then be indexed or vectorized.
- 0 is a reserved index that won't be assigned to any word

In [23]:
max_features=3000
tokenizer=Tokenizer(num_words=max_features,split=' ')
rating_data['Review'].fillna('', inplace=True)
tokenizer.fit_on_texts(rating_data['Review'].values)
# pickle.dump(tokenizer, 'zomato_tokenizer.pkl')
# !cp /content/zomato_tokenizer.pkl /mydrive/NITW/NITW-06-SeqL/

In [None]:
# !cp /mydrive/NITW/NITW-06-SeqL/zomato_tokenizer.pkl /content

In [24]:
rating_data.Review[0]

'A beautiful place to dine inThe interiors take you back to the Mughal era The lightings are just perfectWe went there on the occasion of Christmas and so they had only limited items available But the taste and service was not compromised at allThe only complaint is that the breads could have been betterWould surely like to come here again'

In [25]:
len(rating_data.Review[0])

340

In [26]:

X = tokenizer.texts_to_sequences(rating_data['Review'].values)
X = pad_sequences(X)
# tokenizer will assign integer to each word in the sentence using the vocab (with max_features setting)
# pad sequences will pad each sentence with 0, so that all sentences will be of the same length (here 194)

In [27]:
print(X.shape, type(X))
X[5000]

(1319968, 194) <class 'numpy.ndarray'>


array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [28]:
X.shape

(1319968, 194)

In [29]:
X[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    3,
        308,    9,    5,  705,  507,  253,   23,  1

In [None]:
# pickle.dump(tokenizer,open('zomato_tokenizer.pkl','wb'))
# !cp /content/zomato_tokenizer.pkl /mydrive/NITW/NITW-06-SeqL/

### Building LSTM Model

tf.keras.layers.Embedding
-  input_dim
-  output_dim
-  input_length=None


In [None]:
X.shape

In [30]:
embed_dim = 64  # size of the output embedded vector
lstm_out = 64

model = Sequential()
# embedding layer will generate a vector for each token in the input sequence (vector size specified by embed_dim)
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))

model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 194, 64)           192000    
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 2)                 130       
                                                                 
Total params: 225,154
Trainable params: 225,154
Non-trainable params: 0
_________________________________________________________________
None


### Training and Testing Data

In [None]:
rating_data['Sent'][:5]

In [31]:
Y = pd.get_dummies(rating_data['Sent'].astype(int)).values
print(Y.shape, type(Y))
Y[:5]

(1319968, 2) <class 'numpy.ndarray'>


array([[0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1]], dtype=uint8)

In [34]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.25, random_state = 13)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(989976, 194) (989976, 2)
(329992, 194) (329992, 2)


In [36]:
batch_size = 3200
to_run=False

In [None]:
# Takes about 45 minutes to train...use the saved model

if to_run:
    model.fit(X_train, Y_train, epochs = 10, batch_size=batch_size)
    model.save_weights('zomato_sentiment_rest.h5')
    !cp /content/zomato_sentiment_rest.h5 /mydrive/NITW/NITW-06-SeqL/

Epoch 1/10
310/310 [==============================] - 280s 885ms/step - loss: 0.2160 - accuracy: 0.9130
Epoch 2/10
310/310 [==============================] - 272s 878ms/step - loss: 0.1359 - accuracy: 0.9480
Epoch 3/10
310/310 [==============================] - 272s 879ms/step - loss: 0.1179 - accuracy: 0.9562
Epoch 4/10
310/310 [==============================] - 272s 878ms/step - loss: 0.1053 - accuracy: 0.9619
Epoch 5/10
310/310 [==============================] - 270s 870ms/step - loss: 0.0963 - accuracy: 0.9654
Epoch 6/10
310/310 [==============================] - 270s 872ms/step - loss: 0.0878 - accuracy: 0.9686
Epoch 7/10
310/310 [==============================] - 272s 877ms/step - loss: 0.0813 - accuracy: 0.9711
Epoch 8/10
310/310 [==============================] - 269s 869ms/step - loss: 0.0751 - accuracy: 0.9732
Epoch 9/10
310/310 [==============================] - 266s 858ms/step - loss: 0.0698 - accuracy: 0.9750
Epoch 10/10
310/310 [==============================] - 264s 852ms/step - loss: 0.0650 - accuracy: 0.9771

### Loading Weights : Use this to avoid training time.

We have already trained the model for you. Please use the weights file and load it into the model.

In [35]:
if not to_run:
    model.load_weights('zomato_sentiment_rest.h5')

### Validating our model

In [None]:
# Y_pred = model.predict(X_test)
# acc = calc_accuracy(Y_pred, Y_test)

In [38]:
batch_size = 3200
validation_size = 1500

X_validate = X_test[-validation_size:]  # last 1500 rows of X_test
Y_validate = Y_test[-validation_size:]  # last 1500 rows of Y_test
X_test = X_test[:-validation_size]      # All rows of X_test other than the last 1500 rows
Y_test = Y_test[:-validation_size]      # All rows of Y_test other than the last 1500 rows
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

103/103 - 8s - loss: 0.0634 - accuracy: 0.9783 - 8s/epoch - 75ms/step
score: 0.06
acc: 0.98


In [39]:
import IPython
def view_sentiment(stype,color,text):
    IPython.display.clear_output()
    fig=plt.figure()
    ax=fig.add_axes([0,0,1,1])
    ax.set_axis_off()
    fig.text(0.5,0.5, text,weight='heavy',color='white',size=16,
            bbox={'boxstyle':stype,'facecolor': color, 'alpha': 0.5, 'pad': 2})
    fig.show()

In [None]:
# text = "It is a really good place"
# x = tokenizer.texts_to_sequences([text])
# print(x)
# x = pad_sequences(x, maxlen=X_test.shape[1], padding='pre')
# print(x)

**tf.keras.utils.pad_sequences**
- sequences
- maxlen=None
- padding='pre'





In [40]:
!cp /mydrive/NITW/NITW-06-SeqL/zomato_tokenizer.pkl /content

In [42]:
@interact
def get_sentiment(text=Text("It is a really good place")):
    global tokenizer
    if text!='':
        if len(text.split(' '))>=2:
            max_features=3000
            # tokenizer=pickle.load(open('zomato_tokenizer.pkl','rb'))
            x = tokenizer.texts_to_sequences([text])
            x = pad_sequences(x)
            # x = pad_sequences(x, maxlen=X_test.shape[1], padding='pre')
            # if model.predict_classes(x)[0]!=1:
            prediction = model.predict(x)
            print(prediction)
            zeroProbability, oneProbability = prediction[0][0], prediction[0][1]
            print(zeroProbability, oneProbability)
            print("Review:", text)
            if zeroProbability > oneProbability:
                print("Sentiment prediction: Negative")
            else:
                print("Sentiment prediction: Positive")
            # if zeroProbability > oneProbability:
            #     view_sentiment('Sawtooth','red','Negative')
            # else:
            #     view_sentiment('Round','green','Positive')
        else:
            # view_sentiment('Circle','grey','Enter Text')
            print("Length of review text is less than 2 words")
    else:
        # view_sentiment('Circle','grey','Enter Text')
        print("Nothing entered for review")

interactive(children=(Text(value='It is a really good place', description='text'), Output()), _dom_classes=('w…

In [44]:
def get_sentiment(text):
    global tokenizer
    if text!='':
        if len(text.split(' '))>=2:
            max_features=3000
            # tokenizer=pickle.load(open('zomato_tokenizer.pkl','rb'))
            x = tokenizer.texts_to_sequences([text])
            x = pad_sequences(x)
            # x = pad_sequences(x, maxlen=X_test.shape[1], padding='pre')
            # if model.predict_classes(x)[0]!=1:
            prediction = model.predict(x)
            print(prediction)
            zeroProbability, oneProbability = prediction[0][0], prediction[0][1]
            print(zeroProbability, oneProbability)
            print("Review:", text)
            if zeroProbability > oneProbability:
                print("Sentiment prediction: Negative")
            else:
                print("Sentiment prediction: Positive")
            # if zeroProbability > oneProbability:
            #     view_sentiment('Sawtooth','red','Negative')
            # else:
            #     view_sentiment('Round','green','Positive')
        else:
            # view_sentiment('Circle','grey','Enter Text')
            print("Length of review text is less than 2 words")
    else:
        # view_sentiment('Circle','grey','Enter Text')
        print("Nothing entered for review")

text = input("Enter the review:")
get_sentiment(text)

Enter the review:The food was really good but the ambience was not good
[[0.10463862 0.8953614 ]]
0.10463862 0.8953614
Review: The food was really good but the ambience was not good
Sentiment prediction: Positive
