In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Table of contents:
0. **Goal**

1. **Importing libraries and loading the dataset**

2. **Exploring and cleaning the dataset**
    - Checking for duplicates
    - Removing duplicate entries
    - Removing unnecessary columns
    - Renaming some columns to simpler names
    - Fixing the "ratings" column values
    - Fixing the "cost_for_two" column datatype
    - Fixing the name of a important restaurant

3. **Extracting info from dataset**
    * Extracting latitudes and longitudes from 'location'
    * Extracting the cuisines and their occurence count
    * Extracting the feature Popularity Score

4. **Insights and Visualizations**
    * Does accepting online orders increase ratings?
    * How does localities affect restaurants
        * Localities with high number of restaurants
        * Average rating of restaurants by locality
        * Highest average cost by locality
        
    * Exploring the feature 'Cuisines' and its relation with localities
        * Number of restaurants per cuisine
        * Locations where north indian cuisine is popular
        * Locations where cafes are popular
        * Locations with highest number of restaurants serving desserts
        
    * Best restaurants by cuisine and cost
        * Best north indian cuisine restaurants
        * Best south indian cuisine restaurants
        * Best cafes in bangalore
        * Best chinese cuisine restaurants
        * Most expensive and high rated restaurants 
        * Cheap and best places to eat 

# 0. Goal
**My goal is to explore the dataset to find out how location, cuisine and cost affects the "ratings/number of votes" and how all of these are interrelated.**

# 1. Importing libraries and loading the dataset

In [None]:
import json
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

from plotly.offline import init_notebook_mode, iplot, plot
init_notebook_mode(connected=True)

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="Duke_SkyCrawler")

In [None]:
df_raw = pd.read_csv('/kaggle/input/zomato-bangalore-restaurants/zomato.csv')

# 2. Exploring and cleaning the dataset.

In [None]:
df_raw.head()

In [None]:
df_raw.info() 

## 2.1 Checking for duplicates entries

In [None]:
len(df_raw.name.unique().tolist())

## EXPLAINATION: Even though the number of entries are more than 51k, unique entries (unique by name) are only 8792
##          Let's look into more detail

In [None]:
df_sortby_name = df_raw.sort_values(by = ['name', 
                                          'listed_in(type)'])
df_sortby_name.loc[:, ['name',
                       'phone',
                       'rate', 
                       'votes', 
                       'rest_type', 
                       'location', 
                       'listed_in(type)', 
                       'listed_in(city)']].tail(20)

## EXPLAINATION: It can be clearly seen that some restaurants are repeated (look at their phone, rating, votes, location)
##          and that same restaurant is listed under different type and city (see "listed_in(type)" and "listed_in(city)")
##          So it becomes necessary to remove these duplicate entries 

## 2.2 Removing duplicate entries

In [None]:
df_raw.drop_duplicates(subset = ['address', 'name', 'location', 'rest_type'], inplace = True)
df_raw.reset_index(drop= True, inplace = True)

## EXPLAINATION: removed duplicates, for that the columns used to identify duplicates are 'address', 'name', 'location', 'rest_type'
##          Also index was reset.

## 2.3 Removing unnecessary columns for this EDA

In [None]:
df_raw.drop(columns = ['url', 'phone'], inplace = True)

## EXPLAINATION: Didn't do inplace for 'df_raw' instead saved that view in a new variable, from now we will work on 'df'

## 2.4 Renaming some columns to simpler names

In [None]:
df_raw.rename(columns = {'rate':'ratings',
                     'approx_cost(for two people)':'cost_for_two',
                     'listed_in(type)':'rest_category',
                     'listed_in(city)': 'city'}, inplace=True)

## 2.5 Fixing the "ratings" column values

In [None]:
df_raw.ratings.replace({'NEW': '0',
                 np.nan : '0', 
                 '-': '0'}, inplace = True)
## EXPLAINATION: Replaced values like 'NEW', 'NaN', '-' in the rate columns with zero

In [None]:
df_raw['ratings'] = df_raw.ratings.apply(lambda x: (eval(x))*5 )

## EXPLAINATION: Changed the data type of the column rate from string to float value (from fraction to decimal) 

## 2.6 Fixing the data type of column 'cost_for_two'

In [None]:
def to_float(number):
    try:
        return float(number)
    
    except ValueError:
        number = number.replace(',', '')
        return float(number)
    
    except TypeError:
        pass
    
df_raw['cost_for_two'] = df_raw['cost_for_two'].apply(to_float)

## EXPLAINATION: Changed the data type of the column cost_for_two from string to float for easier analysis ahead.

## 2.7 Fixing the name of one important restaurant which otherwise creates issues

In [None]:
# df_raw.query('address == "Shangri-La Hotel, 56-6B, Palace Road, Bengaluru"') ## to find index number
df_raw.loc[10131, 'name'] = 'Shangri-La Hotel'

## EXPLAINATION: simply updated the corrupted value with 'Shangri-La Hotel' 

# 3. Extracting info from dataset

## 3.1 Extracting latitudes and longitudes from 'location'

In [None]:
dict_latlng = {}

def latlng(i):
        if df_raw.location[i] not in dict_latlng:
            location = geolocator.geocode(f'{df_raw.location[i]}, Bangalore')
            temp_list = [location.latitude, location.longitude]
            dict_latlng[df_raw.location[i]] = temp_list
            return dict_latlng[df_raw.location[i]]

        else:
            return dict_latlng[df_raw.location[i]]
    

## EXPLAINATION: created a function which will return latitude and longitudes from the given index number from the data set
##          Also in this function I have used a dictionary to store the latitudes and longitudes of the locations so
##          that this function does'nt calls again and again for the same locations which also makes this function 
##          performs faster!(only 94 calls instead of 12000)

In [None]:
err_indexes = []
for i in range(0,len(df_raw)):
    try: 
        latlng(i)
    except AttributeError:
        err_indexes.append(i)
        pass
    
## EXPLAINATION: this code runs the latlng function for the whole dataset and creates a list of indexes of those
##          locations for which NOMINATIM can't find the latitudes and longitudes.
##          Let's check which locations are those.

In [None]:
df_raw.loc[err_indexes, :]

## EXPLAINATION: here it can be observed that the only location for which NOMINATIM can't find latitudes 
##          and longitudes is the Rammurthy Nagar, this can be easily found out from google and 
##          manually entered in the dictionary. 

In [None]:
Rammurthy_Nagar_latlng = [13.016494, 77.677325]
dict_latlng['Rammurthy Nagar'] = Rammurthy_Nagar_latlng

## EXPLAINATION: manually added latitude and longitude of Rammurthy Nagar

In [None]:
df_raw['latitude'] = df_raw.location.apply(lambda x: dict_latlng[x][0])
df_raw['longitude'] = df_raw.location.apply(lambda x: dict_latlng[x][1])

## EXPLAINATION: here I finally add new columns and fill them with latitudes and longitudes from the dictionary

## 3.2 Extracting the cuisines and their occurence count

In [None]:
cuisines_list = df_raw.cuisines.tolist()
all_cuisines = []

cuisine_err_indexes = []
for i in range(0, len(cuisines_list)):
    try:
        all_cuisines.extend(cuisines_list[i].split(', ')) 
    except AttributeError: # attribute error because nan does not support list method split
        cuisine_err_indexes.append(i)
        pass
    
## EXPLAINATION: Here a list 'all_cuisines' is created containing all the occurences of cuisine names in the dataset
##          I will further create a dataframe with columns cuisine name and number of occurences 

## Run this code to see what values created Attribute error =>

# for i in range(0, len(cuisine_err_indexes)):
#      print(type(cuisines_list[cuisine_err_indexes[i]]))

In [None]:
df_cuisines = pd.DataFrame(data = set(all_cuisines), columns = ['Cuisine'])
df_cuisines["Occurence_count"] = df_cuisines.Cuisine.apply(lambda x: all_cuisines.count(x))

## EXPLAINATION: Here a new dataframe is created, 'df_cuisines' which holds cuisine name and number of times it occured in original dataset

## 3.3 Extracting "Popularity Score" (experiment)

In [None]:
df_raw['Pop_Score'] = df_raw['ratings']*df_raw['votes']

# 4. Insights and Visualizations

## 4.1 Does accepting online orders increase rating?

In [None]:
grp_online_order = df_raw.groupby('online_order')
grp_online_order.describe().loc[:, ['ratings', 'votes']]

In [None]:
fig = make_subplots(rows=1, 
                    cols=2, 
                    specs = [[{'type': 'domain'}, {'type':'xy'}]],
                    subplot_titles = ['Restaurants that accept online orders','Affect on average rating'])

fig.add_trace(go.Pie(labels = ["Don't Accept", 'Accept'],
                     values = grp_online_order.describe().ratings['count'],
                     hoverinfo = 'label+value', 
                     textinfo = 'label+percent',
                     marker_colors = ['#fc9272', '#43a2ca']),
              row = 1, 
              col = 1)

fig.add_trace(go.Bar(x = grp_online_order.describe().ratings.index,
                     y = grp_online_order.describe().ratings['mean'],
                     hoverinfo = 'x+y',
                     marker_color = ['#fc9272', '#43a2ca']),
              row = 1, 
              col = 2)

fig.update_xaxes(title_text="Accepts orders online?", title_font_size = 15, row=1, col=2)
fig.update_yaxes(title_text="Average rating", title_font_size = 15,row=1, col=2)

fig.update_layout(height = 600,
                  width = 1000, 
                  title = dict(text = 'Order Online?',
                               font_size = 32,
                               x = 0.5),
                  showlegend = False)

It can be clearly seen that those restaurants which provide option for placing order online gets higher rating on average than those who dont. Reason could be the bias which may have been created because zomato app is majorily used for ordering online.

## 4.2 How does Localities affect restaurants

### 4.2.1 Localities with High number of Restaurants

In [None]:
temp = df_raw.groupby('location').size().sort_values(ascending = False).head(20)
temp.index

## EXPLAINATION: From now I will use these Locations only for analysis as these locations are the ones
##          which hosts most of the restaurants.

In [None]:
fig = px.bar(x = temp.index,
             y = temp,
             color = temp.index,
             color_discrete_sequence = px.colors.qualitative.Pastel,
             labels = dict(x = 'Location',
                           y = 'Number of Restaurants'),
             title = "Popular Restaurant Localities")

fig.update_xaxes(title_font_size = 20)
fig.update_yaxes(title_font_size = 20)

fig.update_layout(height = 600,
                  width = 1000,
                  title_x = 0.5,
                  title_font_size = 32,
                  showlegend = False)

## EXPLAINATION: Popular restaurants localities are calculated by selecting localities with highest number
##          of restaurants. And top 15 are chosen.

A simple google search tell us that Whitefield is known for its tech parks and upmarket apartment complexes, lively Whitefield is also a shopping and entertainment hub. Upscale malls like Phoenix Marketcity and VR Bengaluru house global brands, movie theaters, live music, and alfresco bars. These are the reasons that Whitefield hosts a large number of eateries. Also BTM which is a high growth neighbourhood and Electronic City which is the information technology hub in Bangalore (according to wikipedia) are not far behind in numbers.

### 4.2.2 Average rating of Restaurants by Locality

In [None]:
temp = df_raw.groupby('location').describe().loc[:, 'ratings']
temp = temp.sort_values('count', ascending = False).head(20)
temp = temp.sort_values('mean', ascending = False).head(15)
temp

In [None]:
fig = px.bar(data_frame = temp,
             x = temp.index,
             y = temp['mean'],
             color = temp.index,
             color_discrete_sequence = px.colors.qualitative.Pastel,
             labels = dict(x = 'Location',
                           y = 'Average Rating'),
             title = "Average rating of Restaurants by Locality",
             hover_data = dict(Restaurant_count = temp['count'],
                               mean = False))

fig.update_xaxes(title_font_size = 20)
fig.update_yaxes(title = 'Mean Rating', 
                 title_font_size = 20 )

fig.update_layout(height = 600,
                  width = 1000,
                  title_x = 0.5,
                  title_font_size = 32,
                  showlegend = False)

## EXPLAINATION: Average rating of restaurants is calculated by selecting top 20 locations with
##          highest number of restaurants and then sorting them by highest mean rating
##          of the restaurants and then again selecting top 15.

Koramangala comes first with average rating of 3.5. Also it hosts around 270 restaurants. Rest of the locations are not that far behind.

### 4.2.3 Highest Average cost by Locality

In [None]:
temp = df_raw.groupby('location').describe().cost_for_two
temp = temp.sort_values('count', ascending = False).head(20)
temp = temp.sort_values('mean', ascending = False).head(15)
temp

In [None]:
fig = px.bar(data_frame = temp,
             x = temp.index,
             y = temp['mean'],
             color = temp.index,
             color_discrete_sequence = px.colors.qualitative.Pastel,
             labels = dict(x = 'Location',
                           y = 'Average Cost for two'),
             title = "Highest average 'cost for two' by Locality",
             hover_data = dict(Restaurant_count = temp['count'],
                               Cost_for_two = temp['mean'],
                               mean = False))

fig.update_xaxes(title = 'Location',
                 title_font_size = 20)
fig.update_yaxes(title = 'Average Cost for two', 
                 title_font_size = 20 )

fig.update_layout(height = 600,
                  width = 1000,
                  title_x = 0.5,
                  title_font_size = 32,
                  showlegend = False)

## EXPLAINATION: Average cost for 2 of restaurants is calculated by selecting top 20 locations with
##          highest number of restaurants and then sorting them by highest mean cost_for_two
##          of the restaurants and then again selecting top 15.

According to wikipedia Indiranagar is one of the most expensive areas in the city, which is clearly reflected in this graph.

## 4.3 Exploring Cuisines and its relation with Localities

### 4.3.1 Number of Restaurants per cuisine

In [None]:
df_cuisines.sort_values('Occurence_count', ascending = False).head(10)

In [None]:
fig = px.treemap(df_cuisines,
                 path = ['Cuisine'],
                 values = 'Occurence_count',
                 title = 'Cuisines listed in Restaurants',
                 color = 'Occurence_count', 
                 color_continuous_scale  = px.colors.sequential.Purpor, 
                 labels = dict(Occurence_count = 'No. of Restaurants'))

fig.update_layout(height = 600,
                  width = 1000,
                  title_x = 0.5,
                  title_font_size = 32,
                  margin = dict(r=20, l= 20, b = 20, t = 70))

Since there are so many cuisines, there's no limit to what we can find from the data so I will stick to the cuisines which interests me.

### 4.3.2 Locations where North Indian Cuisine is Popular

In [None]:
df_filtered = df_raw.loc[df_raw.cuisines.isna() == False, :]

## EXPLAINATION: Here first I have filtered out all those rows which did'nt had any cuisine listed,
##          those were total of 22.

In [None]:
cuisine = 'North Indian'

df_byCuisine = df_filtered.loc[df_filtered.cuisines.str.contains(cuisine, regex = False), :]
df_North_Indian = df_byCuisine.groupby('location').describe().ratings.sort_values('count', ascending = False).head(10)
df_North_Indian

## EXPLAINATION: I have filtered all those rows which had the cuisine 
##          mentioned in the cuisine variable. Then I grouped them by location
##          and sorted them by number of restaurants(count), will do same for other cuisines too!!

In [None]:
fig = px.bar(data_frame = df_North_Indian,
             x = df_North_Indian.index,
             y = df_North_Indian['count'],
             color = df_North_Indian['mean'],
             color_continuous_scale  = px.colors.sequential.Magenta,
             labels = dict(mean = 'Avg Rating'),
             title = "Locations in which North Indian is Popular",
             hover_data = dict(Restaurant_count = df_North_Indian['count'],
                               Average_rating = df_North_Indian['mean'],
                               count = False, mean = False))

fig.update_xaxes(title = 'Location',
                 title_font_size = 20)
fig.update_yaxes(title = 'Number of Restaurants', 
                 title_font_size = 20 )

fig.update_layout(height = 600,
                  width = 1000,
                  title_x = 0.5,
                  title_font_size = 32)

With average rating of 2.57, whitefield hosts the largest number of restaurents serving north indian cuisine. Although less in numbers Indiranagar hosts 170 restaurants serving north indian with avg rating of 3.

### 4.3.3 Locations where Cafes are Popular

In [None]:
cuisine = 'Cafe'

df_byCuisine = df_filtered.loc[df_filtered.cuisines.str.contains(cuisine, regex = False), :]
df_Cafe = df_byCuisine.groupby('location').describe().ratings.sort_values('count', ascending = False).head(10)
df_Cafe

In [None]:
fig = px.bar(data_frame = df_Cafe,
             x = df_Cafe.index,
             y = df_Cafe['count'],
             color = df_Cafe['mean'],
             color_continuous_scale  = px.colors.sequential.Redor,
             labels = dict(mean = 'Avg Rating'),
             title = "Locations in which Cafes are Popular",
             hover_data = dict(Restaurant_count = df_Cafe['count'],
                               Average_rating = df_Cafe['mean'],
                               count = False, mean = False))

fig.update_xaxes(title = 'Location',
                 title_font_size = 20)
fig.update_yaxes(title = 'Number of Restaurants', 
                 title_font_size = 20 )

fig.update_layout(height = 600,
                  width = 1000,
                  title_x = 0.5,
                  title_font_size = 32)

Indiranagar is the clear winner in this category with restaurants having an avg rating of 3.7. While Koramangala hosts less number but has the highest avg rating in the lot.

### 4.3.4 Locations with highest number of Restaurants serving desserts

In [None]:
cuisine = 'Desserts'

df_byCuisine = df_filtered.loc[df_filtered.cuisines.str.contains(cuisine, regex = False), :]
df_Desserts = df_byCuisine.groupby('location').describe().ratings.sort_values('count', ascending = False).head(10)
df_Desserts

In [None]:
fig = px.bar(data_frame = df_Desserts,
             x = df_Desserts.index,
             y = df_Desserts['count'],
             color = df_Desserts['mean'],
             color_continuous_scale  = px.colors.sequential.Blugrn,
             labels = dict(mean = 'Avg Rating'),
             title = "Locations in which Desserts are listed the most",
             hover_data = dict(Restaurant_count = df_Desserts['count'],
                               Average_rating = df_Desserts['mean'],
                               count = False, mean = False))

fig.update_xaxes(title = 'Location',
                 title_font_size = 20)
fig.update_yaxes(title = 'Number of Restaurants', 
                 title_font_size = 20 )

fig.update_layout(height = 600,
                  width = 1000,
                  title_x = 0.5,
                  title_font_size = 32)

Whitefield a name which is coming up again and again also wins this category although higher avg ratings are found in HSR, Indiranagar, Jayanagar and Koramanagala

## 4.4 Best Restaurants by cuisines and cost

### 4.4.1 Best North Indian Cuisine Restaurants

In [None]:
df_filtered = df_raw.loc[df_raw.cuisines.isna() == False, :]

df_northindian = df_filtered.loc[df_filtered.cuisines.str.contains('North Indian', regex = False), :]
df_southindian = df_filtered.loc[df_filtered.cuisines.str.contains('South Indian', regex = False), :]
df_cafe = df_filtered.loc[df_filtered.cuisines.str.contains('Cafe', regex = False), :]
df_chinese = df_filtered.loc[df_filtered.cuisines.str.contains('Chinese', regex = False), :]

## EXPLAINATION: already explained above!!

In [None]:
df_northindian = df_northindian.sort_values('Pop_Score', ascending = False).head(10).loc[:,['name', 'location', 'ratings', 'votes', 'cost_for_two']]
df_northindian.set_index('name', inplace = True, drop = True)

fig = ff.create_table(df_northindian, index = True, index_title = "Restaurant Name")
fig.update_layout(width = 1200)

## EXPLAINATION: Sorted values by Pop_Score in the dataframe created in the last cell, and filtered for only 5 columns
##          then created plotly's figure_factory table out of the filtered and sorted dataframe just created. 
##          done same for other cuisines too! 

Byg Brewski Brewing Company is the largest brewpub in the whole of Asia which is spread over 65k square feet. Also it has 5 bar areas with its own themes. Because of this Byg Brewski comes first in this list with highest rating and highest number of votes too.

### 4.4.2 Best South Indian Cuisine Restaurants

In [None]:
df_southindian = df_southindian.sort_values('Pop_Score', ascending = False).head(10).loc[:,['name', 'location', 'ratings', 'votes', 'cost_for_two']]
df_southindian.set_index('name', inplace = True, drop = True)

fig = ff.create_table(df_southindian, index = True, index_title = "Restaurant Name")
fig.update_layout(width = 1200)

Byg Brewski Again!! Another restaurant that came up 5 times will have to be very good, at avg rating of 4.3 and large number of votes Empire Restaurant chain comes second.

### 4.4.3 Best Cafes in Bangalore

In [None]:
df_cafe= df_cafe.sort_values('Pop_Score', ascending = False).head(10).loc[:,['name', 'location', 'ratings', 'votes', 'cost_for_two']]
df_cafe.set_index('name', inplace = True, drop = True)

fig = ff.create_table(df_cafe, index = True, index_title = "Restaurant Name")
fig.update_layout(width = 1200)

With 4.5 rating on google and close to 37k votes Truffles cafe comes first in this list of best cafes according to zomato

### 4.4.4 Best Chinese Cuisine Restaurants 

In [None]:
df_chinese = df_chinese.sort_values('Pop_Score', ascending = False).head(10).loc[:,['name', 'location', 'ratings', 'votes', 'cost_for_two']]
df_chinese.set_index('name', inplace = True, drop = True)

fig = ff.create_table(df_chinese, index = True, index_title = "Restaurant Name")
fig.update_layout(width = 1200)

Empire Restaurant Chain again!! 

### 4.4.5 Most Expensive and Best Restaurants in Bangalore

In [None]:
## Expensive and best regardless of any factor.
df_expensive_n_best = df_raw.sort_values(['cost_for_two','Pop_Score'], ascending = False).query('Pop_Score >= 1800').head(10)
df_expensive_n_best = df_expensive_n_best.loc[:,['name','rest_type', 'location', 'ratings', 'votes', 'cost_for_two']]
df_expensive_n_best.set_index('name', inplace = True, drop = True)


fig = ff.create_table(df_expensive_n_best, index = True, index_title = "Restaurant Name")
fig.update_layout(width = 1400)

It must be noted that all of the restaurants/hotels on this list comes under the Fine Dining type, naturally these are the most expensive as well as highest rated eateries in the whole of Bangalore. With first place taken by JW Marriott.

### 4.4.6 Cheap and Best Places to eat

In [None]:
df_cheap_n_best = df_raw.sort_values(['cost_for_two','Pop_Score'], ascending = True).query('Pop_Score >= 1200').head(10)
df_cheap_n_best = df_cheap_n_best.loc[:,['name','rest_type', 'location', 'ratings', 'votes', 'cost_for_two']]
df_cheap_n_best.set_index('name', inplace = True, drop = True)


fig = ff.create_table(df_cheap_n_best, index = True, index_title = "Restaurant Name")
fig.update_layout(width = 1400)

All of the eateries on this list costs only 100 for 2 persons with an exception of one. On looking a little bit closer most of these places serves coffee except for few. Will have to dig deeper for their menus. Anyways their lower price does'nt mean lower ratings, actually opposite, which clearly makes them best value for money.