In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import re
from geopy.geocoders import Nominatim
from folium.plugins import HeatMap
import folium
import plotly.graph_objs as go
import plotly.offline as py
from statistics import mode
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
import sqlite3
from sqlalchemy import create_engine

#sns.set()
plt.style.use('ggplot')

## If a person wants to open a new restaurant: 

* Does the demography of an area matter?## Does the theme of the restaurant matter?
* Is a food chain category restaurant likely to have more customers than its counter part?
* Are any neighborhoods similar based on the type of food?
* What kind of food is more popular in each locality?

## Read the dataset

In [None]:
file_path = '../input/zomato-bangalore-restaurants'
file_name = '/zomato.csv'
zomato = pd.read_csv(file_path + file_name, sep = ',')
zomato.drop_duplicates(inplace = True)
zomato.head(2)

## Quick exploration

In [None]:
print('Number of rows: {} \nNumber of columns: {}'.format(zomato.shape[0],zomato.shape[1]))

In [None]:
num_cols = zomato.select_dtypes(include = 'number').columns.size
cat_cols = zomato.shape[1] - num_cols
print('Number of numerical columns: {} \nNumber of categorical columns: {}'.format(num_cols, cat_cols))

In [None]:
zomato.info()

In [None]:
zomato.describe(include = ['O']).T

## Insights so far

 1. There are roughly 9000 different restaurant names, so it seems most of them are just same restaurants with different locations.
 1. Some restaurants have no address because work only with delivery.
 1. Most restaurants allow online ordering.
 1. Most restaurants don't allow table booking.
 1. Some restaurants have no rating since they are new.

## There's definitely presence of outliers in the votes column due to the oldest restaurants that have the highest numbers of votes. Most restaurants have less than 200 votes.

# Null values

In [None]:
zomato.isnull().sum()

## Handling null values in location

In [None]:
zomato.location.nunique()

In [None]:
zomato.location.value_counts(ascending = False)

### Different methodologies could be used to handle these null values, either replacing them with the most frequent location of all, or grouping by other feaures and finding the most common one.

In [None]:
zomato['listed_in(city)'].nunique()

### We'll group by the neighborhoods in which the restaurants were listed 'listed_in(city)' and get the mode, since it makes sense that restaurants should be close to they were listed.

In [None]:
city_listed_mode = zomato.groupby('listed_in(city)').location.agg(mode)
city_listed_mode

In [None]:
for i,j in zip(city_listed_mode.index, city_listed_mode.values):
    zomato.loc[(zomato['listed_in(city)'] == i) & (zomato.location.isnull()), 'location'] = j
zomato.location.isnull().sum()

# Null values in cuisines

In [None]:
zomato.cuisines.sample(10)

### Let's see the total number of different cuisines

In [None]:
#These are all the different cuisines and combinations in the dataset, naturally there'll be a lot
zomato.cuisines.value_counts(ascending = True)

In [None]:
#This is the number of unique cuisines in the dataset
len({x.strip(' ') for lis in zomato.cuisines.dropna() for x in lis.split(',')})

### There are 107 different types of cuisines. Now let's find out which one is is the most common

In [None]:
cuisines = pd.Series([x.strip(' ') for lis in zomato.cuisines.dropna() for x in lis.split(',')], 
                     name = 'Bangalore Cuisines')
cuisines.value_counts(ascending = False).head()

## Let's make it friendly to the eye

In [None]:
sns.set()
plt.figure(figsize = (6,6))
cuisines.value_counts(ascending = False).head(10).plot(kind = 'pie', autopct = '%.2f', pctdistance = 0.8)
plt.ylabel('Top 10 Bangalore Cuisines', labelpad = 50)
plt.show()

## We could use the same methodology to deal with the missing values. However, the missing values only cover a tiny percentage of the total data, and the grouping would be larger since there are more than 100 different cuisines. It's safe to just fill the missing values with the  most common cuisine which is North Indian.

In [None]:
zomato.loc[zomato.cuisines.isnull(), 'cuisines'] = 'North Indian'
zomato.cuisines.isnull().sum()

# Now for restaurant type 'rest_type'

In [None]:
zomato.rest_type.isnull().sum()

In [None]:
zomato.rest_type.sample(5)

In [None]:
zomato.rest_type.value_counts(ascending = False)

In [None]:
zomato[['rest_type', 'listed_in(type)']].sample(10)

## There could be a correlation between the restaurant type and the type in which it was listed, so we'll group by the latter and get the most common restaurant types to replace the missing values

## We could also just replace the missing values with the most common type of all, but given the amount of missing values in this feature, we'll use the first method

In [None]:
types = [x.strip(' ') for lis in zomato.rest_type.dropna() for x in lis.split(',')]
types = pd.Series(data = types, name = 'Bangalore Restaurant Types')
types.value_counts(ascending = False).head(10)

## The vast majority of restaurants are Quick Bites, followed by Casual Dining

In [None]:
plt.figure(figsize = (8,6))
sns.countplot(y = types)
plt.xlabel('Count')
plt.show()

In [None]:
res_types_mode = zomato.groupby('listed_in(type)')['rest_type'].agg(mode)
res_types_mode

In [None]:
for i,j in zip(res_types_mode.index, res_types_mode.values):
    zomato.loc[(zomato['listed_in(type)'] == i) & (zomato.rest_type.isnull()), 'rest_type'] = j
zomato.rest_type.isnull().sum()

In [None]:
zomato.isnull().sum()

# For approx_cost(for two people)

## Let's see if there's any correlation between rate (since it says whether a restaurant is new or not) and the appoximate cost of a two people meal

In [None]:
zomato[zomato.rate.str.lower() == 'new'][['rate', 'approx_cost(for two people)']].sample(10)

## It seems that even for the new restaurants they could estimate the cost for a two people meal so we'll use the restaurant type and the location to estimate this feature.

In [None]:
zomato[['rest_type','location']].sample(5)

In [None]:
res_typ_and_loc = zomato[['rest_type','location',
                          'approx_cost(for two people)']].dropna(subset = ['approx_cost(for two people)']).copy()

In [None]:
X = res_typ_and_loc.drop(columns = 'approx_cost(for two people)')
y = res_typ_and_loc['approx_cost(for two people)']

In [None]:
y.hist()

In [None]:
y[y.str.contains(',')]

## We discover that the approximate cost of a meal for two people is str, and contains numbers with commas instead of dots, so we proceed to eliminate the commas and convert to int type

In [None]:
y = y.str.replace(',','')
y = y.astype('int')

## We observe that this feature's distribution is positively skewed, so we'll transform it with the logarithm base 2 in order for it to be close to a normal distribution.

In [None]:
y.hist()

In [None]:
ylog2 = np.log2(y)
ylog2.hist()

# We're creating a quick model to try to predict the missing approximate two people meal prices, using the restaurant locations and the restaurant types. We just transformed the target so it has a normal distribution, therefore it works better with some regression models.

## Encode and standarize X

In [None]:
le_rest_type = LabelEncoder()
le_location = LabelEncoder()

In [None]:
X.rest_type = le_rest_type.fit_transform(X['rest_type'])
X.location = le_location.fit_transform(X['location'])

In [None]:
X.head()

In [None]:
scale = StandardScaler()

In [None]:
X_scaled = scale.fit_transform(X)
X_scaled[:3]

In [None]:
ylog2[:3]

In [None]:
rf_reg = RandomForestRegressor()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, ylog2, test_size = 0.25, random_state = 7)

In [None]:
rf_reg.fit(X_train, y_train)

In [None]:
y_pred = rf_reg.predict(X_test)

In [None]:
r2_score(y_test, y_pred)

In [None]:
cross_val_score(estimator = rf_reg, X = X_scaled, y = ylog2, scoring = 'r2', cv = 5).mean()

## We obtain a 68% accuracy with a random forest regressor. We could tune it a bit to see if it performs better.

In [None]:
params = {'n_estimators': [100, 200, 300, 400, 500], 'min_samples_split':[2, 3, 4], 'min_samples_leaf':[1, 2, 3]}

In [None]:
clf = RandomizedSearchCV(rf_reg, params, random_state=0)

In [None]:
search = clf.fit(X_scaled, ylog2)

In [None]:
search.best_score_

In [None]:
search.best_params_

## We'll try a couple of more models and stick to the best performing one. Then we'll tune it a bit and use it for imputing this feature.

## Stocastic Gradient Descent Regressor

In [None]:
stg_reg = SGDRegressor()

In [None]:
stg_reg.fit(X_train, y_train)

In [None]:
stg_reg.score(X_test, y_test)

## Decision Tree Regressor

In [None]:
dt_reg = DecisionTreeRegressor()

In [None]:
dt_reg.fit(X_train, y_train)

In [None]:
dt_reg.score(X_test, y_test)

In [None]:
cross_val_score(dt_reg, X_scaled, ylog2, scoring = 'r2', cv = 5).mean()

## Gradient Boosting Regressor

In [None]:
gb_reg = GradientBoostingRegressor()

In [None]:
gb_reg.fit(X_train, y_train)

In [None]:
gb_reg.score(X_test, y_test)

In [None]:
cross_val_score(gb_reg, X_scaled, ylog2, scoring = 'r2', cv = 5).mean()

## We'll use the tuned Random Forest Regressor and train it with all the data

In [None]:
best_rf_reg = RandomForestRegressor(n_estimators = 400, min_samples_split = 4, min_samples_leaf = 2)

In [None]:
X_scaled[:4]

In [None]:
ylog2[:3]

## We adjust the model

In [None]:
best_rf_reg.fit(X_scaled, ylog2)

## Now we select the rest_type and location of all the missing values for approximate cost of two people meal from the dataset. Later we encode and standardize them, predict de target and reverse transform it from log2

In [None]:
X_test_real = zomato[zomato['approx_cost(for two people)'].isnull()][['rest_type','location']]

In [None]:
X_test_real.head()

## We use the same instantiated LabelEncoder for each feature so it transforms the data with the same original parameters. That is to say, so it assigns the same tags it did with the first data.

In [None]:
X_test_real['rest_type'] = le_rest_type.transform(X_test_real['rest_type'])
X_test_real['location'] = le_location.transform(X_test_real['location'])

In [None]:
X_test_real.head()

## Now we standardize

In [None]:
X_test_real = scale.fit_transform(X_test_real)

In [None]:
X_test_real[:3]

## Now we predict the approximate cost of a two people meal using our best model and the preprocessed data.

In [None]:
approx_cost_two_people_new = best_rf_reg.predict(X_test_real)

In [None]:
approx_cost_two_people_new[:5]

## Now we revert the log2 transformation the get the real approximate costs.

In [None]:
approx_cost_two_people_new_reversed = 2**approx_cost_two_people_new

In [None]:
approx_cost_two_people_new_reversed[:5]

## We still have to remove the commas from the original feature and then transform it into float type.

In [None]:
zomato['approx_cost(for two people)'] = zomato['approx_cost(for two people)'].str.replace(',', '')
zomato['approx_cost(for two people)'] = zomato['approx_cost(for two people)'].astype('float')

## We replace the missing values of the original dataset with these predictions.

In [None]:
zomato.loc[zomato['approx_cost(for two people)'].isnull(), 
           'approx_cost(for two people)'] = approx_cost_two_people_new_reversed

In [None]:
zomato['approx_cost(for two people)'].isnull().sum()

In [None]:
zomato['approx_cost(for two people)'].dtypes

In [None]:
zomato['approx_cost(for two people)'].hist()

In [None]:
zomato['approx_cost(for two people)'].describe(percentiles = [0.25, 0.5, 0.75, 0.9])

In [None]:
zomato.isnull().sum()

## For phone

In [None]:
zomato.phone.sample(5)

## For now we won't bother handling the missing values for this feature since we'll probably end up dropping it anyways during a possible feature selection.

## For rate

In [None]:
zomato.rate.sample(10)

## Let's remove the "/5"

In [None]:
zomato.loc[zomato.rate.notnull(), 
           'rate'] = zomato[zomato.rate.notnull()].rate.apply(lambda x: x.split('/')[0].strip(' '))

In [None]:
zomato.rate.sample(10)

In [None]:
zomato[zomato.votes == 0][['rate', 'votes']].sample(10)

## It is observed that probably all the missing values are the ones that have 0 votes, which should mean that it's a new restaurant.

In [None]:
zomato[(zomato.rate.isnull()) | (zomato.rate.str.lower() == 'new')].shape

In [None]:
zomato[zomato.votes == 0].shape

In [None]:
zomato[(zomato.rate.notnull()) & (zomato.votes == 0) & (zomato.rate.str.lower() != 'new')][['rate', 'votes']]

## We discover that there are also normal rates even when the number of votes is 0, which makes absolutely no sense. There are also "-" which might indicate missing values. We'll replace all these with NANs.

In [None]:
zomato.loc[zomato.votes == 0, 'rate'] = np.nan

In [None]:
zomato.rate = zomato.rate.astype('float')

In [None]:
zomato.dropna(subset = ['rate']).rate.hist()

## We'll create a new column that determines whether a restaurant is new or not.

In [None]:
zomato['is_new'] = zomato.rate.apply(lambda x: 1 if math.isnan(x) else 0)

In [None]:
zomato[['rate', 'is_new']].sample(10)

In [None]:
zomato[zomato.rate.isnull()][['rate', 'is_new']].sample(5)

In [None]:
zomato.isnull().sum()

## We'll do more analysis about the rate later. Now the focus is to deal with the missing values, so we'll move up to dish_liked

In [None]:
zomato.dish_liked.sample(10, random_state = 7).dropna()

In [None]:
zomato.dish_liked.dropna().dtypes

In [None]:
type(zomato.dish_liked[0])

## More than half of this feature are missing values, so it would be dropped if we were doing feature engineering. However, it's interesting to know things like: how many different liked dishes there are, which ones are the most and less frequent in general and by zone/restaurant type, etc. 

## There are empty lists in the feature menu_item, let's take a look

In [None]:
zomato.menu_item.head()

In [None]:
zomato.menu_item.describe()

In [None]:
zomato.menu_item.sample(10)

In [None]:
zomato.menu_item.nunique()

In [None]:
zomato.menu_item[0]

In [None]:
type(zomato.menu_item[0])

In [None]:
zomato[zomato.menu_item == '[]'].shape

## There are nearly 40 thousand records with empty square brackets, hence this feature cannot provide a general insight about menu items. We'll replace them with NANs and turn the remaining records into lists.

In [None]:
zomato[zomato.menu_item != '[]'].menu_item.iloc[:4]

In [None]:
zomato.menu_item = zomato.menu_item.replace('[]', value = np.nan)

In [None]:
zomato.menu_item.isnull().sum()

In [None]:
zomato[zomato.menu_item.notnull()].menu_item.iloc[:4]

In [None]:
zomato.loc[zomato.menu_item.notnull(), 
           'menu_item'] = zomato[zomato.menu_item.notnull()].menu_item.apply(lambda x: x.strip('[]').split(','))

In [None]:
zomato[zomato.menu_item.notnull()].menu_item.iloc[0]

## Another feature that contains empty square brackets is reviews_list. We'll explore it a little bit and clean it.

In [None]:
zomato.reviews_list.sample(5, random_state = 7)

In [None]:
zomato.reviews_list[0]

In [None]:
zomato[zomato.reviews_list == '[]'].shape

In [None]:
zomato.reviews_list.isnull().sum()

In [None]:
zomato.reviews_list = zomato.reviews_list.replace(to_replace = '[]', value = np.nan)

In [None]:
zomato.reviews_list.isnull().sum()

# There are cases with no reviews and still the restaurant has been voted and rated. Maybe it's because only the votes with comments count for this feature.

In [None]:
zomato[(zomato.reviews_list.isnull())&(zomato.votes != 0)].rate.sample(10)

# Let's answer some questions

## Which ones are the zones with the most restaurants in Bangalore?

In [None]:
#Read excel file with the previously created dataset containing the coordinates for each single neighborhood using geopy
locations = pd.read_csv('../input/bangalore-neighborhood-locations/Coordinates.csv')
#Turn the Coordenates column into tuples because it is read as a string from the excel file
locations.Coordenates = locations.Coordenates.apply(lambda x: ast.literal_eval(x))
locations.head()

In [None]:
#Count the amount of restaurants in each neighborhood
restaurants = zomato.location.value_counts().reset_index(drop = False)
restaurants.columns = ['Name', 'Count']
restaurants

In [None]:
#Join (left outer join) the two previous datasets using the Name column as the common one
restaurants = restaurants.merge(locations, how = 'left', on = 'Name')
restaurants

In [None]:
#The * operator unpacks de tuples within the Coordenates series
lat, lon = zip(*restaurants.Coordenates)

## Number of restaurants in Bangalore

In [None]:
restaurants['lat'] = lat
restaurants['lon'] = lon
basemap = folium.Map(location = [12.97, 77.59], zoom_start = 11, control_scale = True)
HeatMap(restaurants[['lat','lon','Count']].values.tolist(), zoom = 20, radius = 15, 
        min_opacity = 1, name = 'Restaurant density in Bangalore').add_to(basemap)
basemap

## It is clear that most of restaurants gather in the center, with another smaller cluster in the southeast.

# Does the theme of the restaurant matter when opening a new one? We could relate the "theme" of a restaurant with its type or with its cuisine.

## We'll measure the amount of success based on the number of votes and rating. Though a high number of votes is also associated with the antiquity of the restaurant it can also measure its popularity.

In [None]:
# All the unique cuisines in the dataset
{x.strip(' ') for lis in zomato.cuisines.dropna() for x in lis.split(',')}

## We can already suspect the most liked and popular cuisines based on the rate and number of votes might be: continental, north indian, south indian, mediterranean, italian and american.

In [None]:
zomato.groupby('cuisines')[['rate', 'votes']].mean().sort_values(by = ['rate', 'votes'], ascending = False).head(15)

## We could even create a new feature as the combination of the two previous ones, which might attempt to measure the success. We'll multiply both features.

In [None]:
zomato['success'] = zomato.rate * zomato.votes

In [None]:
zomato.groupby('cuisines').success.mean().sort_values(ascending = False).head(15)

## Once again, we see that possibly the prefered cuisines are: continental, mediterranean, north and south indian, italian, american...

## Let's look at the lower rates and see if there are cuisines in common.

In [None]:
lower_rates = zomato[zomato.rate < 3].cuisines

In [None]:
low_cuisines = [x.strip(' ') for items in lower_rates for x in items.split(',')]

In [None]:
low_cuisines[:20]

In [None]:
pd.Series(low_cuisines).value_counts(ascending = False)

## This information might not be reliable since some cuisines can appear very frequently compared to others, so they would appear both in the highest ranked and the lowest. Let's take a look at the number of records for each individual cuisine style.

In [None]:
#Each individual cuisine in a list
ind_cuisines = list({x.strip(' ') for lis in zomato.cuisines.dropna() for x in lis.split(',')})
#The amount of times each of those individual cuisines appears in the dataset
cui_count = [zomato[zomato.cuisines.str.lower().str.contains(x.lower())].shape[0] for x in ind_cuisines]
cuisine_count = pd.DataFrame(data = {'Cuisine':ind_cuisines, 'Count':cui_count})

In [None]:
cuisine_count.sort_values(by = 'Count', ascending = False).head(10)

In [None]:
plt.figure(figsize = (7, 20))
sns.barplot(x = 'Count', y = 'Cuisine', data = cuisine_count, palette = 'hls')
#plt.xticks(rotation = 90)
plt.title('Number of appearances of each cuisine', fontsize = 14)
plt.show()

## Let's find the average "success" rate per individual cuisine

In [None]:
#Dropping all duplicated restaurants
zomato2 = zomato.drop_duplicates(subset = ['name'], ignore_index = True, inplace = False)

In [None]:
un_cuisines = {x.strip(' ') for lis in zomato2.cuisines for x in lis.split(',')}
un_cuisines

In [None]:
cui = []
avg_success = []
for i in un_cuisines:
    av_succ = zomato2[zomato2.cuisines.str.contains(i)].success.mean()
    cui.append(i)
    avg_success.append(av_succ)

In [None]:
cuisine_success = pd.DataFrame(data = {'Cuisines':cui,'Avg_success':avg_success})

In [None]:
cuisine_success.dropna(inplace = True)
cuisine_success.reset_index(drop = True, inplace = True)

In [None]:
cuisine_success.sort_values(by = 'Avg_success', ascending = False).head(10)

In [None]:
plt.style.use('ggplot')
sns.barplot(x = 'Cuisines', y = 'Avg_success', data = cuisine_success[cuisine_success.Avg_success > 4000], palette = 'deep')
plt.xticks(rotation = 90)
plt.title('Top cuisines by avg success rate', fontsize = 14)

In [None]:
#cui = []
count = []
for i in un_cuisines:
    cou = zomato2[zomato2.cuisines.str.contains(i)].shape[0]
    #cui.append(i)
    count.append(cou)

In [None]:
#Top most popular individual cuisines
cui_count = pd.DataFrame(data = {'Cuisine':cui, 'Count':count})
cui_count.sort_values(by = 'Count', ascending = False)

In [None]:
sns.set()
labels = cui_count.sort_values(by = 'Count', ascending = False).Cuisine.head(10)
values = cui_count.sort_values(by = 'Count', ascending = False).Count.head(10)
plt.pie(x = values, labels = labels, radius = 1.5)
plt.title('Top cuisines per number of appearances', fontsize = 14, y = 1.15)
plt.show()

## We'll use SQL to join these last two tables together on the cuisine name since we forgot to name both columns the same.

In [None]:
engine = create_engine('sqlite://', echo = False)

In [None]:
cuisine_success.to_sql(name = 'cuisine_success_sql', con = engine, if_exists = 'replace', index = False)
cui_count.to_sql(name = 'cui_count_sql', con = engine, if_exists = 'replace', index = False)

In [None]:
result = engine.execute("SELECT cui_count_sql.Cuisine, Avg_success, Count FROM cuisine_success_sql JOIN \
                        cui_count_sql ON cuisine_success_sql.Cuisines = cui_count_sql.Cuisine")

In [None]:
final = pd.DataFrame(result, columns = ['Cuisine', 'Avg_success', 'Count'])

## We cannot base this analysis solely on the success rate since there are apparently very successful cuisines that appear only a couple of times in the entire dataset. We need to analyse both the most successful and popular ones.

In [None]:
final.sort_values(by = 'Avg_success', ascending = False)

In [None]:
final.Count.describe(percentiles = [0.25, 0.5, 0.75, 0.9])

In [None]:
plt.figure(figsize = (6,6))
sns.boxplot(y = 'Count', data = final)

In [None]:
#Find outliers in the count feature using interquartile range. These outliers are going to be the most
#popular cuisines in Bangalore.
q1 = final.Count.quantile(0.25)
q3 = final.Count.quantile(0.75)
iqr = q3 - q1
final[final.Count > q3 + 1.5 * iqr].sort_values(by = 'Count', ascending = False)

## These are the cuisines which count is 1.5IQR above the Q3, that have the higuest success value. Among the top cuisines there are: italian, continental, cafe, indian and chinese. However, the top cuisines with the highest amount of records are: Indian, North Indian and Chinese! So it seems apart of their own gastronomy, chinese is one of the most demanded ones. Biryani and south indian cuisines are also some of the most relevant ones when it comes to presence and overall success.

In [None]:
plt.style.use('ggplot')
final[final.Count > 400].plot(x = 'Cuisine', y = ['Avg_success', 'Count'], kind = 'bar', figsize = (8, 6))
plt.title('Top cuisines with highest count and success rate', fontsize = 14, fontweight = 'bold')
plt.show()

# Naturally, the ones that appear the most are also the ones which success is not that high. SO IF I WANTED TO OPEN A RESTAURANT IN BANGALORE I'D GREATLY CONSIDER MAKING EITHER INDIAN OR NORTH INDIAN FOOD, FOLLOWED BY ITALIAN, INDIAN, CHINESE, BIRYANI, CONTINENTAL OR FAST FOOD.

## Is a food chain category restaurant likely to have more customers than its counter part?
## Food chain restaurant definition: a restaurant chain is a set of related restaurants in many different locations that are either under shared corporate ownership (e.g., McDonald's in the U.S.) or franchising agreements. Fast food restaurants are the most common, but sit-down restaurant chains also exist.
## So based on its definition we could track food chain restaurants in two different ways: one might be looking at the restaurants with multiple different locations, and the second one would be looking for "Fast Food" type restaurants. Let's start with the first one since it's the closest to the strict definition.

In [None]:
#Get the top 3 most popular restaurant chains in Bangalore
topchains = zomato.drop_duplicates(subset = ['name', 'address']).name.value_counts(ascending = False).head(3).index

In [None]:
#Function to gather the information needed for the scattermapbox of each restaurant chain
def get_chains(chain_name):
    df = zomato.drop_duplicates(subset=['name','address'])[zomato.name == chain_name].location.value_counts().reset_index()
    df.columns = ['Name','Count']    
    df = df.merge(locations, how = 'left', on = 'Name')
    df['lat'], df['lon'] = zip(*df.Coordenates)
    df['Chain'] = chain_name
    return df

In [None]:
#Make list with the three datasets
dt = [get_chains(x) for x in topchains]

In [None]:
#Concatenate all three datasets in one
chains = pd.concat([dt[i] for i in range(len(dt))], ignore_index=True)

In [None]:
#Create column with text to use in each marker
chains['text'] = chains['Name'] + ': ' + chains['Count'].astype('str')

In [None]:
#Function to create a scattermapbox object to be plotted for a given restaurant chain
def get_dots(chain_name):
    aux = chains[chains.Chain == chain_name]
    dots =  go.Scattermapbox(           
                lat = aux['lat'],
                lon = aux['lon'],
                mode = 'markers',
                marker = go.scattermapbox.Marker(size = aux['Count']*8),
                text = aux['text'],
                name = chain_name)
    return dots

In [None]:
data = [get_dots(name) for name in topchains]

In [None]:
#Public access token from the Mapbox website. It allows customizing the mapbox parameters in the layout
mapbox_access_token = 'pk.eyJ1IjoidG9tYXNyb3pvIiwiYSI6ImNrbTEyNjdsMTB5dDMyb21yN3oyaHppN3QifQ.V2AliHyRDFpJMGFjjehPLw'

In [None]:
#Customize layout and feed it to the iplot method from plotly.offline
layout = go.Layout(title = "Top 3 chains locations in Bangalore",        
        hovermode = 'closest',
        mapbox = dict(
            accesstoken = mapbox_access_token,
            style = 'streets',
            center = dict(lat = 12.96, lon = 77.59),
            zoom = 10)
                  )

fig = dict(data = data, layout = layout)
py.iplot(fig, filename = 'Bangalore Mapbox')

## Cafe coffee day mainly clusters in the center and south. Just Bake mainly clusters in the south. Domino's Pizza spreads mostly in the south and east yet it's the most dispersed.

In [None]:
#Remove records with the same address since we need to find restaurants with multiple different locations.
zomato3 = zomato.drop_duplicates(subset = ['address','name'], keep = 'first', ignore_index = True)

In [None]:
#Dataset with restaurant names that appear multiple times. The argument keep=False allows us to keep all the duplicates
zomato3 = zomato3[zomato3.name.duplicated(keep = False)]
zomato3.reset_index(drop = True, inplace = True)

## Let's take a look at the first restaurant

In [None]:
zomato3[zomato3.name == 'Cafe Coffee Day'].head()

## Let's see the the reviews for the case with the lowest rate.

In [None]:
min_rate = zomato3[zomato3.name == 'Cafe Coffee Day'].rate.min()
index = zomato3[(zomato3.name == 'Cafe Coffee Day') & (zomato3.rate == min_rate)].reviews_list.index[0]

In [None]:
#To get the list within the string
ast.literal_eval(zomato3.loc[index, 'reviews_list'])

In [None]:
low_rate_ccd = ast.literal_eval(zomato3.loc[index, 'reviews_list'])

In [None]:
Name = []
Ratings = []
Comments = []
for rating in low_rate_ccd:
    x,y = zip(rating)    
    x,y = x[0].strip('Rated ,'), y[0].strip('RATED\n')
    Name.append('Cafe Coffee Day')
    Ratings.append(x)
    #Replace all the "\n" characters from the comments with nothing, and get the comment dropping the first two unnecessary spaces
    Comments.append(re.sub('\n', '', y)[2:])
low_ccd = pd.DataFrame({'rest_name':Name, 'Rate':Ratings, 'Comments':Comments})
low_ccd

In [None]:
for comment in low_ccd.Comments:
    print(comment+'\n')

## Now let's find out which restaurants are the ones with the most different locations

In [None]:
sns.set()
zomato3.name.value_counts(ascending = False).head(10).plot(kind = 'pie', radius = 1.5, autopct = '%.1f', 
                                                           textprops = dict(size=13))
plt.axis('off')
plt.title('Top 10 chain restaurants with the most appearances', fontsize = 16, y = 1.2)
plt.show()

In [None]:
zomato3.name.value_counts().describe(percentiles = [0.25, 0.5, 0.75, 0.95])

In [None]:
#Get the names of the top 10 chain restaurants
top_chain = zomato3.name.value_counts(ascending = False).head(10).index

In [None]:
top_chain

In [None]:
#Get the average rate, votes and success for each one of the top chain restaurants
zomato3[zomato3.name.isin(top_chain)].groupby('name')[['rate', 'votes', 'success']].mean().sort_values(by = ['rate'], 
                                                                                                       ascending = False)

In [None]:
plt.style.use('ggplot')
zomato3[zomato3.name.isin(top_chain)].groupby('name')[['rate', 'votes']].mean().plot(kind = 'bar', 
                                                                                     secondary_y = 'rate', figsize = (8,7))
plt.show()

## Now let's see the average rate and number of votes both for chain restaurants and for their counterpart.

In [None]:
#Remember zomato2 is the dataframe containing only UNIQUE restaurants, and zomato3 the one containing CHAIN restaurants.
print(f'Records for unique restaurants : {zomato2.shape[0]} \nRecords for chain restaurants: {zomato3.shape[0]} \
      \nAvg rate for unique restaurants: {zomato2.rate.mean()} \nAvg rate for chain restaurants: {zomato3.rate.mean()} \
      \nAvg votes for unique restaurants: {zomato2.votes.mean()} \nAvg votes for chain restaurants: {zomato3.votes.mean()}')

In [None]:
plt.figure(figsize = (6,7))
plt.subplot(2, 2, 1)
sns.boxplot(y = 'rate', data = zomato2)
plt.title('Unique restaurants', fontsize = 15)
plt.ylabel('Rate', fontsize = 13)

plt.subplot(2, 2, 2)
sns.boxplot(y = 'rate', data = zomato3)
plt.title('Chain restaurants', fontsize = 15)
plt.ylabel('')

plt.subplot(2, 2, 3)
sns.boxplot(y = 'votes', data = zomato2)
plt.title('Unique restaurants', fontsize = 15)
plt.ylabel('Votes', fontsize = 13)

plt.subplot(2, 2, 4)
sns.boxplot(y = 'votes', data = zomato3)
plt.title('Chain restaurants', fontsize = 15)
plt.ylabel('')

plt.tight_layout()
plt.show()

## Finally, let's see the most popular cuisines for both the unique and chain restaurants.

In [None]:
unique_rest_cuisines = pd.Series([x.strip(' ') for lis in zomato2.cuisines.dropna() for x in lis.split(',')])
chain_rest_cuisines = pd.Series([x.strip(' ') for lis in zomato3.cuisines.dropna() for x in lis.split(',')])

In [None]:
plt.figure(figsize = (12, 6))
plt.subplot(1, 2, 1)
unique_rest_cuisines.value_counts(ascending = False).head(10).plot(kind = 'pie', radius = 1.5, cmap = 'Set3')
plt.title('Top 10 cuisines in unique restaurants', fontsize = 14, y = 1.2)
plt.subplot(1, 2, 2)
chain_rest_cuisines.value_counts(ascending = False).head(10).plot(kind = 'pie', radius = 1.5, cmap = 'Set3')
plt.title('Top 10 cuisines in chain restaurants', fontsize = 14, y = 1.2)
plt.tight_layout()
plt.show()

# Apparently there's no substantial difference between chain restaurants and their counter parts when it comes to success or popularity. However it is clear that in both cases the top cuisines are indian, chinese and fast food. In the case of chain restaurants, desserts and bakery are also some of the most popular ones.

# Are any neighborhoods similar based on the type of food?

## We can get the most popular type of food (cuisine) per neighborhood and then group them using this feature. It is also important to see the neighborhoods in which there are more restaurants!

In [None]:
zomato.location.value_counts(ascending = False).describe(percentiles = [0.25, 0.5, 0.75, 0.95])

In [None]:
#Get all unique neighborhoods
neighborhoods = zomato.location.unique()

In [None]:
#Get the most popular cuisine for each individual neighborhood
neighborhoods_cui_mode = []
for i in neighborhoods:
    cui_mode = zomato[zomato.location == i].cuisines.mode()[0]
    neighborhoods_cui_mode.append(cui_mode)

# This answers the question: what kind of food is more popular in each locality?

In [None]:
neighborhoods_mode_cuisine = pd.DataFrame({'Neighborhoods':neighborhoods, 'Most_popular_cuisine':neighborhoods_cui_mode})
neighborhoods_mode_cuisine

In [None]:
neighborhoods_mode_cuisine.rename(columns = {'Neighborhoods':'Name'}, inplace = True)
df_tags = neighborhoods_mode_cuisine.merge(locations, how = 'left', on = 'Name')
df_tags['lat'], df_tags['lon'] = zip(*df_tags.Coordenates)
df_tags

In [None]:
tags = go.Scattermapbox(lat = df_tags.lat,
                        lon = df_tags.lon,                        
                        mode = 'markers',                        
                        marker = go.scattermapbox.Marker(size = 10, color = '#EC7063'),
                        text = df_tags.Name + ': ' + df_tags.Most_popular_cuisine,
                        name = 'Most popular cuisine in each neighborhood'
                                                        )                 
                       
layout = go.Layout(title = 'Most popular cuisine in each neighborhood (hover to see info)',        
        hovermode = 'closest',
        mapbox = dict(
            accesstoken = mapbox_access_token,
            style = 'streets',
            center = dict(lat = 12.96, lon = 77.59),
            zoom = 10)
                  )

fig = dict(data = tags, layout = layout)
py.iplot(fig, filename = 'Bangalore Mapbox') 

In [None]:
#Cuisines with multiple ocurrences
mul_cui = neighborhoods_mode_cuisine[neighborhoods_mode_cuisine.Most_popular_cuisine.duplicated(keep = False)].Most_popular_cuisine.unique()

In [None]:
mul_cui

In [None]:
#Create dataframe grouping by cuisine and joining the neighborhoods that have each one of them as the most popular
cui_groups = neighborhoods_mode_cuisine[neighborhoods_mode_cuisine.Most_popular_cuisine.isin(mul_cui)].groupby('Most_popular_cuisine').Name.apply(', '.join)
cui_groups = cui_groups.reset_index(drop = False)
cui_groups

## Neighborhoods grouped by most popular cuisine

In [None]:
#Join the two ocurrences were the popular cuisine is 'chinese, north indian' and replace one of them with this join
cui_groups.loc[2, 'Name'] = cui_groups[cui_groups.Most_popular_cuisine.str.lower().str.contains('chinese')].Name.str.cat(sep = ', ')
cui_groups.drop(index = 6, axis = 0, inplace = True)
cui_groups.reset_index(drop = True, inplace = True)
cui_groups

In [None]:
#See the neighborhoods which most popular type of food is south indian
cui_groups.loc[6, 'Name']

## Finally, let's give a further look to the features: online_order and book_table

In [None]:
zomato[['online_order', 'book_table']].sample(5)

In [None]:
zomato.online_order.value_counts().plot(kind = 'pie', autopct = '%.2f', radius = 1,
                                        colors = ['#E74C3C', '#F1948A'], explode = [0,0.1], 
                                        labels = ['Yes', 'No'], shadow = True,
                                        textprops = dict(size=12), wedgeprops = dict(linewidth=2))
plt.axis('off')
plt.title('Percentage of restaurants that allow online ordering', pad = 10)
plt.show()

In [None]:
zomato.book_table.value_counts().plot(kind = 'pie', autopct = '%.2f', radius = 1,
                                        colors = ['#2980B9', '#5DADE2'], explode = [0,0.15], 
                                        labels = ['No','Yes'], shadow = True,
                                        textprops = dict(size=12), wedgeprops = dict(linewidth=2))
plt.axis('off')
plt.title('Percentage of restaurants that allow table booking', pad = 10)
plt.show()

In [None]:
plt.figure(figsize = (7,5))

plt.subplot(1,2,1)
sns.boxplot(x = 'online_order', y = 'rate', data = zomato, hue = 'online_order')
plt.title('Restaurants online ordering', fontsize = 15)
plt.xlabel('Online Order')
plt.ylabel('Rate', fontsize = 13)
plt.legend(loc = 'center')

plt.subplot(1,2,2)
sns.boxplot(x = 'book_table', y = 'rate', data = zomato, hue = 'book_table')
plt.title('Restaurants table booking', fontsize = 15)
plt.xlabel('Book Table')
plt.ylabel('Rate', fontsize = 13)
plt.legend(loc = 'center')

plt.tight_layout()
plt.show()

## It can be observed that having the possibility to order online doesn't make a huge difference in the ratings. This might occur since there are only 10% more restaurants that have this option so the impact is not that relevant. On the other hand, restaurants that permit booking tables are much less, so they generally have a better rating since this option is much more appreciated.