# Zomato Bangalore Restaurants

#### A trip through resturants in `Bangalore`
#### Importing Matplotlib inline
This magic command helps in inline visualization in python notebook

In [None]:
%matplotlib inline

In [None]:
from warnings import simplefilter

simplefilter('ignore')

### Importing necessary modules for creating the kernel

* `pandas` for creating and handling DataFrames
* `seaborn` for plotting
* `wordcloud` for creating word cloud

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
from wordcloud import WordCloud
from nltk.probability import FreqDist

### Setting a color scheme for the kernel: "Crimson"

In [None]:
sns.set(font_scale= 1.1,style= 'darkgrid', palette= 'Set3')
my_pal= sns.dark_palette(color= 'crimson', n_colors= 7, as_cmap= False, reverse= False)
sns.palplot(my_pal)

###### Loading the dataset

In [None]:
data= pd.read_csv('../input/zomato.csv')

##### Exploring the DataFrame

In [None]:
data.head()

#### We'll drop the `URL`, `phone` columns as they are irrelevant for this analysis

In [None]:
data.drop(['url', 'phone'],axis= 1, inplace= True) # inplace= True, makes the change inplace

#### We'll explore the dataset further

We see that there are missing values in: `rate`, `location`, `rest_type`, `dish_liked`, `cuisines`, `approx_cost`
we will explore these variables a little more in detail

In [None]:
data.isna().mean().mul(100) # gives the rate of missing values in the dataframe

In [None]:
data.info()

#### We'll explore the `rate` variable

We see there are 7775 **missing values** and 2208 values named **"NEW"**, this indicates that there are a few restaraunts that haven't been rated yet, we will replace the missing value with **-1** and **NEW** with **-2**, so that we can filter them easily later

In [None]:
data.rate.value_counts(dropna= False).head(10)

In [None]:
data['ratings']= data.rate.str.split('/', expand= True)[0].str.rstrip() #we will try and get the ratings
data['ratings']= data['ratings'].str.replace('-', '-1')
data['ratings'].fillna('-1', inplace= True)
data['ratings']= data['ratings'].str.replace('NEW', '-2')
data['ratings'].value_counts(dropna= False).head(10)

In [None]:
data['ratings']= pd.to_numeric(data['ratings']); data.drop('rate', axis= 1, inplace= True)

#### We'll now explore the `approx_cost(for two people)` variable

We see that there are **","** in between, so they are treated as string. We'll replace the **","** and typecaste them to numberic

In [None]:
data['approx_cost(for two people)'].value_counts().sample(10) # wee see that there are commas in the number
# we'll replace this separtor with blanks

In [None]:
data['approx_cost(for two people)']= data['approx_cost(for two people)'].str.replace(',', ""); data['approx_cost(for two people)'].sample(7)
data['approx_cost(for two people)']= pd.to_numeric(data['approx_cost(for two people)'])
data.rename(columns= {'approx_cost(for two people)': 'cost for two'}, inplace= True)

### From analysing the ratings based on facilities we see;
Restaurants with just **table booking** facility have higher average rating `4.3`, while restaurants with both **online ordering** and **table booking** capabilities have an average rating of `4.1`.

In [None]:
_= data.query('ratings != -1').query('ratings != -2')

ratings= sns.FacetGrid(_, row= 'online_order', col= 'book_table', sharex= True, sharey= True, aspect = 2.6)
ratings.map(sns.kdeplot, 'ratings', shade= True, color= 'crimson')

plt.suptitle('Distribution of Ratings by availability of Online ordering or Table booking facilities'
             , y= 1.05)
plt.show()

### Analysing the patterns of cost for food, we see:

Note: The plots have been filtered to include only values below `3,000 Rs`.

1. Restaurants with both online ordering and table booking and just table booking facilities have higher spread of cost of food. These might be fine dine-in restaurants.

2. While, restaurants with neither of the facilities have much lower spread of cost for food, these might be takeaway restaurants

In [None]:
_= data[(data['cost for two'] > 0) & (data['cost for two'] < 3000)]

ratings= sns.FacetGrid(_, row= 'online_order', col= 'book_table', sharex= True, sharey= True, aspect = 2.6)

ratings.map(sns.kdeplot, 'cost for two', shade= True, color= 'crimson')

plt.suptitle('Distribution of Price by availability Online order or Table bookings', y= 1.05)

plt.show()

In [None]:
data['rest_type']= data['rest_type'].str.split(',',expand= True)[0]
data['rest_type']= data['rest_type'].str.replace(',',"")

## From analysing the ratings for Restaurant types we see;

1. Pubs, Bars, Lunges etc. that seem to be serving `alcohol` seems to have gotten higher average ratings among all the restaurant types
2. `Quick food stalls` or cheaper restaurants seems to have gotten lower average ratings among all the restaurant types

In [None]:
_= data.query('ratings > 0').groupby('rest_type')['ratings'].agg('mean').sort_values(ascending = False)

fig, (ax1, ax2)= plt.subplots(1, 2,figsize= (24, 6), sharex= True)

sns.barplot(y= _.head(5).index, x= _.head(5), palette= my_pal, ax= ax1, edgecolor= 'k')
ax1.set_ylabel('Restaraunt type', fontdict= {'fontsize': 'x-large'})
ax1.set_xlabel('Ratings', fontdict= {'fontsize': 'large'})
ax1.set_title('Five highest rated Restaurant types', fontdict= {'fontsize': 'xx-large'})

sns.barplot(y= _.tail(5).index, x= _.tail(5), ax= ax2, palette= my_pal, edgecolor= 'k')
ax2.set_ylabel('')
ax2.set_xlabel('Ratings', fontdict= {'fontsize': 'large'})
ax2.set_title("Five lowest rated Restaurant types", fontdict= {'fontsize': 'xx-large'})

fig.tight_layout()
plt.xticks(ticks= np.arange(0., 5., 0.5))
fig.suptitle('Average Ratings grouped by "Restaurant" types in Bangalore', y= 1.08, fontsize= 25)
plt.show()

# From analysing the cost for Restaurants we see;

1. Some of the most expensve restaurant types also seems to be some of highest rated restaurant types too!
2. We can see some of the cheapest restaurant types also seems to have gotten some of the lowest ratings too!

In [None]:
_= data[data['cost for two'] > 0].groupby('rest_type')['cost for two'].agg('mean').sort_values(ascending = False)

fig, (ax1, ax2)= plt.subplots(1, 2,figsize= (24, 6), sharex= True)

sns.barplot(y= _.head(5).index, x= _.head(5), palette= my_pal, ax= ax1, edgecolor= 'k')
ax1.set_ylabel('Restaraunt type', fontdict= {'fontsize': 'x-large'})
ax1.set_xlabel('Ratings', fontdict= {'fontsize': 'large'})
ax1.set_title('Five most expensive Restaurant types', fontdict= {'fontsize': 'xx-large'})

sns.barplot(y= _.tail(5).index, x= _.tail(5), ax= ax2, palette= my_pal, edgecolor= 'k')
ax2.set_ylabel('')
ax2.set_xlabel('Ratings', fontdict= {'fontsize': 'large'})
ax2.set_title("Five cheapest Restaurant types", fontdict= {'fontsize': 'xx-large'})

fig.tight_layout()
plt.xticks(ticks= np.arange(0, _.max(), 500))
fig.suptitle('Ratings grouped by "Restaurant" types in Bangalore', y= 1.08, fontsize= 25)
plt.show()

From the Violin Plot we can again see that of the highest rated reataurant types were **Pubs**, **Fine Dining**, **Micro Brewery** and **Lounges**

In [None]:
_= data.query('ratings > 0')
fig= plt.figure(figsize= (18, 6))
sns.violinplot(x= _['listed_in(type)'], y= _['ratings'], width= 0.75, palette= my_pal)
plt.title('Ratings distribution across "Restaurant types" in Banglore', fontdict={'fontsize': 'larger'})
plt.xlabel('Restaurant type', fontdict={'fontsize': 'medium'})
plt.ylabel('Ratings', fontdict={'fontsize': 'medium'})
plt.show()

### From the violin plot we see;

1. Fine **dine-out**, **pubs**,**bars** etc. costed higher on average than other restaurants
2. We see that **desserts** and **take-away/delivery** are cheaper alternatives in `Bangalore`!
3. We also see some outliers in **dine-out**, **buffet** and **drinks & night-life**, these could be five star hotels or part of five star hotels

In [None]:
_= data[data['cost for two'] > 0]
fig= plt.figure(figsize= (18, 6))
sns.violinplot(x= _['listed_in(type)'], y= _['cost for two'], width= 0.75, palette= my_pal)
plt.title('Cost distribution across "Restaurant types" in Banglore', fontdict={'fontsize': 'x-large'})
plt.xlabel('Restaurant type', fontdict={'fontsize': 'medium'})
plt.ylabel('Ratings', fontdict={'fontsize': 'medium'})
plt.show()

### From the word-cloud of the most liked dish we see;
**Pasta**, **Burgers**, **Cocktails**, **Pizza** and **Biriyani** seems to be some of the most liked dishes by `Banglorians`.

In [None]:
data['dish_liked'].fillna('Unknown', inplace= True)

dishes= []
for text in data['dish_liked']:
    
    for word in text.split(','):
        
        dishes.append(word.strip().title())

dishes_freq= FreqDist(dishes)

dishes_list= sorted(dishes_freq.items(), key= lambda x: x[1], reverse= True)[1:]

most_liked_dishes= [x[0].title() for x in dishes_list]
most_liked_dishes_count= [x[1] for x in dishes_list]
_= dishes_freq.pop('Unknown', None) #this removes the key and the value and then returns it

my_pal_wrd_cld= sns.light_palette(color= 'crimson', n_colors=7, reverse= True, as_cmap= True) #creates a paletter as cmap
wd= WordCloud(scale= 10, margin= False, max_words= 50, colormap= my_pal_wrd_cld).generate_from_frequencies(dishes_freq)

plt.figure(figsize= (24, 6))
plt.imshow(wd)
plt.axis('off')
plt.show()

### From the wordcloud on most served cuisines we see;
**North Indian** cuisines seems to be the most served and liked cuisine in `Bangalore`. This could hint at a higher concentration of North Indians in `Banglore`.

In [None]:
data['cuisines'].fillna('unknown', inplace= True)

cuisines= []
for text in data['cuisines']:
    
    for word in text.split(','):
        
        cuisines.append(word.strip().title())

cuisine_freq= FreqDist(cuisines)

my_pal_wrd_cld= sns.light_palette(color= 'crimson', n_colors=7, reverse= True, as_cmap= True) #creates a paletter as cmap
wd= WordCloud(scale= 10, margin= False, max_words= 50, colormap= my_pal_wrd_cld).generate_from_frequencies(cuisine_freq)

plt.figure(figsize= (24, 6))
plt.imshow(wd)
plt.axis('off')
plt.show()

### From analysing the most expensive and cheapest resturants we see;

1. That most expensive restaurants seem to be 5 star rated or similar
2. Some of the cheapest restaurants in bangalore serve food for as low as 40 Rs.

In [None]:
_= data[data['cost for two'] > 0].groupby('name')['cost for two'].mean().sort_values(ascending= False)
exp_names= [n[0] for n in _.head().index.str.split(' - ', expand= True)]

fig, (ax1, ax2)= plt.subplots(1, 2, figsize= (24, 6))

sns.barplot(x= _.head(), y=exp_names, palette= my_pal, edgecolor= 'k', ax= ax1, ci= None)
ax1.set_title('Five Most Expensive Restaurants', fontdict= {'fontsize': 'xx-large'})
ax1.set_ylabel('Restaurant Name', fontdict= {'fontsize': 'x-large'})
ax1.set_xlabel('Cost for Two People', fontdict= {'fontsize': 'x-large'})

sns.barplot(x= _[_.notna()].tail(5),y= _[_.notna()].tail(5).index, palette= my_pal, edgecolor= 'k', ax= ax2, ci= None)
ax2.set_title('Five Cheapest Restaurants', fontdict= {'fontsize': 'xx-large'})
ax2.set_xlabel('Cost for Two People', fontdict= {'fontsize': 'x-large'})
ax2.set_ylabel('')

fig.tight_layout()
fig.suptitle('Top five Most Expensive and Cheapest Restaurants for in Bangalore', y= 1.09, fontsize=25)
plt.show()

### From the barplot we see;

1. "BTM" has more than 1,000 restaurants (that's a busy site!)
2. Koramangala, HSR and other streets are missing from the sites with high number of restaurants

In [None]:
locs= data.drop_duplicates(subset= 'name')['listed_in(city)'].str.replace(('\dth '), "").value_counts() #drop duplicates in the name and take count

fig= plt.figure(figsize= (18, 4))
my_pal= sns.dark_palette(color= 'crimson', n_colors= 10, as_cmap= False, reverse= False)
k= sns.barplot(y= locs.head(10).index, x= locs.head(10), palette= my_pal, edgecolor= 'k')
fig.suptitle('Restaurant Count grouped by "Location"')
plt.ylabel('Location', fontdict= {'fontsize': 'medium'})
plt.xlabel('Count', fontdict= {'fontsize': 'medium'})
plt.show()

In [None]:
my_pal= sns.dark_palette(color= 'crimson', n_colors= data['listed_in(type)'].nunique(), reverse= False)
sns.palplot(my_pal)

### From the scatter plot we see;

1. Most ratings fall in the "crimson" span, indicating that most restaurants got a rating between `3.5` to `5.0`
2. There might be more **Pubs & bars** in `Banglore` than **Buffets**, **Cafes** and other restaurant types

In [None]:
_= data[(data['ratings'] > 0) & (data['votes'] > 0) & (data['cost for two'] > 0)]

plt.figure(figsize= (24, 9))
sns.scatterplot(x= _['ratings'], y=_['cost for two']
                , hue= _['listed_in(type)']
                , palette= my_pal)

plt.ylabel('Cost for Two', fontdict= {'fontsize': 'x-large'})
plt.xlabel('Ratings', fontdict= {'fontsize': 'x-large'})
plt.title('Cost for Two vs Ratings by "Restaurant Type"', fontdict= {'fontsize': 'x-large'})
plt.axvspan(3.65, 4.75, color= 'crimson', alpha= 0.15)
plt.show()

### Please fork the kernel and give and upvote if you like the kernel

#### Tell me what you think of the kernel and what else could have been done

#### Thank you for tuning-in!!