## Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Reading the data

In [None]:
df = pd.read_csv('../input/zomato-bangalore-restaurants/zomato.csv')

## Looking at the data

In [None]:
df.head()

In [None]:
df.info()

## Checking for NULL values

Number of NULL values in each column

In [None]:
df.isnull().sum()

Percentage of NULL values in each column

In [None]:
(df.isnull().mean())*100

In [None]:
df.nunique(axis=0, dropna = False)

### Dropping a few columns

In [None]:
df.drop(['url', 'phone', 'listed_in(city)', 'dish_liked','reviews_list', 'menu_item'], 
        axis = 1, inplace=True)

In [None]:
df.head()

## Cleaning and Prepping the Data for EDA

In [None]:
df.isnull().sum()

### name


There are 8792 restaurants in Bangalore spread with different outlets across the city.

In [None]:
df['name'].nunique()

### location

#### NULL values on location column

In [None]:
print(df['location'].unique())
print(len(df['location'].unique()))

In [None]:
df[df['location'].isnull()]

#### Dropping the rows with null values in location.

In [None]:
df = df.dropna(subset= ['location'])

#### Clubbing all the blocks of Koramangala under one location

In [None]:
df_kor = df[df['location'].str.contains('Koramangala')]
df_kor_list = df_kor['location'].unique()
df['location'] = df['location'].replace(df_kor_list, 'Koramangala')

### cost(for 2 people)

In [None]:
df = df.rename(columns={'approx_cost(for two people)': 'cost'})

In [None]:
df['cost'] = [float(str(i).replace(",", "")) for i in df['cost']]

### rate

In [None]:
df['rate'].unique()

In [None]:
df['rate'] = df['rate'].fillna(df['rate'].mode()[0])

In [None]:
df[df['rate'] == '-'].shape

In [None]:
rate_hyphen = df.loc[df['rate'] == '-'].index
df.drop(rate_hyphen, axis = 0, inplace = True)

In [None]:
df_rate_new = df[df.rate == 'NEW']

In [None]:
df_rate_new.shape

In [None]:
df['rate'].replace({"NEW" : "2.55 /5"}, inplace = True)

In [None]:
df['rate'] = df['rate'].apply(lambda x : x.split('/')[0])
df['rate'] = df['rate'].apply(lambda x : x.split(' ')[0])
df = df.astype({'rate' : float})

In [None]:
np.sort(df['rate'].unique())

### online_order

In [None]:
df['online_order'].replace({'Yes' : 1, 'No' : 0}, inplace = True)

### book_table

In [None]:
df['book_table'].replace({'Yes' : 1, 'No' : 0}, inplace = True)

### listed_in (type)

In [None]:
df['listed_in(type)'].unique()

## EDA

### Percentage of new restaurants (location-wise)

In [None]:
df_rate_new = df_rate_new.drop_duplicates('address')

In [None]:
df_res_count = df[['address','name','location']]
df_res_count = df_res_count.drop_duplicates()
df_res_count = df_res_count.groupby(['location'])['name'].count()
df_res_count = df_res_count.rename('No of Restaurants')
df_res_count

In [None]:
df_new_res = df_rate_new.groupby(['location'])['name'].count()
df_new_res = df_new_res.rename('No of New Restaurants')
df_new_res

In [None]:
df_loc_new = pd.concat([df_res_count, df_new_res], axis =1)
df_loc_new.fillna(value = 0, inplace = True)
df_loc_new['percent_new_res'] = df_loc_new.iloc[:,1]/df_loc_new.iloc[:,0] * 100

In [None]:
df_loc_new.sort_values('percent_new_res', ascending = False).head(30)

### Average cost of restaurants and Average cost for new restaurants

It can be observed that the average cost for 2 (approx.) for new restaurants is lower than that for all restaurants in almost all the locations.

In [None]:
df_new_res_cost = df_rate_new.groupby(['location'])['cost'].mean()
df_new_res_cost =df_new_res_cost.rename('Avg cost of New Restaurants')
df_res_cost = df[['address','name','location', 'cost']]
df_res_cost = df_res_cost.drop_duplicates()
df_res_cost =df_res_cost.groupby(['location'])['cost'].mean()
df_res_cost= df_res_cost.rename('Avg cost of Restaurants')

In [None]:
df_cost = pd.concat([df_res_cost, df_new_res_cost], axis =1)
df_cost

### location and rating

In [None]:
df_rate = df.groupby(['location'])['rate'].mean()
df_rate.rename('Avg rating')
df_rate.to_frame()
df_rate

### Online order and Rating

It can be seen that median rating for restaurants with online order is higher than that without online order option.

In [None]:
df_online_order = df[['address','name', 'online_order']]
df_online_order = df_online_order.drop_duplicates()
print(df_online_order.groupby(['online_order'])['name'].count())
plt.figure(figsize = (6, 6))
sns.countplot(df['online_order'])
plt.show()

In [None]:
sns.boxplot(x = 'online_order', y = 'rate', data = df)

### No. of restaurants offereing online order (location wise)

In [None]:
df4 = df.groupby(['location','online_order'])['name'].count()
df4.to_csv('zomatodf4.csv')
df4 = pd.read_csv('zomatodf4.csv')
df4 = pd.pivot_table(df4, values=None, index=['location'], columns=['online_order'], fill_value=0, aggfunc=np.sum)
df4

### Booking Table and Rating

It can be seen that median rating for restaurants with online booking of table is way higher than that without online booking of table option.

In [None]:
print(df.groupby(['book_table'])['name'].count())
plt.figure(figsize = (6, 6))
sns.countplot(df['book_table'])
plt.show()

In [None]:
sns.boxplot(x = 'book_table', y = 'rate', data = df)

### No. of restaurants offereing table booking (location wise)

In [None]:
df5 = df.groupby(['location','book_table'])['name'].count()
df5.to_csv('zomatodf5.csv')
df5 = pd.read_csv('zomatodf5.csv')
df5 = pd.pivot_table(df5, values=None, index=['location'], columns=['book_table'], fill_value=0, aggfunc=np.sum)
df5

### Type of Restaurants and Rating

Bufffets, Cafes, Pubs and Bars and Drinks & nightlife have a higher median rating and less varied rating than Delivery, Desserts and Dine-out. 

In [None]:
plt.figure(figsize = (12, 6))
sns.boxplot(x = 'listed_in(type)', y = 'rate', data = df)

### Distribution of Types of Restaurants

In [None]:
type_count = df['listed_in(type)'].value_counts()
fig, ax = plt.subplots(figsize=(10, 10))
ax.pie(x=type_count.values, labels=None, autopct='%1.1f%%',
            startangle=90, textprops={'size': 10, 'color': 'white'},
            pctdistance=0.9, radius=1)
fig.suptitle('', fontsize=20)
fig.legend(type_count.index)

### Diffrent types of restaurants (location wise)

In [None]:
df6 = df.groupby(['location','listed_in(type)'])['name'].count()
df6.to_csv('zomatodf6.csv')
df6 = pd.read_csv('zomatodf6.csv')
df6 = pd.pivot_table(df6, values=None, index=['location'], columns=['listed_in(type)'], fill_value=0, aggfunc=np.sum)
df6

### No of votes (location wise)

In [None]:
df_votes = df[['address','name','location', 'votes']]
df_votes.drop_duplicates()
df7 = df_votes.groupby(['location'])['votes'].sum()
df7 = df7.to_frame()
df7.sort_values('votes', ascending=False).head(30)

### Cost Distribution and Rating

In [None]:
bins = pd.IntervalIndex.from_tuples([(0, 500), (501, 1000), (1001, 2000), (2001, 3000), (3001, 4000), (4001, 5000), (5001, 6000)])
df['cost_cat'] = pd.cut(df['cost'], bins)


In [None]:
plt.figure(figsize=(15,15))
ax = sns.boxplot(x="cost_cat", y="rate", data=df)
ax.set_xlabel('Cost', fontsize=16)
ax.set_ylabel('Rating', fontsize=16)
ax.set_title('Price and Distribution')
df.drop('cost_cat', axis=1, inplace=True)

### rest_type and location

In [None]:
df['rest_type'].unique()

In [None]:
df_rest = df[['name','location','rest_type']]

In [None]:
df_rest = df_rest.reindex(range(0,len(df_rest)))

In [None]:
df_rest.drop_duplicates()

In [None]:
a = set()
def func(x):
    if(type(x) == list):
        for y in x:
            a.add(y.strip())
b = df_rest['rest_type'].str.split(',').apply(func)
print(a)

In [None]:
column_names = list(a)
rest_type = pd.DataFrame(columns=column_names)
rest_type

In [None]:
rest_type['location'] = df_rest.groupby('location').groups.keys()
rest_type = rest_type.set_index('location').fillna(0)

In [None]:
for i in range(0,len(df_rest)):
    for x in a:
        if type(df_rest.loc[i, 'rest_type']) == str and x in df_rest.loc[i, 'rest_type']:
            rest_type.loc[df_rest.loc[i, 'location'], x] = rest_type.loc[df_rest.loc[i, 'location'], x]+1

#### No of rest_type (location wise)

In [None]:
rest_type

### No of cuisines offered and rating

In [None]:
def fun(x):
    if(type(x) == list):
        return (len(x))
    else:
        return 0
df['no_of_cuisine'] = df['cuisines'].str.split(',').apply(fun)

In [None]:
df.groupby(['no_of_cuisine'])['rate'].mean()

In [None]:
df.drop('no_of_cuisine', axis=1, inplace=True)

### No of different cuisines available location wise

In [None]:
df_cuisines = df[['name','location','cuisines']]

In [None]:
df_cuisines = df_cuisines.reindex(range(0,len(df_cuisines)))

In [None]:
df_cuisines = df_cuisines.drop_duplicates()
df_cuisines = df_cuisines.reindex(range(0, len(df_cuisines)))

In [None]:
a = set()
def func(x):
    if(type(x) == list):
        for y in x:
            a.add(y.strip())
b = df_cuisines['cuisines'].str.split(',').apply(func)
print(a)

In [None]:
column_names = list(a)
cuisines_type = pd.DataFrame(columns=column_names)
cuisines_type

In [None]:
cuisines_type['location'] = df_cuisines.groupby('location').groups.keys()
cuisines_type = cuisines_type.set_index('location').fillna(0)

In [None]:
i = 0
for i in range(0,len(df_cuisines)):
    for x in a:
        if type(df_cuisines.loc[i, 'cuisines']) == str and x in df_cuisines.loc[i, 'cuisines']:
            cuisines_type.loc[df_cuisines.loc[i, 'location'], x] = cuisines_type.loc[df_cuisines.loc[i, 'location'], x]+1

In [None]:
cuisines_type

### cuisines and rate

In [None]:
a = set()
def func(x):
    if(type(x) == list):
        for y in x:
            a.add(y.strip())
b = df_cuisines['cuisines'].str.split(',').apply(func)
print(a)

In [None]:
cuisines_rate = df[['name', 'location', 'rate', 'cuisines']]
cuisines_rate.drop_duplicates()
column_names = list(a)
cuisines_rate = pd.concat([cuisines_rate, pd.DataFrame(columns=column_names)], axis=1)
cuisines_rate = cuisines_rate.reindex(range(0,len(cuisines_rate)))
cuisines_rate = cuisines_rate.fillna(value = 0)

In [None]:
for i in range(0, len(cuisines_rate)):
    for x in a:
        if type(cuisines_rate.loc[i, 'cuisines']) == str and x in cuisines_rate.loc[i, 'cuisines']:
            cuisines_rate.loc[i, x] = cuisines_rate.loc[i, x]+1

In [None]:
cui_rate = cuisines_rate.groupby('rate').sum().tail(10).T

In [None]:
cui_rate.sort_values(4.0, ascending = False).head(50)

### listed_in (type) and location

In [None]:
df_listed_in = df[['name', 'location','listed_in(type)']]
df_listed_in = df_listed_in.drop_duplicates()
df9 = df_listed_in.groupby(['location','listed_in(type)'])['name'].count()
df9.to_csv('zomatodf9.csv')
df9 = pd.read_csv('zomatodf9.csv')
df9 = pd.pivot_table(df9, values=None, index=['location'], columns=['listed_in(type)'], fill_value=0, aggfunc=np.sum)
df9

In [None]:
df_loc_res_type = pd.DataFrame()
for rest in df['listed_in(type)'].unique():
      df_sub = df9[('name',rest)].sort_values(ascending=False).head(10)
      df_sub= df_sub.rename(rest)
      df_loc_res_type = pd.concat([df_loc_res_type, df_sub],axis = 1)
df_loc_res_type

In [None]:
df9[('name','Pubs and bars')].sort_values(ascending=False).head(10)


### Top Rated Restaurants

In [None]:
df_top_rate = df[['name', 'location', 'rate', 'listed_in(type)']]
df_top_rate_avg = df_top_rate.groupby(['name',])['rate'].mean()
df_top_rate_avg.sort_values(ascending=False).head(101)

In [None]:
df_top_rate = df[['name', 'location', 'rate', 'listed_in(type)']]
df_top_rate = df_top_rate.groupby(['name','listed_in(type)'])['rate'].mean()
df_top_rate=df_top_rate.to_frame()
df_top_rate_indexed = df_top_rate.sort_index()
for rest in df['listed_in(type)'].unique():
      print('--------', rest, '----------')
      print(df_top_rate_indexed.xs((slice(None), rest)).sort_values("rate", ascending = False).head())

### Top 20 Cuisines in Bangalore

In [None]:
cuisine_data = pd.DataFrame(cuisines_type.sum(axis=0))
cuisine_data
cuisine_data.reset_index(inplace=True)
cuisine_data.columns = ['Cuisines', 'Number of Resturants']
Top20_cuisines= (cuisine_data.sort_values(['Number of Resturants'],ascending=False)).head(20)
sns.barplot(Top20_cuisines['Cuisines'], Top20_cuisines['Number of Resturants'])
plt.xlabel('Cuisines', fontsize=20)
plt.ylabel('Number of Resturants', fontsize=20)
plt.title('Top 20 Cuisines', fontsize=30)
plt.xticks(rotation = 90)
plt.show()

### Top 20 Rest Type

In [None]:
rest_data = pd.DataFrame(rest_type.sum(axis=0))
rest_data
rest_data.reset_index(inplace=True)
rest_data.columns = ['rest_type', 'Number of Resturants']
Top20_rest= (rest_data.sort_values(['Number of Resturants'],ascending=False)).head(20)
sns.barplot(Top20_rest['rest_type'], Top20_rest['Number of Resturants'])
plt.xlabel('rest_type', fontsize=20)
plt.ylabel('Number of Resturants', fontsize=20)
plt.title('Top 20 rest_type on Zomato', fontsize=30)
plt.xticks(rotation = 90)
plt.show()

In [None]:
Top20_rest['rest_type']

### Table with Cost, Votes and New Restaurant Percentage (location wise)

In [None]:
df8 = pd.concat([df_loc_new, df7,df_cost], axis = 1)
df8.sort_values('votes', ascending= False).head(20)

We can see that Church Street, St. Marks Road and Cunningham road has least amount of new restaurants with a high amount of foot fall.

## More on St. Marks Road

In [None]:
df_st_marks_road=df[df.location == 'St. Marks Road']
df_st_marks_road

In [None]:
c = set()
def func(x):
    if(type(x) == list):
        for y in x:
            c.add(y.strip())
b = df_st_marks_road['cuisines'].str.split(',').apply(func)
print(c)
a = set()
def func(x):
    if(type(x) == list):
        for y in x:
            a.add(y.strip())
b = df_cuisines['cuisines'].str.split(',').apply(func)
print(a)
smr_missing_cuisines = a.difference(c)
smr_opp = smr_missing_cuisines.intersection(Top20_cuisines['Cuisines'])
print(smr_opp)

Pizza, Bakery and Indian are missing from St Marks Road.

In [None]:
c = set()
def func(x):
    if(type(x) == list):
        for y in x:
            c.add(y.strip())
b = df_st_marks_road['rest_type'].str.split(',').apply(func)
print(c)
a = set()
def func(x):
    if(type(x) == list):
        for y in x:
            a.add(y.strip())
b = df_rest['rest_type'].str.split(',').apply(func)
print(a)
smr_missing_rest = a.difference(c)
smr_opp_rest = smr_missing_rest.intersection(Top20_rest['rest_type'])
print(smr_opp_rest)

'Beverage Shop', 'Takeaway', 'Kiosk', 'Sweet Shop', 'Microbrewery', 'Club', 'Fine Dining', 'Food Truck', 'Food Court', 'Confectionery', 'Bakery', 'Delivery', 'Mess' are not present on St. Marks Road.

## More about Church Street

In [None]:
df_church_street=df[df.location == 'Church Street']
df_church_street

In [None]:
c = set()
def func(x):
    if(type(x) == list):
        for y in x:
            c.add(y.strip())
b = df_church_street['cuisines'].str.split(',').apply(func)
print(c)
a = set()
def func(x):
    if(type(x) == list):
        for y in x:
            a.add(y.strip())
b = df_cuisines['cuisines'].str.split(',').apply(func)
print(a)
cs_missing_cuisines = a.difference(c)
cs_opp = cs_missing_cuisines.intersection(Top20_cuisines['Cuisines'])
print(cs_opp)

Bakery and Indian are missing from Church Street.

In [None]:
c = set()
def func(x):
    if(type(x) == list):
        for y in x:
            c.add(y.strip())
b = df_church_street['rest_type'].str.split(',').apply(func)
print(c)
a = set()
def func(x):
    if(type(x) == list):
        for y in x:
            a.add(y.strip())
b = df_rest['rest_type'].str.split(',').apply(func)
print(a)
cs_missing_rest = a.difference(c)
cs_opp_rest = cs_missing_rest.intersection(Top20_rest['rest_type'])
print(cs_opp_rest)

'Takeaway', 'Kiosk', 'Sweet Shop', 'Microbrewery', 'Club', 'Fine Dining', 'Food Truck', 'Food Court', 'Confectionery', 'Bakery', 'Delivery', 'Mess' are not present on Church Road.

## More on Cunningham Road

In [None]:
df_cun_road=df[df.location == 'Cunningham Road']
df_cun_road

In [None]:
c = set()
def func(x):
    if(type(x) == list):
        for y in x:
            c.add(y.strip())
b = df_cun_road['cuisines'].str.split(',').apply(func)
print(c)
a = set()
def func(x):
    if(type(x) == list):
        for y in x:
            a.add(y.strip())
b = df_cuisines['cuisines'].str.split(',').apply(func)
print(a)
cr_missing_cuisines = a.difference(c)
cr_opp = cr_missing_cuisines.intersection(Top20_cuisines['Cuisines'])
print(cr_opp)

Ice Cream and Indian are missing from Cunningham Road.

In [None]:
c = set()
def func(x):
    if(type(x) == list):
        for y in x:
            c.add(y.strip())
b = df_cun_road['rest_type'].str.split(',').apply(func)
print(c)
a = set()
def func(x):
    if(type(x) == list):
        for y in x:
            a.add(y.strip())
b = df_rest['rest_type'].str.split(',').apply(func)
print(a)
cr_missing_rest = a.difference(c)
cr_opp_rest = cr_missing_rest.intersection(Top20_rest['rest_type'])
print(cr_opp_rest)

'Takeaway', 'Kiosk', 'Sweet Shop', 'Microbrewery', 'Club', 'Food Truck', 'Food Court', 'Confectionery', 'Mess' are not present on Cunningham Road.

## More on Kiosk Rest Type

There are 192 kiosks in Bangalore with the median cost for 2(approx) of rupees 255.

In [None]:
df['kiosk']=df['rest_type'].str.contains(pat = 'Kiosk')
df_kiosk = df[df.kiosk == True]
df_kiosk = df_kiosk.drop_duplicates()

In [None]:
rest_kiosk = rest_type['Kiosk']
rest_kiosk = rest_kiosk.to_frame()
rest_kiosk['TF'] = (rest_kiosk['Kiosk'] > 0)
rest_kiosk = rest_kiosk[rest_kiosk.TF == True]
rest_kiosk

In [None]:
print('Average Cost for 2 in Kiosks in Bangalore: ', df_kiosk.cost.mean())
print('Median Cost for 2 in Kiosks in Bangalore: ', df_kiosk.cost.median())
print('Max. Cost for 2 in Kiosks in Bangalore: ', df_kiosk.cost.max())
print('Min. Cost for 2 in Kiosks in Bangalore: ', df_kiosk.cost.min())

### Please Up Vote if you liked!!!!!