In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
print(os.listdir("../input"))
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
sns.set_style('darkgrid')

In [None]:
df = pd.read_csv(r'../input/zomato-restaurants-data/zomato.csv',encoding="latin-1")
df.head()

In [None]:
for columns in df.columns:
    print(columns)

In [None]:
print("Shape of the dataset {}".format(df.shape))

# Exploratory Data Analysis


In [None]:
# Our dataset has 21 columns and 9551 rows
zomato_india = df[df['Country Code'] == 1]
zomato_india.head()

In [None]:
zomato_india['City'].value_counts()

The dataset is highly skewed toward the cities included in Delhi-NCR. So, we will summarise all the other cities in Rest of India while those in New Delhi, Ghaziabad, Noida, Gurgaon, Faridabad to Delhi-NCR. Doing this would make our analysis turn toward Delhi-NCR v Rest of India.

In [None]:
zomato_ncr = zomato_india[(zomato_india['City'] == 'New Delhi') | (zomato_india['City'] == 'Gurgaon') | (zomato_india['City'] == 'Faridabad') | (zomato_india['City'] == 'Noida')]
zomato_ncr.head()

In [None]:
zomato_ncr.drop(['Country Code','Currency'], axis = 1, inplace = True)  # dropping unnecessary columns what are not needed

In [None]:
zomato_ncr.shape # now our data has 19 columns

In [None]:
zomato_ncr.info()

In [None]:
zomato_ncr.describe()

In [None]:
# Checking for Null values
zomato_ncr.isnull().sum()

In [None]:
# Visual Description of null values 
sns.heatmap(zomato_ncr.isnull(), yticklabels = False, cbar = False, cmap = 'viridis')
# No null values in the dataset

In [None]:
# Encode the boolean values
boolean_columns = ['Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu']

# Create encoding
encoding = {"Yes" : 1,
            "No" : 0}

# encoding using the lambda function
for col in boolean_columns:
    zomato_ncr[col] = zomato_ncr[col].apply(lambda x : encoding[x])

In [None]:
zomato_ncr[boolean_columns]

In [None]:
# check restaurants having more than 7 outlets
more_than_7 = {}
for restaurants,val in zip(zomato_ncr['Restaurant Name'].value_counts().index, zomato_ncr['Restaurant Name'].value_counts()):
    if val >= 7:
        more_than_7[restaurants] = val
print("Reataurants with more than 7 outlets in ncr: {}".format(len(more_than_7.keys())))

In [None]:
pd.set_option('display.max_rows', None)
zomato_ncr.head(3)

# Data Visualisation

In [None]:
# Visualisation of top 5 restaurants in ncr
zomato_ncr['Restaurant Name'].value_counts().head(5).plot(kind='bar', color='black', figsize=(10,10))
plt.xlabel('Name of restaurant', color='g', fontsize = 18)
plt.ylabel('Amount of each restaurant', color='g', fontsize = 18)
plt.title('Top 5 Restaurants with most outlets in Delhi NCR', color = 'g', fontsize = 25)
plt.figure()

In [None]:
zomato_ncr.groupby('Cuisines')['Aggregate rating'].mean().plot.hist(orientation='vertical', color='pink', figsize=(7,7))
plt.figure()

In [None]:
# 1 show yes for booking while 0 show no
plt.figure(figsize = (5,5))
sns.countplot(zomato_ncr['Has Table booking'], palette="Set3")
plt.show()

In [None]:
# 1 show yes for booking while 0 show no
plt.figure(figsize = (5,5))
sns.countplot(zomato_ncr['Has Online delivery'], palette="pastel")
plt.show()

# Restaurants with more than 7 outlets

In [None]:
plt.figure(figsize = (22, 11))
plt.bar(x = more_than_7.keys(), height = more_than_7.values(), color='royalblue')
plt.xlabel('Restaurants', fontsize = 18)
plt.ylabel('Number of outlets', fontsize = 19)
plt.title('Restaurants having more than 7 outlets in Delhi NCR', fontsize = 25)
plt.xticks(rotation = -90)
plt.yticks(np.arange(0,90,2))
plt.grid(color='#95a5a6', linestyle='--', linewidth=1, axis='y', alpha=0.7)
plt.show()

In [None]:
def seven_or_more(data, column):
    for col in column:

        d = zomato_ncr[zomato_ncr['Restaurant Name'] == col]
        data = pd.concat((data,d), axis = 0)
  
    return data

In [None]:
seven_or_more_outlets = pd.DataFrame(None)
seven_or_more_outlets = seven_or_more(seven_or_more_outlets, list(more_than_7.keys()))
seven_or_more_outlets.shape

In [None]:
plt.figure(figsize = (12,10))
sns.countplot(seven_or_more_outlets['City'], palette="Set3")
plt.yticks(np.arange(0,950,50))
plt.show()

# New Delhi Data

In [None]:
# analysis on new delhi data
delhi_data = seven_or_more_outlets.groupby('City').get_group('New Delhi')
delhi_data.head()

In [None]:
delhi_data[['Locality','Locality Verbose']].head()

In [None]:
delhi_data.drop('Locality Verbose',axis = 1, inplace = True)

In [None]:
delhi_data['Locality'].value_counts().sort_values(ascending=False).head(10)

In [None]:
plt.figure(figsize = (12, 30))
plt.barh(delhi_data['Locality'].value_counts().sort_values().index, delhi_data['Locality'].value_counts().sort_values(), color = 'royalblue')
plt.ylabel('New Delhi Localities', fontsize = 18)
plt.xlabel('Count', fontsize = 18)
plt.title('Distribution of Restaurants in New Delhi Localities', fontsize = 30)
plt.xticks(np.arange(0,26,1))
plt.grid(color='#95a5a6', linestyle='--', linewidth=1, axis='y', alpha=0.7)
plt.show()

1. Posh areas have more number of outlets.
2. South Delhi Areas have more number of outlets as compared to other parts of Delhi.

In [None]:
plt.figure(figsize=(15,23))
plt.barh(delhi_data['Cuisines'].value_counts().sort_values().index, delhi_data['Cuisines'].value_counts().sort_values(), color = 'royalblue')
plt.ylabel("Cuisines", fontsize = 30)
plt.xlabel("Count", fontsize = 30)
plt.title('Visualising Popularity of various cuisines in Delhi', fontsize = 40)
plt.grid(color='#95a5a6', linestyle='--', linewidth=1, axis='y', alpha=0.7)
plt.show()

Fast Food is the most trending cuisine

In [None]:
# Creating competitor data 
competitor = set()
for res,cuisine in zip(delhi_data['Restaurant Name'], delhi_data['Cuisines']):
    if 'Mughlai' in cuisine or 'Fast Food' in cuisine or 'American' in cuisine or 'Pizza' in cuisine or 'Burger' in cuisine or 'Biryani' in cuisine:
        competitor.add(res)
competitor

In [None]:
competitor_data = pd.DataFrame(None)
for col in competitor:
    d = delhi_data[delhi_data['Restaurant Name'] == col]
    competitor_data = pd.concat((competitor_data, d), axis=0)

In [None]:
# Group the competitor data by restaurant name and then visualise the mean of average cost
mean_competitor_data = competitor_data.groupby('Restaurant Name').mean()[['Average Cost for two', 'Price range', 'Aggregate rating', 'Votes']]
mean_competitor_data.hist(figsize=(12,9))
plt.show()

In [None]:
# Let's see if there is any correlation in the competitor data
sns.heatmap(competitor_data.drop('Switch to order menu', axis = 1).corr())
plt.show()

1. The average cost for two of my competitors is majorly between INR 200-800
2. The price range is between 1 and 3, most of them having 1.
3. There is a positive correlation between Average Cost for two and Has Online Delivery and Has Table booking.