In [27]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Zomato Data Set Analysis & Visualization

In [28]:
# Importing Libraries

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('dark_background')

In [30]:
#Reading CSV

In [31]:
df = pd.read_csv("../input/dataset/zomato.csv")
df.head()

In [32]:
df.shape

In [33]:
df.columns

In [34]:
df = df.drop(['url', 'address', 'phone','menu_item', 'dish_liked', 'reviews_list'], axis = 1)
df.head()

In [35]:
df.info()

### Dropping Duplicates

In [36]:
df.drop_duplicates(inplace = True)
df.shape

### Cleaning Rate Column

In [37]:
df['rate'].unique()

### Removing "New", "-" and "/5" from Rate  Column

In [38]:
def handlerate(value):
    if(value=='NEW' or value=='-'):
        return np.nan
    else:
        value = str(value).split('/')
        value = value[0]
        return float(value)
    
df['rate'] = df['rate'].apply(handlerate)
df['rate'].head()

In [39]:
df.rate.isnull().sum()

### Filling Null in Rate Column with Mean

In [43]:
df["rate"].fillna(df['rate'].mean(), inplace = True)
df['rate'].isnull().sum()

In [44]:
df.info()

### Dropping Null  Values

In [45]:
df.dropna(inplace = True)
df.head()

In [46]:
df.rename(columns = {'approx_cost(for two people)':'Cost2plates','listed_in(type)':'Type'}, inplace = True)
df.head()

In [47]:
df['location'].unique()

In [48]:
df['listed_in(city)'].unique()

### Listed in(city) and  location, both are there, lets keep only one.

In [50]:
df = df.drop(['listed_in(city)'], axis = 1)

In [52]:
df['Cost2plates'].unique()

### Removing  , from Cost2Plates Column

In [53]:
def handlecomma(value):
    value = str(value)
    if ',' in value:
        value = value.replace(',','')
        return float(value)
    else:
        return float(value)
df['Cost2plates'] = df['Cost2plates'].apply(handlecomma)
df['Cost2plates'].unique()

In [54]:
df.head()

### Cleaning Rest Type Column

In [60]:
df['rest_type'].value_counts()

In [56]:
rest_types = df['rest_type'].value_counts(ascending = False)
rest_types

In [58]:
rest_types_lessthan1000 = rest_types[rest_types<1000]
rest_types_lessthan1000

### Making Rest Types Less than 1000 in  frequency as others

In [63]:
def handle_rest_type(value):
    if(value in rest_types_lessthan1000):
        return 'Others'
    else:
        return value
df['rest_type'] = df['rest_type'].apply(handle_rest_type)
df['rest_type'].value_counts()

### Cleaning Location Column

In [65]:
df['location'].value_counts()

In [68]:
location = df['location'].value_counts(ascending = False)
location_lessthan300 = location[location<300]

def handle_location(value):
    if(value in location_lessthan300):
        return 'others'
    else:
        return value
df['location'] = df['location'].apply(handle_location)
df['location'].value_counts()

### Cleaning Cuisines Column

In [73]:
cuisines = df['cuisines'].value_counts(ascending = False)

cuisines_lessthan100 = cuisines[cuisines<100]

def handle_cuisines(value):
    if(value in cuisines_lessthan100):
        return  'others'
    else:
        return value
    
df['cuisines'] = df['cuisines'].apply(handle_cuisines) 
df['cuisines'].value_counts()
        

In [74]:
df.head()

## Data Cleaning is done let's work on Data Visualization

### Count Plot of Various Locations

In [75]:
plt.figure(figsize =  (16,10))
ax = sns.countplot(df['location']) 
plt.xticks(rotation=90)

### Visulaizing Online Order

In [76]:
plt.figure(figsize = (6,6))
sns.countplot(df['online_order'],palette = 'inferno')   

### Visualizing Book Table

In [77]:
plt.figure(figsize = (6,6))
sns.countplot(df['book_table'], palette = 'rainbow')

### Visualizing Online Order Vs Rate

In [85]:
plt.figure(figsize = (6,6))
sns.boxplot(x = 'online_order', y = 'rate', data = df)

### Visualization Book Table Vs Rate

In [87]:
plt.figure(figsize = (6,6))  
sns.boxplot(x = 'book_table', y = 'rate', data = df)

### Visualizing Online Order Facility, Location Wise

In [90]:
df1 = df.groupby(['location','online_order'])['name'].count()
df1.to_csv('location_online.csv')
df1 = pd.read_csv('location_online.csv')
df1 = pd.pivot_table(df1, values = None, index = ['location'], columns = ['online_order'], fill_value = 0, aggfunc = np.sum)
df1

In [91]:
df1.plot(kind = 'bar', figsize = (15,8))

### Visualizing Book Table Facility, Location Wise

In [92]:
df2 = df.groupby(['location','book_table'])['name'].count()
df2.to_csv('location_book_table.csv')
df2 = pd.read_csv('location_book_table.csv')
df2 = pd.pivot_table(df2, values = None, index = ['location'], columns = ['book_table'], fill_value = 0, aggfunc = np.sum)
df2

In [93]:
df2.plot(kind = 'bar', figsize = (15,8))

### Visualizing Types of Restaurents Vs Rate

In [94]:
plt.figure(figsize = (14,8))
sns.boxplot(x = 'Type', y = 'rate', data = df, palette = 'inferno')

### Grouping Types of Restaurents, location wise

In [97]:
df3 = df.groupby(['location','Type'])['name'].count()
df3.to_csv('location_Type.csv')
df3 = pd.read_csv('location_Type.csv')
df3 = pd.pivot_table(df3, values = None, index = ['location'], columns = ['Type'], fill_value = 0, aggfunc = np.sum)
df3

In [98]:
df3.plot(kind = 'bar', figsize = (15,8))

### No. of Votes, Location Wise

In [101]:
df4 = df[['location', 'votes']]
df4.drop_duplicates()
df5 = df4.groupby(['location'])['votes'].sum()
df5 = df5.to_frame()
df5 = df5.sort_values('votes', ascending = False)
df5.head()

In [102]:
plt.figure(figsize = (15,8))
sns.barplot(df5.index , df5['votes'])
plt.xticks(rotation = 90)


In [103]:
df.head()

### Visualizing Top Cuisines

In [105]:
df6 = df[['cuisines', 'votes']]
df6.drop_duplicates()
df7 = df6.groupby(['cuisines'])['votes'].sum()
df7 = df7.to_frame()
df7 = df7.sort_values('votes', ascending = False)
df7.head()

In [106]:
df7 = df7.iloc[1:, :]
df7.head()

In [107]:
plt.figure(figsize = (15,8))
sns.barplot(df7.index , df7['votes'])
plt.xticks(rotation = 90)

### END 