In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "svg"


In [2]:
raw_df = pd.read_csv('Zomato Chennai Listing 2020.csv', delimiter=',')

In [3]:
print(raw_df.head())

                                          Zomato URL     Name of Restaurant  \
0  https://www.zomato.com/chennai/yaa-mohaideen-b...  Yaa Mohaideen Briyani   
1  https://www.zomato.com/chennai/sukkubhai-biriy...     Sukkubhai Biriyani   
2  https://www.zomato.com/chennai/ss-hyderabad-bi...   SS Hyderabad Biryani   
3        https://www.zomato.com/chennai/kfc-perambur                    KFC   
4  https://www.zomato.com/chennai/tasty-kitchen-p...          Tasty Kitchen   

                                             Address     Location  \
0          336 & 338, Main Road, Pallavaram, Chennai   Pallavaram   
1   New 14, Old 11/3Q, Railway Station Road, MKN ...      Alandur   
2   98/339, Arcot Road, Opposite Gokulam Chit Fun...  Kodambakkam   
3   10, Periyar Nagar, 70 Feet Road, Near Sheeba ...     Perambur   
4   135B, SRP Colony, Peravallur, Near Perambur, ...     Perambur   

                                             Cuisine  \
0                                        ['Biryani']  

In [4]:
print(raw_df.shape)

(12032, 12)


In [5]:
print(raw_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12032 entries, 0 to 12031
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Zomato URL             12032 non-null  object 
 1   Name of Restaurant     12032 non-null  object 
 2   Address                12032 non-null  object 
 3   Location               12032 non-null  object 
 4   Cuisine                12032 non-null  object 
 5   Top Dishes             12032 non-null  object 
 6   Price for 2            12032 non-null  float64
 7   Dining Rating          12032 non-null  object 
 8   Dining Rating Count    12032 non-null  object 
 9   Delivery Rating        12032 non-null  object 
 10  Delivery Rating Count  12032 non-null  object 
 11  Features               12032 non-null  object 
dtypes: float64(1), object(11)
memory usage: 1.1+ MB
None


In [6]:
print(raw_df.describe())

        Price for 2
count  12032.000000
mean     397.611370
std      332.045938
min       40.000000
25%      200.000000
50%      300.000000
75%      450.000000
max     5000.000000


In [7]:
wrong_data = raw_df['Delivery Rating'] == 'Delivery Rating'
print(raw_df[wrong_data])

Empty DataFrame
Columns: [Zomato URL, Name of Restaurant, Address, Location, Cuisine, Top Dishes, Price for 2, Dining Rating, Dining Rating Count, Delivery Rating, Delivery Rating Count, Features]
Index: []


In [8]:
raw_df = raw_df[~wrong_data]

In [9]:
raw_df.drop(['Zomato URL', 'Delivery Rating', 'Address'], axis = 1, inplace=True)

In [10]:
raw_df.head()

Unnamed: 0,Name of Restaurant,Location,Cuisine,Top Dishes,Price for 2,Dining Rating,Dining Rating Count,Delivery Rating Count,Features
0,Yaa Mohaideen Briyani,Pallavaram,['Biryani'],"['Bread Halwa', ' Chicken 65', ' Mutton Biryan...",500.0,4.3,1500,9306,"['Home Delivery', 'Indoor Seating']"
1,Sukkubhai Biriyani,Alandur,"['Biryani', ' North Indian', ' Mughlai', ' Des...","['Beef Biryani', ' Beef Fry', ' Paratha', ' Pa...",1000.0,4.4,3059,39200,"['Home Delivery', 'Free Parking', 'Table booki..."
2,SS Hyderabad Biryani,Kodambakkam,"['Biryani', ' North Indian', ' Chinese', ' Ara...","['Brinjal Curry', ' Tandoori Chicken', ' Chick...",500.0,4.3,1361,10500,"['Home Delivery', 'Indoor Seating']"
3,KFC,Perambur,"['Burger', ' Fast Food', ' Finger Food', ' Bev...",['Zinger Burger'],500.0,4.0,1101,11200,"['Home Delivery', 'Free Parking', 'Card Upon D..."
4,Tasty Kitchen,Perambur,"['Chinese', ' Biryani', ' North Indian', ' Che...","['Mutton Biryani', ' Chicken Rice', ' Tomato R...",450.0,4.2,617,22400,"['Home Delivery', 'Indoor Seating']"


In [11]:
raw_df.isnull().sum()

Name of Restaurant       0
Location                 0
Cuisine                  0
Top Dishes               0
Price for 2              0
Dining Rating            0
Dining Rating Count      0
Delivery Rating Count    0
Features                 0
dtype: int64

In [12]:
raw_df[raw_df['Price for 2'].isnull()]

Unnamed: 0,Name of Restaurant,Location,Cuisine,Top Dishes,Price for 2,Dining Rating,Dining Rating Count,Delivery Rating Count,Features


In [13]:
raw_df = raw_df.drop(labels=12031, axis=0)

In [14]:
raw_df.fillna('NA', inplace=True)

In [15]:
raw_df.isnull().sum()

Name of Restaurant       0
Location                 0
Cuisine                  0
Top Dishes               0
Price for 2              0
Dining Rating            0
Dining Rating Count      0
Delivery Rating Count    0
Features                 0
dtype: int64

In [16]:
raw_df['Delivery Rating Count'].value_counts()

Not enough Delivery Reviews    3379
Does not offer Delivery        2252
None                            219
27                               34
37                               30
                               ... 
3530                              1
7705                              1
6374                              1
1773                              1
1617                              1
Name: Delivery Rating Count, Length: 2282, dtype: int64

In [17]:
raw_df['Dining Rating'].replace(to_replace=['-','NEW','Opening'], value='0', inplace=True)

In [18]:
raw_df['Features'].value_counts()

['Home Delivery', 'Indoor Seating']                                                                                                                                                                                               2447
['Delivery Only']                                                                                                                                                                                                                  961
['Home Delivery']                                                                                                                                                                                                                  759
['Indoor Seating']                                                                                                                                                                                                                 569
['Breakfast', 'Home Delivery', 'Indoor Seating']                            

In [19]:
raw_df['Features'].replace(to_replace=['-','NEW','Opening'], value='0', inplace=True)


In [20]:
raw_df['Price for 2'] = raw_df['Price for 2'].astype('int64')
raw_df['Dining Rating'] = raw_df['Dining Rating'].astype('float64')
raw_df['Features'] = raw_df['Features'].astype('int64')

ValueError: could not convert string to float: 'None'