In [1]:
import pandas as pd
import numpy as np
import re
import googlemaps
import json

In [2]:
dataset = pd.read_csv('RestaurantGeocoded.csv')
dataset.head()

Unnamed: 0,Locality,RestaurantName,Address,Category,CostForTwo,Cuisines,Ratings,votes,Latitude,Longitude
0,C Scheme,Kanha,"E 62, Bhagat Singh Marg, C Scheme, Jaipur","['Sweet Shop', 'Quick Bites']",₹500,"['North Indian', 'South Indian', 'Fast Food', ...",4.2,995,26.912258,75.80059
1,C Scheme,Stepout Cafe,"P 14, Sehdev Marg, Ashok Nagar, C Scheme, Jaipur","['Café', 'Casual Dining']",₹800,"['Cafe', 'Italian', 'Lebanese', 'Mexican', 'Co...",4.5,1385,26.905899,75.802078
2,C Scheme,Burger Farm,"3 & 4, Trimurty, V-Jai City Point Building, Ah...",['Quick Bites'],₹400,"['Burger', 'Fast Food', 'Beverages']",4.1,1427,26.914246,75.80493
3,C Scheme,Domino's Pizza,"C 18, Silver Square Mall, Bhagwandas Marg, C S...",['Quick Bites'],₹700,"['Pizza', 'Fast Food']",3.3,1258,26.915043,75.809926
4,C Scheme,Thali and More,"C-46 B, Sarojini Marg, Above Punjab National B...",['Casual Dining'],"₹1,100","['North Indian', 'South Indian', 'Chinese', 'F...",4.4,2339,26.910926,75.805679


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4354 entries, 0 to 4353
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Locality        4354 non-null   object 
 1   RestaurantName  4354 non-null   object 
 2   Address         4354 non-null   object 
 3   Category        4354 non-null   object 
 4   CostForTwo      4354 non-null   object 
 5   Cuisines        4354 non-null   object 
 6   Ratings         4342 non-null   object 
 7   votes           4354 non-null   object 
 8   Latitude        4279 non-null   float64
 9   Longitude       4279 non-null   float64
dtypes: float64(2), object(8)
memory usage: 340.3+ KB


We need to change the data type for CostForTwo, Ratings, votes from object to int, float, int respectively.  
Also, Category and Cuisines are in list format, we need to get the strings out of them.

First, let's check different values of cost.

In [4]:
dataset['CostForTwo'].value_counts()

₹300              734
₹200              532
₹400              512
₹500              415
₹250              345
₹150              296
₹350              251
₹600              220
₹100              167
₹450              152
₹700              113
₹800              110
₹1,000             75
₹550               67
₹1,200             51
₹900               43
₹650               42
₹750               32
₹1,500             27
₹1,100             26
₹1,400             20
₹1,600             16
₹850               15
₹1,300             13
₹2,000             13
₹1,800             11
₹3,000              9
₹1,700              7
₹2,500              6
₹950                4
₹2,100              3
₹4,000              3
₹2,200              3
₹3,500              3
₹2,400              3
₹5,000              2
₹2,600              2
₹6,000              2
₹2,800              1
₹4,100              1
₹8,000              1
No Cost given.      1
₹1,250              1
₹3,200              1
₹7,000              1
₹2,300    

No cost given is not a cost, we need to get rid of it.

In [5]:
dataset.loc[dataset['CostForTwo']=='No Cost given.']

Unnamed: 0,Locality,RestaurantName,Address,Category,CostForTwo,Cuisines,Ratings,votes,Latitude,Longitude
2231,Mansarovar,Burgs & Fries,"Shop 8, Ridhi Sidhi Link Road, Gopal Pura, Man...",[],No Cost given.,"['Fast Food', 'Burger', 'Wraps']",-,No-Votes,26.83875,75.764087


It seems we don't have a category, cost, rating, or votes for this restaurant. With the current amount of data, this tuple is no use to us.

In [6]:
dataset.drop(2231, inplace=True)
dataset.reset_index(drop=True, inplace=True)

With that out of the way, there is no other anomaly in the costs, let's remove the Rupee symbol and comma from the cost and typecast the values to integer.  
We will use regular expression to accomplish that.

In [7]:
cost = dataset['CostForTwo'].values
cost = [re.sub(',','',item) for item in cost]
cost = [item[1:] for item in cost]
cost = [int(item) for item in cost]
type(cost[0])
dataset['CostForTwo'] = cost

Now, we will work on the Categories and Cuisines, since both require same type of operations.  
While we are at it, let's also extract all the unique Categories and Cuisines.

In [8]:
category = dataset['Category'].values.tolist()
cuisines = dataset['Cuisines'].values.tolist()

In [9]:
for index, value in enumerate(cuisines):
    cuisines[index] = re.sub('\'','',value).strip('[]').split(', ')
RES_CUISINES = set([val for item in cuisines for val in item])

In [10]:
for index, value in enumerate(category):
    category[index] = re.sub('\'','',value).strip('[]').split(', ')
RES_CATEGORIES = set([val for item in category for val in item])

Fix the Category and Cuisine values.

In [11]:
dataset['Category'] = [re.sub('\'','', item).strip('[]') for item in dataset['Category']]
dataset['Cuisines'] = [re.sub('\'','', item).strip('[]') for item in dataset['Cuisines']]

In [12]:
dataset.head()

Unnamed: 0,Locality,RestaurantName,Address,Category,CostForTwo,Cuisines,Ratings,votes,Latitude,Longitude
0,C Scheme,Kanha,"E 62, Bhagat Singh Marg, C Scheme, Jaipur","Sweet Shop, Quick Bites",500,"North Indian, South Indian, Fast Food, Chinese...",4.2,995,26.912258,75.80059
1,C Scheme,Stepout Cafe,"P 14, Sehdev Marg, Ashok Nagar, C Scheme, Jaipur","Café, Casual Dining",800,"Cafe, Italian, Lebanese, Mexican, Continental,...",4.5,1385,26.905899,75.802078
2,C Scheme,Burger Farm,"3 & 4, Trimurty, V-Jai City Point Building, Ah...",Quick Bites,400,"Burger, Fast Food, Beverages",4.1,1427,26.914246,75.80493
3,C Scheme,Domino's Pizza,"C 18, Silver Square Mall, Bhagwandas Marg, C S...",Quick Bites,700,"Pizza, Fast Food",3.3,1258,26.915043,75.809926
4,C Scheme,Thali and More,"C-46 B, Sarojini Marg, Above Punjab National B...",Casual Dining,1100,"North Indian, South Indian, Chinese, Fast Food",4.4,2339,26.910926,75.805679


Now, we will take care of the Ratings.

In [13]:
dataset['Ratings'].value_counts()

-      730
3.3    318
3.4    313
3.2    303
3.5    298
3.1    281
3.6    254
3.7    252
3.8    225
3.0    216
NEW    203
3.9    177
2.9    162
4.0    134
2.8    117
4.1     82
2.7     66
4.2     56
2.6     34
4.3     29
4.4     17
2.5     17
4.5     17
2.4     13
2.3      7
4.6      7
4.7      7
2.2      2
2.0      1
4.8      1
4.9      1
2.1      1
Name: Ratings, dtype: int64

In [14]:
dataset['Ratings'].replace(['-','NEW'],np.nan, inplace=True)

In [15]:
dataset['votes'].value_counts()

No-Votes    949
4           128
8           112
7           102
5            99
           ... 
517           1
2287          1
1408          1
3694          1
727           1
Name: votes, Length: 556, dtype: int64

In [16]:
dataset['votes'].replace('No-Votes', np.nan, inplace=True)

In [17]:
dataset.isnull().sum()

Locality            0
RestaurantName      0
Address             0
Category            0
CostForTwo          0
Cuisines            0
Ratings           945
votes             949
Latitude           75
Longitude          75
dtype: int64