In [1]:
#import libraries
import pandas as pd

In [2]:
# Loading .csv file of Zomato data using pandas
df_data = pd.read_csv(r"C:\Users\Samir\OneDrive\Desktop\pyzomato.csv", encoding ='unicode_escape')
print(df_data)

      RestaurantID                 RestaurantName  CountryCode       City  \
0          7402935                           Skye           94    Jakarta   
1          7410290       Satoo - Hotel Shangri-La           94    Jakarta   
2          7420899                     Sushi Masa           94    Jakarta   
3          7421967                 3 Wise Monkeys           94    Jakarta   
4          7422489    Avec Moi Restaurant and Bar           94    Jakarta   
...            ...                            ...          ...        ...   
6253      17330155                       Deorio's          216   Columbus   
6254      17582499                    Royal Hotel          216  Pocatello   
6255      17284302  El Vaquero Mexican Restaurant          216     Albany   
6256      17342548  Happy Joe's Pizza & Ice Cream          216    Dubuque   
6257      17284105                  Cookie Shoppe          216     Albany   

                           Locality   Longitude   Latitude  \
0     Grand I

In [3]:
# Data Cleaning (Zomato data)
#check the datatype
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6258 entries, 0 to 6257
Data columns (total 30 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   RestaurantID           6258 non-null   int64  
 1   RestaurantName         6258 non-null   object 
 2   CountryCode            6258 non-null   int64  
 3   City                   6258 non-null   object 
 4   Locality               6258 non-null   object 
 5   Longitude              6258 non-null   float64
 6   Latitude               6258 non-null   float64
 7   Country.Countryname    6258 non-null   object 
 8   Cuisines               6249 non-null   object 
 9   Currency               6258 non-null   object 
 10  Has_Table_booking      6258 non-null   object 
 11  Has_Online_delivery    6258 non-null   object 
 12  Is_delivering_now      6258 non-null   object 
 13  Switch_to_order_menu   6258 non-null   object 
 14  Price_range            6258 non-null   int64  
 15  Vote

In [4]:
# Check the Null Values
print(df_data.isnull().sum())

RestaurantID             0
RestaurantName           0
CountryCode              0
City                     0
Locality                 0
Longitude                0
Latitude                 0
Country.Countryname      0
Cuisines                 9
Currency                 0
Has_Table_booking        0
Has_Online_delivery      0
Is_delivering_now        0
Switch_to_order_menu     0
Price_range              0
Votes                    0
Average_Cost_for_two     0
Rating                   0
Datekey_Opening          0
Year                     0
Day                      0
Quarter                  0
Months Name              0
Week of Month            0
Financial Quarters       0
USD Rate                 0
Indian rupees            0
RestaurantName - Copy    0
Day of Week              0
USD_Dollars              0
dtype: int64


In [5]:
df_data['Cuisines']

0             Italian, Continental
1       Asian, Indonesian, Western
2                  Sushi, Japanese
3                         Japanese
4                  French, Western
                   ...            
6253                Italian, Pizza
6254               Pizza, Bar Food
6255                       Mexican
6256    Desserts, Pizza, Ice Cream
6257                           NaN
Name: Cuisines, Length: 6258, dtype: object

In [6]:
# Example DataFrame
data = {'Cuisines': ['Italian, Chinese, Indian', 'Mexican, Thai', 'Indian, American']}
df = pd.DataFrame(data)



In [7]:
# Split the Cuisines column into multiple columns
df_split = df['Cuisines'].str.split(',', expand=True)

In [8]:
print(df_split)

         0          1        2
0  Italian    Chinese   Indian
1  Mexican       Thai     None
2   Indian   American     None


In [9]:
#Checking to eliminate unnecessary column 
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6258 entries, 0 to 6257
Data columns (total 30 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   RestaurantID           6258 non-null   int64  
 1   RestaurantName         6258 non-null   object 
 2   CountryCode            6258 non-null   int64  
 3   City                   6258 non-null   object 
 4   Locality               6258 non-null   object 
 5   Longitude              6258 non-null   float64
 6   Latitude               6258 non-null   float64
 7   Country.Countryname    6258 non-null   object 
 8   Cuisines               6249 non-null   object 
 9   Currency               6258 non-null   object 
 10  Has_Table_booking      6258 non-null   object 
 11  Has_Online_delivery    6258 non-null   object 
 12  Is_delivering_now      6258 non-null   object 
 13  Switch_to_order_menu   6258 non-null   object 
 14  Price_range            6258 non-null   int64  
 15  Vote

In [10]:
#Drop the unnecessary columns
df_data.drop(['Locality','Longitude','Latitude','Switch_to_order_menu','RestaurantName - Copy'],axis=1,inplace=True)

In [11]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6258 entries, 0 to 6257
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   RestaurantID          6258 non-null   int64  
 1   RestaurantName        6258 non-null   object 
 2   CountryCode           6258 non-null   int64  
 3   City                  6258 non-null   object 
 4   Country.Countryname   6258 non-null   object 
 5   Cuisines              6249 non-null   object 
 6   Currency              6258 non-null   object 
 7   Has_Table_booking     6258 non-null   object 
 8   Has_Online_delivery   6258 non-null   object 
 9   Is_delivering_now     6258 non-null   object 
 10  Price_range           6258 non-null   int64  
 11  Votes                 6258 non-null   int64  
 12  Average_Cost_for_two  6258 non-null   int64  
 13  Rating                6258 non-null   float64
 14  Datekey_Opening       6258 non-null   object 
 15  Year                 

In [12]:
#to change the datatype 1st replace the _ with -
df_data['Datekey_Opening']=df_data['Datekey_Opening'].str.replace('_','-')
df_data['Datekey_Opening']

0       02-05-2014
1       23-10-2014
2       19-08-2013
3       17-11-2014
4       14-11-2017
           ...    
6253    26-03-2012
6254    15-03-2017
6255    23-01-2016
6256    04-01-2017
6257    20-10-2011
Name: Datekey_Opening, Length: 6258, dtype: object