# Swiggy Dataset, Data Pre- Processing and Cleaning

## Importing Libraries

In [2]:
# Importing Numpy
import numpy as np
# Importing Matplotlib
import matplotlib.pyplot as plt
# plt is athe alias name for pyplot
import pandas as pd
# pd is the alias for pandas

## Loading Data into Dataframe

In [3]:
# Loading the Dataset
swiggy_df = pd.read_csv("Swiggy_Dataset.csv")

## Exploring data and Performing Data Cleaning and Pre-Processing 

In [4]:
# Showing the first five rows
swiggy_df.head()

Unnamed: 0,Restaurant Name,City,Locality,Cuisines,Average Cost for two,Has Table booking,Has Online delivery,Rating Stars out of 5,Rating in text,Price range,Votes
0,Sultans of Spice,Bangalore,"BluPetal Hotel, Koramangala",North Indian,1300,Yes,Yes,4.1,Very Good,3,314
1,The Fatty Bao - Asian Gastro Bar,Bangalore,Indiranagar,Asian,2400,Yes,Yes,4.7,Excellent,4,591
2,Toit,Bangalore,Indiranagar,Italian,2000,No,No,4.8,Excellent,4,270
3,Three Dots & A Dash,Bangalore,Indiranagar,European,1300,Yes,No,3.9,Good,3,365
4,Bombay Brasserie,Bangalore,Indiranagar,Modern Indian,1500,No,Yes,4.2,Very Good,3,229


In [5]:
# Showing Unique values in 'City' column
swiggy_df['City'].unique()

array(['Bangalore', 'Chandigarh', 'Chennai', 'Goa', 'Guwahati',
       'Hyderabad', 'Jaipur', 'Kolkata', 'Lucknow', 'Mumbai', 'Patna',
       'Pune', 'Surat'], dtype=object)

In [7]:
# Showing Unique values in 'City' column
swiggy_df['Locality'].unique()

array(['BluPetal Hotel, Koramangala', 'Indiranagar', 'JP Nagar',
       'Koramangala 5th Block', 'Koramangala 6th Block',
       'Koramangala 7th Block', 'Marathahalli', 'New BEL Road',
       'Residency Road', 'Sarjapur Road', 'UB City',
       'Chandigarh Industrial Area',
       'Elante Mall, Chandigarh Industrial Area', 'Sector 26',
       'Sector 28', 'Sector 35', 'Sector 7', 'Sector 8', 'Adyar',
       'Anna Nagar East', 'Ashok Nagar',
       'Express Avenue Mall,  Royapettah', 'Gopalapuram', 'Kilpauk',
       'Kotturpuram', 'Mylapore', 'Nungambakkam', 'Perungudi', 'RA Puram',
       'Ramapuram', 'Santhome', 'T. Nagar', 'Velachery', 'Anjuna',
       'Anjuna Beach, Anjuna', 'Arambol', 'Baga', 'Betalbatim',
       'Calangute', 'Calangute Beach, Calangute', 'Candolim',
       'Cavelossim', 'Nerul', 'Panaji', 'Vagator',
       'Anil Plaza, Christian Basti', 'Chandmari', 'Christian Basti',
       'Dispur', 'Six Mile', 'Ulubari', 'Uzan Bazaar', 'Zoo Tiniali',
       '12th Square Buildi

In [8]:
# Showing Unique values in 'City' column
swiggy_df['Cuisines'].unique()

array(['North Indian', 'Asian', 'Italian', 'European', 'Modern Indian',
       'Bakery', 'Fast Food', 'Continental', 'American', 'Mexican',
       'Japanese', 'Cafe', 'Mughlai', 'Desserts', 'South Indian',
       'Biryani', 'Seafood', 'Goan', 'Chinese', 'Mediterranean',
       'Hyderabadi', 'Rajasthani', 'Bengali', 'Healthy Food',
       'Street Food', 'Charcoal Grill', 'Lebanese', 'Gujarati'],
      dtype=object)

In [6]:
# Showing the shape of the Dataset
swiggy_df.shape

(258, 11)

*From above we can see that the data contains 258 rows and 11 columns*

In [10]:
# Checking Datatypes of all column
swiggy_df.dtypes

Restaurant Name            object
City                       object
Locality                   object
Cuisines                   object
Average Cost for two        int64
Has Table booking          object
Has Online delivery        object
 Rating Stars out of 5    float64
Rating in text             object
Price range                 int64
Votes                       int64
dtype: object

In [13]:
# Check ratings info
swiggy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 258 entries, 0 to 257
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Restaurant Name         258 non-null    object 
 1   City                    258 non-null    object 
 2   Locality                258 non-null    object 
 3   Cuisines                258 non-null    object 
 4   Average Cost for two    258 non-null    int64  
 5   Has Table booking       258 non-null    object 
 6   Has Online delivery     258 non-null    object 
 7    Rating Stars out of 5  258 non-null    float64
 8   Rating in text          258 non-null    object 
 9   Price range             258 non-null    int64  
 10  Votes                   258 non-null    int64  
dtypes: float64(1), int64(3), object(7)
memory usage: 22.3+ KB


In [14]:
# Check Duplicates
swiggy_df.duplicated().sum()

0

In [11]:
swiggy_df.columns

Index(['Restaurant Name', 'City', 'Locality', 'Cuisines',
       'Average Cost for two', 'Has Table booking', 'Has Online delivery',
       ' Rating Stars out of 5', 'Rating in text', 'Price range', 'Votes'],
      dtype='object')

In [15]:
# Check the presence of missing values
swiggy_df.isnull().sum()

Restaurant Name           0
City                      0
Locality                  0
Cuisines                  0
Average Cost for two      0
Has Table booking         0
Has Online delivery       0
 Rating Stars out of 5    0
Rating in text            0
Price range               0
Votes                     0
dtype: int64

There are no null values in the dataset

- Now as there are multiple values in 'City', 'Locality', 'Cuisines', 'Rating in text' columns applying LabelEncoder() the 'City', 'Locality', 'Cuisines' columns.
- 'Has Table booking', 'Has Online delivery' columns have two values 'yes' and 'no', so we can use get_dummies() , but in order to keep the processed data small we will be using  LabelEncoder() on 'Has Table booking', 'Has Online delivery' columns.

In [12]:
# Importing LabelEncoder from sklearn.preprocessing 
from sklearn.preprocessing import LabelEncoder
# Initializing the LabelEncoder
lb = LabelEncoder()

In [16]:
# Applying LabelEncoder on multiple columnas at same time using for loop
enc_list=['City', 'Locality', 'Cuisines','Rating in text', 'Has Table booking', 'Has Online delivery']
for i in enc_list:
    swiggy_df[i] = lb.fit_transform(swiggy_df[i])

In [17]:
# Showing top 5 Rows of the pre- processed dataset
swiggy_df.head()

Unnamed: 0,Restaurant Name,City,Locality,Cuisines,Average Cost for two,Has Table booking,Has Online delivery,Rating Stars out of 5,Rating in text,Price range,Votes
0,Sultans of Spice,0,20,23,1300,1,1,4.1,3,3,314
1,The Fatty Bao - Asian Gastro Bar,0,56,1,2400,1,1,4.7,1,4,591
2,Toit,0,56,16,2000,0,0,4.8,1,4,270
3,Three Dots & A Dash,0,56,10,1300,1,0,3.9,2,3,365
4,Bombay Brasserie,0,56,21,1500,0,1,4.2,3,3,229


## Saving the Pre-Processed and cleaned data to a csv file

In [18]:
# saving the dataframe
swiggy_df.to_csv('Swiggy_Pre_Processed_dataset.csv',index=False)

Now we can use the saved csv file to perform further analysis and applying various machine learning models based on data and desired output