# Breakdown of this Notebook:

1. Loading the data and import the libraries.
2. Data Cleading:
    * Deleting redundant columns.
    * Renaming the columns.
    * Dropping the duplicates.
    * Cleaning individual columns
    * Removing the NaN value from the dataset
    * Some transformation

In [1]:

import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
import re
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Loading the dataset
zomato = pd.read_csv("C:/Sanket.f/Hacker Earth/zomato.csv", encoding = "ISO-8859-1")
zomato.head()

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229


In [3]:
zomato.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9551 entries, 0 to 9550
Data columns (total 21 columns):
Restaurant ID           9551 non-null int64
Restaurant Name         9551 non-null object
Country Code            9551 non-null int64
City                    9551 non-null object
Address                 9551 non-null object
Locality                9551 non-null object
Locality Verbose        9551 non-null object
Longitude               9551 non-null float64
Latitude                9551 non-null float64
Cuisines                9542 non-null object
Average Cost for two    9551 non-null int64
Currency                9551 non-null object
Has Table booking       9551 non-null object
Has Online delivery     9551 non-null object
Is delivering now       9551 non-null object
Switch to order menu    9551 non-null object
Price range             9551 non-null int64
Aggregate rating        9551 non-null float64
Rating color            9551 non-null object
Rating text             9551 non-null o

In [4]:
zomato["City"].value_counts()

New Delhi    5473
Gurgaon      1118
Noida        1080
Faridabad     251
Ghaziabad      25
             ... 
Huskisson       1
Lorn            1
Mc Millan       1
Weirton         1
Palm Cove       1
Name: City, Length: 141, dtype: int64

In [5]:
zomato["Switch to order menu"].value_counts()

No    9551
Name: Switch to order menu, dtype: int64

In [6]:
zomato["Rating color"].value_counts()

Orange        3737
White         2148
Yellow        2100
Green         1079
Dark Green     301
Red            186
Name: Rating color, dtype: int64

In [7]:
zomato["Rating text"].value_counts()

Average      3737
Not rated    2148
Good         2100
Very Good    1079
Excellent     301
Poor          186
Name: Rating text, dtype: int64

### Data Cleaning and Feature Engineering

In [8]:
#It seems no columns need to be deleted
# Removing the duplicates
zomato.duplicated().sum()


0

In [9]:
# Checking whether it has any null values
zomato.isnull().sum()

Restaurant ID           0
Restaurant Name         0
Country Code            0
City                    0
Address                 0
Locality                0
Locality Verbose        0
Longitude               0
Latitude                0
Cuisines                9
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Aggregate rating        0
Rating color            0
Rating text             0
Votes                   0
dtype: int64

In [10]:
# Reading the column Names
zomato.columns

Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
       'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines',
       'Average Cost for two', 'Currency', 'Has Table booking',
       'Has Online delivery', 'Is delivering now', 'Switch to order menu',
       'Price range', 'Aggregate rating', 'Rating color', 'Rating text',
       'Votes'],
      dtype='object')

In [11]:
# All the values in columns are having correct datatypes so no conversion needed
zomato.dtypes

Restaurant ID             int64
Restaurant Name          object
Country Code              int64
City                     object
Address                  object
Locality                 object
Locality Verbose         object
Longitude               float64
Latitude                float64
Cuisines                 object
Average Cost for two      int64
Currency                 object
Has Table booking        object
Has Online delivery      object
Is delivering now        object
Switch to order menu     object
Price range               int64
Aggregate rating        float64
Rating color             object
Rating text              object
Votes                     int64
dtype: object

In [12]:
# Reading ratings of dataset
zomato["Aggregate rating"]

0       4.8
1       4.5
2       4.4
3       4.9
4       4.8
       ... 
9546    4.1
9547    4.2
9548    3.7
9549    4.0
9550    4.0
Name: Aggregate rating, Length: 9551, dtype: float64

In [13]:
zomato['Locality Verbose']

0       Century City Mall, Poblacion, Makati City, Mak...
1       Little Tokyo, Legaspi Village, Makati City, Ma...
2       Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...
3       SM Megamall, Ortigas, Mandaluyong City, Mandal...
4       SM Megamall, Ortigas, Mandaluyong City, Mandal...
                              ...                        
9546                                  Karakí_y, ÛÁstanbul
9547                                 Koôuyolu, ÛÁstanbul
9548                               Kuruí_eôme, ÛÁstanbul
9549                               Kuruí_eôme, ÛÁstanbul
9550                                      Moda, ÛÁstanbul
Name: Locality Verbose, Length: 9551, dtype: object

In [14]:
zomato["Locality"].value_counts()

Connaught Place                 122
Rajouri Garden                   99
Shahdara                         87
Defence Colony                   86
Malviya Nagar                    85
                               ... 
Grovetown                         1
Maddilapalem                      1
Bogor Timur                       1
Calangute Beach, Calangute        1
Majaz Waterfront, Al Majaz 3      1
Name: Locality, Length: 1208, dtype: int64

In [15]:
zomato["Cuisines"].sample(5)

5286                       Bakery, Desserts
6675     Italian, North Indian, Continental
5301    North Indian, South Indian, Chinese
266                     American, Breakfast
2897                                 Mithai
Name: Cuisines, dtype: object

In [16]:
zomato["Restaurant Name"].value_counts()

Cafe Coffee Day     83
Domino's Pizza      79
Subway              63
Green Chick Chop    51
McDonald's          48
                    ..
Binge Restaurant     1
The Chop House       1
Omazoni              1
The Roll Hut         1
Mad Cowes Cafe       1
Name: Restaurant Name, Length: 7446, dtype: int64

In [17]:
# Lower Casing
zomato["Locality"] = zomato["Locality"].str.lower()

In [18]:
zomato["Locality"].head()

0     century city mall, poblacion, makati city
1    little tokyo, legaspi village, makati city
2    edsa shangri-la, ortigas, mandaluyong city
3        sm megamall, ortigas, mandaluyong city
4        sm megamall, ortigas, mandaluyong city
Name: Locality, dtype: object

In [19]:
## Removal of Punctuations
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """This is custom function to remove the punctuation"""
    return text.translate(str.maketrans('','', PUNCT_TO_REMOVE))

zomato["Locality"] = zomato["Locality"].apply(lambda text: remove_punctuation(text))

## Removal of stopwords
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """Custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

zomato["Locality"] = zomato["Locality"].apply(lambda text: remove_stopwords(text))

zomato[["Locality", "Cuisines"]].sample(5)

Unnamed: 0,Locality,Cuisines
3974,hauz khas,"Desserts, Fast Food"
7432,sunder nagar,"Continental, North Indian, European, Asian"
1610,old railway road,"Pizza, Fast Food"
2372,swaroop nagar,"Bakery, Fast Food"
2827,chanakyapuri,"North Indian, Mughlai"


In [20]:
# Restuarants Names
restuarant_names = list(zomato['Restaurant Name'].unique())
def get_top_words(column, top_nu_of_words, nu_of_word):
    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
    bag_of_words = vec.fit_transform(column)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_nu_of_words]

zomato = zomato.drop(['Votes','Longitude','Latitude', 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu'], axis =1)

df_percent = zomato.sample(frac=0.5)

In [21]:
df_percent

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Cuisines,Average Cost for two,Currency,Price range,Aggregate rating,Rating color,Rating text
598,210134,Grand Barbeque Buffet Restaurant,214,Dubai,"Al Mina Road, Next to Ibis Styles Jumeirah Hot...",satwa,"Satwa, Dubai","Indian, Asian",150,Emirati Diram(AED),3,4.4,Green,Very Good
2915,304018,Spicy Food Court,1,New Delhi,"Khasra 361, Near Bank Of India, Sultanpur, Nea...",chhatarpur,"Chhatarpur, New Delhi","North Indian, Mughlai, Chinese",500,Indian Rupees(Rs.),2,2.7,Orange,Average
9352,6901231,Tipu Sultan,215,Birmingham,"43 Alcester Road, Moseley, Birmingham B13 8AA",moseley,"Moseley, Birmingham","Indian, Pakistani",45,Pounds(£),3,4.0,Green,Very Good
3913,807,Cosy Restaurant,1,New Delhi,"B-1/30, Hauz Khas, New Delhi",hauz khas,"Hauz Khas, New Delhi","North Indian, Mughlai, Chinese",1100,Indian Rupees(Rs.),3,3.4,Orange,Average
5894,18361244,Utsav,1,New Delhi,"Shop G-17, Aggarwal Millenium Tower-I, Netaji ...",netaji subhash place,"Netaji Subhash Place, New Delhi",Bakery,250,Indian Rupees(Rs.),1,3.5,Yellow,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1717,18368009,Maa Bhagwati Tiffins,1,Gurgaon,"Subhash Nagar, Sector 12, Gurgaon",sector 12,"Sector 12, Gurgaon",North Indian,200,Indian Rupees(Rs.),1,3.0,Orange,Average
707,2500076,Balbeer's Kitchen & Bar,1,Aurangabad,"Shendra, Near Cambridge School, Jalna Road, Ch...",chicalthana,"Chicalthana, Aurangabad","Italian, Continental, Chinese, North Indian",850,Indian Rupees(Rs.),3,3.3,Orange,Average
9178,2700010,Yo China Restaurant,1,Ranchi,"2nd Floor, Citadel Building, Main Road, Ranchi",hindpiri,"Hindpiri, Ranchi",Chinese,650,Indian Rupees(Rs.),2,3.4,Orange,Average
2063,306883,Subway,1,Gurgaon,"7, Ground Floor, Welldone Tech Park, Sohna Roa...",sohna road,"Sohna Road, Gurgaon","American, Fast Food, Salad, Healthy Food",500,Indian Rupees(Rs.),2,2.9,Orange,Average


In [29]:
zomato["City"].value_counts()

New Delhi    5473
Gurgaon      1118
Noida        1080
Faridabad     251
Ghaziabad      25
             ... 
Huskisson       1
Lorn            1
Mc Millan       1
Weirton         1
Palm Cove       1
Name: City, Length: 141, dtype: int64

In [22]:
zomato.columns

Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
       'Locality', 'Locality Verbose', 'Cuisines', 'Average Cost for two',
       'Currency', 'Price range', 'Aggregate rating', 'Rating color',
       'Rating text'],
      dtype='object')

# TF-IDF Vectorization
TF-IDF (Term Frequency-Inverse Document Frequency) vectors for each document. This will give you a matrix where each column represents a word in the general vocabulary (all words that appear in at least one document) and each column represents a restaurant, as before.

TF-IDF is the statistical method of assessing the meaning of a word in a given document. Now, I will use the TF-IDF vectorization on the dataset:

In [23]:
df_percent.set_index('Restaurant Name', inplace=True)
indices = pd.Series(df_percent.index)

# Creating tf-idf matrix
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['Locality'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [24]:
# here name in function is restuarant_name
def recommend(name, cosine_similarities = cosine_similarities):
    
    # Create a list to put top restaurants
    recommend_restaurant = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])
    
    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['Cuisines', 'Aggregate rating', 'Average Cost for two'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(df_percent[['Cuisines','Aggregate rating', 'Average Cost for two']][df_percent.index == each].sample()))
    
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['Cuisines','Aggregate rating', 'Average Cost for two'], keep=False)
    df_new = df_new.sort_values(by='Aggregate rating', ascending=False).head(10)
    
    print('TOP %s RESTAURANTS LIKE %s In GIVEN LOCALITY: ' % (str(len(df_new)), name))
    
    return df_new

In [26]:
recommend('Spicy Food Court')

TOP 10 RESTAURANTS LIKE Spicy Food Court In GIVEN LOCALITY: 


Unnamed: 0,Cuisines,Aggregate rating,Average Cost for two
The Bento Cafe,"Asian, Chinese, Japanese",4.2,750
The Tea Place by Manjushree,"Cafe, Beverages",4.1,700
Al Arabian Express,Mughlai,3.7,500
Happy Hakka,"Chinese, Asian, Thai",3.6,650
Kay's Chicken Corner,"Mughlai, North Indian, Chinese",3.6,700
Cafe Red,Cafe,3.6,1100
Woodfire Grille,"American, Seafood, Steak",3.6,70
Nirmal Vada Pav,Street Food,3.5,200
Le Marche Sugar & Spice Cafe,"Cafe, Italian, Fast Food",3.5,700
Sharma Bakers,Bakery,3.5,200
