In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
import seaborn as sns
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
zomato_data=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/zomato.csv")
zomato_df=zomato_data.copy()
zomato_df.head(2)

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari


In [4]:
zomato_df.shape

(51717, 17)

In [5]:
zomato_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51717 entries, 0 to 51716
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   url                          51717 non-null  object
 1   address                      51717 non-null  object
 2   name                         51717 non-null  object
 3   online_order                 51717 non-null  object
 4   book_table                   51717 non-null  object
 5   rate                         43942 non-null  object
 6   votes                        51717 non-null  int64 
 7   phone                        50509 non-null  object
 8   location                     51696 non-null  object
 9   rest_type                    51490 non-null  object
 10  dish_liked                   23639 non-null  object
 11  cuisines                     51672 non-null  object
 12  approx_cost(for two people)  51371 non-null  object
 13  reviews_list                 51

In [6]:
zomato_df.isnull().sum()

url                                0
address                            0
name                               0
online_order                       0
book_table                         0
rate                            7775
votes                              0
phone                           1208
location                          21
rest_type                        227
dish_liked                     28078
cuisines                          45
approx_cost(for two people)      346
reviews_list                       0
menu_item                          0
listed_in(type)                    0
listed_in(city)                    0
dtype: int64

In [7]:

#Dropping the column "dish_liked", "phone", "url"
zomato_df=zomato_df.drop(['phone','dish_liked'],axis=1)

#Remove the NaN values from the dataset
zomato_df.dropna(how='any',inplace=True)

#Removing the Duplicates
zomato_df.duplicated().sum()
zomato_df.drop_duplicates(inplace=True)

#Changing the column names
zomato_df = zomato_df.rename(columns={'approx_cost(for two people)':'cost','listed_in(type)':'type', 'listed_in(city)':'city'})

#Removing '/5' from Rates
zomato_df = zomato_df.loc[zomato_df.rate !='NEW']
zomato_df = zomato_df.loc[zomato_df.rate !='-'].reset_index(drop=True)
remove_slash = lambda x: x.replace('/5', '') if type(x) == np.str else x
zomato_df.rate = zomato_df.rate.apply(remove_slash).str.strip().astype('float')

#Changing the cost to string
zomato_df['cost'] = zomato_df['cost'].astype(str)
zomato_df['cost'] = zomato_df['cost'].apply(lambda x: x.replace(',','.'))
zomato_df['cost'] = zomato_df['cost'].astype(float)


In [8]:
zomato_df.shape

(41263, 15)

In [9]:
zomato_df.isnull().sum()

url             0
address         0
name            0
online_order    0
book_table      0
rate            0
votes           0
location        0
rest_type       0
cuisines        0
cost            0
reviews_list    0
menu_item       0
type            0
city            0
dtype: int64

In [10]:
## Computing Mean Rating
restaurants = list(zomato_df['name'].unique())
zomato_df['Mean Rating'] = 0
for i in range(len(restaurants)):
    zomato_df['Mean Rating'][zomato_df['name'] == restaurants[i]] = zomato_df['rate'][zomato_df['name'] == restaurants[i]].mean()
#Scaling the mean rating values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (1,5))
zomato_df[['Mean Rating']] = scaler.fit_transform(zomato_df[['Mean Rating']]).round(2)


In [11]:
zomato_df[['name','rate','Mean Rating']].head()

Unnamed: 0,name,rate,Mean Rating
0,Jalsa,4.1,3.99
1,Spice Elephant,4.1,3.97
2,San Churro Cafe,3.8,3.58
3,Addhuri Udupi Bhojana,3.7,3.45
4,Grand Village,3.8,3.58


In [12]:
## Lower Casing
zomato_df["reviews_list"] = zomato_df["reviews_list"].str.lower()

## Removal of Puctuations
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
zomato_df["reviews_list"] = zomato_df["reviews_list"].apply(lambda text: remove_punctuation(text))



In [13]:
zomato_df[['reviews_list', 'cuisines','url']].sample(5)

Unnamed: 0,reviews_list,cuisines,url
2565,rated 20 ratedn bad quality of puff and bomba...,Bakery,https://www.zomato.com/bangalore/cake-art-basa...
32714,rated 40 ratedn limited options in the menuth...,"Continental, North Indian, Chinese, Arabian",https://www.zomato.com/bangalore/high-sky-whit...
12842,rated 30 ratedn rude behavior by the staff ve...,"North Indian, Mughlai, Mediterranean, Iranian",https://www.zomato.com/bangalore/ruh-bellandur...
30607,rated 30 ratedn been there on several occasio...,"North Indian, Mithai",https://www.zomato.com/bangalore/bhaiyaji-food...
29166,rated 10 ratedn service was very disappointin...,"Chinese, Thai, Asian",https://www.zomato.com/bangalore/magnolia-kora...


In [14]:
def get_top_words(column, top_nu_of_words, nu_of_word):

    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')

    bag_of_words = vec.fit_transform(column)

    sum_words = bag_of_words.sum(axis=0)

    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]

    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

    return words_freq[:top_nu_of_words]


In [15]:

# RESTAURANT NAMES:
restaurant_names = list(zomato_df['name'].unique())
def get_top_words(column, top_nu_of_words, nu_of_word):
    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
    bag_of_words = vec.fit_transform(column)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_nu_of_words]

zomato_df=zomato_df.drop(['address','rest_type', 'type', 'menu_item', 'votes'],axis=1)


# Randomly sample 60% of your dataframe
df_percent = zomato_df.sample(frac=0.5)


In [16]:
zomato_df.head()

Unnamed: 0,url,name,online_order,book_table,rate,location,cuisines,cost,reviews_list,city,Mean Rating
0,https://www.zomato.com/bangalore/jalsa-banasha...,Jalsa,Yes,Yes,4.1,Banashankari,"North Indian, Mughlai, Chinese",800.0,rated 40 ratedn a beautiful place to dine int...,Banashankari,3.99
1,https://www.zomato.com/bangalore/spice-elephan...,Spice Elephant,Yes,No,4.1,Banashankari,"Chinese, North Indian, Thai",800.0,rated 40 ratedn had been here for dinner with...,Banashankari,3.97
2,https://www.zomato.com/SanchurroBangalore?cont...,San Churro Cafe,Yes,No,3.8,Banashankari,"Cafe, Mexican, Italian",800.0,rated 30 ratedn ambience is not that good eno...,Banashankari,3.58
3,https://www.zomato.com/bangalore/addhuri-udupi...,Addhuri Udupi Bhojana,No,No,3.7,Banashankari,"South Indian, North Indian",300.0,rated 40 ratedn great food and proper karnata...,Banashankari,3.45
4,https://www.zomato.com/bangalore/grand-village...,Grand Village,No,No,3.8,Basavanagudi,"North Indian, Rajasthani",600.0,rated 40 ratedn very good restaurant in neigh...,Banashankari,3.58


In [17]:
zomato_df.to_csv("restaurant1.csv")

In [18]:
zomato_df.to_csv("restaurant2.csv")

In [19]:
df_percent.head()

Unnamed: 0,url,name,online_order,book_table,rate,location,cuisines,cost,reviews_list,city,Mean Rating
5676,https://www.zomato.com/bangalore/kfc-3-whitefi...,KFC,Yes,No,2.8,"ITPL Main Road, Whitefield","Burger, Fast Food",400.0,rated 40 ratedn in banglore there are many kf...,Brookefield,3.38
31391,https://www.zomato.com/bangalore/corner-house-...,Corner House Ice Cream,Yes,No,4.4,Seshadripuram,"Ice Cream, Desserts",400.0,rated 40 ratedn very close to my work place w...,Malleshwaram,4.44
5181,https://www.zomato.com/bangalore/just-bake-sha...,Just Bake,Yes,No,3.5,Shanti Nagar,"Bakery, Desserts",400.0,rated 50 ratedn just bake cake is just awesom...,Brigade Road,3.07
31359,https://www.zomato.com/bangalore/tea-samakruth...,Tea Samskruthi,No,No,3.9,Malleshwaram,Cafe,200.0,rated 50 ratedn tea samskruti is the best pla...,Malleshwaram,3.71
11609,https://www.zomato.com/bangalore/taco-bell-ind...,Taco Bell,Yes,No,4.1,Indiranagar,"Mexican, American, Fast Food",600.0,rated 40 ratedn good place for mexican food t...,Frazer Town,3.81


In [20]:
df_percent['reviews_list'].isnull().sum()

0

In [21]:
df_percent['url'].isnull().sum()

0

In [22]:
df_percent.set_index('name', inplace=True)
indices = pd.Series(df_percent.index)

# Creating tf-idf matrix
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['reviews_list'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [23]:
def recommend(name, cosine_similarities = cosine_similarities):

    # Create a list to put top restaurants
    recommend_restaurant = []

    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]

    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)

    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)

    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])

    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['cuisines', 'Mean Rating', 'cost','url'])

    # Create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating', 'cost','url']][df_percent.index == each].sample()))

    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['cuisines','Mean Rating', 'cost','url'], keep=False)
    df_new = df_new.sort_values(by='Mean Rating', ascending=False).head(10)

    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df_new)), name))

    return df_new

In [24]:
recommend('Pai Vihar')

TOP 10 RESTAURANTS LIKE Pai Vihar WITH SIMILAR REVIEWS: 


Unnamed: 0,cuisines,Mean Rating,cost,url
Cinnamon,"North Indian, Asian, Continental",3.62,1.0,https://www.zomato.com/bangalore/cinnamon-sesh...
Samosa Singh,"Street Food, Fast Food, Rolls, Desserts",3.6,200.0,https://www.zomato.com/bangalore/samosa-singh-...
Samosa Singh,"Street Food, Beverages",3.6,150.0,https://www.zomato.com/bangalore/samosa-singh-...
Kadai Crust - Amma Veetu Samayal,"Chettinad, South Indian, Biryani",3.58,700.0,https://www.zomato.com/bangalore/kadai-crust-a...
Pallavi Restaurant,"Biryani, Chinese, Andhra",3.58,500.0,https://www.zomato.com/bangalore/pallavi-resta...
Upahar Sagar,"South Indian, Chinese, North Indian",3.58,350.0,https://www.zomato.com/bangalore/upahar-sagar-...
Magix's Parattha Roll,"Fast Food, North Indian, Chinese, Mughlai, Rolls",3.52,400.0,https://www.zomato.com/bangalore/magixs-paratt...
Magix's Parattha Roll,"Fast Food, North Indian, Chinese, Mughlai, Rolls",3.52,400.0,https://www.zomato.com/bangalore/magixs-paratt...
Magix's Parattha Roll,"Fast Food, North Indian, Chinese, Mughlai, Rolls",3.52,400.0,https://www.zomato.com/bangalore/magixs-paratt...
Prasiddhi Food Corner,"Fast Food, North Indian, South Indian",3.45,200.0,https://www.zomato.com/bangalore/prasiddhi-foo...


In [25]:
recommend('Canopy')

TOP 10 RESTAURANTS LIKE Canopy WITH SIMILAR REVIEWS: 


Unnamed: 0,cuisines,Mean Rating,cost,url
Atithi,"North Indian, Chinese, Street Food",3.63,800.0,https://www.zomato.com/bangalore/atithi-hsr?co...
Atithi,"North Indian, Chinese, Street Food",3.63,800.0,https://www.zomato.com/bangalore/atithi-hsr?co...
Cinnamon,"North Indian, Chinese, Biryani",3.62,550.0,https://www.zomato.com/bangalore/cinnamon-hsr?...
Cafe @ Elanza,"Chinese, North Indian, Cafe",3.45,1.0,https://www.zomato.com/bangalore/cafe-elanza-r...
Cafe @ Elanza,"Chinese, North Indian, Cafe",3.45,1.0,https://www.zomato.com/bangalore/cafe-elanza-r...
Nouvelle Garden,"North Indian, Continental, Italian",3.45,900.0,https://www.zomato.com/bangalore/nouvelle-gard...
Sri Sai Mango Tree Restaurant,"North Indian, Biryani, Chinese",3.32,600.0,https://www.zomato.com/bangalore/sri-sai-mango...
The Onyx - The HHI Select Bengaluru,"North Indian, Chinese, Continental",2.97,950.0,https://www.zomato.com/bangalore/the-onyx-the-...
Wazir's,"North Indian, Chinese",2.94,500.0,https://www.zomato.com/bangalore/wazirs-shanti...
Melange - Hotel Ekaa,"North Indian, Chinese, Continental, Mangalorean",2.81,900.0,https://www.zomato.com/bangalore/melange-hotel...


In [26]:
recommend('Cinnamon')

TOP 10 RESTAURANTS LIKE Cinnamon WITH SIMILAR REVIEWS: 


Unnamed: 0,cuisines,Mean Rating,cost,url
Chianti,Italian,4.59,1.5,https://www.zomato.com/bangalore/chianti-koram...
Chianti,Italian,4.59,1.5,https://www.zomato.com/bangalore/chianti-mg-ro...
Chinita Real Mexican Food,Mexican,4.47,1.2,https://www.zomato.com/bangalore/chinita-real-...
Oh! Calcutta,"Bengali, Seafood",4.39,1.2,https://www.zomato.com/bangalore/oh-calcutta-c...
Oh! Calcutta,"Bengali, Seafood",4.39,1.2,https://www.zomato.com/bangalore/oh-calcutta-c...
Soda Bottle Opener Wala,"Parsi, North Indian",4.36,1.3,https://www.zomato.com/bangalore/soda-bottle-o...
CafÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ© Felix,"American, Cafe, Continental",4.35,1.7,https://www.zomato.com/bangalore/caf%C3%A9-fel...
CafÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ© Felix,"American, Cafe, Continental",4.35,1.7,https://www.zomato.com/bangalore/caf%C3%A9-fel...
CafÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ© Felix,"American, Cafe, Continental",4.35,1.7,https://www.zomato.com/bangalore/caf%C3%A9-fel...
Foxtrot - House of Subculture,"Cafe, American, Asian, North Indian",4.35,1.0,https://www.zomato.com/bangalore/foxtrot-house...


In [27]:
import pickle
pickle.dump(tfidf, open('restaurant2.pkl', 'wb'))