In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [8]:
path = '/content/gdrive/My Drive/Colab Notebooks/Projects/Zomato_restaurant_recommendation_system'

# Content based recommendation system

# Importing necessary libraries

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import warnings
warnings.filterwarnings('always')      # Always print matching warnings
warnings.filterwarnings('ignore')      # Never print matching warnings

# Load and read the dataset

In [10]:
zomato_real = pd.read_csv(path + "/zomato.csv")
zomato_real.head()

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


#### Data Cleaning and Feature Engineering

#### Analyse the dataset

In [11]:
zomato_real.info()

"""
1) The dataset is of form rows and columns
2) The dataset contains categorical columns(We will have to perform encoding to convert the categorical data to integer data)
3) The data is of form pandas dataframe
4) The dataset has NaN values which need to be treated
5) After converting the data to integer type we need to check whether data is scaled or not.
"""

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51717 entries, 0 to 51716
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   url                          51717 non-null  object
 1   address                      51717 non-null  object
 2   name                         51717 non-null  object
 3   online_order                 51717 non-null  object
 4   book_table                   51717 non-null  object
 5   rate                         43942 non-null  object
 6   votes                        51717 non-null  int64 
 7   phone                        50509 non-null  object
 8   location                     51696 non-null  object
 9   rest_type                    51490 non-null  object
 10  dish_liked                   23639 non-null  object
 11  cuisines                     51672 non-null  object
 12  approx_cost(for two people)  51371 non-null  object
 13  reviews_list                 51

'\n1) The dataset is of form rows and columns\n2) The dataset contains categorical columns(We will have to perform encoding to convert the categorical data to integer data)\n3) The data is of form pandas dataframe\n4) The dataset has NaN values which need to be treated\n5) After converting the data to integer type we need to check whether data is scaled or not.\n'

#### Deleting unnecessary columns

In [12]:
# Dropping the url, dish_liked and phone from the dataset and saving the dataset into new variable
zomato = zomato_real.drop(['url', 'dish_liked', 'phone'], axis = 1)

In [13]:
zomato.head()

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,Banashankari,Quick Bites,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


#### Removing the duplicates

In [14]:
# Checking for number of duplicates
zomato.duplicated().sum()
# Removing the duplicate rows
zomato.drop_duplicates(inplace = True)

#### Removing the null values

In [15]:
# Since there are null values in the dataset we will be removing them

# Checking total null values in each column
zomato.isnull().sum()

# Removing the null values
zomato.dropna(how = 'any', inplace = True)     # How = any specifies that if atleast one null value is present in the row 

In [16]:
# Checking whether we have removed the null values or not.
zomato.isnull().sum()

address                        0
name                           0
online_order                   0
book_table                     0
rate                           0
votes                          0
location                       0
rest_type                      0
cuisines                       0
approx_cost(for two people)    0
reviews_list                   0
menu_item                      0
listed_in(type)                0
listed_in(city)                0
dtype: int64

#### Changing the column names if necessary

In [17]:
zomato = zomato.rename(columns = {'approx_cost(for two people)' : 'cost', 'listed_in(type)' : 'type', 
                                  'listed_in(city)' : 'city'})

In [18]:
zomato.head()

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,Banashankari,Quick Bites,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


#### Some Data Transformations

In [19]:
# Changing the 'cost' to string
zomato['cost'] = zomato['cost'].astype('str')

# Using lambda function to remove ',' from the 'cost' column
zomato['cost'] = zomato['cost'].apply(lambda x : x.replace(',', '.'))

# Changing the 'cost' to float
zomato['cost'] = zomato['cost'].astype('float')

In [20]:
# Removing '/5' from the rate column
zomato = zomato.loc[zomato.rate != 'NEW']                 # .loc -> Access a group of rows and columns
zomato = zomato.loc[zomato.rate != '-'].reset_index(drop = True)
remove_slash = lambda x : x.replace('/5', '') if type(x) == np.str else x

zomato['rate'] = zomato['rate'].apply(remove_slash).str.strip().astype('float')    # str.strip removes whitespaces around the string

In [21]:
# Adjust the column names

zomato.name = zomato.name.apply(lambda x : x.title())

zomato[['online_order', 'book_table']].replace(('Yes', 'No'), (True, False), inplace = True)

In [22]:
# Computing Mean Rating

restaurants = list(zomato['name'].unique())
zomato['Mean Rating'] = 0

for i in range(len(restaurants)):
    zomato['Mean Rating'][zomato['name'] == restaurants[i]] = zomato['rate'][zomato['name'] == restaurants[i]].mean()
    

# Scaling the Mean Rating column
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (1, 5))
zomato[['Mean Rating']] = scaler.fit_transform(zomato[['Mean Rating']]).round(2)

In [23]:
zomato.head()

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city,Mean Rating
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800.0,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari,3.99
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800.0,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari,3.97
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800.0,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari,3.58
3,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7,88,Banashankari,Quick Bites,"South Indian, North Indian",300.0,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari,3.45
4,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600.0,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari,3.58


#### Performing text pre-processing

In [24]:
# Lower Casing the letters
zomato['reviews_list'] = zomato['reviews_list'].str.lower()

In [25]:
# Removing the punctuations
import string
PUNC_TO_REMOVE = string.punctuation               # String of punctuation marks

def remove_punctuation(text):
    """ Custom function to remove punctuation"""
    return text.translate(str.maketrans('', '', PUNC_TO_REMOVE))
    
zomato['reviews_list'] = zomato['reviews_list'].apply(lambda text : remove_punctuation(text))

In [30]:
# Removal of Stopwords
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))      # list of common stopwords in english language

def remove_stopwords(text):
    """ Custom function to remove stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

zomato['reviews_list'] = zomato['reviews_list'].apply(lambda text : remove_stopwords(text))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [31]:
# Removal of urls
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')              # Setting the url pattern
    return url_pattern.sub(r'', text)

zomato['reviews_list'] = zomato['reviews_list'].apply(lambda text : remove_urls(text))

In [32]:
zomato[['reviews_list', 'cuisines']].sample(5)

Unnamed: 0,reviews_list,cuisines
1959,rated 50 ratedn busy lanes jayanagar find plac...,"Healthy Food, Cafe, Beverages"
10711,rated 50 ratedn peacefull place friendly staff...,"Cafe, Mughlai"
22969,rated 40 ratedn good location sub road n mid s...,"North Indian, Kerala"
16143,rated 40 ratedn tasty biryani personally liked...,"Biryani, South Indian"
29249,rated 40 ratedn small cute place tucked lanes ...,"Cafe, Desserts, Beverages"


#### Count-Vectorisation

In [33]:
restaurant_names = list(zomato['name'].unique())

def get_top_words(column, top_num_of_words, num_of_word):
    
    vec = CountVectorizer(ngram_range = num_of_word, stop_words = 'english')
    bag_of_words = vec.fit_transform(column)
    sum_words = bag_of_words.sum(axis = 0)
    words_freq = [(words, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x : x[1], reverse = True)
    
    return words_freq[: top_num_of_words]

In [34]:
zomato = zomato.drop(['address', 'rest_type', 'type', 'menu_item', 'votes'], axis = 1)

In [35]:
zomato.shape

(41237, 10)

In [36]:
zomato.isnull().sum()

name            0
online_order    0
book_table      0
rate            0
location        0
cuisines        0
cost            0
reviews_list    0
city            0
Mean Rating     0
dtype: int64

In [37]:
# Randomly sample 50% of your dataframe
df_percent = zomato.sample(frac = 0.5)

#### TF-IDX Vectorizer (Term Frequency - Inverse Document Frequency)

In [38]:
# This would give us a matrix where each column represents a word in a general vocabulary(all words that appears at 
# least once in each document) and each column represents a restaurant

In [39]:
df_percent.set_index('name', inplace = True)
indices = pd.Series(df_percent.index)

# Creating a Tf-Idf matrix
tfidf = TfidfVectorizer(analyzer = 'word', ngram_range = (1, 2), min_df = 0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['reviews_list'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

#### Creating a function to recommend restaurants

In [40]:
def recommend(name, cosine_similarities = cosine_similarities):
    
    # Create a list to put top restaurants
    recommend_restaurant = []
    
    # Find index of hotel entered
    idx = indices[indices == name].index[0]
    
    # Find hotels with similar cosine similarity value and order them in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)
    
    # Extract top 30 restaurant indices with similar cosine-similarity value
    top30_indices = list(score_series[0:31].index)
    
    # Names of top 30 restaurants
    for each in top30_indices:
        recommend_restaurant.append(list(df_percent.index)[each])
        
    # Creating a new dataset to show similar restaurants
    df_new = pd.DataFrame(columns = ['cuisines', 'Mean Rating', 'cost'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(df_percent[['cuisines', 'Mean Rating', 'cost']][df_percent.index == each].sample()))
        
    # Drop the same named restaurants and sort only the top 10 by highest rating
    df_new = df_new.drop_duplicates(subset = ['cuisines', 'Mean Rating', 'cost'], keep = False)
    df_new = df_new.sort_values(by = 'Mean Rating', ascending = False).head(10)
    
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df_new)), name))
    
    return df_new

In [41]:
recommendtion = recommend("Wazir'S")
recommendtion

TOP 10 RESTAURANTS LIKE Wazir'S WITH SIMILAR REVIEWS: 


Unnamed: 0,cuisines,Mean Rating,cost
Pallavi Restaurant,"Biryani, Chinese, Andhra",3.58,500.0
Donne Biriyani Angadi Mane,"Biryani, Chinese",3.47,250.0
B.M.W - Bhookh Mitaane Wala,"North Indian, South Indian, Chinese",3.42,500.0
Agarwal Food Service,"North Indian, Chinese, Biryani",3.39,400.0
Hotel New Karavali,"Mangalorean, South Indian, North Indian",3.34,300.0
Desi Dhaba,"North Indian, Chinese, Rolls",3.19,300.0
Taza Khaana,"Chinese, North Indian",2.63,450.0
Sri Lakshmi Dhaba,"North Indian, Chinese",2.5,250.0
Foodiction,"North Indian, Fast Food, Chinese, Burger",2.35,500.0
Night Food Joint,"North Indian, Chinese",2.35,500.0
