In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
pd.options.mode.chained_assignment = None

In [2]:
df=pd.read_csv('zomato.csv')

In [3]:
df.head()

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


# Data Preprocessing

### Removing unnecessary columns

In [4]:
df=df.drop(['url','phone','dish_liked'],axis=1)

### Removing duplicate rows

In [5]:
df.duplicated().sum()

43

In [6]:
df=df.drop_duplicates()

### Removing Nan values

In [7]:
df.isnull().sum()

address                           0
name                              0
online_order                      0
book_table                        0
rate                           7767
votes                             0
location                         21
rest_type                       227
cuisines                         45
approx_cost(for two people)     345
reviews_list                      0
menu_item                         0
listed_in(type)                   0
listed_in(city)                   0
dtype: int64

In [8]:
df=df.dropna()

### Renaming columns

In [9]:
df=df.rename(columns={'rate':'rating','approx_cost(for two people)':'cost','listed_in(type)':'type','listed_in(city)':'city'})

### Data cleaning & transformation

In [10]:
df['rating'].unique()

array(['4.1/5', '3.8/5', '3.7/5', '3.6/5', '4.6/5', '4.0/5', '4.2/5',
       '3.9/5', '3.1/5', '3.0/5', '3.2/5', '3.3/5', '2.8/5', '4.4/5',
       '4.3/5', 'NEW', '2.9/5', '3.5/5', '2.6/5', '3.8 /5', '3.4/5',
       '4.5/5', '2.5/5', '2.7/5', '4.7/5', '2.4/5', '2.2/5', '2.3/5',
       '3.4 /5', '-', '3.6 /5', '4.8/5', '3.9 /5', '4.2 /5', '4.0 /5',
       '4.1 /5', '3.7 /5', '3.1 /5', '2.9 /5', '3.3 /5', '2.8 /5',
       '3.5 /5', '2.7 /5', '2.5 /5', '3.2 /5', '2.6 /5', '4.5 /5',
       '4.3 /5', '4.4 /5', '4.9/5', '2.1/5', '2.0/5', '1.8/5', '4.6 /5',
       '4.9 /5', '3.0 /5', '4.8 /5', '2.3 /5', '4.7 /5', '2.4 /5',
       '2.1 /5', '2.2 /5', '2.0 /5', '1.8 /5'], dtype=object)

In [11]:
df=df[(df['rating']!='NEW') & (df['rating']!='-')]

In [12]:
df['rating']=df['rating'].str.split('/').str[0].astype(float)

In [13]:
df['cost'].unique()

array(['800', '300', '600', '700', '550', '500', '450', '650', '400',
       '900', '200', '750', '150', '850', '100', '1,200', '350', '250',
       '950', '1,000', '1,500', '1,300', '199', '1,100', '1,600', '230',
       '130', '1,700', '1,350', '2,200', '1,400', '2,000', '1,800',
       '1,900', '180', '330', '2,500', '2,100', '3,000', '2,800', '3,400',
       '50', '40', '1,250', '3,500', '4,000', '2,400', '2,600', '1,450',
       '70', '3,200', '240', '6,000', '1,050', '2,300', '4,100', '120',
       '5,000', '3,700', '1,650', '2,700', '4,500', '80'], dtype=object)

In [14]:
df['cost']=df['cost'].astype(str)

In [15]:
df['cost']=df['cost'].apply(lambda x:x.replace(',','')).astype(float)

In [16]:
df['book_table']=df['book_table'].replace(['Yes','No'],[True,False])

In [17]:
df['online_order']=df['online_order'].replace(['Yes','No'],[True,False])

### Creating a column for Mean rating of each restaurant

In [18]:
restaurants=list(df['name'].unique())

In [19]:
df['Mean rating']=0

In [20]:
for i in range(len(restaurants)):
    df['Mean rating'][df['name']==restaurants[i]] = df['rating'][df['name']==restaurants[i]].mean()

In [21]:
scaler = MinMaxScaler(feature_range = (1,5))
df[['Mean rating']] = scaler.fit_transform(df[['Mean rating']]).round(2)

# Text (reviews) processing

### Lower casing

In [22]:
df['reviews_list']=df['reviews_list'].str.lower()

### Removing punctuations, numbers and special characters

In [23]:
df['reviews_list']=df['reviews_list'].str.replace('.',"",regex=True)

In [24]:
df['reviews_list']=df['reviews_list'].str.replace(r'[^A-Za-z0-9]'," ",regex=True)

### Removing stopwords

In [25]:
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(sentence):
    return " ".join([word for word in str(sentence).split() if word not in STOPWORDS])

df['reviews_list']=df['reviews_list'].apply(remove_stopwords)

### Removing URL's

In [26]:
pattern=r"https?://\S+|www\.\S+"

def remove(text):
    match=re.findall(pattern,text)
    
    for i in match:
        text=text.replace(i,"")
    return text

In [27]:
df['reviews_list']=df['reviews_list'].apply(remove)

# TF-IDF Vectorization

In [28]:
df.head()

Unnamed: 0,address,name,online_order,book_table,rating,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city,Mean rating
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,True,True,4.1,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800.0,rated 40 rated n beautiful place dine inthe in...,[],Buffet,Banashankari,3.99
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,True,False,4.1,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800.0,rated 40 rated n dinner family turned good cho...,[],Buffet,Banashankari,3.97
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,True,False,3.8,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800.0,rated 30 rated n ambience good enough pocket f...,[],Buffet,Banashankari,3.58
3,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,False,False,3.7,88,Banashankari,Quick Bites,"South Indian, North Indian",300.0,rated 40 rated n great food proper karnataka s...,[],Buffet,Banashankari,3.45
4,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,False,False,3.8,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600.0,rated 40 rated n good restaurant neighbourhood...,[],Buffet,Banashankari,3.58


In [29]:
features=df.drop(['address','rest_type','type','menu_item'],axis=1)

In [30]:
features_percent=features.sample(frac=0.5,random_state=101)

features_percent=features_percent.set_index('name')

indices=pd.Series(features_percent.index)

In [31]:
tfidf=TfidfVectorizer(ngram_range=(1, 2),min_df=0,stop_words='english')

In [32]:
tfidf_matrix=tfidf.fit_transform(features_percent['reviews_list'])

In [33]:
cosine_similarities = linear_kernel(tfidf_matrix,tfidf_matrix)

## Function that recommends restaurants

In [34]:
def recommend_restaurant(name,cosine_similarities=cosine_similarities):
    
    restaurants=[]
    
    index=indices[indices == name].index[0]
    
    res_series=pd.Series(cosine_similarities[index]).sort_values(ascending=False)
    
    top50_indices=list(res_series.iloc[0:51].index)
    
    for i in top50_indices:
        restaurants.append(list(features_percent.index)[i])
    
    df_result = pd.DataFrame(columns=['Mean rating','cuisines','cost'])
    
    for each in restaurants:
        df_result=df_result.append(features_percent.loc[each][['Mean rating','cuisines','cost']])
    

    df_result=df_result.drop_duplicates()
    df_result=df_result.sort_values('Mean rating',ascending=False).head(10)
    
    return df_result

In [35]:
recommend_restaurant('Jalsa')

Unnamed: 0,Mean rating,cuisines,cost
Asia Kitchen By Mainland China,5.0,"Asian, Chinese, Thai, Momos",1500.0
AB's - Absolute Barbecues,4.86,"European, Mediterranean, North Indian, BBQ",1600.0
AB's - Absolute Barbecues,4.86,"European, Mediterranean, North Indian, BBQ",1400.0
Biergarten,4.83,"Continental, North Indian, Chinese, European, ...",2100.0
Biergarten,4.83,"Continental, European, BBQ, Chinese, Asian",2400.0
The Black Pearl,4.78,"North Indian, European, Mediterranean, BBQ",1500.0
The Black Pearl,4.78,"North Indian, European, Mediterranean",1400.0
House Of Commons,4.77,"Continental, Asian, North Indian",1000.0
Big Pitcher,4.68,"American, Continental, North Indian, Mediterra...",1800.0
Communiti,4.67,"Continental, BBQ, Salad",1500.0
