In [1]:
import numpy as np
import pandas as pd
zomato = pd.read_csv("zomato.csv")
zomato.head()

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


In [3]:
zomato.describe()

Unnamed: 0,votes
count,51717.0
mean,283.697527
std,803.838853
min,0.0
25%,7.0
50%,41.0
75%,198.0
max,16832.0


In [6]:
zomato.columns

Index(['url', 'address', 'name', 'online_order', 'book_table', 'rate', 'votes',
       'phone', 'location', 'rest_type', 'dish_liked', 'cuisines',
       'approx_cost(for two people)', 'reviews_list', 'menu_item',
       'listed_in(type)', 'listed_in(city)'],
      dtype='object')

In [7]:
# Let's delete unnecessay columns
# here I will drop the column "dish_liked", "phone", "url"
zomato2 = zomato.drop(['url','dish_liked','phone'], axis=1)

In [8]:
# let's check for duplicate values
print(zomato2.duplicated().sum())

43


In [9]:
# now let's drop the duplicate values
zomato2.drop_duplicates(inplace=True)

In [10]:
# Now let's have a look at the null values in the dataset
print(zomato2.isnull().sum())

address                           0
name                              0
online_order                      0
book_table                        0
rate                           7767
votes                             0
location                         21
rest_type                       227
cuisines                         45
approx_cost(for two people)     345
reviews_list                      0
menu_item                         0
listed_in(type)                   0
listed_in(city)                   0
dtype: int64


In [11]:
# let's drop all the null values
zomato2.dropna(how='any', inplace=True)

In [12]:
# Now I will change the names of some columns to make the columns easier to use in the process
zomato = zomato2.rename(columns={'approx_cost(for two people)':'cost','listed_in(type)':'type', 'listed_in(city)':'city'})

In [13]:
#Some Transformations
zomato['cost'] = zomato['cost'].astype(str) #Changing the cost to string
zomato['cost'] = zomato['cost'].apply(lambda x: x.replace(',','.')) #Using lambda function to replace ',' from cost
zomato['cost'] = zomato['cost'].astype(float)

In [14]:
#Removing '/5' from Rates
zomato = zomato.loc[zomato.rate !='NEW']
zomato = zomato.loc[zomato.rate !='-'].reset_index(drop=True)
remove_slash = lambda x: x.replace('/5', '') if type(x) == np.str else x
zomato.rate = zomato.rate.apply(remove_slash).str.strip().astype('float')

In [15]:
# Adjust the column names
zomato.name = zomato.name.apply(lambda x:x.title())
zomato.online_order.replace(('Yes','No'),(True, False),inplace=True)
zomato.book_table.replace(('Yes','No'),(True, False),inplace=True)

In [16]:
## Computing Mean Rating
restaurants = list(zomato['name'].unique())
zomato['Mean Rating'] = 0

for i in range(len(restaurants)):
    zomato['Mean Rating'][zomato['name'] == restaurants[i]] = zomato['rate'][zomato['name'] == restaurants[i]].mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [17]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (1,5))
zomato[['Mean Rating']] = scaler.fit_transform(zomato[['Mean Rating']]).round(2)

In [18]:
## Lower Casing
zomato["reviews_list"] = zomato["reviews_list"].str.lower()

In [19]:
# now I will define a function to remove punctuation from the reviews
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: remove_punctuation(text))

In [20]:
# Now let's remove the stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: remove_stopwords(text))

In [23]:
# Now let's remove the urls from the reviews
import re
def removeurls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: removeurls(text))

zomato[['reviews_list', 'cuisines']].sample(10)

Unnamed: 0,reviews_list,cuisines
36960,rated 50 ratedn doesnt dhaba appearance small ...,"Fast Food, North Indian"
9483,rated 10 ratedn today ordered mushakalfood 130...,"Mediterranean, Arabian, Lebanese"
473,rated 10 ratedn ordered something arrived thin...,"Desserts, Mithai"
38503,rated 40 ratedn ambience give 5ngood 5nservice...,"Continental, Mediterranean, European"
11309,rated 50 ratedn ordered gulab jamun rasgulla b...,"North Indian, Street Food, Mithai"
40275,rated 40 ratedn innerchef recently become one ...,"North Indian, Continental"
32016,rated 50 ratedn people craving authentic seafo...,Mangalorean
35351,rated 10 ratedn worst food chicken roast roll ...,"Fast Food, Beverages"
10433,rated 30 ratedn went blue bar saturday night a...,Continental
31970,rated 40 ratedn serve varieties dishes famous ...,"South Indian, North Indian, Chinese"


In [24]:
# let's process of the names of reataurants
restaurant_names = list(zomato['name'].unique())
def get_top_words(column, top_nu_of_words, nu_of_word):
    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
    bag_of_words = vec.fit_transform(column)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_nu_of_words]

In [25]:
# Now let's drop the unnecessary columns
zomato=zomato.drop(['address','rest_type', 'type', 'menu_item', 'votes'],axis=1)

In [26]:
# Now I will randomly sample the data
df_percent = zomato.sample(frac=0.5)

df_percent.set_index('name', inplace=True)
indices = pd.Series(df_percent.index)

In [27]:
# Now Let's build the restaurant recommendation system
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#Now I will perform tf-idf vectorization on the dataset
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['reviews_list'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [33]:
# Now the final step is to create a function to recommend restaurants
def recommend(name, cosine_similarities = cosine_similarities):
    
    # Create a list to put top restaurants
    recommendations = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommendations.append(list(df_percent.index)[each])
    
    # Creating the new data set to show similar restaurants
    df = pd.DataFrame(columns=['cuisines', 'Mean Rating', 'cost'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommendations:
        df = df.append(pd.DataFrame(df_percent[['cuisines','Mean Rating', 'cost']][df_percent.index == each].sample()))

    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df = df.drop_duplicates(subset=['cuisines','Mean Rating', 'cost'], keep=False)
    df = df.sort_values(by='Mean Rating', ascending=False).head(10)
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df)), name))   
    return df
recommend('Keto Kitchen')

TOP 10 RESTAURANTS LIKE Keto Kitchen WITH SIMILAR REVIEWS: 


Unnamed: 0,cuisines,Mean Rating,cost
Banjara Melting Pot,"North Indian, Chinese, Seafood",4.1,1.7
Brahmin Tiffins & Coffee,South Indian,4.1,100.0
Shake It Off,"Beverages, Fast Food",3.96,600.0
Bowring Kulfi,"Ice Cream, Desserts",3.84,150.0
Mc Donald'S,"Burger, Fast Food",3.71,500.0
Rayalaseema Chefs,"North Indian, Biryani, Andhra, Chinese",3.71,800.0
Mountain Spice,"Chinese, Nepalese, Tibetan, Momos",3.71,400.0
Sulaimani Chai,"Fast Food, Beverages",3.58,100.0
Punjab Mail,"North Indian, Chinese, Beverages",3.58,650.0
Barbeque Delight,"Arabian, Kerala, North Indian, Chinese, BBQ",3.48,800.0


In [34]:
recommend('Pai Vihar')

TOP 10 RESTAURANTS LIKE Pai Vihar WITH SIMILAR REVIEWS: 


Unnamed: 0,cuisines,Mean Rating,cost
Burma Burma,"Asian, Burmese",4.74,1.5
Cravy Wings,"American, Burger, Fast Food",4.11,400.0
Royal Biryani Kitchen,"Biryani, North Indian, Kebab",3.72,600.0
Cafe Aladdin,"Cafe, Chinese",3.71,500.0
3 Leafs,"North Indian, South Indian, Chinese",3.45,600.0
Karavali Grand,"Mangalorean, Seafood, North Indian, Chinese",3.45,600.0
Chowpatty,"North Indian, Fast Food, Street Food",3.43,300.0
Chicken Corner,"Biryani, North Indian, Fast Food, Chinese",3.35,400.0
Hotel Durga Shree Grand,"South Indian, North Indian, Chinese, Street Food",3.32,200.0
Karavali Family Restaurant,"Seafood, Biryani, South Indian, North Indian, ...",3.19,500.0


In [35]:
recommend('The Grill House')

TOP 10 RESTAURANTS LIKE The Grill House WITH SIMILAR REVIEWS: 


Unnamed: 0,cuisines,Mean Rating,cost
Communiti,"Continental, BBQ, Salad",4.67,1.5
Hammered,"North Indian, Thai, Japanese, Continental, Cafe",4.65,1.3
Brew And Barbeque - A Microbrewery Pub,"Continental, North Indian, BBQ, Steak",4.64,1.4
Smoke House Deli,"European, Italian, Desserts, Salad, Juices, Steak",4.61,1.6
Millers 46,"American, Continental, Steak",4.35,1.1
Portland Steakhouse & Cafe,"Steak, Italian, Continental, American, Burger,...",4.23,1.5
Cafe Azzure,"Cafe, Continental, Italian, Burger",4.15,1.2
Plan B,"Continental, BBQ, Steak",4.02,1.1
Plan B,"American, Continental, BBQ, Steak",4.02,1.2
Cinnamon,"North Indian, Asian, Continental",3.62,1.0
