In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
import re
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [2]:
zomato_real=pd.read_csv(r"C:\Users\hp\Downloads\zomato.csv (1).zip")
zomato_real.head(2) 

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari


In [3]:
#Deleting Unnnecessary Columns
zomato=zomato_real.drop(['url','dish_liked','phone'],axis=1) #Dropping the column "dish_liked", "phone", "url" and saving the new dataset as "zomato"

In [4]:
#Removing the Duplicates
zomato.duplicated().sum()
zomato.drop_duplicates(inplace=True)

In [5]:
#Remove the NaN values from the dataset
zomato.isnull().sum()
zomato.dropna(how='any',inplace=True)

In [6]:
#Changing the column names
zomato = zomato.rename(columns={'approx_cost(for two people)':'cost','listed_in(type)':'type', 'listed_in(city)':'city'})

In [7]:
#Some Transformations
zomato['cost'] = zomato['cost'].astype(str) #Changing the cost to string
zomato['cost'] = zomato['cost'].apply(lambda x: x.replace(',','.')) #Using lambda function to replace ',' from cost
zomato['cost'] = zomato['cost'].astype(float)

In [8]:
#Removing '/5' from Rates
zomato = zomato.loc[zomato.rate !='NEW']
zomato = zomato.loc[zomato.rate !='-'].reset_index(drop=True)
remove_slash = lambda x: x.replace('/5', '') if type(x) == str else x
zomato.rate = zomato.rate.apply(remove_slash).str.strip().astype('float')

In [9]:
# Adjust the column names
zomato.name = zomato.name.apply(lambda x:x.title())
zomato.online_order.replace(('Yes','No'),(True, False),inplace=True)
zomato.book_table.replace(('Yes','No'),(True, False),inplace=True)

In [10]:
## Computing Mean Rating
restaurants = list(zomato['name'].unique())
zomato['Mean Rating'] = 0

In [11]:
for i in range(len(restaurants)):
    zomato['Mean Rating'][zomato['name'] == restaurants[i]] = zomato['rate'][zomato['name'] == restaurants[i]].mean()
    
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (1,5))
zomato[['Mean Rating']] = scaler.fit_transform(zomato[['Mean Rating']]).round(2)

In [12]:
zomato.columns

Index(['address', 'name', 'online_order', 'book_table', 'rate', 'votes',
       'location', 'rest_type', 'cuisines', 'cost', 'reviews_list',
       'menu_item', 'type', 'city', 'Mean Rating'],
      dtype='object')

In [13]:
zomato.shape

(41237, 15)

In [14]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
## Lower Casing
zomato["reviews_list"] = zomato["reviews_list"].str.lower()

In [16]:
## Removal of Puctuations
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: remove_punctuation(text))

In [17]:
## Removal of Stopwords
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: remove_stopwords(text))

In [18]:
## Removal of URLS
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: remove_urls(text))

In [19]:
#zomato[['reviews_list', 'cuisines']].sample(5)

In [20]:
# RESTAURANT NAMES:
restaurant_names = list(zomato['name'].unique())
def get_top_words(column, top_nu_of_words, nu_of_word):
    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
    bag_of_words = vec.fit_transform(column)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_nu_of_words]

In [21]:
#print(restaurant_names)

In [22]:
zomato=zomato.drop(['address','rest_type', 'type', 'menu_item', 'votes', 'city', 'location'],axis=1)
zomato.drop_duplicates(inplace=True)
import pandas

In [23]:
zomato.head()

Unnamed: 0,name,online_order,book_table,rate,cuisines,cost,reviews_list,Mean Rating
0,Jalsa,True,True,4.1,"North Indian, Mughlai, Chinese",800.0,rated 40 ratedn beautiful place dine inthe int...,3.99
1,Spice Elephant,True,False,4.1,"Chinese, North Indian, Thai",800.0,rated 40 ratedn dinner family turned good choo...,3.97
2,San Churro Cafe,True,False,3.8,"Cafe, Mexican, Italian",800.0,rated 30 ratedn ambience good enough pocket fr...,3.58
3,Addhuri Udupi Bhojana,False,False,3.7,"South Indian, North Indian",300.0,rated 40 ratedn great food proper karnataka st...,3.45
4,Grand Village,False,False,3.8,"North Indian, Rajasthani",600.0,rated 40 ratedn good restaurant neighbourhood ...,3.58


In [24]:
zomato.loc[(zomato['name']=='Jalsa')]

Unnamed: 0,name,online_order,book_table,rate,cuisines,cost,reviews_list,Mean Rating
0,Jalsa,True,True,4.1,"North Indian, Mughlai, Chinese",800.0,rated 40 ratedn beautiful place dine inthe int...,3.99
400,Jalsa,True,True,4.1,"North Indian, Mughlai, Chinese",800.0,rated 40 ratedn beautiful place dine inthe int...,3.99
485,Jalsa,True,True,4.1,"North Indian, Mughlai, Chinese",800.0,rated 40 ratedn beautiful place dine inthe int...,3.99
1942,Jalsa,True,True,4.1,"North Indian, Mughlai, Chinese",800.0,rated 40 ratedn beautiful place dine inthe int...,3.99
2775,Jalsa,True,True,4.1,"North Indian, Mughlai, Chinese",800.0,rated 40 ratedn super ambiencengreat food spic...,3.99
6104,Jalsa,True,True,4.2,"North Indian, Mughlai",1.5,rated 40 ratedn pretty decent place eat mughal...,3.99
15334,Jalsa,True,True,4.1,"North Indian, Mughlai, Chinese",800.0,rated 40 ratedn beautiful place dine inthe int...,3.99
16883,Jalsa,True,True,4.1,"North Indian, Mughlai, Chinese",800.0,rated 40 ratedn beautiful place dine inthe int...,3.99
32920,Jalsa,True,True,4.2,"North Indian, Mughlai",1.5,rated 40 ratedn friendly staff average ambienc...,3.99


In [25]:
 zomato = (zomato.groupby(['name','online_order','book_table'])
      .agg({'cuisines': lambda x: ",".join(x),'reviews_list': lambda x: ",".join(x),'rate':'mean','cost':'mean','Mean Rating':'mean'})
      .rename({'rate' : 'mean_rate','cost' : 'mean_cost', 'Mean Rating': 'Mean Rating'},axis=1)
      .reset_index())

In [26]:
zomato.shape

(7184, 8)

In [27]:
zomato.set_index('name', inplace=True)
indices = pd.Series(zomato.index)
zomato.reset_index(inplace=True)

In [28]:
# # Randomly sample 60% of your dataframe
# df_percent = zomato.sample(frac=0.5)
# df_percent.set_index('name', inplace=True)
# indices = pd.Series(df_percent.index)

In [29]:
# df_percent.reset_index(inplace=True)

In [30]:
zomato.columns

Index(['name', 'online_order', 'book_table', 'cuisines', 'reviews_list',
       'mean_rate', 'mean_cost', 'Mean Rating'],
      dtype='object')

In [31]:
zomato.loc[(zomato['name']=='Jalsa')]

Unnamed: 0,name,online_order,book_table,cuisines,reviews_list,mean_rate,mean_cost,Mean Rating
3034,Jalsa,True,True,"North Indian, Mughlai, Chinese,North Indian, M...",rated 40 ratedn beautiful place dine inthe int...,4.122222,622.555556,3.99


In [32]:
### links
## https://stackoverflow.com/questions/64235312/how-to-implodereverse-of-pandas-explode-based-on-a-column

In [33]:
zomato.head(2)

Unnamed: 0,name,online_order,book_table,cuisines,reviews_list,mean_rate,mean_cost,Mean Rating
0,#Feeltheroll,False,False,"Fast Food,Fast Food",rated 50 ratedn egg chicken roll paneer roll r...,3.4,200.0,3.06
1,#L-81 Cafe,True,False,"Fast Food, Beverages,Fast Food, Beverages,Fast...",rated 40 ratedn little cafe set beautiful loca...,3.9,400.0,3.71


In [34]:
# Creating tf-idf matrix
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(zomato['reviews_list'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [35]:
cosine_similarities.shape

(7184, 7184)

In [36]:
def recommend(name, cosine_similarities = cosine_similarities):
    
    # Create a list to put top restaurants
    recommend_restaurant = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(zomato.index)[each])
    
    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['cuisines', 'Mean Rating', 'mean_cost'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(zomato[['cuisines','Mean Rating', 'mean_cost']][zomato.index == each].sample()))
    
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['cuisines','Mean Rating', 'mean_cost'], keep=False)
    df_new = df_new.sort_values(by='Mean Rating', ascending=False).head()
    
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df_new)), name))
    
    return df_new

In [37]:
df_new = recommend('Pai Vihar')

TOP 5 RESTAURANTS LIKE Pai Vihar WITH SIMILAR REVIEWS: 


In [38]:
df_new

Unnamed: 0,cuisines,Mean Rating,mean_cost
1171,"American, Cafe, Continental,American, Cafe, Co...",4.35,1.7
1167,"American, Cafe, Continental",4.35,1.7
3602,"Cafe, Desserts,Cafe, Desserts,Cafe, Desserts,C...",4.35,800.0
6487,"Chinese,Chinese,Chinese,Chinese,Chinese",4.28,3.5
5694,"North Eastern, Asian, Naga, Steak, Momos,North...",4.23,800.0


In [39]:
pickle.dump(zomato.to_dict(),open('rest_name.pkl','wb'))
#df_new.to_pickle('restaurant_dict.pkl')

In [40]:
pickle.dump(cosine_similarities,open('similarity.pkl', 'wb'))

In [41]:
## Pycharm code test

In [42]:
restaurant_dict = pickle.load(open('rest_name.pkl','rb'))
restaurant = pd.DataFrame(restaurant_dict)

In [43]:
rest_index = restaurant[restaurant['name'] == 'Pai Vihar'].index[0]

In [44]:
print(rest_index)

4636


In [45]:
similarity = pickle.load(open('similarity.pkl','rb'))
distances = similarity[rest_index]

In [46]:
print(distances)

[0.75916596 0.35771326 0.00944374 ... 0.01181826 0.0064631  0.00507652]


In [47]:
restaurant.reset_index(inplace=True)

In [48]:
restaurant_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[0])[1:6]
print(restaurant_list)

[(7182, 0.006463103863318846), (7181, 0.01181825664336817), (7180, 0.0018822417030589093), (7179, 0.38336960237586726), (7178, 0.007351103652282357)]


In [49]:
recommended_rest = []
for i in restaurant_list:
    recommended_rest.append(restaurant.iloc[i[0],1])

In [50]:
print(recommended_rest)

['Zyara', "Zu'S Doner Kebaps", 'Zoroy Luxury Chocolate', "Zoey'S", 'Zodiac Grills']
