In [None]:
import numpy as np
import pandas as pd
import warnings
import re
import nltk
from tabulate import tabulate
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from geopy.geocoders import Nominatim
from geopy.distance import great_circle 

### Loading the dataset

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#reading the dataset
zomato_real=pd.read_csv("/kaggle/input/zomato-bangalore-restaurants/zomato.csv")
zomato_real.head()

In [None]:
zomato_real.info()

### Data Cleaning and Feature Engineering

In [None]:
#Deleting Unnnecessary Columns
#Dropping the column "phone", "url" and saving the new dataset as "zomato"
zomato=zomato_real.drop(['url','dish_liked','phone'],axis=1) 

In [None]:
#Removing the Duplicates
zomato.duplicated().sum()
zomato.drop_duplicates(inplace=True)

In [None]:
#Remove the NaN values from the dataset
zomato.isnull().sum()
zomato.dropna(how='any',inplace=True)
zomato.info()

In [None]:
#Reading Column Names
zomato.columns

In [None]:
#Changing the column names
zomato = zomato.rename(columns={'approx_cost(for two people)':'cost','listed_in(type)':'type','listed_in(city)':'city'})
zomato.columns

In [None]:
# Some Transformations
zomato['cost'] = zomato['cost'].astype(str) #Changing the cost to string
zomato['cost'] = zomato['cost'].apply(lambda x: x.replace(',','')) #Using lambda function to replace ',' from cost
zomato['cost'] = zomato['cost'].astype(float) # Changing the cost to Float
zomato.info()

In [None]:
# Removing '/5' from Rates
zomato = zomato.loc[zomato.rate !='NEW']
zomato = zomato.loc[zomato.rate !='-'].reset_index(drop=True)
remove_slash = lambda x: x.replace('/5', '') if type(x) == np.str else x
zomato.rate = zomato.rate.apply(remove_slash).str.strip().astype('float')
zomato['rate'].head()

In [None]:
# Adjust the column values for online order and book table
zomato.name = zomato.name.apply(lambda x:x.title())
zomato.online_order.replace(('Yes','No'),(True, False),inplace=True)
zomato.book_table.replace(('Yes','No'),(True, False),inplace=True)

In [None]:
# Checking for Null values
zomato.isnull().sum()

In [None]:
# Computing Mean Rating
restaurants = list(zomato['name'].unique())
zomato['Mean Rating'] = 0

for i in range(len(restaurants)):
    zomato['Mean Rating'][zomato['name'] == restaurants[i]] = zomato['rate'][zomato['name'] == restaurants[i]].mean()

In [None]:
zomato.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range = (1,5))
zomato[['Mean Rating']] = scaler.fit_transform(zomato[['Mean Rating']]).round(2)

### Text Preprocessing

In [None]:
# 5 examples of these columns before text processing:
zomato[['reviews_list', 'cuisines']].sample(5)

In [None]:
# Lower Casing
zomato["reviews_list"] = zomato["reviews_list"].str.lower()

In [None]:
# Removal of Puctuations
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: remove_punctuation(text))

In [None]:
# Removal of Stopwords
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: remove_stopwords(text))

In [None]:
# Removal of URLS
import re
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: remove_urls(text))

In [None]:
# zomato[['reviews_list', 'cuisines']].sample(5)

In [None]:
# RESTAURANT NAMES:
restaurant_names = list(zomato['name'].unique())
# restaurant_names

In [None]:
def get_top_words(column, top_nu_of_words, nu_of_word):
    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
    bag_of_words = vec.fit_transform(column)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    
    return words_freq[:top_nu_of_words]

In [None]:
zomato.columns

In [None]:
zomato=zomato.drop(['address', 'type', 'menu_item', 'votes'],axis=1)

In [None]:
zomato.columns

In [None]:
zomato.head()

In [None]:
# Sampling 60% of the dataframe
df_percent = zomato.sample(frac=0.30, random_state=1)

In [None]:
df_percent.shape

### Term Frequency-Inverse Document Frequency


In [None]:
df_percent.set_index('name', inplace=True)

In [None]:
indices = pd.Series(df_percent.index)

In [None]:
# Creating tf-idf matrix
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['reviews_list'])

In [None]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_similarities.shape

In [None]:
df_percent.to_pickle('df.pkl')

In [None]:
import pickle
with open('cosine_similarities.pkl','wb') as f:
    pickle.dump(cosine_similarities, f, protocol = 4)

<a href="./cosine_similarity.pkl"> Download File </a>

In [None]:
#Function to find distance between 2 locations
def Get_distance(r_loc,u_loc):
    geolocator=Nominatim(user_agent="app")
    u_loc = "Bangalore " + u_loc
    r_loc = "Bangalore " + r_loc
    
    r_location = geolocator.geocode(r_loc)
    if r_location is None:
        return 0
    else:
        R = (r_location.latitude,r_location.longitude)
    
    u_location = geolocator.geocode(u_loc)
    
    if u_location is None:
        return 0
    else:
        U = (u_location.latitude,u_location.longitude)
    return round(great_circle(R, U).km,1) 
    
# Get_distance('Shivajinagar','Banaskankari')

In [None]:
def recommend(name, cosine_similarities, df_percent,user_location):
    # List to put top 10 restaurants
    recommend_restaurant = []

    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])
    
    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['cuisines', 'Mean Rating', 'cost'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','rest_type','Mean Rating', 'cost','location']][df_percent.index == each].sample()))
    
    df_new.index.name = 'name'
    df_new.reset_index(inplace = True)

    df_new = df_new.drop_duplicates(subset = ['name','Mean Rating','location','cost'])
    # Please uncomment the following 5 lines to get distance from customer location
#     l_l = []    
#     for l in df_new.location:
#         l_l.append(Get_distance(l,user_location))
#     df_new.insert(2, "Distance(km)",l_l, True) 
#     df_new = df_new[df_new['Distance(km)'] < 20]

    df_new = df_new.sort_values(by='Mean Rating', ascending=False).head(10)
    

    df_new.reset_index(drop=True, inplace=True)
    
    return df_new

In [None]:
recommend('Jalsa', cosine_similarities, df_percent,'Banashankari')

In [None]:
# One Hot Encoding of cuisines
# For every row in the dataframe, iterate through the list of cuisines and place a 1 into the corresponding column

df_percent_copy_sample = df_percent.copy()
df_percent_copy_sample['cuisines'] = df_percent_copy_sample.cuisines.str.split(',')
# df_percent_copy_sample.set_index('name',inplace=True)
for index, row in df_percent_copy_sample.iterrows():
    try:
        for x in row['cuisines']:
            df_percent_copy_sample.at[index, x.strip()] = 1
    except Exception:
        pass
    
#Filling in the NaN values with 0 to show that a restaurant doesn't have that column
df_percent_copy_sample.fillna(0,inplace=True)

In [None]:
mat = df_percent_copy_sample.drop(['cuisines','rest_type', 'online_order', 'book_table', 'rate', 'location', 'cost', 'reviews_list', 'city', 'Mean Rating'],1)
# df_percent_copy_sample.head()

### Computing the cosine similarities

In [None]:
# using cosine_similarity function from sklearn.metrics.pairwise to compute the similarities 
cosine_sim = cosine_similarity(mat)
cosine_sim.shape

In [None]:
recommend('Jalsa', cosine_sim, df_percent_copy_sample,'BTM Layout')

In [None]:
mat.head()

In [None]:
mat.to_pickle('restaurant_cuisines.pkl')

### List of cuisines

In [None]:
cuisine_list = list(mat.columns)

In [None]:
def get_cosine_sim_with_all_restaurant(user_cuisine_list,cuisine_list,mat):
    test_list = [0 for i in range(len(cuisine_list))]
    for i in user_cuisine_list:
        test_list[cuisine_list.index(i)] = 1
    
    cosine_sim = cosine_similarity(np.array(mat),np.array([test_list]))
    sim_list = []
    for j in range(len(cosine_sim)):
        sim_list.append(float(cosine_sim[j][0]))
    sim_list = list(enumerate(sim_list)) 
    sim_list.sort(key = lambda x:x[1],reverse = True)
    return sim_list


In [None]:
def recommend_based_on_cuisine_cosine_sim(user_cuisine_list,cuisine_list,mat,df_percent,user_location):
    sim_list = get_cosine_sim_with_all_restaurant(user_cuisine_list,cuisine_list,mat)
    
    recommender_res = []
    for x in sim_list[0:31]:
        recommender_res.append(x[0])

    df_n = df_percent.iloc[recommender_res,:]
    df_n.drop(['online_order', 'book_table', 'rate','reviews_list', 'city'], axis = 1, inplace = True)
    df_n.index.name = 'name'
    df_n.reset_index(inplace = True)
    df_n = df_n.loc[df_n.astype(str).drop_duplicates(subset = ['name','Mean Rating','cost'], keep=False).index]
    
    # Please uncomment the following 5 lines to get distance from customer location
#     l_l = []
#     for l in df_n.location:
#         l_l.append(Get_distance(l,user_location))
#     df_n.insert(2, "Distance(km)",l_l, True)
#     df_n = df_n[df_n['Distance(km)'] < 20]
    
    df_n = df_n.sort_values(by='Mean Rating', ascending=False).head(10)
    df_n.reset_index(drop=True,inplace = True)
    return df_n

In [None]:
recommend_based_on_cuisine_cosine_sim(['North Indian'],cuisine_list,mat,df_percent,'Banashankari')

print('Welcome to Yottabytes Restaurant Recommender!\n')
print('Please enter your location')
t_location = input()
print('\nWould you like to search by restaurant or cuisine? press 1 for Restaurant 2 for cuisine')
t_option = int(input())
if(t_option ==1):
    print('Enter Restaurant Name')
    t_rest = input()
    print('\nRestaurants with similar reviews and cuisines to this are:')
    t_rest_review_recommendation = recommend(t_rest, cosine_similarities, df_percent,t_location)
    t_rest_cui_recommendation = recommend(t_rest, cosine_sim, df_percent_copy_sample,t_location)
    
    t_combined = pd.concat([t_rest_review_recommendation.iloc[0:int(0.6*len(t_rest_review_recommendation)),:], t_rest_cui_recommendation.iloc[0:int(0.4*len(t_rest_cui_recommendation)),:]],ignore_index=True)
    t_combined = t_combined.sort_values(by='Mean Rating', ascending=False)
    t_combined = t_combined.drop_duplicates(subset = ['name','Mean Rating','location','cost'])
    print('TOP %s RESTAURANTS LIKE %s: ' % (str(len(t_combined)), t_rest))
    print(tabulate(t_combined, headers = 'keys', tablefmt = 'github')) 


else:
    print('\nThe available cuisines across all the restaurants are \n:')
    for i in range(len(cuisine_list)):
        print(i,' ',cuisine_list[i])
    print('Enter number of cuisines')
    t_n = int(input())
    t_cuisine_list  =[]
    print('Enter cuisines of your choice:')
    for j in range(t_n):
        t_c = input()
        t_cuisine_list.append(t_c)
    t_cuisine_recommendation = recommend_based_on_cuisine_cosine_sim(t_cuisine_list,cuisine_list,mat,df_percent,t_location)
    print('Restaurants with similar cuisines are:')
    print(tabulate(t_cuisine_recommendation, headers = 'keys', tablefmt = 'github'))
