# TF-IDF Cosine Similarty (Approach 2)

In [12]:
# import all the required libararies for this appraoch including TF-IDF, Cosine_Similarity fucntion
import string
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cs

# import the csv into a dataframe model for training
df = pd.read_csv('1.csv')

In [13]:
# print all the column that exist in the dataframe
df.columns

Index(['Unnamed: 0', 'name', 'online_order', 'book_table', 'rate', 'votes',
       'location', 'rest_type', 'dish_liked', 'cuisines', 'approx_cost',
       'reviews_list', 'menu_item', 'type', 'neighbourhood'],
      dtype='object')

Upon initial Analyisis we found that the rating of the restaurant i.e. popularity of the restaurant is highly correlated to the cuisine and the number of cuisines available in a restaurant 
Also, the rating of the restaurant is dependent on the location of the restaurant which indirectly affects the average cost of the restaurant 

Hence, we consider the following features for our recommendation

In [14]:
df = df[['name','rate','location','cuisines','approx_cost','neighbourhood']]

# Data Cleaning and Preprocessing

In [15]:
# drop any duplicate resturants in the dataset with respect to restaurant name
df.drop_duplicates('name', inplace=True)

# drop the resturants with no cuisines
df.dropna(subset=['cuisines'], inplace=True)

# ouptut the number of null values in all columns after the above preprocessing
df.isnull().sum()

name                0
rate             2454
location            0
cuisines            0
approx_cost        36
neighbourhood       0
dtype: int64

In [16]:
# custom function to remove the punctuation from the columns containing text
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

# apply the remove punctutation fucntion on the cuisine column to remove the commas
df["cuisines"] = df["cuisines"].apply(lambda text: remove_punctuation(text))

In [17]:
# for comparing the location of restarants we combine the 
# location and neighbourhood column into a single column named addr

df['addr'] = df['neighbourhood'] +' '+ df['location']
df.head()

Unnamed: 0,name,rate,location,cuisines,approx_cost,neighbourhood,addr
0,Jalsa,4.1,Banashankari,North Indian Mughlai Chinese,800.0,Banashankari,Banashankari Banashankari
1,Spice Elephant,4.1,Banashankari,Chinese North Indian Thai,800.0,Banashankari,Banashankari Banashankari
2,San Churro Cafe,3.8,Banashankari,Cafe Mexican Italian,800.0,Banashankari,Banashankari Banashankari
3,Addhuri Udupi Bhojana,3.7,Banashankari,South Indian North Indian,300.0,Banashankari,Banashankari Banashankari
4,Grand Village,3.8,Basavanagudi,North Indian Rajasthani,600.0,Banashankari,Banashankari Basavanagudi


In [18]:
# Set the index of the dataframe to the name column
df.set_index('name', inplace=True)

# Obtain a series of all restaurants in the dataset
indices = pd.Series(df.index)

# Training 

In [19]:
# Create a new column in the dataframe which contains the tf-idf values of the restaurant's cuisine and address
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_cuisine_matrix = tfidf.fit_transform(df['cuisines'])
tfidf_location_matrix = tfidf.fit_transform(df['addr'])

In [20]:
# Create two new matrices which contains the cosine similarity values of the restaurant's cuisine and address
cuisine_similarity = cs(tfidf_cuisine_matrix,tfidf_cuisine_matrix)
location_similarity = cs(tfidf_location_matrix,tfidf_location_matrix)

# Recommendation

In [21]:
# define a custon recommend fucntion 
def recommend(name):
    # initialize the recommend list to empty
    recommendations = []

    # obatin the index of the restaurant entered by the user
    idx = indices[indices == name].index[0]

    # obatin the top 2000 similar restaurants with respect to the cuisine from the cuisine similarity matrix
    cuisine_series = pd.Series(cuisine_similarity[idx]).sort_values(ascending=False)
    top_cuisine = list(cuisine_series.iloc[0:2000].index) 

    # obatin the top 1000 similar restaurants with respect to the location from the location similarity matrix
    location_series = pd.Series(location_similarity[idx]).sort_values(ascending=False)
    top_location = list(location_series.iloc[0:1000].index)

    # combine the top 2000 similar restaurants with respect to the cuisine and the top 1000 similar restaurants with respect to the location
    for i in top_cuisine:
        for j in top_location:
            if i == j:
                # append the common restaurant to the recommend list
                recommendations.append(list(df.index)[i])

    # create a new dataframe with the restaurants in recommend list
    df_new = pd.DataFrame(columns=['cuisines','rate', 'approx_cost', 'addr'])
    # obatin the rating and average cost of the restarants in the recommend list
    for each in recommendations:
        df_new = df_new.append(pd.DataFrame(df[['cuisines','rate', 'approx_cost', 'addr']][df.index == each].sample()))
    
    # drop any duplicate resturants in the dataset with respect to restaurant name
    df_new = df_new.drop_duplicates()

    # compare the average cost of the restarants in the recommend list with the average cost of the restarant input by the user
    df_new['cost_diff'] = df_new['approx_cost'] - (df[df.index == name]['approx_cost'].values[0])
    
    # drop all the restarants with the cost difference more than 200
    drop_names = df_new[(df_new['cost_diff'] > 201) | (df_new['cost_diff'] < -201)].index
    df_new.drop(drop_names, inplace=True)
    df_new.drop(['cost_diff'], axis=1, inplace=True)

    # sort the restarants in the recommend list by their rating
    df_new = df_new.sort_values(by='rate', ascending=False)

    # return the top 10 recommendations from the recommend list
    return df_new.head(10)

# Output

In [22]:
recommend('Jalsa')

Unnamed: 0,cuisines,rate,approx_cost,addr
Empire Restaurant,North Indian Mughlai South Indian Chinese,4.4,750.0,Banashankari Jayanagar
Meghana Foods,Biryani North Indian Chinese,4.4,600.0,Banashankari Jayanagar
The Royal Corner - Pai Viceroy,North Indian Chinese,4.2,900.0,Banashankari Jayanagar
Kapoor's Cafe,North Indian,4.2,800.0,Banashankari Jayanagar
Jalsa,North Indian Mughlai Chinese,4.1,800.0,Banashankari Banashankari
Hara Fine Dine,North Indian Chinese,4.0,800.0,Banashankari Banashankari
The Kebab Room - Restaurant & Brewery,Mughlai North Indian,4.0,1000.0,Indiranagar Indiranagar
Mint and Mustard,North Indian Chinese,4.0,750.0,Banashankari Basavanagudi
1947,North Indian Chinese,4.0,850.0,Banashankari Banashankari
Silbatti,North Indian,4.0,750.0,HSR HSR
