# Cosine_similairty, Pearson Correlation and Euclidian Distance (Appraoch 2)

In [1]:
#importing the required libraries
import pandas as pd
pd.options.mode.chained_assignment = None # default='warn'
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cs
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import train_test_split
df = pd.read_csv('data\processed\zomato_final.csv')

# Data Cleaning and Preprocessing

In [2]:
# pre processing data for the implementation of the model (dropping irrelevant features)
#uncomment following line if performing evaluation
#df = df.sample(frac = 1)
df = df[['name','rate','location','dish_liked','cuisines','approx_cost','neighbourhood']]
df.drop_duplicates('name', inplace=True)
df.dropna(subset=['cuisines'], inplace=True)

In [3]:
#cleaning text data
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["cuisines"] = df["cuisines"].apply(lambda text: remove_punctuation(text))

# Training

In [4]:
#creating new column to combine 'neighbourhood' and 'location'
df['addr'] = df['neighbourhood'] +' '+ df['location']

#splitting into train and test
train_df = df.iloc[:5000,:]
test_df = df.iloc[5000:,:]

In [5]:
# Setting 'name' as the index for convenience
df.set_index('name', inplace=True)
train_df.set_index('name', inplace=True)
test_df.set_index('name', inplace=True)
indices = pd.Series(df.index)

In [6]:
#Creating new column with the text columns used for recommendation
train_df['combined'] = train_df['addr']  + train_df['cuisines']
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')

# Recommendation

In [7]:
# function to return similarity/distance matrices for the test instance
def get_matrices(name):
    global train_df
    row = df[df.index == name]
    row['combined'] = row['addr']  + row['cuisines']
    temp_df = train_df
    temp_df=temp_df.append(row)
    indices = pd.Series(temp_df.index)
    tfidf_combined_matrix = tfidf.fit_transform(temp_df['combined'])
    combined_similarity = cs(tfidf_combined_matrix,tfidf_combined_matrix)
    tfidf_combined_array = tfidf_combined_matrix.toarray()
    D = euclidean_distances(tfidf_combined_matrix)
    idx = indices[indices == name].index[0]
    
    return tfidf_combined_matrix,combined_similarity,tfidf_combined_array,D,idx

#function the implements the recommendation model for all 3 distance measures. When 'method=All'
#it returns the recommendation for all methods. This is used for evaluation of the model.
def recommend_2(name, method):
    tfidf_combined_matrix,combined_similarity,tfidf_combined_array,D,idx = get_matrices(name)
    if method == 'Cosine':
        similarity_scores = list(enumerate(combined_similarity[idx]))
        similarity_scores = list(filter(lambda x : x[1] > 0.7,similarity_scores)) #similarity threshold
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        books_index = [i[0] for i in similarity_scores if i[0]!=5000]
        return df.iloc[books_index][1:]
    
    if method == 'Pearson':
        correlation = []
        for i in range(len(tfidf_combined_array)):
            correlation.append(pearsonr(tfidf_combined_array[idx], tfidf_combined_array[i])[0])
        correlation = list(enumerate(correlation))
        correlation = list(filter(lambda x : x[1] > 0.7,correlation))#similarity threshold
        sorted_corr = sorted(correlation, reverse=True, key=lambda x: x[1])
        books_index = [i[0] for i in sorted_corr if i[0]!=5000]
        return df.iloc[books_index][1:]
    
    if method == 'Euclidian':
        distance = list(enumerate(D[idx]))
        distance = list(filter(lambda x : x[1] <1,distance))#similarity threshold
        distance = sorted(distance, key=lambda x: x[1])
        books_index = [i[0] for i in distance if i[0]!=5000]
        return df.iloc[books_index][1:]
    
    if method == 'All':
        similarity_scores = list(enumerate(combined_similarity[idx]))
        similarity_scores = list(filter(lambda x : x[1] > 0.7,similarity_scores))
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        cosine = [i[0] for i in similarity_scores if i[0]!=5000]
        cosine = df.iloc[cosine][1:]
        
        correlation = []
        for i in range(len(tfidf_combined_array)):
            correlation.append(pearsonr(tfidf_combined_array[idx], tfidf_combined_array[i])[0])
        correlation = list(enumerate(correlation))
        correlation = list(filter(lambda x : x[1] > 0.7,correlation))
        sorted_corr = sorted(correlation, reverse=True, key=lambda x: x[1])
        pearson = [i[0] for i in sorted_corr if i[0]!=5000]
        pearson = df.iloc[pearson][1:]
        
        
        distance = list(enumerate(D[idx]))
        distance = list(filter(lambda x : x[1] <1,distance))
        distance = sorted(distance, key=lambda x: x[1])
        euclidian = [i[0] for i in distance if i[0]!=5000]
        euclidian = df.iloc[euclidian][1:]
        
        return cosine,pearson,euclidian
    

# Output

In [8]:
recommend_2('Meghana Foods','Euclidian')

Unnamed: 0_level_0,rate,location,dish_liked,cuisines,approx_cost,neighbourhood,addr
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Anand Donne Biriyani,3.6,Jayanagar,,Biryani,200.0,Banashankari,Banashankari Jayanagar
Biryanis And More,4.0,Jayanagar,"Prawn Biryani, Dragon Chicken, Chicken Boneles...",Biryani North Indian Chinese Andhra South Indian,750.0,Banashankari,Banashankari Jayanagar
Vindu Andhra Ruchulu,3.8,Jayanagar,"Raita, Chicken Curry, Fish, Mutton Biryani, Be...",Biryani North Indian Andhra,800.0,Basavanagudi,Basavanagudi Jayanagar
Desi Rasoi,3.8,Jayanagar,"Hara Bhara Kebab, Lunch Buffet, Naan, Babycorn...",Biryani North Indian Chinese Rajasthani,600.0,Basavanagudi,Basavanagudi Jayanagar


# Evaluation

In [None]:
#Used to evaluate the model, takes alot of time to run. DataFrame is sampled(shuffled) before running.
psum =0
csum = 0
esum = 0
cset = set()
pset = set()
eset = set()
count=0
for ind in test_df.index[:1000]:
    print(count)
    count+=1
    cosine,pearson,euclidian = recommend_2(ind,'All')
    csum += len(cosine)
    psum += len(pearson)
    esum += len(euclidian)
    for i in cosine.index:
        cset.add(i)
    for i in pearson.index:
        pset.add(i)
    for i in euclidian.index:
        eset.add(i)
print("Cosine Quality")
print(csum/1000)
print("Pearson Quality")
print(psum/1000)
print("Euclidian Quality")
print(esum/1000)
print()
print("Cosine Coverage")
print(len(cset)/len(train_df))
print("Pearson Coverage")
print(len(pset)/len(train_df))
print("Euclidian Coverage")
print(len(eset)/len(train_df))

In [None]:
"""Output Obtained
Cosine Quality
8.52
Pearson Quality
8.511
Euclidian Quality
15.848

Cosine Coverage
0.4212
Pearson Coverage
0.4208
Euclidian Coverage
0.6122
"""