In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None # default='warn'
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cs
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import train_test_split
df = pd.read_csv('1.csv')

In [2]:
df = df[['name','rate','location','dish_liked','cuisines','approx_cost','neighbourhood']]
df.drop_duplicates('name', inplace=True)
df.dropna(subset=['cuisines'], inplace=True)

In [3]:
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["cuisines"] = df["cuisines"].apply(lambda text: remove_punctuation(text))

In [4]:
df['addr'] = df['neighbourhood'] +' '+ df['location']
#df.head()
train_df = df.iloc[:5000,:]
test_df = df.iloc[5000:,:]

In [5]:
df.set_index('name', inplace=True)
train_df.set_index('name', inplace=True)
test_df.set_index('name', inplace=True)
indices = pd.Series(df.index)

In [6]:
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_cuisine_matrix = tfidf.fit_transform(df['cuisines'])
tfidf_location_matrix = tfidf.fit_transform(df['addr'])

In [7]:
cuisine_similarity = cs(tfidf_cuisine_matrix,tfidf_cuisine_matrix)
location_similarity = cs(tfidf_location_matrix,tfidf_location_matrix)

In [8]:
print(df[df.index == '7 Hills Garden Restaurant']['approx_cost'].values)

[400.]


In [9]:
def recommend(name):
    recommendations = []

    idx = indices[indices == name].index[0]

    cuisine_series = pd.Series(cuisine_similarity[idx]).sort_values(ascending=False)
    #print(cuisine_series)
    top_cuisine = list(cuisine_series.iloc[0:2000].index)

    location_series = pd.Series(location_similarity[idx]).sort_values(ascending=False)
    top_location = list(location_series.iloc[0:1000].index)

    for i in top_cuisine:
        for j in top_location:
            if i == j:
                recommendations.append(list(df.index)[i])

    df_new = pd.DataFrame(columns=['cuisines','rate', 'approx_cost', 'addr'])
    
    for each in recommendations:
        df_new = df_new.append(pd.DataFrame(df[['cuisines','rate', 'approx_cost', 'addr']][df.index == each].sample()))
    
    df_new = df_new.drop_duplicates()

    df_new['cost_diff'] = df_new['approx_cost'] - (df[df.index == name]['approx_cost'].values[0])
    drop_names = df_new[(df_new['cost_diff'] > 201) | (df_new['cost_diff'] < -201)].index
    df_new.drop(drop_names, inplace=True)

    df_new = df_new.sort_values(by='rate', ascending=False)

    df_new.to_csv('temp.csv')
    df_new.drop(['cost_diff'], axis=1, inplace=True)
    return df_new.head(10)

In [10]:
recommend('Jalsa')

Unnamed: 0,cuisines,rate,approx_cost,addr
Empire Restaurant,North Indian Mughlai South Indian Chinese,4.4,750.0,Banashankari Jayanagar
Meghana Foods,Biryani North Indian Chinese,4.4,600.0,Banashankari Jayanagar
The Royal Corner - Pai Viceroy,North Indian Chinese,4.2,900.0,Banashankari Jayanagar
Kapoor's Cafe,North Indian,4.2,800.0,Banashankari Jayanagar
Jalsa,North Indian Mughlai Chinese,4.1,800.0,Banashankari Banashankari
Hara Fine Dine,North Indian Chinese,4.0,800.0,Banashankari Banashankari
1947,North Indian Chinese,4.0,850.0,Banashankari Banashankari
Mint and Mustard,North Indian Chinese,4.0,750.0,Banashankari Basavanagudi
Pramukh Family Restaurant,North Indian Chinese Mughlai,3.9,700.0,Banashankari Basavanagudi
Thamboola,North Indian Chinese Continental,3.9,800.0,Banashankari Banashankari


In [11]:
train_df['combined'] = train_df['addr']  + train_df['cuisines']

Unnamed: 0_level_0,rate,location,dish_liked,cuisines,approx_cost,neighbourhood,addr,combined
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Jalsa,4.1,Banashankari,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...",North Indian Mughlai Chinese,800.0,Banashankari,Banashankari Banashankari,Banashankari BanashankariNorth Indian Mughlai ...
Spice Elephant,4.1,Banashankari,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...",Chinese North Indian Thai,800.0,Banashankari,Banashankari Banashankari,Banashankari BanashankariChinese North Indian ...
San Churro Cafe,3.8,Banashankari,"Churros, Cannelloni, Minestrone Soup, Hot Choc...",Cafe Mexican Italian,800.0,Banashankari,Banashankari Banashankari,Banashankari BanashankariCafe Mexican Italian
Addhuri Udupi Bhojana,3.7,Banashankari,Masala Dosa,South Indian North Indian,300.0,Banashankari,Banashankari Banashankari,Banashankari BanashankariSouth Indian North In...
Grand Village,3.8,Basavanagudi,"Panipuri, Gol Gappe",North Indian Rajasthani,600.0,Banashankari,Banashankari Basavanagudi,Banashankari BasavanagudiNorth Indian Rajasthani


In [12]:
def get_matrices(name):
    global train_df
    row = df[df.index == name]
    row['combined'] = row['addr']  + row['cuisines']
    temp_df = train_df
    temp_df=temp_df.append(row)
    indices = pd.Series(temp_df.index)
    tfidf_combined_matrix = tfidf.fit_transform(temp_df['combined'])
    combined_similarity = cs(tfidf_combined_matrix,tfidf_combined_matrix)
    tfidf_combined_array = tfidf_combined_matrix.toarray()
    D = euclidean_distances(tfidf_combined_matrix)
    idx = indices[indices == name].index[0]
    
    return tfidf_combined_matrix,combined_similarity,tfidf_combined_array,D,idx

def recommend_2(name, method):
    tfidf_combined_matrix,combined_similarity,tfidf_combined_array,D,idx = get_matrices(name)
    if method == 'Cosine':
        similarity_scores = list(enumerate(combined_similarity[idx]))
        similarity_scores = list(filter(lambda x : x[1] > 0.7,similarity_scores))
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        books_index = [i[0] for i in similarity_scores if i[0]!=5000]
        return df.iloc[books_index][1:]
    if method == 'Pearson':
        correlation = []
        for i in range(len(tfidf_combined_array)):
            correlation.append(pearsonr(tfidf_combined_array[idx], tfidf_combined_array[i])[0])
        correlation = list(enumerate(correlation))
        correlation = list(filter(lambda x : x[1] > 0.7,correlation))
        sorted_corr = sorted(correlation, reverse=True, key=lambda x: x[1])
        books_index = [i[0] for i in sorted_corr if i[0]!=5000]
        return df.iloc[books_index][1:]
    if method == 'Euclidian':
        distance = list(enumerate(D[idx]))
        distance = list(filter(lambda x : x[1] <1,distance))
        distance = sorted(distance, key=lambda x: x[1])
        books_index = [i[0] for i in distance if i[0]!=5000]
        return df.iloc[books_index][1:]
    if method == 'All':
        similarity_scores = list(enumerate(combined_similarity[idx]))
        similarity_scores = list(filter(lambda x : x[1] > 0.7,similarity_scores))
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        cosine = [i[0] for i in similarity_scores if i[0]!=5000]
        cosine = df.iloc[cosine][1:]
        
        correlation = []
        for i in range(len(tfidf_combined_array)):
            correlation.append(pearsonr(tfidf_combined_array[idx], tfidf_combined_array[i])[0])
        correlation = list(enumerate(correlation))
        correlation = list(filter(lambda x : x[1] > 0.7,correlation))
        sorted_corr = sorted(correlation, reverse=True, key=lambda x: x[1])
        pearson = [i[0] for i in sorted_corr if i[0]!=5000]
        pearson = df.iloc[pearson][1:]
        
        
        distance = list(enumerate(D[idx]))
        distance = list(filter(lambda x : x[1] <1,distance))
        distance = sorted(distance, key=lambda x: x[1])
        euclidian = [i[0] for i in distance if i[0]!=5000]
        euclidian = df.iloc[euclidian][1:]
        
        return cosine,pearson,euclidian
    

In [13]:
recommend_2('Meghana Foods','Euclidian')

Unnamed: 0_level_0,rate,location,dish_liked,cuisines,approx_cost,neighbourhood,addr
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Anand Donne Biriyani,3.6,Jayanagar,,Biryani,200.0,Banashankari,Banashankari Jayanagar
Biryanis And More,4.0,Jayanagar,"Prawn Biryani, Dragon Chicken, Chicken Boneles...",Biryani North Indian Chinese Andhra South Indian,750.0,Banashankari,Banashankari Jayanagar
Vindu Andhra Ruchulu,3.8,Jayanagar,"Raita, Chicken Curry, Fish, Mutton Biryani, Be...",Biryani North Indian Andhra,800.0,Basavanagudi,Basavanagudi Jayanagar
Desi Rasoi,3.8,Jayanagar,"Hara Bhara Kebab, Lunch Buffet, Naan, Babycorn...",Biryani North Indian Chinese Rajasthani,600.0,Basavanagudi,Basavanagudi Jayanagar


In [None]:
psum =0
csum = 0
esum = 0
cset = set()
pset = set()
eset = set()
count=0
for ind in test_df.index[:1000]:
    print(count)
    count+=1
    cosine,pearson,euclidian = recommend_2(ind,'All')
    csum += len(cosine)
    psum += len(pearson)
    esum += len(euclidian)
    for i in cosine.index:
        cset.add(i)
    for i in pearson.index:
        pset.add(i)
    for i in euclidian.index:
        eset.add(i)
print("Cosine Quality")
print(csum/1000)
print("Pearson Quality")
print(psum/1000)
print("Euclidian Quality")
print(esum/1000)
print()
print("Cosine Coverage")
print(len(cset)/len(train_df))
print("Pearson Coverage")
print(len(pset)/len(train_df))
print("Euclidian Coverage")
print(len(eset)/len(train_df))

0
1
2
3
4
5
6


In [None]:
"""Output Obtained
Cosine Quality
8.52
Pearson Quality
8.511
Euclidian Quality
15.848

Cosine Coverage
0.4212
Pearson Coverage
0.4208
Euclidian Coverage
0.6122
"""