In [2]:
import numpy as np
import pandas as pd

In [3]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from sklearn.metrics.pairwise import cosine_similarity

Using TensorFlow backend.


In [4]:
df = pd.read_csv('1.csv')
df = df[['name','rate','location','dish_liked','cuisines','approx_cost','neighbourhood']]
df.drop_duplicates('name', inplace=True)
df.dropna(subset=['cuisines'], inplace=True)

In [5]:
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["cuisines"] = df["cuisines"].apply(lambda text: remove_punctuation(text))
df['addr'] = df['neighbourhood'] +' '+ df['location']
df.set_index('name', inplace=True)
indices = pd.Series(df.index)

In [6]:
df

Unnamed: 0_level_0,rate,location,dish_liked,cuisines,approx_cost,neighbourhood,addr
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Jalsa,4.1,Banashankari,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...",North Indian Mughlai Chinese,800.0,Banashankari,Banashankari Banashankari
Spice Elephant,4.1,Banashankari,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...",Chinese North Indian Thai,800.0,Banashankari,Banashankari Banashankari
San Churro Cafe,3.8,Banashankari,"Churros, Cannelloni, Minestrone Soup, Hot Choc...",Cafe Mexican Italian,800.0,Banashankari,Banashankari Banashankari
Addhuri Udupi Bhojana,3.7,Banashankari,Masala Dosa,South Indian North Indian,300.0,Banashankari,Banashankari Banashankari
Grand Village,3.8,Basavanagudi,"Panipuri, Gol Gappe",North Indian Rajasthani,600.0,Banashankari,Banashankari Basavanagudi
...,...,...,...,...,...,...,...
Chime - Sheraton Grand Bengaluru Whitefield Hotel &...,4.3,"ITPL Main Road, Whitefield","Cocktails, Pizza, Buttermilk",Finger Food,2500.0,Whitefield,"Whitefield ITPL Main Road, Whitefield"
The Nest - The Den Bengaluru,3.4,"ITPL Main Road, Whitefield",,Finger Food North Indian Continental,1500.0,Whitefield,"Whitefield ITPL Main Road, Whitefield"
Nawabs Empire,3.2,"ITPL Main Road, Whitefield",,North Indian Chinese Arabian Momos,300.0,Whitefield,"Whitefield ITPL Main Road, Whitefield"
SeeYa Restaurant,3.3,KR Puram,,North Indian Kerala Chinese,350.0,Whitefield,Whitefield KR Puram


In [7]:
l = [i for i in df['addr']]
l2 = [i for i in df['cuisines']]
l3 = [i for i in df['dish_liked']]

In [8]:
dish=[]
for i in l3:
    if pd.isna(i):
        dish.append('')
    else:
        dish.append(''.join(i.split(',')))

In [9]:
# integer encode the documents
vocab_size = 50
encoded_docs = [one_hot(d, vocab_size) for d in l]
encoded_docs2 = [one_hot(d, vocab_size) for d in l2]
encoded_docs3 = [one_hot(d, vocab_size) for d in dish]

max_length = 10
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
padded_docs2 = pad_sequences(encoded_docs2, maxlen=max_length, padding='post')
padded_docs3 = pad_sequences(encoded_docs3, maxlen=max_length, padding='post')


In [10]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 8)             400       
_________________________________________________________________
flatten_1 (Flatten)          (None, 80)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 81        
Total params: 481
Trainable params: 481
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
# fit the model
embd = model.predict(padded_docs)
embd2 = model.predict(padded_docs2)
embd3 = model.predict(padded_docs3)

In [12]:
embd,embd2,embd3

(array([[0.5022929 ],
        [0.5022929 ],
        [0.5022929 ],
        ...,
        [0.5126716 ],
        [0.51264906],
        [0.51373714]], dtype=float32),
 array([[0.50852466],
        [0.50823224],
        [0.50693524],
        ...,
        [0.5053636 ],
        [0.5060385 ],
        [0.50904024]], dtype=float32),
 array([[0.5217099 ],
        [0.50056547],
        [0.50743616],
        ...,
        [0.512577  ],
        [0.512577  ],
        [0.512577  ]], dtype=float32))

In [41]:
sim=[]
for i in range(indices.shape[0]):
    #sim.append([embd[i][0],embd2[i][0],embd3[i][0]])
    #sim.append([embd[i][0],embd2[i][0],embd3[i][0]]) 
    sim.append([embd[i][0],embd2[i][0]])

In [42]:
sim[0]

[0.5022929, 0.50852466]

In [49]:
def rec(name):
    idx = indices[indices == name].index[0]
    corr,reco = [],[]
    for i in range(len(sim)):
        #corr.append( (cosine_similarity([sim[idx]],[sim[i]])[0][0], i))
        cs = cosine_similarity([sim[idx]],[sim[i]])[0][0]
        if cs>=1:
            corr.append((cs,i))
     
    corr = sorted(corr,key = lambda x:x[0], reverse=True)
    reco = [c[1] for c in corr]
    reco.remove(idx)
    print('Coverage = ',(len(corr)*100/df.shape[0]),'Percent')
    
    return df.iloc[reco[:11]]
    

    
rec("Furry Fairy Paw Paradise")

Coverage =  5.441712204007286 Percent


Unnamed: 0_level_0,rate,location,dish_liked,cuisines,approx_cost,neighbourhood,addr
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Spice Elephant,4.1,Banashankari,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...",Chinese North Indian Thai,800.0,Banashankari,Banashankari Banashankari
Burger King,3.2,Basavanagudi,,Burger Fast Food Beverages,600.0,Banashankari,Banashankari Basavanagudi
Ande Ka Funda,3.8,Banashankari,,North Indian Fast Food Rolls,250.0,Banashankari,Banashankari Banashankari
Crave Bit Cafe,,JP Nagar,,Cafe,600.0,Bannerghatta Road,Bannerghatta Road JP Nagar
Starbucks,3.9,Bannerghatta Road,,Cafe Desserts,700.0,Bannerghatta Road,Bannerghatta Road Bannerghatta Road
Kolkata Kathi Rolls,3.8,JP Nagar,,Rolls,150.0,Bannerghatta Road,Bannerghatta Road JP Nagar
Berrylicious,3.8,JP Nagar,"Macaroon, Brownie Sundae, Waffles",Ice Cream Desserts Bakery,400.0,Bannerghatta Road,Bannerghatta Road JP Nagar
Punjabi Dhaba,3.5,Ejipura,"Butter Chicken, Paratha",North Indian Chinese,350.0,Brigade Road,Brigade Road Ejipura
Hunger House 32,,Ejipura,,North Indian Chinese,300.0,Brigade Road,Brigade Road Ejipura
Palmgrove,4.0,Residency Road,"Masala Dosa, Naan, Pongal, Grape Juice, Kulcha...",North Indian South Indian Chinese Juices,700.0,Brigade Road,Brigade Road Residency Road


In [44]:
cosine_similarity([sim[11]],[sim[0]])

array([[0.99999857]], dtype=float32)