In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('food_250.csv', usecols=['name', 'ingredients'])
df.head()

Unnamed: 0,name,ingredients
0,Balu shahi,"Maida flour, yogurt, oil, sugar"
1,Boondi,"Gram flour, ghee, sugar"
2,Gajar ka halwa,"Carrots, milk, sugar, ghee, cashews, raisins"
3,Ghevar,"Flour, ghee, kewra, milk, clarified butter, su..."
4,Gulab jamun,"Milk powder, plain flour, baking powder, ghee,..."


In [3]:
items = set()
for x in df.ingredients:
    for val in x.split(', '):
        items.add(val.lower().strip())

items = list(items)

In [4]:
new_df = pd.DataFrame(data=np.zeros((255, 367), dtype=int), columns=['name', 'ingredients'] + items)

In [5]:
for i, d in df.iterrows():
    new_df.loc[i, ['name', 'ingredients']] = d[:2]

    for val in d[1].split(', '):
        item = val.lower().strip()
        new_df.loc[i, item] = 1

In [6]:
new_df.head()

Unnamed: 0,name,ingredients,turmeric,split urad dal,avocado oil,citric acid,green chili,chillies,khaman,chana daal,...,tomato,glutinous rice,chia seed,elachi,coconut flakes,sauce,litre milk,skimmed milk powder,rice,edible gum
0,Balu shahi,"Maida flour, yogurt, oil, sugar",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Boondi,"Gram flour, ghee, sugar",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Gajar ka halwa,"Carrots, milk, sugar, ghee, cashews, raisins",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Ghevar,"Flour, ghee, kewra, milk, clarified butter, su...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Gulab jamun,"Milk powder, plain flour, baking powder, ghee,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
query = ['bread crumbs', 'paneer', 'garam masala']

In [16]:
def embed_query(q, items):
    embedding = np.zeros((365,), dtype=int)
    it = np.array(items)
    
    for q in query:
        idx = np.where(q == it)
        embedding[idx] = 1

    return embedding

embed_query(query, items)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [23]:
emb_qy = embed_query(query, items)
sim = cosine_similarity(new_df.iloc[:, 2:], emb_qy.reshape(1, -1)).ravel()
# sim

In [24]:
idx_sorted = np.argsort(sim)[::-1]
# print(len(idx_sorted))
# print(sim[idx_sorted])

for val, idx in np.column_stack((sim[idx_sorted], idx_sorted)):
#     print(val, idx)
    if val > 0:
        print(new_df.iloc[int(idx), 0])

Kofta
Aloo tikki
Paneer butter masala
Bhindi masala
Dal makhani 
Kachori
Kadai paneer
Chole bhature
Chicken Tikka masala
Lauki ki subji
Makki di roti sarson da saag
Bilahi Maas
Mushroom do pyaza
Mushroom matar
Dum aloo
Paneer tikka masala
Palak paneer
Dal tadka
Aloo shimla mirch
Tandoori Chicken
Vegetable jalfrezi
Aloo gobi
Samosa
Dahi vada
Shahi paneer
Beef Fry
Aloo matar
Chana masala


In [37]:
new_query = 'Shahi paneer'
new_embed = new_df.loc[new_df['name'] == new_query].values.ravel()[2:]
new_embed

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [38]:
new_sim = cosine_similarity(new_df.iloc[:, 2:], new_embed.reshape(1, -1)).ravel()

In [39]:
new_sim

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.25819889, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.36514837,
       0.        , 0.        , 0.        , 0.2       , 0.2       ,
       0.36514837, 0.        , 0.2       , 0.        , 0.2    

In [44]:
new_idx_sorted = np.argsort(new_sim)[::-1]

count = 10
for val, idx in np.column_stack((new_sim[new_idx_sorted], new_idx_sorted)):
#     print(val, idx)
    if count == 0:
        break
    if 0 < val < 0.99:
        print(new_df.iloc[int(idx), 0], val)
        count -= 1

Mushroom do pyaza 0.6
Sev tameta 0.4472135954999579
Khichdi 0.39999999999999997
Vegetable jalfrezi 0.39999999999999997
Dal tadka 0.39999999999999997
Makki di roti sarson da saag 0.39999999999999997
Palak paneer 0.39999999999999997
Samosa 0.39999999999999997
Lauki ki subji 0.39999999999999997
Rajma chaval 0.39999999999999997


In [83]:
queries = ['Balu shahi', 'Gajar ka halwa', 'Gulab jamun']

embedds = []
for q in queries:
#     print(q)
    food = new_df.loc[new_df['name'] == q].values[0][2:]
    embedds.append(food)
#     print(food)
    
# print(len(embs))
embedds = np.logical_or.reduce(embedds)

In [84]:
new_sims = cosine_similarity(new_df.iloc[:, 2:], embedds.reshape(1, -1)).ravel()

In [85]:
new_sims

array([0.53452248, 0.3086067 , 0.65465367, 0.25354628, 0.75592895,
       0.        , 0.08908708, 0.40089186, 0.3086067 , 0.26726124,
       0.3086067 , 0.40089186, 0.10910895, 0.13363062, 0.15430335,
       0.13363062, 0.26726124, 0.        , 0.15430335, 0.35856858,
       0.3086067 , 0.3086067 , 0.18898224, 0.10910895, 0.11952286,
       0.3086067 , 0.        , 0.13363062, 0.3086067 , 0.18898224,
       0.26726124, 0.        , 0.        , 0.        , 0.15430335,
       0.26726124, 0.11952286, 0.15430335, 0.20203051, 0.        ,
       0.3086067 , 0.18898224, 0.23904572, 0.18898224, 0.        ,
       0.18898224, 0.20203051, 0.18898224, 0.        , 0.26726124,
       0.15430335, 0.        , 0.18898224, 0.        , 0.        ,
       0.        , 0.3086067 , 0.        , 0.26726124, 0.18898224,
       0.        , 0.40089186, 0.13363062, 0.3086067 , 0.        ,
       0.        , 0.26726124, 0.10910895, 0.        , 0.        ,
       0.        , 0.11952286, 0.        , 0.23904572, 0.     

In [86]:
idx_sorts = np.argsort(new_sims)[::-1]

count = 10
for val, idx in np.column_stack((new_sims[idx_sorts], idx_sorts)):
#     print(val, idx)
    if count == 0:
        break
    if 0 < val < 0.99:
        food = new_df.iloc[int(idx), 0]
        if food not in queries:
            count -= 1
            print(food)

Shankarpali
Lassi
Kaju katli
Payasam
Sohan papdi
Dharwad pedha
Kalakand
Laddu
Basundi
Mihidana


In [93]:
df.loc[df['name'] == 'Rasam', 'ingredients'].values

array(['Tomato, curry leaves, garlic, mustard seeds, hot water'],
      dtype=object)

In [None]:
.useless = ['axone', 'yogurt', 'wine vinegar', 'whole red', 'whole wheat flour', 'watercress', 'vermicelli pudding',
           'vegetable oil', 'tomato', 'tomato paste', 'thick poha', ]
useful = ['mushrooms', 'vermicelli', 'poha', ]


In [None]:
items

In [None]:
'''
250 x 366

name potato tomato almonsd
halwa 0         0       1



q = [sugar, cheese, bread] (365, )
= [0, 0, 0, 1, 0, 0, 1, 0, 1]


remove = { teaspoon, teaspoons, to, taste }
'''

In [None]:
import pandas as pd
import numpy as np

raw = pd.read_excel('/content/IndianFoodDatasetXLS_6000.xlsx')
df = raw.copy()

columns_to_drop = ['RecipeName', 'Ingredients', 'URL', 'PrepTimeInMins' , 'CookTimeInMins', 'Course',
                   'TotalTimeInMins','TranslatedInstructions', 'Instructions', 'Servings', 'Srno', 'Diet']

df = df.drop(columns = columns_to_drop).dropna()

# data seems to contain more than just indian cuisines, so I drop these
cuisines_to_drop = ['Mexican', 'Italian Recipes', 'Thai', 'Chinese', 'Asian', 'Middle Eastern', 'European',
                   'Arab', 'Japanese', 'Vietnamese', 'British', 'Greek', 'French', 'Mediterranean', 'Sri Lankan',
                   'Indonesian', 'African', 'Korean', 'American', 'Carribbean', 'World Breakfast', 'Malaysian', 'Dessert',
                   'Afghan', 'Snack', 'Jewish', 'Brunch', 'Lunch', 'Continental', 'Fusion']

# need to drop desserts and breakfasts, as these are much less likely to contain spices
# courses_to_drop = ['South Indian Breakfast', 'Snack', 'Appetizer', 'Indian Breakfast', 'Dessert', 'North Indian Breakfast',
                #   'World Breakfast', 'Brunch', 'Side Dish']

df = df[~df['Cuisine'].isin(cuisines_to_drop)]
# df = df[~df['Course'].isin(courses_to_drop)]
df.shape

In [None]:
df.head()

In [None]:
# df = df['TranslatedIngredients']

def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

#create boolean mask
mask = df.loc[:, 'TranslatedIngredients'].apply(isEnglish)
df = df[mask].dropna().reset_index(drop=True)

df.shape #I see we dropped about 350 entries.

In [None]:
df.head()

In [None]:
ing_df = df.copy()

In [None]:
for i, row in ing_df.iterrows():
    name, ing, _ = row
    
    print(ing)
    ing_df.iloc[i, 1] = clean_ingredients(ing)
    break


In [None]:
def clean_ingredients(s):
    items = ing.split(', ')
    filt_items = []
    for words in items:
        filt_word = ''
        for letter in word:
            if letter.isalpha() or letter.isspace():
                filt_word

    return 'Hello'

In [None]:
import re

pattern = r'^[a-zA-Z\s-]+$'

print(re.match(pattern, 'Karela Bitter Gourd Pavakkai - deseeded'))