In [None]:
import collections
import json
import os

import folium
import matplotlib.pyplot as plt
import nltk
import numpy as np
import re
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import umap
from wordcloud import WordCloud

In [None]:
plt.style.use("Solarize_Light2")

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!ls ../input/indian-food-101

In [None]:
df = pd.read_csv("/kaggle/input/indian-food-101/indian_food.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
print("There are ", len(set(df['name'])), "dish")

In [None]:
df_diet =  df[["diet"]].copy()
df_diet["count"] = 1
df_diet = df_diet.groupby("diet").count()
df_diet.head()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10,6), gridspec_kw=dict(wspace=0.1, hspace=0.6))
fig.suptitle("Analisys of diet", fontsize=15)

g_diet = sns.countplot(data=df, x="diet", order = df['diet'].value_counts().index,
                       ax=axes[0])
g_diet.set_title("diet countplot")

g_diet_pie = df_diet.plot.pie(y="count", ax=axes[1])
g_diet_pie.set_title("diet pie plot")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10,6), gridspec_kw=dict(wspace=0.1, hspace=0.6))
fig.suptitle("Analisys of flavor profile", fontsize=15)

g_flavor_profile = sns.countplot(data=df, x="flavor_profile", ax=axes[0], 
                                 order = df['flavor_profile'].value_counts().index)
g_flavor_profile.set_title("flavor profile countplot")

df_flavor_profile =  df[["flavor_profile"]].copy()
df_flavor_profile["count"] = 1
df_flavor_profile = df_flavor_profile.groupby("flavor_profile").count()
g_flavor_profile_pie = df_flavor_profile.plot.pie(y="count", ax=axes[1])
g_flavor_profile_pie.set_title("flavor profile pie plot")

In [None]:
g_course = sns.countplot(data=df, x="course", order = df['course'].value_counts().index)
g_course.set_title("course countplot")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10,6), gridspec_kw=dict(wspace=0.1, hspace=0.6))
fig.suptitle("Analisys of region", fontsize=15)

g_region = sns.countplot(data=df, x="region", order = df['region'].value_counts().index, ax=axes[0])
g_region.set_title("region countplot")

g_region = sns.countplot(data=df, x="region", hue="flavor_profile", ax=axes[1],
                         order = df['region'].value_counts().index)
g_region.legend_._loc = 1
g_region.set_title("taste countplot per region")

In [None]:
plt.figure(figsize=(20, 10))
g_state = sns.countplot(data=df, x="state",order = df['state'].value_counts().index)
g_state.set_xticklabels(g_state.get_xticklabels(), rotation=45)
g_state.set_title("state countplot")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10,6), gridspec_kw=dict(wspace=0.1, hspace=0.6))
fig.suptitle("Analisys of prep&cook time", fontsize=15)

g_prep_time = sns.distplot(df["prep_time"], ax=axes[0])
g_prep_time.set_title("prep_time countplot")

g_cook_time = sns.distplot(df["cook_time"], ax=axes[1])
g_cook_time.set_title("cook_time countplot")

In [None]:
all_words = []
for i in range(len(df)):
    txt =  df["ingredients"][i]
    #txt =  txt.replace(', ', ',').lower()
    #all_words += [ word for word in re.split('[,.]',txt) ]
    all_words += [word.lower() for word in nltk.word_tokenize(txt) if not word in ['.', ',']]
    
word_freq = collections.Counter(all_words)
W = WordCloud().fit_words(word_freq)

In [None]:
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(W)
plt.axis('off')
plt.show()

In [None]:
words = np.array(list(word_freq.keys()))
words

In [None]:
def gen_ingredients_vector(ingredients):    
    ingredients_vec = np.zeros(words.shape)
    ingredients = set([word.lower() for word in nltk.word_tokenize(ingredients) if not word in ['.', ',']])
    for ingredient in ingredients:
        idx = np.where(words == ingredient)
        ingredients_vec[idx] = 1
    
    return ingredients_vec.tolist()

df["ingredients_vec"] = df["ingredients"].map(gen_ingredients_vector)

df.head()

In [None]:
ingredients_vecs = []
for i in range(len(df)):
    ingredients_vecs.append(df["ingredients_vec"][i])
    
ingredients_vecs = np.array(ingredients_vecs)
# ingredients_vecs = np.array(gen_ingredients_vector("sugar, milk"))

In [None]:
cos_matrix = cosine_similarity(ingredients_vecs, ingredients_vecs)

In [None]:
plt.figure(figsize=(20, 20))
ax = sns.heatmap(cos_matrix)
ax.set_title("cosine_similarity of ingredients_vectors")

In [None]:
cosine_similarity([ingredients_vecs[10]], [np.array(gen_ingredients_vector("sugar, ghee"))])

my_dish = "sugar, ghee"
my_dish_vec = np.array(gen_ingredients_vector(my_dish))
max_prob = 0.0
max_id = 0

# print("LENGTH")
# print(len(ingredients_vecs))

for i in range(len(ingredients_vecs)):
    current_value = cosine_similarity([ingredients_vecs[i]], [my_dish_vec])
    if (current_value > max_prob):
        max_prob = current_value
        max_id = i

print("max_prob = " + str(max_prob))
df.iloc[[max_id]]
