# **Libs & Data**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import plotly.express as px

In [None]:
df = pd.read_csv('/kaggle/input/indian-food-101/indian_food.csv')

# **Exploratory Data Analysis**

In [None]:
# function Exploratory Data Analysis
def eda(dfA, all=False, desc='Exploratory Data Analysis'):
    print(desc)
    print(f'\nShape:\n{dfA.shape}')
    print(f'\nDTypes - Numerics')
    print(dfA.select_dtypes(include=np.number).columns.tolist())
    print(f'\nDTypes - Categoricals')
    print(dfA.select_dtypes(include='object').columns.tolist())
    print(f'\nIs Null: {dfA.isnull().sum().sum()}')
    print(f'{dfA.isnull().mean().sort_values(ascending=False)}')
    dup = dfA.duplicated()
    print(f'\nDuplicated: \n{dfA[dup].shape}\n')
    try:
        print(dfA[dfA.duplicated(keep=False)].sample(4))
    except:
        pass
    if all:  # here you put yours prefered analysis that detail more your dataset
        
        print(f'\nDTypes - Numerics')
        print(dfA.describe(include=[np.number]))
        print(f'\nDTypes - Categoricals')
        print(dfA.describe(include=['object']))

# function Fill NaN values
def cleanNaN(dfA):
  for col in dfA:
    if type(dfA[col]) == 'object':
        dfA[col] = dfA[col].fillna('unknow')
    else:
        dfA[col] = dfA[col].fillna(0)
  return dfA

In [None]:
eda(df)

In [None]:
df.region.unique() # only columns with null value

In [None]:
cleanNaN(df)
eda(df)

In [None]:
pd.set_option('display.max_colwidth', None)
df.sample(2)

**Ingredients**

In [None]:
ingredientsAll = []
for k in df.ingredients.values.tolist():
    for i in k.split(','):
        ingredientsAll.append(i.strip())

In [None]:
ingredients = pd.value_counts(ingredientsAll)
ingredients

In [None]:
ing20=ingredients[:20]
ing20

In [None]:
fig = px.bar(ing20, color=ing20.index, title='Top 20 - Ingredients')
fig.show()

**Diet**

In [None]:
diets = df.diet.unique()
diets

In [None]:
xd = df.diet.value_counts()
fig = px.pie(xd, values=xd.values, names=xd.index, title='Diets', 
             color=xd.index, color_discrete_sequence=px.colors.sequential.Greens_r)
fig.show()

**Flavors**

In [None]:
flavors = df['flavor_profile'].unique()
flavors

In [None]:
df.loc[df['flavor_profile']=='-1','flavor_profile'] = 'unknow'

In [None]:
flavors = df['flavor_profile'].unique()
flavors

In [None]:
xd = df['flavor_profile'].value_counts()
fig = px.pie(xd, values=xd.values, names=xd.index, title='Flavors', 
             color=xd.index, color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

**Courses**

In [None]:
courses = df.course.unique()
courses

In [None]:
xd = df.course.value_counts()
fig = px.pie(xd, values=xd.values, names=xd.index, title='Courses', 
             color=xd.index, color_discrete_sequence=px.colors.sequential.YlGnBu)
fig.show()

**States**

In [None]:
states = df.state.unique()
states

In [None]:
st = df.state.value_counts()
fig = px.bar(st, color=st.index, title='States')
fig.show()

**Regions**

In [None]:
regions = df.region.unique()
regions

In [None]:
df.loc[df.region=='-1','region'] = 'unknow'
df.loc[df.region==0,'region'] = 'unknow'
regions = df.region.unique()
regions

In [None]:
xd = df.region.value_counts()
fig = px.pie(xd, values=xd.values, names=xd.index, title='Regions', 
             color=xd.index, color_discrete_sequence=px.colors.sequential.Electric_r)
fig.show()

**Preparation Time**

In [None]:
df.prep_time.unique()

In [None]:
# I'll considere prep_time with mean to values equal -1
pt = int(df.prep_time.mean())
df.loc[df.prep_time == -1,'prep_time'] = pt

In [None]:
xd = df.prep_time.value_counts()[:5]
xd

In [None]:
fig = px.bar(x=xd.index, y=xd.values, color=xd.index, title='Preparation Time (min)',
            labels=dict(x='minutes', y='qty of plates'))
fig.show()

**Cooking time**

In [None]:
df.cook_time.unique()

In [None]:
# I'll considere cook_time with mean to values equal -1
ct = int(df.cook_time.mean())
df.loc[df.cook_time == -1,'cook_time'] = ct

In [None]:
xd = df.cook_time.value_counts()[:5]
xd

In [None]:
fig = px.bar(x=xd.index, y=xd.values, color=xd.index, title='Cooking Time (min)',
            labels=dict(x='minutes', y='qty of plates'))
fig.show()

# **Similarity:**

**based on list of ingredients.**

In [None]:
# creating ingredients list
listIngredients = ingredients.index.tolist()

In [None]:
# return % similarity betweens 2 lists
def similarityArrays(t1,t2):
    return len(set(t1) & set(t2)) / float(len(set(t1) | set(t2))) * 100

In [None]:
# convert ingredients to numeric array
def convertIngredients(listIng):
    li = []
    for ing in listIng.split(','):
        ing = ing.strip()
        li.append(listIngredients.index(ing))
    return li

In [None]:
# get one sample 
ingSample = df.ingredients.head(1).values[0]
ingSample

In [None]:
# double check in function
for teste in convertIngredients(ingSample):
    print(listIngredients[teste], end=' ')

In [None]:
# testing function in lambda
df.ingredients.head(1).apply(lambda x: convertIngredients(x))

In [None]:
df['ingredientsList'] = df.ingredients.apply(lambda x: convertIngredients(x))

In [None]:
# look at sugar and ghee
df[['ingredients','ingredientsList']].head()

In [None]:
dfs = pd.DataFrame()
for a in range(0, len(df)):
    dishA = df.name.iloc[a]
    dishAlist = df.ingredientsList.iloc[a]
    for b in range(0, len(df)):
        if a != b:
            dishB = df.name.iloc[b]
            dishBlist = df.ingredientsList.iloc[b]
            s = similarityArrays(dishAlist, dishBlist)
            dfs = dfs.append({'plate A': dishA, 'plate B': dishB, 'similarity': s}, 
                             ignore_index=True)


In [None]:
dfs[dfs.similarity >50].sort_values(by='similarity', ascending=False)

In [None]:
df[df.name=='Pattor']

In [None]:
df[df.name=='Patra']