In [None]:
# Loading libraries.

import numpy as np
import pandas as pd

# for visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# set theme for visualizationF8485E
plt.style.use("seaborn-whitegrid")

# Create an array with the colors you want to use
colors = ["#54436B", "#50CB93", "#ED8E7C", "#FFD523", '#00C1D4', '#F8485E']

# Set your custom color palette
sns.set_palette(sns.color_palette(colors))

In [None]:
# Loading dataset.

# "-1" represents NaN values in set.
na_val = [-1, '-1']
data = pd.read_csv("../input/indian-food-101/indian_food.csv", na_values = na_val)

data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
# missing values.
data.isnull().sum()

In [None]:
# Let's look for values count for non-numeric values.
for col in data.select_dtypes([object]).columns:
    print("--"*10)
    print(f"Values counts for '{col}'")
    print(data[col].value_counts())

In [None]:
# summary of non-numeric data.
data.describe(include=object)

In [None]:
# drop duplicate rows if there's any.
data.drop_duplicates(inplace=True)

In [None]:
# Let't examine data if there is any outliers.
plt.figure(figsize=(14, 10))
sns.boxplot(data = data, x='course', y = 'prep_time')

plt.title("Outliers in Preparation time")

plt.show()

In [None]:
# Let't examine data if there is any outliers.
plt.figure(figsize=(14, 10))

sns.boxplot(data = data, y = 'cook_time')
plt.title("Outliers in Cooking time")

plt.show()

There are some outliers, but in this case they can not treat as outliers because preparation time and cooking time may vary according to the particular recipe.

In [None]:
data.head()

In [None]:
data['ingredients'] = data.loc[:, 'ingredients'].apply(lambda x: x.split(','))

In [None]:
data.head()

In [None]:
# Word frequency of Ingredients according to Flavors.
from collections import Counter

# for sweets
df = data.loc[data['flavor_profile'] == 'sweet', 'ingredients']

ingre_word_frq = Counter()
for ingredients in df.values:
    ingre_word_frq.update(ingredients)
    
sweetwords = [key for key, val in dict(ingre_word_frq).items()]
sweetwords[0:5]

In [None]:
# For spicy food.
df = data.loc[data['flavor_profile'] == 'spicy', 'ingredients']

ingre_word_frq = Counter()
for ingredients in df.values:
    ingre_word_frq.update(ingredients)
    
spicywords = [key for key, val in dict(ingre_word_frq).items()]
spicywords[:5]

In [None]:
# For bitter food.
df = data.loc[data['flavor_profile'] == 'bitter', 'ingredients']

ingre_word_frq = Counter()
for ingredients in df.values:
    ingre_word_frq.update(ingredients)
    
bitterwords = [key for key, val in dict(ingre_word_frq).items()]
bitterwords[0:5]

In [None]:
# For sour food.
df = data.loc[data['flavor_profile'] == 'sour', 'ingredients']

ingre_word_frq = Counter()
for ingredients in df.values:
    ingre_word_frq.update(ingredients)
    
sourwords = [key for key, val in dict(ingre_word_frq).items()]
sourwords[0:5]

In [None]:
# drop rows if both ingredients and flavor_profile values are "NaN".
data.dropna(subset=['flavor_profile', 'ingredients'], how='all', axis='index', inplace=True)

In [None]:
def CountFlavor(ingredientList):
    
    """
    This function returns a flavour of dish based on most occurance of words in Ingredient list.

    """
    
    dict_flavor_count = {}
    
    sweetcount = 0
    spicycount = 0
    bittercount = 0
    sourcount = 0
    
    for word in ingredientList:
        if word in sweetwords:
            sweetcount += 1
            dict_flavor_count['sweet'] = sweetcount
            
        if word in spicywords:
            spicycount += 1
            dict_flavor_count['spicy'] = spicycount
            
         
        if word in bitterwords:
            bittercount += 1
            dict_flavor_count['bitter'] = bittercount
            
            
        if word in sourwords:
            sourcount += 1
            dict_flavor_count['sour'] = sourcount
            
    sorted_dict = sorted(dict_flavor_count.items(), key=lambda kv:(kv[1], kv[0]) , reverse=True)
    
    if sorted_dict == []:
        return 'sweet'
            
    return sorted_dict[0][0]

In [None]:
# handling missing data in flavor_profile based on most occourance ingredients.
null_df = data.loc[data.flavor_profile.isnull(), 'ingredients']

for idx in null_df.index:
    IngredientList = null_df.loc[idx]
    data.loc[idx, 'flavor_profile'] = CountFlavor(IngredientList)

In [None]:
# dataframe of missing data in both state and region.
null_df = data.loc[(data['state'].isnull()) & (data['region'].isnull())]
null_df

These are common dishes in all over India.

In [None]:
# for above missing data let's fill it with 'India'.
for idx in null_df.index:
    data.loc[idx, ['state', 'region']] = 'India'

In [None]:
data.loc[248]

In [None]:
data.loc[data.region.isnull()]

In [None]:
pd.DataFrame(data.groupby(['state']).region.unique())

In [None]:
# fill the missing data in region for state 'UttarPradesh'
data.region.fillna('North', inplace=True)

In [None]:
# Missing data in state column.
null_states_df = data.loc[data.state.isnull()]
null_states_df

In [None]:
a = pd.DataFrame(data.groupby(['region']).state.unique())
a

In [None]:
# filling missing values for state for North region.
data.loc[115, 'state'] = 'Punjab'

# filling missing values for state for South region.
for idx in null_states_df.index:
    data.loc[idx, 'state'] = 'South India'

In [None]:
data.head()

In [None]:
# data visualization

# Let's look at which dish require more preparation time.
filt = data.prep_time > 60 
a = data.loc[filt, ['name', 'prep_time', 'cook_time']].sort_values(by='prep_time', ascending=False)
a.set_index(['name'], inplace=True)
a

In [None]:
# visualization for same.
plt.figure(figsize=(16, 8))
sns.barplot(data=a, x=a.index, y='prep_time', color="#54436B", label="Preparation Time")
sns.barplot(data=a, x=a.index, y='cook_time', color="#50CB93", label="Cooking Time")

plt.xticks(rotation=90)

plt.xlabel("Name of Dish")
plt.ylabel("Time in minutes")
plt.title("Total prepartion and cooking time taken by dish.")

plt.legend()
plt.show()

<b style="font-size:22px;
          font-weight:700;
          color:#F69E7B">Pindi Chana </b> takes most time for prepartion.

In [None]:
# Let's look at which dish require less cooking time.
a = data[['name','prep_time','cook_time']]
a = a.sort_values(by='cook_time',ascending=False).head(10)
a.set_index('name', inplace=True)
a

In [None]:
# visualization for same.
a.plot(kind='bar', figsize=(16, 8))

plt.xticks(rotation=90)

plt.xlabel("Name of Dish")
plt.ylabel("Time in minutes")
plt.title("Total prepartion and cooking time taken by dish.")

plt.legend()
plt.show()

In [None]:
data.head()

In [None]:
data['total_time'] = data['prep_time'] + data['cook_time']

In [None]:
data.head()

In [None]:
# Flavour wise Average time.
a = data.groupby(by=['flavor_profile']).mean()
a.sort_values(by=['total_time'], ascending=False, inplace=True)
# visualization for the same.

plt.figure(figsize=(10, 8))

sns.barplot(x=a.index, y=a.total_time, data=a)

plt.title("Average time taken by flavors.")

plt.text(-0.1, 84, round(a.total_time[0]), fontsize=20, animated=True)
plt.text( 0.9, 70, round(a.total_time[1]), fontsize=20, animated=True)
plt.text( 1.9, 53, round(a.total_time[2]), fontsize=20, animated=True)
plt.text( 2.9, 21, round(a.total_time[3]), fontsize=20, animated=True)

plt.show()

**Sweet dishes takes most time**

In [None]:
from collections import Counter

a = [x for x in data.ingredients]

words = Counter()

for l in a:
    words.update(l)
words = dict(words)
d = dict(sorted(words.items(), key = lambda x: x[1], reverse=True))

In [None]:
a = pd.DataFrame(data =d.values() , index=d.keys(), columns = ['Count']).head(10)

# visualization for most used ingredients in recipes
a.plot(kind='bar', figsize=((14, 7)))

plt.xlabel("Ingredient")
plt.ylabel("Count")
plt.title("Most Common Ingredient used in Indian food.")

plt.show()

**Sugar** is most used ingredient in Indian Cusine.

In [None]:
data.head()

In [None]:
# vegetarian / non-vegetarian dishes .

a = pd.DataFrame(data.groupby(by=['diet']).count().name)

plt.figure(figsize=(10, 7))

plt.barh(a.index, a.name, label='count')

plt.xlabel("Diet")
plt.ylabel("Counts")
plt.title("Total counts of Vegetartian and Non-vegetarian dishes.")

plt.legend()
plt.show()

In [None]:
data.course.unique()

In [None]:
# Course types
a = pd.DataFrame(data.groupby(by = ['course']).count().name)
a.rename(columns = {'name':'Count'}, inplace=True)
a.sort_values(by='Count', ascending=False, inplace=True)

# visualization for the same
plt.figure(figsize=(14, 8))
sns.barplot(data=a, x=a.index, y='Count')

plt.title("Count of each course type")
plt.show()

**Most dishes are main course.**

In [None]:
# which state has most of dessert dishes.
a = data.loc[data.course == 'dessert']
a = pd.DataFrame(a.groupby(by = ['state']).name.count())
a.rename(columns= {'name':'Dessert Dishes'}, inplace=True)
a.sort_values(by='Dessert Dishes', ascending=False, inplace=True)

In [None]:
# visualization for same
a.plot(kind='bar', figsize=((16, 8)))

plt.xlabel('States')
plt.ylabel("Counts")
plt.title("Statewise Number of Dessert")

plt.show()

**West Bengal has most of dessert dishes.**

In [None]:
data.flavor_profile.unique()

In [None]:
# Let's see for the flavour match with this or not.
a = data.loc[data.flavor_profile == 'sweet']
a = pd.DataFrame(a.groupby(by = ['state']).name.count())
a.rename(columns= {'name':'sweets'}, inplace=True)
a.sort_values(by='sweets', ascending=False, inplace=True)
a.head()

In [None]:
# visualization for same
a.plot(kind='bar', figsize=((16, 8)))

plt.xlabel('States')
plt.ylabel("Counts")
plt.title("Total count of sweet flavour dish by state")

plt.show()

**It is same as for dessert**

In [None]:
# Region wise dish based on flavours.
a = pd.DataFrame(data.groupby(by=['flavor_profile', 'region']).count().name)
a.rename(columns={'name': 'counts'}, inplace=True)

a.unstack().plot(kind='bar', figsize=(18, 8))

plt.xlabel("Flavours")
plt.ylabel('counts')
plt.title("counts of dishes based on region")

plt.show()

**We can see that Spicy and Sweet are the most common flavors in all region** 

In [None]:
# Let's examin spciy dishes.
spicy_df = data.loc[data.flavor_profile=='spicy']
spicy_df = spicy_df.reset_index(drop=True)
spicy_df.head()

In [None]:
# counts of spicy dishes based on state.
a = pd.DataFrame(spicy_df.groupby('state').count().name).sort_values(by='name', ascending=False).head(10)

# visualization
a.plot(kind='bar', figsize=(16, 8))

plt.xlabel("States")
plt.ylabel("counts")
plt.title("State wise count of spicy dishes.")

plt.show()

**Most spicy dishes are from Punjab**

In [None]:
a = pd.DataFrame(data.groupby(by=['state', 'diet']).count().name)
a.rename(columns = {'name':'count'}, inplace=True)

# visualization for the same.
a.unstack().plot(kind='bar', figsize=(16, 8))

plt.ylim(0, 40)

plt.xlabel("States")
plt.ylabel("Counts")
plt.title("State-wise count of Vegetarian and Non-vegetarian dishes.")

plt.show()

In [None]:
a = pd.DataFrame(data.groupby(by=['region', 'diet']).count().name)
a.rename(columns = {'name':'count'}, inplace=True)

# visualization for the same.
a.unstack().plot(kind='bar', figsize=(16, 8))

plt.xlabel("Regions")
plt.ylabel("Counts")
plt.title("Region-wise count of Vegetarian and Non-vegetarian dishes.")

plt.show()

## Conclusion
* Most of dishes are vegetarian and from west region.
* Most common ingredient used in indian Cusine is Sugar.
* Spicy food is common in all regions.
* Sweets takes more time as compare to other dishes.