# Flavours of India - *Unity in Diversity!*
> Indian cuisine has always been appreciated and known for its traditional food all over the world, be it for the use of wide variey of spices and herbs or huge assortment of dishes in terms of the style of cooking that varies from region to region. In all terms, it is indeed *Unity is Diversity*!

In [None]:
import pandas as pd
import numpy as np
import collections
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns

## Loading the data

In [None]:
indianFood_df = pd.read_csv('/kaggle/input/indian-food-101/indian_food.csv')
indianFood_df.shape

This dataset has ***255*** food items and associated details in terms of ingredients used, what diet (vegetarian or non vegetarian) they belong to, how much are their preparation and cooking times, what flavor profile they are of, and lastly their state and region of origin. <br>
There were multiple instances with -1 as values across colums, therefore as part of preprocessing of the data, subtituting the -1 values to *'Not Known'*.

In [None]:
indianFood_df_nonNull = indianFood_df[~indianFood_df.isnull().any(axis=1)]
indianFood_df_withNull = indianFood_df[indianFood_df.isnull().any(axis=1)]
## replacing the NaN in the region column using a state to region mapping dictionary
state_to_region_dict = dict(zip(indianFood_df_nonNull.state, indianFood_df_nonNull.region))
indianFood_df['region'] = indianFood_df.state.map(state_to_region_dict)
indianFood_df[['prep_time', 'cook_time']] = indianFood_df[['prep_time', 'cook_time']].replace(-1,None)
indianFood_df[['flavor_profile','state','region']] = indianFood_df[['flavor_profile','state','region']].replace('-1','Not Known')
indianFood_df

## Let's get into exploring: *The Taste of India* ;)

## Which state is leading the board with most dishes?
**We see the clear winners to be:**
* Gujarat
* Punjab
* Maharashtra

In [None]:
plt.figure(figsize=(12,10))
sns.set_style('whitegrid')
plt.tight_layout()
ax = sns.countplot(y="state", data=indianFood_df,palette="Set3")
ax.set(ylabel='State', xlabel='Number of Dishes')
ax.set_title('Number of dishes across the different states of India', fontsize = 20)
sns.despine(offset=10, trim=True)
plt.show()

## Let's see how the flavor profile looks like for the different regions in India
> The people of ***West*** seem to have a vivid profile of flavours

In [None]:
sns.set(style="whitegrid", font_scale=1)
ax = sns.catplot(x="flavor_profile", col="region", col_wrap=3,palette = "Set2",
                data=indianFood_df[indianFood_df['region'] != 'Not Known'],
                kind="count",height=5, aspect=1)
plt.tight_layout()
sns.despine(offset=10, trim=True)
axes = ax.axes.flatten()
ax.set(ylabel='Number of Dishes', xlabel='Flavor profile')
ax.fig.subplots_adjust(hspace=.3, wspace=.2)
regions_list = indianFood_df[indianFood_df['region'] != 'Not Known']['region'].unique().tolist()
regions_list
for i in regions_list:
    region_index = regions_list.index(i)
    axes[region_index].set_title(i,fontsize = 12)
plt.show()

## What is the percentage distribution of Vegetarian and Non-Vegetarian dishes?
> We see that majority of the dishes are ***Vegetarian*** dishes, i.e., about *88.6 percent*.

In [None]:
plt.figure(figsize=(6,5))
sns.set(style="whitegrid", font_scale=1)
plt.tight_layout()
total = len(indianFood_df['diet'])
ax = sns.countplot(y="diet", data=indianFood_df,palette="Set2")
ax.set(ylabel='Diet', xlabel='Number of Dishes')

for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_width()/total)
        x = p.get_x() + p.get_width() + 0.02
        y = p.get_y() + p.get_height()/2
        ax.annotate(percentage, (x, y))
# ax.set_title('Number of dishes across the different states of India', fontsize = 20)
sns.despine()
plt.show()

## Let's see how the diet of the dishes
> Non Vegetarian food is more prevalent in the *North Eastern* region of India, while the *Western* India is abundant with Vegetarian food

In [None]:
diet_to_region_df = indianFood_df.groupby("diet")["region"].value_counts().to_frame().rename(columns = {'region' : 'Number of Dishes'}).reset_index()
colors = sns.color_palette("Set2")
region_to_color_dict = dict(zip(diet_to_region_df['region'].unique().tolist(),colors))

veg_df = diet_to_region_df[diet_to_region_df.diet == 'vegetarian'].sort_values('region')
nonVeg_df = diet_to_region_df[diet_to_region_df.diet == 'non vegetarian'].sort_values('region')

plt.figure(figsize=(15,7.5))
plt.tight_layout()

plt.subplot(1,2,1);
plt.pie(data=veg_df,x='Number of Dishes',colors=[region_to_color_dict.get(key) for key in veg_df['region']])
plt.axis('equal')
plt.legend(labels=veg_df['region'],bbox_to_anchor=(1.05, 1), loc='upper left',fontsize=10,title = 'Region')
plt.title('Vegeterian',fontweight="bold",fontsize = 20)

plt.subplot(1,2,2);
plt.pie(data=nonVeg_df,x='Number of Dishes',colors=[region_to_color_dict.get(key) for key in nonVeg_df['region']])
plt.axis('equal')
plt.legend(labels=nonVeg_df['region'],bbox_to_anchor=(1.05, 1), loc='upper left',fontsize=10,title = 'Region')
plt.title('Non Vegeterian',fontweight="bold",fontsize = 20)

plt.subplots_adjust(wspace=0.5)
plt.show()

## Let's look at the time required to cook the Vegetarian and Non-Vegetarian food items

In [None]:
sns.set(style="dark", font_scale=1)
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, sharey=True)

plt.tight_layout()
indianFood_df['total_time'] = indianFood_df['prep_time'] + indianFood_df['cook_time']

sns.violinplot("diet", "prep_time", data=indianFood_df,
               palette='Set2', ax=ax1)
ax1.set_xlabel('Diet')
ax1.set_ylabel('Preparation Time')
ax1.set_xticklabels(labels = indianFood_df['diet'].unique(),rotation=90)

sns.violinplot("diet", "cook_time", data=indianFood_df,
               palette='Set2',ax=ax2)
ax2.set_xlabel('Diet')
ax2.set_ylabel('Cooking Time')
ax2.set_xticklabels(labels = indianFood_df['diet'].unique(),rotation=90)

sns.violinplot("diet", "total_time", data=indianFood_df,
               palette='Set2',ax=ax3)
ax3.set_xlabel('Diet')
ax3.set_ylabel('Total Time')
ax3.set_xticklabels(labels = indianFood_df['diet'].unique(),rotation=90)

fig.subplots_adjust(wspace = 0.5)
plt.show()

## Comparing the prepping and cooking time of the dishes
> Considering the Vegetarian Diet, ***Shrikhand*** has the maximum cooking time but prep time is very less, while ***Pindi chana*** takes maximum prep time but cooking time is relatively low<br>
> On the other hand for Non Vegetarian Diet, ***Biryani*** has the maximum cooking time, while ***Tandoori Chicken*** and ***Tandoori Fish Tikka*** have the maximum prep time as compared to cooking time.

In [None]:
plt.figure(figsize=(15,7.5))
plt.tight_layout()
sns.set_style('whitegrid')

vegIndianFood_df = indianFood_df[indianFood_df['diet'] == 'vegetarian']
nonVegIndianFood_df = indianFood_df[indianFood_df['diet'] == 'non vegetarian']

plt.subplot(1,2,1);
sns.scatterplot(x="prep_time", y="cook_time",alpha=0.7, s=200,
                palette='Set2', data=vegIndianFood_df)
plt.xlabel('Prep Time')
plt.ylabel('Cooking Time')
plt.title('Vegeterian',fontweight="bold",fontsize = 20)
vegAnnotate_df = vegIndianFood_df[(vegIndianFood_df['prep_time'] == 500) | (vegIndianFood_df['cook_time'] > 700)]
vegAnnotate_dict = dict(zip(vegAnnotate_df.name,list(zip(vegAnnotate_df.prep_time, vegAnnotate_df.cook_time))))
for dish in vegAnnotate_dict:
    if dish == 'Shrikhand':
        sns.scatterplot(vegAnnotate_df.prep_time, vegAnnotate_df.cook_time, s=200, color="#CD5C5C")
        plt.annotate(dish,vegAnnotate_dict.get(dish),fontsize=15,family='sans-serif',
                     horizontalalignment='left',verticalalignment='top')
    else:
        sns.scatterplot(vegAnnotate_df.prep_time, vegAnnotate_df.cook_time, s=200, color="#CD5C5C")
        plt.annotate(dish,vegAnnotate_dict.get(dish),fontsize=15,family='sans-serif',
                     horizontalalignment='right',verticalalignment='bottom')
        
plt.subplot(1,2,2);        
sns.scatterplot(x="prep_time", y="cook_time",alpha=0.7, s=200,
                palette='Set1', data=nonVegIndianFood_df)
plt.xlabel('Prep Time')
plt.ylabel('Cooking Time')
plt.title('Non Vegeterian',fontweight="bold",fontsize = 20)
nonVegAnnotate_df = nonVegIndianFood_df[(nonVegIndianFood_df['prep_time'] > 200) | (nonVegIndianFood_df['cook_time'] > 100)]
nonVegAnnotate_dict = dict(zip(nonVegAnnotate_df.name,list(zip(nonVegAnnotate_df.prep_time, nonVegAnnotate_df.cook_time))))
for dish in nonVegAnnotate_dict:
    if dish == 'Biryani':
        sns.scatterplot(nonVegAnnotate_df.prep_time, nonVegAnnotate_df.cook_time, s=200, color="#CD5C5C")
        plt.annotate(dish,nonVegAnnotate_dict.get(dish),fontsize=15,family='sans-serif',
                     horizontalalignment='left',verticalalignment='top')
    elif dish == 'Tandoori Chicken':
        sns.scatterplot(nonVegAnnotate_df.prep_time, nonVegAnnotate_df.cook_time, s=200, color="#CD5C5C")
        plt.annotate(dish,nonVegAnnotate_dict.get(dish),fontsize=15,family='sans-serif',
                     horizontalalignment='right',verticalalignment='top')
    else:
        sns.scatterplot(nonVegAnnotate_df.prep_time, nonVegAnnotate_df.cook_time, s=200, color="#CD5C5C")
        plt.annotate(dish,nonVegAnnotate_dict.get(dish),fontsize=15,family='sans-serif',
                     horizontalalignment='right',verticalalignment='bottom')
plt.subplots_adjust(wspace=0.5)
plt.show()

## What are the categories in terms of Course, the food items belong to?
> Clearly, we have most food items falling into the ***Main Course*** category.

In [None]:
plt.figure(figsize=(15,7.5))
plt.tight_layout()

course_dict = indianFood_df['course'].value_counts().to_dict()
plt.pie(x=course_dict.values(),colors=sns.color_palette('Set2')[0:5])
plt.axis('equal')
plt.legend(labels=course_dict.keys(),bbox_to_anchor=(1, 1), loc='upper left',fontsize=10,title = 'Course')
plt.show()

## To what all courses do the food items belong to as per the different states
> For *Desserts*, **West Bengal** sweet-ly wins it, while **Punjab** stands out for their *Main Course* dishes. <br>
All the namkeens: Khakra, Fafda, Murmura make **Gujrat** a leader in terms of *Snacks*

In [None]:
sns.set_style('whitegrid')
sns.set_palette("Set2")
state_to_course_df = indianFood_df.groupby(['state', 'course']).size().unstack()
ax = state_to_course_df.plot(kind='barh', stacked=True, figsize=(10, 6))
ax.set_ylabel('State',fontsize=20)
ax.set_xlabel('Number of Dishes',fontsize=20)
plt.legend(title='Course', bbox_to_anchor=(1.0, 1), loc='upper left')
plt.show()

## What are dishes with most ingredients?
> ***Ghevar*** has the most number of ingredients

In [None]:
ingredients_df = indianFood_df[['name','ingredients']].set_index('name')
ingredients_dict = ingredients_df.apply(lambda row: len(row.ingredients.split(', ')), axis = 1).to_dict()
ingredients_dict = dict(sorted(ingredients_dict.items(), key=lambda kv: kv[1], reverse=True))
most_ingredients_dict = dict(list(ingredients_dict.items())[0: 6])

plt.bar(list(most_ingredients_dict.keys()), list(most_ingredients_dict.values()))
plt.ylabel ('Number of ingredients used')
plt.xlabel ('Dishes')
plt.title("Top 6 dishes with most number of ingredients",fontweight="bold",fontsize = 15)
plt.xticks(list(most_ingredients_dict.keys()),rotation=90)

plt.show()

## Let's look at the most used ingredient across the different dishes
> ***Sugar***, ***garam masala***, ***coconut***, ***milk***, ***ginger*** are some of the most used ingredient

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
text = " ".join(ingredient for ingredient in indianFood_df.ingredients)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color="white",width=800, height=400).generate(text)
plt.figure( figsize=(15,10) )
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

## Feel free to upvote and give feedback!