**Table of contents**

- [Distribution of Vegetarian and Non Vegetarian Foods](#one)
- [Distribution of Diet by Region](#two)
- [Food Preparation and Cooking Time Analysis](#three)
- [Flavor profile distribution](#four)
- [Course Distribution](#five)
- [Geographical representation of state-wise distribution of sweets in India](#six)
- [Look at ingredients involved](#seven)
- [Most used ingredients by state](#eight)


In [1]:
import warnings


import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf
import plotly.graph_objs as go

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from wordcloud import WordCloud, ImageColorGenerator

plt.style.use('ggplot')



plt.rcParams['figure.figsize'] = (15,15)
warnings.filterwarnings("ignore")
init_notebook_mode(connected=True)
cf.go_offline()

%matplotlib inline

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

[**Pipeline**](https://github.com/SaiKrishna1908/ML-Checklist/blob/main/Pipeline.md)

In [1]:
food_df = pd.read_csv('/kaggle/input/indian-food-101/indian_food.csv')

In [1]:
food_df.head(5)

In [1]:
food_df.groupby(by='state').mean()

In [1]:
def split_labels(series):
    series = series.split(",")
    if "Unknown" in series:
      series.remove("Unknown")
    return series

food_df['ingredients'] = food_df['ingredients'].map(split_labels)

<a id="one"></a>
# Distribution of Vegetarian and Non Vegetarian Foods

In [1]:
sns.displot(food_df, x = 'diet', shrink=0.8)


Not a lot of Non-Vegetarian items. Lets add conditioning on region.

<a id="two"></a>
# Distribution of Diet by Region

In [1]:
spec_del = np.where( (food_df['region'] == '-1') | (food_df['region'].isna()))
regions_cleaned_df= food_df.drop(spec_del[0])


In [1]:
figure = plt.figure(figsize=(8,8))

ax = figure.add_axes([0,0,0.9,0.9])

sns.histplot(regions_cleaned_df, x='region', hue = 'diet', shrink=0.8, multiple='dodge',ax=ax)
ax.set_title('Diet Distribution By Region');

Most of the Non-vegetarian food items in the dataset come from North East States.

In [1]:
figure = plt.figure(figsize=(12,8))

ax = figure.add_axes([0,0,0.9,0.9])
sns.histplot(food_df, x= 'state', shrink=0.8, hue='diet',multiple='dodge', ax= ax)
ax.set_title('Diet Type per State')
ax.set_xlabel('State')
ax.set_xticklabels(labels=list(food_df['state'].unique()), rotation=60);

<a id="three"></a>
# Food Preparation and Cooking Time Analysis

In [1]:
spec_time_del = np.where((food_df['prep_time'] == -1) | (food_df['cook_time'] == -1))

In [1]:
time_cleaned_df = food_df.drop(spec_time_del[0])

In [1]:
time_cleaned_df['total_time'] = (time_cleaned_df['prep_time'] +time_cleaned_df['cook_time'])/60

In [1]:
cat = sns.catplot(x='state', y='total_time', data=time_cleaned_df,ax=ax,height=10, jitter=False)
plt.xticks(rotation=60);

<a id="four"></a>
# Flavor profile distribution

In [1]:
profile_spec_delete = np.where((food_df['flavor_profile'] == '-1'))
falvor_profile_cleaned = food_df.drop(profile_spec_delete[0])

In [1]:
profile_counts = []

for profile in falvor_profile_cleaned['flavor_profile'].unique():
    
    count = (falvor_profile_cleaned[falvor_profile_cleaned['flavor_profile'] == profile].shape[0]*100)/falvor_profile_cleaned.shape[0]
    profile_counts.append(count)


pie,ax = plt.subplots(figsize=(8,8))
labels = falvor_profile_cleaned['flavor_profile'].unique()
patches, text = plt.pie(x=profile_counts,explode=[0.05]*4, labels=labels,pctdistance=0.5)

legend_labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(labels, profile_counts)]

plt.title("Flavor Profile", fontsize=14);
plt.legend(patches, legend_labels, bbox_to_anchor=(-0.1, 1.), fontsize=8, loc =1)

<a id="five"></a>
# Course Distribution

In [1]:
food_df['course'].unique()

In [1]:
course_counts = []

for profile in food_df['course'].unique():
    
    count = (food_df[food_df['course'] == profile].shape[0]*100)/food_df.shape[0]
    course_counts.append(count)


pie,ax = plt.subplots(figsize=(8,8))
labels = food_df['course'].unique()
patches, text = plt.pie(x=course_counts,explode=[0.05]*4, labels=labels,pctdistance=0.5)

legend_labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(labels, course_counts)]

plt.title("Flavor Profile", fontsize=14);
plt.legend(patches, legend_labels, bbox_to_anchor=(-0.1, 1.), fontsize=8, loc =1)

In [1]:
states = list(food_df['state'].value_counts().keys())
counts = list(food_df['state'].value_counts().values)

<a id="six"></a>
# Geographical representation of state-wise distribution of sweets in India

In [1]:
data = dict(type = 'choropleth',
            geojson="https://gist.githubusercontent.com/jbrobst/56c13bbbf9d97d187fea01ca62ea5112/raw/e388c4cae20aa53cb5090210a42ebb9b765c0a36/india_states.geojson",
            featureidkey = 'properties.ST_NM',
            locationmode='geojson-id',
            locations=states,
            z = counts,            
            
            autocolorscale = False,
            colorscale= 'purples',
            marker_line_color = 'rgb(255,255,255)',
            
            colorbar = dict (
                title={'text': "Sweets Counts"},
                thickness=15,
                len=0.35,
                bgcolor='rgba(255,255,255,0.6)',

                tick0=0,
                dtick=20000,

                xanchor='left',
                x=0.01,
                yanchor='bottom',
                y=0.05            
            )
           )

In [1]:
layout = dict(
            title=dict(
                text='Sweets Distribution By State',
                xanchor='center',
                x=0.5,
                yref='paper',
                yanchor='bottom',
                y=1,
                pad={'b':10}
            ),
            
            geo= {
                'visible': False,
                'projection': dict(
                    type = 'conic conformal',
                    parallels=[12.472944444, 35.172805555556],
                    rotation = {'lat':24,'lon':80}, 
                ),
                'lonaxis': {'range':[68,98]},
                'lataxis': {'range':[6,38]}
            },
            
            margin={'r':0,'t':30,'l':0, 'b':0},
            height=550,
            width=550    
)

In [1]:
choromap = go.Figure(data=[data], layout=layout)

In [1]:
iplot(choromap,show_link = False)

<a id="seven"></a>
# Lets take a look at ingredients involved

In [1]:
ingredients_string = ''

for i in food_df['ingredients']:
    ingredients_string += ' '.join(item.strip().replace(' ','-') for item in i)
    ingredients_string+=' '



In [1]:
wordcloud = WordCloud(width = 400, height = 400, colormap = 'icefire'
                      ,background_color ='black', 
                min_font_size = 8).generate(ingredients_string)                  
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis('off')
plt.show()

We do have data duplication issue here. Maybe sanitize typos 

<a id="eight"></a>
# Most used ingredients by state

**It makes no sense to find most used ingredients for a state which has only 2 dishes, so lets only focus on states which contributed to this dataset by applying a threshold. Maybe >= 10 ?**

In [1]:
# This function takes a list of ingredients which may contain duplicate values and returns a 
# dictionary with keys as ingredients and values as the number of dishes in which this
# ingredient is used

def get_ingredients_dict(ingredients):
    
    ingredient_dictionary = dict()
    
    for ingredient in ingredients:
        if ingredient in ingredient_dictionary:
            ingredient_dictionary[ingredient]+=1
        else:
            ingredient_dictionary[ingredient] = 1
    return ingredient_dictionary

In [1]:
state_ingredients_dict = dict()

for state, count in zip(states,counts):
    
    if count >= 10 and state != '-1':
        
        ingredients_string = ''
       
        state_ingredients_df = food_df[food_df['state'] == state]
        
        for i in state_ingredients_df['ingredients']:
            ingredients_string += ':'.join(item.strip().replace(' ','-') for item in i)            
            state_ingredients_dict[state] = get_ingredients_dict(ingredients_string.split(':'))

In [1]:
state_ingredients_dict.keys()

In [1]:
NROW=3
NCOL=3
fig , axes = plt.subplots(nrows=NROW,ncols=NROW, figsize=(12,12))

x_axis,y_axis = 0,0

for state in state_ingredients_dict.keys():
    
    d = state_ingredients_dict[state]
    
    
    index = 0
    top_used_ingredients = dict()
    top_used_ingredients['ingredient'] = []
    top_used_ingredients['count'] = []
    for ingredient in sorted(d, key=d.get, reverse=True):
        if index > 5:
            break
        else: 
            top_used_ingredients['ingredient'].append(ingredient)
            top_used_ingredients['count'].append(d[ingredient])
        index+=1
    
    
    
    
    temp = pd.DataFrame.from_dict(top_used_ingredients)
        
    
    sns.barplot(x='ingredient', y='count',ax=axes[x_axis,y_axis],data=temp)
    axes[x_axis,y_axis].set_title(state);
    axes[x_axis,y_axis].set_xlabel('Ingredient')
    axes[x_axis,y_axis].set_ylabel('Count of Ingredient Used')
    axes[x_axis,y_axis].set_xticklabels(labels=temp['ingredient'],rotation=60)
    
        
    y_axis+=1
    
    if(y_axis==NCOL):
        y_axis = 0
        x_axis+=1
plt.tight_layout()