# <font color='darkorange'>INDIAN </font><font color='dodgerblue'>FOOD </font><font color='darkgreen'>ANALYSIS</font>

![](http://media.giphy.com/media/NI8fT7ugMo9eU/giphy.gif)

In [None]:
import pandas as pd
import numpy as np
# visualizations
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from wordcloud import WordCloud
# text data
import re

# read url and open image
import requests
from PIL import Image
from io import BytesIO

# value percentage in pie chart
def make_autopct(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return '{p:.2f}%\n({v:d})'.format(p=pct,v=val)
    return my_autopct

# missing values in a dataset
def missing_data(dataset):
    missing_values = dataset.isnull().sum()
    missing_values = missing_values[missing_values > 0]
    perc_missing = pd.DataFrame(round(missing_values/len(dataset)*100, 2), columns=['missing%'])
    count_missing = pd.DataFrame(missing_values, columns=['count'])
    missing_data = pd.concat([perc_missing, count_missing], axis=1).sort_values(by='missing%', ascending=False)
    return missing_data.style.background_gradient(cmap='PuBuGn')

def categorical_count(data, column, palette, xlabel, ylabel):
    counts = data[column].value_counts()
    x = counts.index
    y = counts.values
    colors = sns.color_palette(palette, 8)
    with plt.style.context('fivethirtyeight'):
        plt.figure(figsize=(10, 6))
        plt.bar(x=x, height=y, color=colors)
        for c, v in zip(x, y):
            plt.text(c, v+1, f'{v}\n({round(v/sum(y)*100, 2)}%)', ha='center')
        plt.yticks(np.arange(0, max(y)+20, 20))
        plt.xticks(x)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.show()
        
def bivariate_categorical_count(data, col1, col2, measure):
    count = data.groupby([col1, col2]).agg({measure: 'count'}).unstack()[measure]
    categories = count.index
    y1 = count.values[:, 0]
    y2 = count.values[:, 1]
    fig = go.Figure()
    fig.add_trace(go.Bar(x=categories,y=y1,name=data[col2].unique()[1], marker_color='gold',text=y1))
    fig.add_trace(go.Bar(x=categories,y=y2,name=df[col2].unique()[0], marker_color='teal',text=y2))
    fig.update_traces(textposition='outside')
    fig.update_layout(barmode='group', 
                      title=f"Number of {df[col2].unique()[0]} and {df[col2].unique()[1]} items across {col1}", 
                      template='ggplot2')
    fig.show()

def read_img_from_url(url):
    """this function reads an image from url and converts in numpy array"""
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    img_matrix = np.array(img)
    return img_matrix

def custom_wc(col):
    plt.figure(figsize=(12, 20))
    for i, category in enumerate(df[col].unique()):
        # ingredints corpus for a specific category
        ingredient_list = ",".join([word.lower() for word in re.split(',', ",".join(df[df[col] == category].ingredients))])
        # wordcloud instantiate
        wc = WordCloud(width=900, height=600, colormap='Dark2', 
                       max_words=50, background_color='white').generate(ingredient_list)
        # plot wordcloud
        plt.subplot(4, 2, i+1)
        plt.imshow(wc, interpolation='bilinear')
        plt.title(category.upper(), fontweight='bold')
        plt.suptitle(f'50 most common ingredients across all {col}s', fontsize=15)
        plt.axis("off")

# <font color='mediumvioletred'>Lets look at the data</font>

In [None]:
df = pd.read_csv('../input/indian-food-101/indian_food.csv', na_values='-1')
df.head()

## Dataset shape
 - the sample dataset consists of 255 Indian dishes

In [None]:
df.shape

## Missing Values

In [None]:
missing_data(df)

### Fill missing values with 'Unknown'

In [None]:
# handle missing values
df.fillna('Unknown', inplace=True)

## Additional Features

In [None]:
df['ingredients_count'] = df['ingredients'].apply(lambda x: len(re.split(',', x)))

# <font color='darkorchid'>Data Exploration</font>

## 1. 50 most common ingredients used in Indian cuisine

In [None]:
url = "https://i.etsystatic.com/11743934/r/il/f30302/2167761089/il_794xN.2167761089_i2nd.jpg"
image_mask = read_img_from_url(url)
ingredient_list = ",".join([word.lower() for word in re.split(',', ",".join(df.ingredients))])
wc = WordCloud(mask=image_mask, width=900, height=600, colormap='Dark2', 
               max_words=50, background_color='white', contour_width=1, contour_color='black').generate(ingredient_list)
plt.figure(figsize=(12, 15))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

## 2. Number of vegetarian and non vegetarian dishes (Types of diet)

In [None]:
diet_count = df.diet.value_counts()
diet_colors = ['teal', 'gold']
fig, ax = plt.subplots(figsize=(7, 7))
center_circle = plt.Circle((0, 0), 0.7, color='white')
plt.pie(x=diet_count.values, labels=diet_count.index, autopct=make_autopct(diet_count.values), 
          textprops={'size': 15}, pctdistance=0.4, colors=diet_colors)
ax.add_artist(center_circle)
plt.legend()
plt.show()

In [None]:
custom_wc('diet')

## 3. Number of dishes in each course type

In [None]:
categorical_count(df, 'course', 'husl', 'Type of course', 'Number of dishes')

In [None]:
custom_wc('course')

## 4. Flavour profile of dishes and respective count
- The missing flavor profiles of dishes is labeled as `Unknown`. 
- In the given sample dataset of 255 Indian dishes , 52.2 percent of the dishes are spicy, 34.5 percent are sweet, 11.4 percent are `Unknown`, 1.6 percent are bitter and only 0.4 percent are sour i.e. just a single dish is sour.

In [None]:
categorical_count(df, 'flavor_profile', 'Set2', 'Flavor profile', 'Number of dishes')

In [None]:
custom_wc('flavor_profile')

## 5. Number of dishes from all regions
- The missing regions of dishes are labeled as `Unknown`. 

In [None]:
categorical_count(df, 'region', 'tab10', 'Region', 'Number of dishes')

In [None]:
custom_wc('region')

## 6. Number of vegetarian and non-vegetarian dishes across regions

In [None]:
bivariate_categorical_count(df, 'region', 'diet', 'name')

In [None]:
pd.pivot_table(df, values='name', index=['diet'], columns=['region'], 
               aggfunc='count', fill_value=0, dropna=False, margins=True, margins_name='Total')

## 7. Number of vegetarian and non-vegetarian dishes across courses

In [None]:
bivariate_categorical_count(df, 'course', 'diet', 'name')

In [None]:
pd.pivot_table(df, values='name', index=['diet'], columns=['course'], 
               aggfunc='count', fill_value=0, dropna=False, margins=True, margins_name='Total')

## 8. Number of vegetarian and non-vegetarian dishes across flavor profiles

In [None]:
bivariate_categorical_count(df, 'flavor_profile', 'diet', 'name')

In [None]:
pd.pivot_table(df, values='name', index=['diet'], columns=['flavor_profile'], 
               aggfunc='count', fill_value=0, dropna=False, margins=True, margins_name='Total')

## 9. Top 10 dishes with the highest preparation and cooking time
- excluding all the missing values in prep and cook time

In [None]:
top_preptime = df[df.prep_time != 'Unknown'][['name', 'prep_time']].sort_values(by='prep_time', ascending=False)[:10]
top_cooktime = df[df.cook_time != 'Unknown'][['name', 'cook_time']].sort_values(by='cook_time', ascending=False)[:10]
with plt.style.context('ggplot'):
    fig, ax = plt.subplots(1, 2, figsize=(10, 10))
    plt.subplots_adjust(wspace=0.1)
    ax[0].barh(y=top_preptime['name'], width=top_preptime['prep_time'],
           color='orchid')
    ax[0].invert_xaxis()
    ax[0].yaxis.tick_left()
    #ax[0].set_yticklabels(top_preptime['name'], fontsize=12, fontweight='semibold')
    ax[0].set_xlabel('Preparation Time in minutes')
    ax[0].set_title('Preparation Time')
    
    ax[1].barh(y=top_cooktime['name'], width=top_cooktime['cook_time'],
           color='coral')
    ax[1].yaxis.tick_right()
    #ax[1].set_yticklabels(top_cooktime['name'], fontsize=12, fontweight='semibold')
    ax[1].set_xlabel('Cooking Time in minutes')
    ax[1].set_title('Cooking Time')
    ax[1].set_xticks(np.arange(0, 800, 100))

## 10. Top 5 dishes with the highest cooking and preparation time for all course types
- excluding all the missing values in prep and cook time

In [None]:
course_cooktime = df[df.cook_time != 'Unknown'][['name', 'course', 'cook_time']].sort_values(by=['course','cook_time'], ascending=False)
course_preptime = df[df.prep_time != 'Unknown'][['name', 'course', 'prep_time']].sort_values(by=['course','prep_time'], ascending=False)
courses = df.course.unique()
n_best = 5
colors = sns.color_palette("tab10")
with plt.style.context('bmh'):
    fig, axes = plt.subplots(4, 2, figsize=(10, 15))
    plt.subplots_adjust(wspace=0.5, hspace=0.5)
    axes = axes.ravel()
    for i, c, color, ax in zip(np.arange(0, 8, 2), courses, colors, axes):
        sns.barplot(data=course_preptime[course_preptime.course == c][:n_best], 
                x='prep_time', y='name', ax=axes[i+1], orient='h', color='orchid')
        sns.barplot(data=course_cooktime[course_cooktime.course == c][:n_best], 
                x='cook_time', y='name', ax=axes[i], orient='h', color='coral')
        
        axes[i].set_xlabel('cooking time in minutes')
        axes[i+1].set_xlabel('preparation time in minutes')
        axes[i].set_ylabel(None)
        axes[i+1].set_ylabel(None)
        axes[i].set_title(c.upper())
        axes[i+1].set_title(c.upper())

## 11. Top 5 dishes with the highest cooking and preparation time for all regions

In [None]:
region_cooktime = df[df.cook_time != 'Unknown'][['name', 'region', 'cook_time']].sort_values(by=['region','cook_time'], ascending=False)
region_preptime = df[df.prep_time != 'Unknown'][['name', 'region', 'prep_time']].sort_values(by=['region','prep_time'], ascending=False)
regions = df.region.unique()
n_best = 5
colors = sns.color_palette("tab10")
with plt.style.context('bmh'):
    fig, axes = plt.subplots(7, 2, figsize=(10, 30))
    plt.subplots_adjust(wspace=0.7, hspace=0.5)
    axes = axes.ravel()
    for i, c, color, ax in zip(np.arange(0, 14, 2), regions, colors, axes):
        sns.barplot(data=region_preptime[region_preptime.region == c][:n_best], 
                x='prep_time', y='name', ax=axes[i+1], orient='h', color='orchid')
        sns.barplot(data=region_cooktime[region_cooktime.region == c][:n_best], 
                x='cook_time', y='name', ax=axes[i], orient='h', color='coral')
        
        axes[i].set_xlabel('cooking time in minutes')
        axes[i+1].set_xlabel('preparation time in minutes')
        axes[i].set_ylabel(None)
        axes[i+1].set_ylabel(None)
        axes[i].set_title(c.upper())
        axes[i+1].set_title(c.upper())

## 12. Distribution of number of ingrediants used in dishes

In [None]:
categorical_count(df, 'ingredients_count', 'Dark2', 'Number of ingrediants used', 'count')

## 13. Distribution of number of ingredients used across types of diet, region, flavor profiles and courses

In [None]:
with plt.style.context('bmh'):
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    plt.subplots_adjust(wspace=0.4, hspace=0.4)
    axes = axes.ravel()
    for ax, col in zip(axes, ['diet', 'region', 'flavor_profile', 'course']):
        sns.countplot(data=df, x='ingredients_count', hue=col, ax=ax)
        ax.legend(loc='upper right')
        ax.set_title(col)