### Contents  
#### * Load Data and Libraries  
#### * Data Visualization  
#### * Cleaning Data - arrange Ingredients  
#### * Clustering  
#### * appendix

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### * Load Data and Libraries
  
To help unify the wording of Ingredients, I use FuzzyWuzzy  
[FuzzyWuzzy -GitHub](https://github.com/seatgeek/fuzzywuzzy)

In [None]:
import warnings; warnings.simplefilter('ignore')
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

import collections
from wordcloud import WordCloud

from sklearn.cluster import KMeans

!pip install --q fuzzywuzzy
from fuzzywuzzy import fuzz

In [None]:
food = pd.read_csv('../input/indian-food-101/indian_food.csv').set_index('name')
food.shape

In [None]:
food.head()

#### * Data Visualization

In [None]:
fig, ax = plt.subplots(2, 2, sharey=True,figsize=(12,8))
plt.subplots_adjust(hspace=0.6)
for i, f in enumerate(['diet', 'flavor_profile','course', 'region']):
    axy, axx = divmod(i,2)
    sns.countplot(food[f], ax=ax[axy, axx])
    ax[axy,axx].tick_params(axis='x', labelrotation=45)

In [None]:
fig, ax = plt.subplots(figsize=(12,3))
sns.countplot(food['state'], ax=ax)
ax.tick_params(axis='x', labelrotation=90)

#### * Cleaning Data - arrange Ingredients

In [None]:
ing_dic = collections.defaultdict(int)

for f in food.index:
    ing_list = food.at[f, 'ingredients'].split(', ')
    for i in ing_list:
        i = i.lower().strip()
        ing_dic[i] += 1

ing_df = pd.DataFrame.from_dict(ing_dic, orient='index')\
    .rename(columns={0:'count'})

Let's check components of Ingredients, start with 'red'

In [None]:
ing_df.sort_index().loc['red': 'red0'].T

> There exists red chili, red chilli and red chillies in components of Ingredients!

Check for similarity in the expression of the ingredients by FuzzyWuzzy  

> Combinations with FuzzyWuzzy.ratio>70

In [None]:
ing_list = ing_df.sort_values('count').index.to_list()

n = 0

for i in range(len(ing_list)-1):
    for j in range(i+1, len(ing_list)):
        ratio = fuzz.ratio(ing_list[i], ing_list[j])
        if n == 30:
            break
        if ratio > 70:
            print(ing_list[i], ', ', ing_list[j], '\t', ratio)
            n += 1

When the ratio is greater than 80, the two expressions appear to refer to the same content in many cases

> all combination with ratio>80 shown in hidden cell

In [None]:
for i in range(len(ing_list)-1):
    for j in range(i+1, len(ing_list)):
        ratio = fuzz.ratio(ing_list[i], ing_list[j])
        if ratio > 80:
            print('"', ing_list[i], '": "',ing_list[j], '"\t', ratio)

In [None]:
similar_ing_dic = {
    "red chili": "red chilli",
    "greens":"green",
    "drumstick":"drumsticks",
    "thin rice flakes":"beaten rice flakes",
    "chana daal":"chana da ",
    "whole urad dal":"white urad dal",
    "bell pepper":"bell peppers",
    "frozen green peas":"green peas" ,
    "fresh green peas":"green peas",
    "chilli": "chillies",
    "fish fillets": "fish fillet",
    "mustard seed": "mustard seeds",
    "peanut":"peanuts",
    "red chillies":"red chilli",
    "dried fruits":"dry fruits",
    "almond":"almonds",
    "carrots":"carrot",
    "yoghurt":"yogurt",
    "chenna":"chhena",
    "green chillies":"green chilies",
    "green chilli":"green chilies",
    "green chili":"green chilies",
    "potatoes":"potato",
    "tomatoes":"tomato"
}

In [None]:
new_ing_dic = collections.defaultdict(int)

for f in food.index:
    tmp_list = food.at[f, 'ingredients'].split(', ')
    for i in tmp_list:
        i = i.lower().strip()
        if i in similar_ing_dic:
            i = similar_ing_dic[i]
        new_ing_dic[i] += 1
            
new_ing_df = pd.DataFrame.from_dict(new_ing_dic, orient='index')\
    .rename(columns={0:'count'})
new_ing_df.sort_index().loc['red':'red0'].T

#### * Clustering

> Setting a 'Bag-of=Ingredient'

In [None]:
BoI_df = pd.DataFrame(
    np.zeros(len(food)*len(new_ing_dic)).reshape(len(food),len(new_ing_dic))\
    .astype(int),index=food.index, columns=new_ing_df.index)

for f in food.index:
    tmp_list = food.at[f, 'ingredients'].split(', ')
    for i in tmp_list:
        i = i.lower()
        if i[0] == ' ':
            i = i[1:]
        if i[-1] ==' ':
            i = i[:-1]
        if i in similar_ing_dic:
            i = similar_ing_dic[i]
        BoI_df.at[f, i]=1

BoI_df.head()

> Run KMeans

In [None]:
km = KMeans(n_clusters=5,random_state=0)
clust5 = pd.DataFrame(
    km.fit(BoI_df).labels_, index=BoI_df.index).rename(columns={0:'grp'})

Features by Cluster

In [None]:
sns.countplot(y=clust5['grp'], orient='h');

In [None]:
fig,ax = plt.subplots(4,figsize=(12,16))
for i, f in enumerate(['flavor_profile','course','region','diet']):
    pd.crosstab(clust5['grp'],food[f],normalize='index')[::-1]\
    .plot.barh(stacked=True,ax=ax[i])
    ax[i].set_title(f)

In [None]:
freq_ing = BoI_df.sum().sort_values(ascending=False)[:10].index.to_list()
freq_ing_df = pd.merge(clust5, BoI_df[freq_ing],
                       how='inner', left_index=True, right_index=True)\
                    .groupby('grp').sum()
tot_ing_df = pd.DataFrame(BoI_df.sum()).rename(columns={0:'total'})

for c in freq_ing_df.columns:
    freq_ing_df[c] = freq_ing_df[c]/tot_ing_df.at[c, 'total']

freq_ing_df.T[::-1].plot.barh(stacked=True, figsize=(12,8),
                              title='Share of frequently used ingredient');

> 0: main course, South region  
1: main course, North region  
2: rice flour and jaggery  
3: Desert  
4: Spicy, main course and snack, south region

In [None]:
fig, ax = plt.subplots(1, 2, sharey=True, figsize=(14,4))
sns.boxplot(y=clust5['grp'], x=food['prep_time'], orient='h', ax=ax[0])
ax[0].set_xscale('log')
sns.boxplot(y=clust5['grp'], x=food['cook_time'], orient='h', ax=ax[1])
ax[1].set_xscale('log');

I think I've made a reasonably meaningful classification for myself. What do you think?

Ingredient WordCloud by Cluster

In [None]:
wc_df = pd.merge(clust5, BoI_df, how='inner', left_index=True, right_index=True)

def wc():
    for i in range(5):
        tmp_df = wc_df[wc_df['grp']==i]
        tmp_list = []
        for f in tmp_df.index:
            for c in tmp_df.columns[1:]:
                if tmp_df.at[f, c]==1:
                    tmp_list.append(c.replace(' ', '_'))
        words = ' '.join([word for word in tmp_list])
        wordcloud =WordCloud(
            width=700, height=300, collocations=False, background_color='white',
            max_font_size=100).generate(words)
        plt.figure(figsize=(14, 6))
        plt.title('Group '+str(i), fontsize=32)
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis('off')

wc()

#### * appendix

In [None]:
print(clust5[clust5['grp']==0].index.to_list())

In [None]:
print(clust5[clust5['grp']==1].index.to_list())

In [None]:
print(clust5[clust5['grp']==2].index.to_list())

In [None]:
print(clust5[clust5['grp']==3].index.to_list())

In [None]:
print(clust5[clust5['grp']==4].index.to_list())