In [None]:
import numpy as np 
import pandas as pd

# Data Description | Descrição dos Dados

In [None]:
raw_data = pd.read_csv('../input/indian-food-101/indian_food.csv')
raw_data.head()

In [None]:
raw_data.info()

In [None]:
#Dropping nulls | Retirando valores nulos
data = raw_data.dropna()
data.info()

In [None]:
#Changing "-1" values in categorical variables to "Unknown" | 
#Mudando valores "-1" de variaveis categoricas para "Unkown" (Desconhecido)


categorical_cols = [col for col in data.columns if data[col].dtype == 'object']
a = {}
for col in categorical_cols:
    a[col] = '-1'
data = data.replace(a, "Unknown")

In [None]:
data.describe()

# Data Visualization | Visualização dos Dados

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.figure(figsize=(10, 6))

sns.distplot(data['prep_time'], kde=False, bins=80)
plt.title('Preparation time frequency / Frequencia do tempo de preparação')
plt.xlabel('Preparation time / Tempo de preparação')

In [None]:
plt.figure(figsize=(10, 6))
sns.distplot(data['cook_time'], kde=False, bins=80)
plt.title('Cook time frequency / Frequencia do tempo de cozimento')
plt.xlabel('Cook time / Tempo de cozimento')

In [None]:
total_time = [total for total in data['cook_time'] + data['prep_time']]
data = data.join(pd.DataFrame(total_time, columns=['total_time']))
data.head()

In [None]:
plt.figure(figsize=(10, 6))
sns.distplot(data['total_time'], kde=False, bins=80)
plt.xlabel('Total time / Tempo total')
plt.title('Total time frequency / Frequencia de tempo total')

In [None]:
plt.figure(figsize=(14, 6))

plt.subplot(121)
plt.xticks(rotation=45)
plt.title('Longest time recipes / Receitas mais demoradas')
sns.barplot(y='total_time', x='name', data=data.nlargest(10, 'total_time')) 

plt.subplot(122)
plt.title('Shortest time recipes / Receitas mais rápidas')
plt.xticks(rotation=30)
sns.barplot(y='total_time', x='name', data=data.query('total_time >= 0').nsmallest(10, 'total_time'))

In [None]:
plt.figure(figsize=(10, 6))
data['diet'].value_counts().plot.pie(labels=['Vegetarian', 'Non Vegetarian'], autopct='%1.1f%%')
plt.ylabel('')

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='prep_time', y='cook_time', data=data, hue='diet')
plt.xlabel('Preparation time / Tempo de preparação')
plt.ylabel('Cook time / Tempo de cozimento')

In [None]:
recipes_per_state = data['state'].value_counts()

plt.figure(figsize=(12, 8))
plt.tick_params(axis='x', labelrotation=60)

sns.barplot(x=recipes_per_state.index, y=recipes_per_state.values)
plt.title('Recipes per state / Receitas por estado')

In [None]:
plt.figure(figsize=(10,8))
plt.title('Flavors per region / Sabor por região')
sns.countplot(data=data, x='region', hue='flavor_profile')
plt.xlabel('Region / Região')
plt.xticks([0, 1, 2, 3, 4, 5, 6], ['East / Leste', 'West / Oeste', 'North / Norte', 
                   'Unknown / Desconhecido', 'North East / Noroeste', 'South / Sul', 'Center / Centro'], 
           rotation=30)
plt.legend(['Sweet / Adocicado', 'Spicy / Apimentado', 'Bitter / Amargo', 'Unknown / Desconhecido', 'Sour / Azedo'], 
           title='Flavors / Sabores', loc='upper right')
plt.ylabel('Count / Contagem')

In [None]:
#Function for filtering ingredients

def filter_ingredients(column):
    
    values = data[column].unique()
    
    temp_datasets = []
    
    out = {}
    
    for value in values:
        temp_datasets.append(data.query('{0} == "{1}"'.format(column, value)))
    
    
    all_ingredients = np.array([])
    ingredients_per_filter = []

    for dataset in temp_datasets:
        recipes = [ing.lower().split(', ') for ing in dataset['ingredients']]
        for ingredients in recipes:
            for ingredient in ingredients:
                all_ingredients = np.append(all_ingredients, ingredient.strip())
        ingredients_per_filter.append(pd.Series(all_ingredients))
    
    for value in range(len(values)):
        out[values[value]] = ingredients_per_filter[value]
    
    return out

In [None]:
recipes = [ing.lower().split(', ') for ing in data['ingredients']]

all_ingredients = np.array([])

for ingredients in recipes:
    for ingredient in ingredients:
        all_ingredients = np.append(all_ingredients, ingredient.strip())

all_ingredients = pd.Series(all_ingredients)
most_used = all_ingredients.value_counts()[0:10]
labels=[label.capitalize() for label in most_used.index]

plt.figure(figsize=(10,6))

sns.barplot(x=most_used.index, y=most_used.values)
plt.xticks(rotation=30, labels=labels, ticks=range(10))

In [None]:
data['ingredients'].update([low.lower() for low in data['ingredients']])
data

In [None]:
ingredients_per_diet = filter_ingredients('diet')

#Plot the most used for vegetarian and non vegetarian recipes

veg_most_used = ingredients_per_diet['vegetarian'].value_counts()[0:15]
veg_labels = [label.capitalize() for label in veg_most_used.index]

non_veg_most_used = ingredients_per_diet['non vegetarian'].value_counts()[0:15]
non_veg_labels = [label.capitalize() for label in non_veg_most_used.index]

plt.figure(figsize=(20,8))


plt.subplot(121)
plt.title('Vegetarian / Vegetarianas')
sns.barplot(x=veg_most_used.index, y=veg_most_used.values)
plt.xticks(rotation=45, labels=veg_labels, ticks=range(15))

plt.subplot(122)
plt.title('Non vegetarian / Não vegetarianas')
sns.barplot(x=non_veg_most_used.index, y=non_veg_most_used.values)
plt.xticks(rotation=50, labels=non_veg_labels, ticks=range(15))


In [None]:
#Other Vizualization

from wordcloud import WordCloud

plt.figure(figsize=(20, 8))

plt.subplot(121)
wordcloud_veg = WordCloud(background_color='white', width=1600, height=800).generate(' '.join(ingredients_per_diet['vegetarian']))
plt.imshow(wordcloud_veg)
plt.axis('off')

plt.subplot(122)
wordcloud_non_veg = WordCloud(background_color='white', width=1600, height=800).generate(' '.join(ingredients_per_diet['non vegetarian']))
plt.imshow(wordcloud_non_veg)
plt.axis('off')

This is my first public notebook, any tips or commentaries are welcome. Este é meu primeiro notebook, quaisquer dicas ou comentários são bem vindos.

Thank you. Obrigado.