# What do we have for dinner ?

## Imports

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import seaborn as sns
import itertools
plt.style.use('seaborn')

## Some Data

In [None]:
import os, zipfile
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        p = os.path.join(dirname, filename)
        with zipfile.ZipFile(p, 'r') as zip_ref:
            zip_ref.extract('.'.join(filename.split('.')[:-1]))
os.listdir()

In [None]:
train = pd.read_json('train.json')
test = pd.read_json('test.json')
train[:3]

In [None]:
lists_of_ingredients = train.ingredients.values.ravel().tolist() + test.ingredients.values.ravel().tolist()

unique_ingredients = sorted(list(set(list(itertools.chain(*lists_of_ingredients)))))

unique_ingredients[:3],len(unique_ingredients)

In [None]:
def format_ingredients(full_list,given_list):
    l = [0] * len(full_list)
    for x in given_list:
        l[full_list.index(x)] = 1
    return l

In [None]:
%%time
train[unique_ingredients] = pd.DataFrame(train.ingredients.apply(lambda x: format_ingredients(unique_ingredients,x)).tolist(),columns=unique_ingredients)
train.drop(['ingredients'],axis=1,inplace=True)

In [None]:
%%time
test[unique_ingredients] = pd.DataFrame(test.ingredients.apply(lambda x: format_ingredients(unique_ingredients,x)).tolist(),columns=unique_ingredients)
test.drop(['ingredients'],axis=1,inplace=True)

## Some Quick EDA

**Number of receipes by Cuisine**

In [None]:
train.cuisine.value_counts().plot.bar(rot=0,figsize=(20,7));

**Top 15 most used Ingredients**

In [None]:
train.sum()[2:].sort_values(ascending=False)[:15].plot.bar(rot=0,figsize=(25,7));

## Modelling

In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split

**Split data into train/test**

In [None]:
train_,test_ = train_test_split(train,test_size=0.33,random_state=42,stratify=train.cuisine)

**Check proportions**

In [None]:
'train:',train_.cuisine.value_counts() / len(train_),'test:',test_.cuisine.value_counts() / len(test_)

**Training**

In [None]:
import xgboost as xgb

In [None]:
%%time
xgc = xgb.XGBClassifier(objective='multi:softmax',num_class=train.cuisine.nunique())
xgc.fit(train_[train_.columns[2:]],train_['cuisine'])

**Results**

In [None]:
results = test_[['id','cuisine']].copy()
results['y_pred'] = xgc.predict(test_[test_.columns[2:]])

**Classification Report**

In [None]:
print(metrics.classification_report(results.cuisine,results.y_pred))

**Confusion Matrix**

In [None]:
sns.heatmap(metrics.confusion_matrix(results.cuisine,results.y_pred),annot=True,fmt='d');

Usually I retrain using all train data before submission,

but since this appraoche takes ages, I will just use the model already trained on splitted data

In [None]:
%%time
sub = test[['id']].copy()
sub['cuisine'] = xgc.predict(test[test.columns[1:]])

**Submission**

In [None]:
sub.to_csv('submission.csv',index=False)