## What's Cooking

### Loading necessary libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
%matplotlib inline
import os
print(os.listdir("../input"))

### Loading data

In [None]:
train = pd.read_json('../input/train.json')
train.set_index('id' , inplace= True)
label = train['cuisine']
train.drop('cuisine' , axis = 1 , inplace= True)
test = pd.read_json('../input/test.json')

In [None]:
train.head()

In [None]:
test.head()

## EDA

In [None]:
print('Number of train data ' , len(train))
print('Number of test data ' , len(test))

### Numbers of cuisine

In [None]:
len(label.unique())

There are 20 types of cuisine in this dataset.

### Distribution of cuisines. 

In [None]:
plt.figure(figsize=(16, 6))
sns.countplot(y = label , order = label.value_counts().index)

1. Most of the cuisines are Italian and Mexican.
2. Least data is available for the Russian and Brazilian.
3. Imbalanced dataset

### Number of ingredient 

In [None]:
type(train.ingredients[0])

#### The datatype of values in the ingredients column is a list.

In [None]:
print('Maximum ingredients used in a single cuisine' , train.ingredients.apply(len).max())
print('Minimum ingredients used in a single cuisine' , train.ingredients.apply(len).min())

## ML

### Let's define a function to convert a list to a string.

In [None]:
def list_to_text(data):
    return (" ".join(data)).lower()

##### Lets test it

In [None]:
list_to_text(['a' , 'b'])

Ok its working

### Converting ingredients columns from a list to string.

In [None]:
train.ingredients = train.ingredients.apply(list_to_text )
test.ingredients = test.ingredients.apply(list_to_text)

In [None]:
train.head()

In [None]:
test.head()

### Working with text features

In [None]:

tfidf = TfidfVectorizer()

In [None]:
X_train = tfidf.fit_transform(train.ingredients)
X_test = tfidf.transform(test.ingredients)

In [None]:
l = LabelEncoder()
label = l.fit_transform(label)

In [None]:
label

In [None]:
clf = XGBClassifier()
scores = cross_val_score(clf, X_train, label, cv=3).mean()
scores

In [None]:
clf.fit(X_train , label)
pre = clf.predict(X_test)

In [None]:
pre

### Inverse the prediction to its name/label using LabelEncoder's inverse_transform

In [None]:
pre = l.inverse_transform(pre)
pre

### Prepare the submission file.

In [None]:
submit = pd.read_csv('../input/sample_submission.csv')
submit.head()

In [None]:
submit.cuisine = pre
submit.id = test.id

In [None]:
submit.to_csv('submit.csv' , index= False)

In [None]:
!ls