# Load files

In [None]:
import json

filename_train = '../input/whats-cooking-kernels-only/train.json'
filename_test  = '../input/whats-cooking-kernels-only/test.json'

with open(filename_train, 'r') as f:
    dict_train = json.load(f)
    
with open(filename_test, 'r') as f:
    dict_test = json.load(f)

In [None]:
ingredients_train = [ ' '.join(d['ingredients']).lower() for d in dict_train ]
ingredients_test  = [ ' '.join(d['ingredients']).lower() for d in dict_test  ]

# Vectorize ingredients

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(ingredients_train).toarray()
X_train

In [None]:
X_test = vectorizer.transform(ingredients_test).toarray()
X_test

# Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

y_train = [ d['cuisine'] for d in dict_train ]
y_train = label_encoder.fit_transform(y_train)
y_train

# Modeling

In [None]:
import numpy as np
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train)

params = {
    'objective': 'multiclass',    
    'num_class': len(np.unique(y_train)),
    'metric': 'multi_logloss',
}

cv_rslt = lgb.cv(params, 
                 lgb_train, 
                 nfold=5, 
                 num_boost_round=10000, 
                 early_stopping_rounds=20,
                 return_cvbooster=True)

# Prediction

In [None]:
import pandas as pd

cvbooster = cv_rslt['cvbooster']
y_preds = cvbooster.predict(X_test, num_iteration=cvbooster.best_iteration)
y_pred = np.mean(y_preds, axis=0).argmax(axis=-1)
y_pred = label_encoder.inverse_transform(y_pred)
y_pred = pd.Series(y_pred, name='cuisine')
y_pred

# Save Output

In [None]:
indices = pd.Series([ d['id'] for d in dict_test ], name='id')
indices

In [None]:
answer = pd.concat([indices, y_pred], axis=1)
answer

In [None]:
filename_output = './submission.csv'
answer.to_csv(filename_output, index=False)