In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
import lightgbm as lgb

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
DATA_PATH = '/kaggle/input/tabular-playground-series-may-2021/'
sample = pd.read_csv(DATA_PATH + 'sample_submission.csv')
train = pd.read_csv(DATA_PATH + 'train.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
# Convert target values to integer (e.g. Convert "Class_1" into 0)
train_y = train_y.map(lambda x: int(x.split('_')[1]) - 1)

test_x = pd.read_csv(DATA_PATH + 'test.csv')

display(train.shape)
display(train.head())
display(train_x.head())
display(train_y.head())
display(test_x.head())

In [None]:
params = {
    'objective': 'multiclassova',
    'verbose': 0,
    'seed': 71,
    'metrics': 'multi_logloss',
    'num_class': 4
}
num_round = 100

scores = []
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

    lgb_train = lgb.Dataset(tr_x, tr_y)
    lgb_eval = lgb.Dataset(va_x, va_y)
    model = lgb.train(params, lgb_train, num_boost_round=num_round, valid_sets=[lgb_train, lgb_eval])
    va_pred = model.predict(va_x)
    pred = model.predict(test_x)
    score = log_loss(va_y, va_pred)
    scores.append(score)

In [None]:
print(f'logloss: {np.mean(scores):.4f}')

In [None]:
lgb_train = lgb.Dataset(train_x, train_y)
model = lgb.train(params, lgb_train, num_boost_round=num_round)
pred = model.predict(test_x)

df_pred = pd.DataFrame(pred, columns=['Class_1', 'Class_2', 'Class_3', 'Class_4'])
df_pred['id'] = test_x.iloc[:, 0]
submission = df_pred[['id', 'Class_1', 'Class_2', 'Class_3', 'Class_4']]
submission.to_csv('submission.csv', index=False)

In [None]:
importance = pd.DataFrame(model.feature_importance(), index=train_x.columns, columns=['importance'])
importance.style.bar(subset=['importance'], color='#d65f5f')