In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import plotly.express as px
!pip install bhtsne
from bhtsne import tsne
!pip install umap-learn
import umap
import scipy.sparse
from lightgbm import plot_importance

In [None]:
DATA_PATH = '/kaggle/input/tabular-playground-series-may-2021/'
sample = pd.read_csv(DATA_PATH + 'sample_submission.csv')
train = pd.read_csv(DATA_PATH + 'train.csv')

# Drop "id" as well since lightgbm used it as the most important feature
# https://www.kaggle.com/mstkmyhr/2021-05-15-tps-baseline-submission-by-lightgbm/edit/run/62983993
train_x = train.drop(['id', 'target'], axis=1)
train_y = train['target']
# Convert target values to integer (e.g. Convert "Class_1" into 0)
train_y = train_y.map(lambda x: int(x.split('_')[1]) - 1)

test_x = pd.read_csv(DATA_PATH + 'test.csv')
test_x = test_x.drop(['id'], axis=1)

In [None]:
train.drop(['id'], axis=1).describe().T\
        .style.bar(subset=['mean'], color=px.colors.qualitative.G10[0])\
        .background_gradient(subset=['std'], cmap='Greens')\
        .background_gradient(subset=['50%'], cmap='BuGn')

In [None]:
# Applying t-SNE takes forever...
# Y = tsne(train_x.astype(np.float64))
# plt.scatter(Y[:, 0], Y[:, 1], c=train_y)
# plt.show()

# Instead, use UMAP
reducer = umap.UMAP(
    n_components=2,
)

In [None]:
%%time
embedding = reducer.fit_transform(train_x)
embedding.shape

In [None]:
plt.scatter(
    embedding[:, 0],
    embedding[:, 1],
    c=train_y,
    s=.5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(5)-0.5).set_ticks(np.arange(4))
plt.title('UMAP projection of the dataset', fontsize=16)

- あまり良い特徴量が得られたような気はしないが、特徴量に追加してみる。

In [None]:
reducer = umap.UMAP(
    n_components=2
)

In [None]:
%%time
embedding_test = reducer.fit_transform(test_x)
embedding_test.shape

In [None]:
train_x['umap_x'] = embedding[:, 0]
train_x['umap_y'] = embedding[:, 1]
test_x['umap_x'] = embedding_test[:, 0]
test_x['umap_y'] = embedding_test[:, 1]

In [None]:
params = {
    'objective': 'multiclassova',
    'verbose': -1,
    'seed': 71,
    'metrics': 'multi_logloss',
    'num_class': 4
}
num_round = 100

scores = []
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
    
    lgb_train = lgb.Dataset(tr_x, tr_y)
    lgb_eval = lgb.Dataset(va_x, va_y)
    model = lgb.train(params, lgb_train, num_boost_round=num_round, valid_sets=[lgb_train, lgb_eval])
    va_pred = model.predict(va_x)
    score = log_loss(va_y, va_pred)
    scores.append(score)

In [None]:
print(f'logloss: {np.mean(scores):.4f}')

In [None]:
lgb_train = lgb.Dataset(train_x, train_y)
model = lgb.train(params, lgb_train, num_boost_round=num_round)
pred = model.predict(test_x)

df_pred = pd.DataFrame(pred, columns=['Class_1', 'Class_2', 'Class_3', 'Class_4'])
df_pred['id'] = pd.read_csv(DATA_PATH + 'test.csv').iloc[:, 0]
submission = df_pred[['id', 'Class_1', 'Class_2', 'Class_3', 'Class_4']]
submission.to_csv('submission.csv', index=False)

In [None]:
plot_importance(model, figsize=(8,16), importance_type='split')

In [None]:
plot_importance(model, figsize=(8,16), importance_type='gain')