In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import optuna
import xgboost as xgb
from optuna.samplers import TPESampler
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
input_dir = '../input/tabular-playground-series-jun-2021/'
train_df = pd.read_csv(f'{input_dir}train.csv')

In [None]:
train_df.head()

In [None]:
train_df.nunique().sort_values(ascending=False)[:20]

In [None]:
X = train_df.drop(['id', 'target'], axis=1).values
y = LabelEncoder().fit_transform(train_df['target'])

In [None]:
best_classifier = None
clf = None

def objective(trial):
    global clf

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, test_size=0.1)
    X_train = np.log(X_train+1)
    X_valid = np.log(X_valid+1)
    X_train = RobustScaler().fit_transform(X_train)
    X_valid = RobustScaler().fit_transform(X_valid)    

    param_grid = {
        'max_depth': trial.suggest_int('max_depth', 6, 15),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 2000, 12000),
        'lambda': trial.suggest_float('lambda', 1e-3, 1e3),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'min_child_weight': trial.suggest_int('min_child_weight', 300, 2000),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.10, 0.5),
    }

    clf = xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss', use_label_encoder=False,
                            num_class = 9, gpu_id=0, tree_method = 'gpu_hist', **param_grid)
    
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation_0-mlogloss')
    clf.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_valid, y_valid)],
            eval_metric='mlogloss', verbose=False)

    return -cross_val_score(clf, X_valid, y_valid, scoring='neg_log_loss').mean()

def callback(study, trial):
    global best_classifier
    if study.best_trial == trial:
        best_classifier = clf

In [None]:
train_time = 3 * 60 * 60 # 3 h * 60 m * 60 s

study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(objective, timeout=train_time, callbacks=[callback])

print('Number of finished trials: ', len(study.trials))
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

In [None]:
from optuna import visualization
print('Best trial: score {},\nparams {}'.format(study.best_trial.value,study.best_trial.params))
hist = study.trials_dataframe()
hist.head()

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
evals_result = best_classifier.evals_result()

plt.plot(evals_result['validation_0']['mlogloss'])
plt.plot(evals_result['validation_1']['mlogloss'])
plt.title('Model Multiclass Log Loss')
plt.ylabel('Multiclass Log Loss')
plt.xlabel('Estimator')
plt.legend(['Train', 'Validation'], loc='upper right')

plt.show()

In [None]:
test_df = pd.read_csv(f'{input_dir}test.csv')

X_test = test_df.drop('id', axis=1).values
X_test = np.log(X_test+1)
X_test = RobustScaler().fit_transform(X_test)

test_preds = best_classifier.predict_proba(X_test)

In [None]:
sub = pd.read_csv(f'{input_dir}sample_submission.csv')
sub.iloc[:, 1:] = test_preds
sub.to_csv('submission.csv', index=False)