This appraoch uses XGBClassifier and GridSearchCV to find a tuned model. GPU use is recommended.

In [None]:
#bring in data
import numpy as np
from xgboost import XGBClassifier

import pandas as pd
raw_training = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/train.csv")
raw_test = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/test.csv")

In [None]:
#show data
display(raw_training.head())

In [None]:
#pull out features and show
X = raw_training.iloc[:, 1:76]

display(X.head())

In [None]:
#pull out responses and show
y = raw_training.iloc[:, 76]

display(y.head())

The snippet below is the grid search I used to find the tuned model.

tree_method='gpu_hist' allows use of GPU from Kaggle. It is needed since we are training/validating 200+ times.

cv = 2 since the data is very large.

from xgboost import XGBClassifier

xgbc = XGBClassifier(eval_metric = "logloss", objective = "multi:softmax", num_class = 9, tree_method = "gpu_hist")

xgb_pars = {
    "max_depth" : [9, 12],
    "min_child_weight" : [100, 1000],
    "subsample" : [.25, .5, .75],
    "colsample_bytree" : [.25, .5, .75],
    "eta" : [.1, .05, .01],
}

xgbc_gs = GridSearchCV(xgbc, param_grid = xgb_pars, cv = 2, scoring = "neg_log_loss")

xgbc_gs.fit(np.ascontiguousarray(X), np.ascontiguousarray(y))

display(xgbc_gs.best_estimator_)

display(xgbc_gs.best_score_)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.25, eta=0.1,
              eval_metric='logloss', gamma=0, gpu_id=0, importance_type='gain',
              interaction_constraints='', learning_rate=0.100000001,
              max_delta_step=0, max_depth=9, min_child_weight=100, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=2,
              num_class=9, num_parallel_tree=1, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=0.75, tree_method='gpu_hist', validate_parameters=1,
              verbosity=None)

-1.7486742511149402

In [None]:
#best model is pasted below
model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.25, eta=0.1,
              eval_metric='logloss', gamma=0, gpu_id=0, importance_type='gain',
              interaction_constraints='', learning_rate=0.100000001,
              max_delta_step=0, max_depth=9, min_child_weight=100, 
              monotone_constraints='()', n_estimators=100, n_jobs=2,
              num_class=9, num_parallel_tree=1, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=0.75, tree_method='gpu_hist', validate_parameters=1,
              verbosity=None)

In [None]:
#fit best model
model.fit(np.ascontiguousarray(X), np.ascontiguousarray(y))

In [None]:
#grab test features
X_test = raw_test.iloc[:,1:76]

In [None]:
#predict probabilities on test data
test_pred = model.predict_proba(np.ascontiguousarray(X_test))

test_pred = pd.DataFrame(test_pred)

In [None]:
#prepare output and save
output = pd.DataFrame(raw_test.iloc[:,0])

output = output.merge(test_pred, left_index = True, right_index = True)

output.columns = ["id", "Class_1", "Class_2", "Class_3", "Class_4", "Class_5", "Class_6", "Class_7", "Class_8", "Class_9"]

output.to_csv('submission.csv', index=False)