In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm.notebook as tqdm

%matplotlib inline

# Load Data

In [None]:
df_train = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/train.csv").drop(columns='id')
df_test = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/test.csv").drop(columns='id')

# XGBoost

I'm just going to assume that the test data distribution is balanced

In [None]:
from xgboost import XGBClassifier, plot_importance
from sklearn.metrics import classification_report as cr, confusion_matrix as cm
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing

In [None]:
x = df_train.drop(columns='target')
le = preprocessing.LabelEncoder().fit(df_train.target)
y = le.transform(df_train.target)

x_train, x_val, y_train, y_val = train_test_split(x, y, stratify=y, shuffle=True, random_state=0)

x_train, y_train = SMOTE().fit_resample(x_train, y_train)
x_val, y_val = SMOTE().fit_resample(x_val, y_val)

model = XGBClassifier(eval_metric='mlogloss', use_label_encoder=False, random_state=0).fit(x_train, y_train) #, sample_weight=sample_weight)

print("Train:")
y_pred = model.predict_proba(x_train)
print(cm(y_true=y_train, y_pred=y_pred.argmax(axis=1)))
print(cr(y_true=y_train, y_pred=y_pred.argmax(axis=1)))
print(log_loss(y_pred=y_pred, y_true=y_train, labels=np.unique(y_train)))

print("Val:")
y_pred = model.predict_proba(x_val)
print(cm(y_true=y_val, y_pred=y_pred.argmax(axis=1)))
print(cr(y_true=y_val, y_pred=y_pred.argmax(axis=1)))
print(log_loss(y_pred=y_pred, y_true=y_val, labels=np.unique(y_val)))

_, ax = plt.subplots(1, 1, figsize=(18, 18))
plot_importance(model, ax=ax)
plt.title('Feature Importance')
plt.show()

In [None]:
N = 100
a_min = np.linspace(0.0, 0.5, num=N)
a_max = np.linspace(0.5, 1.0, num=N)

z = np.array([[log_loss(y_pred=np.clip(y_pred, a_min=i, a_max=j), y_true=y_val, labels=np.unique(y_val)) for i in a_min] for j in a_max])

print(z.min())
i, j = np.unravel_index(z.argmin(), z.shape)
a_min, a_max = a_min[i], a_max[j]
print(a_min, a_max)

# The a_max cutoff is pretty low here.
a_max = 1 - a_min

In [None]:
# Retrain on all of the data
x, y = SMOTE().fit_resample(x, y)
model = XGBClassifier(eval_metric='mlogloss', use_label_encoder=False, random_state=0).fit(x, y)

In [None]:
df_test = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/test.csv")
y_pred = model.predict_proba(df_test.drop(columns='id'))
# y_pred = np.clip(y_pred, a_min, a_max)

submission = pd.DataFrame(y_pred, columns=le.classes_)
submission = submission[['Class_1','Class_2','Class_3','Class_4']]

submission['id'] = df_test['id']
submission.to_csv('./submission.csv', index=False)
assert len(submission) == 50000