In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm.notebook as tqdm

%matplotlib inline

# Load Data

In [None]:
df_train = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/train.csv").drop(columns='id')
df_test = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/test.csv").drop(columns='id')

In [None]:
# There are duplicated within the train data
print(df_train.drop(columns='target').duplicated(keep=False).sum())
print(df_test.duplicated(keep=False).sum())

# We just drop these since they all have different classes
print(df_train.duplicated(keep=False).sum())  # Not dropping tatget gives 0
df_train = df_train[~df_train.drop(columns='target').duplicated(keep=False)]
df_train

In [None]:
_df_train = df_train.copy()
_df_test = df_test.copy()
_df_test['split'] = 'test'
_df_train['split'] = 'train'
_df = pd.concat([_df_train, _df_test])
_df[_df.drop(columns=['split', 'target']).duplicated(keep=False)]

In [None]:
df_train.head()

# EDA

## Train vs Test distributions

In [None]:
df_train.describe().T

In [None]:
df_test.describe().T

In [None]:
from scipy.stats import anderson_ksamp, ks_2samp

for col in df_test.columns:
    s = ks_2samp(df_train[col], df_test[col])
    # s = anderson_ksamp([df_train[col], df_test[col]])
    print(f'{col}: {s}')

KS test implies that train and test distibutions are pretty similar.

## Target counts

In [None]:
df_train.target.value_counts().plot(kind='bar')
# df_test.target.value_counts().plot(kind='bar')

# Outliers

In [None]:
df_train.boxplot(figsize=(20, 10), rot=90)
plt.show()

### Variation of features split by target

Keep in mind class imbalance

In [None]:
fig, axes = plt.subplots(17, 3, figsize=(18, 54))

cnts = df_train.drop(columns='target').value_counts().sort_index()
target_order = sorted(df_train.target.unique())

for col, ax in tqdm.tqdm(zip(df_train.drop(columns='target'), axes.flatten()), total=50):
    cnt = df_train[col].value_counts().sort_index()
    sns.kdeplot(x=col, hue='target', hue_order=target_order, data=df_train, fill=True, legend=True, ax=ax,)
    
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_title(f'{col}, Unique Values: {len(cnt)}', loc='right', fontsize=12)
    ax.axis('off')
    
axes.flatten()[-1].axis('off')
axes.flatten()[-2].axis('off')

fig.tight_layout()
plt.show()

## Correlations

In [None]:
_ = plt.figure(figsize=(10, 10))
corr = df_train.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()

# Feature selection

In [None]:
from sklearn.feature_selection import VarianceThreshold
vt = VarianceThreshold(threshold=0.1).fit(df_train.drop(columns='target'))
vt.get_support()

# XGBoost

In [None]:
from xgboost import XGBClassifier, plot_importance
from sklearn.metrics import classification_report as cr, confusion_matrix as cm
from sklearn.metrics import log_loss
from sklearn.utils import compute_sample_weight
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [None]:
x = df_train.drop(columns='target')
le = preprocessing.LabelEncoder().fit(df_train.target)
y = le.transform(df_train.target)

x_train, x_val, y_train, y_val = train_test_split(x, y, stratify=y, shuffle=True, random_state=0)

# sample_weight = compute_sample_weight('balanced', y_train)
model = XGBClassifier(eval_metric='mlogloss', use_label_encoder=False, random_state=0).fit(x_train, y_train) #, sample_weight=sample_weight)

print("Train:")
y_pred = model.predict_proba(x_train)
print(cm(y_true=y_train, y_pred=y_pred.argmax(axis=1)))
print(cr(y_true=y_train, y_pred=y_pred.argmax(axis=1)))
print(log_loss(y_pred=y_pred, y_true=y_train, labels=np.unique(y_train)))

print("Val:")
y_pred = model.predict_proba(x_val)
print(cm(y_true=y_val, y_pred=y_pred.argmax(axis=1)))
print(cr(y_true=y_val, y_pred=y_pred.argmax(axis=1)))
print(log_loss(y_pred=y_pred, y_true=y_val, labels=np.unique(y_val)))

_, ax = plt.subplots(1, 1, figsize=(18, 18))
plot_importance(model, ax=ax)
plt.title('Feature Importance')
plt.show()

In [None]:
from sklearn.feature_selection import SelectFromModel

thresholds = np.sort(model.feature_importances_)
for thresh in tqdm.tqdm(thresholds[[5, 10, 15, 20, 25, 30, 35, 40, 49]):
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    sel_x_train = selection.transform(x_train)
    xgb = XGBClassifier(eval_metric='mlogloss', use_label_encoder=False, random_state=0).fit(sel_x_train, y_train)
    xgb.fit(sel_x_train, y_train)
    
    sel_x_val = selection.transform(x_val)
    y_pred = xgb.predict_proba(sel_x_val)
    ll = log_loss(y_pred=y_pred, y_true=y_val, labels=np.unique(y_val))
    print(f"Thresh={thresh}, n={sel_x_train.shape}, Log-Loss: {ll}")

In [None]:
N = 100
a_min = np.linspace(0.0, 0.5, num=N)
a_max = np.linspace(0.5, 1.0, num=N)

z = np.array([[log_loss(y_pred=np.clip(y_pred, a_min=i, a_max=j), y_true=y_val, labels=np.unique(y_val)) for i in a_min] for j in a_max])

print(z.min())
i, j = np.unravel_index(z.argmin(), z.shape)
a_min, a_max = a_min[i], a_max[j]
print(a_min, a_max)

# The a_max cutoff is pretty low here.
a_max = 0.9

In [None]:
# Train on all the data
model = XGBClassifier(eval_metric='mlogloss', use_label_encoder=False, random_state=0).fit(x, y) #, sample_weight=sample_weight)

In [None]:
df_test = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/test.csv")
y_pred = model.predict_proba(df_test.drop(columns='id'))
# y_pred = np.clip(y_pred, a_min, a_max)

submission = pd.DataFrame(y_pred, columns=le.classes_)
submission = submission[['Class_1','Class_2','Class_3','Class_4']]


submission['id'] = df_test['id']
submission.to_csv('./submission.csv', index=False)
assert len(submission) == 50000