In [None]:
from pathlib import Path
import pickle

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss

import optuna

# Importing Data

In [None]:
SEED = 4651

INPUT_DIR = Path("../input/tabular-playground-series-jun-2021")

train_fn = INPUT_DIR/"train.csv"
test_fn = INPUT_DIR/"test.csv"

target = "target"

Xnames = [f"feature_{i}" for i in range(75)]

In [None]:
train = pd.read_csv(train_fn, index_col='id')
test = pd.read_csv(test_fn, index_col='id')
nrows = len(train)

ytrain = train[target]
Xtrain = train[Xnames]
train_target = train[target].map(lambda x: int(x.split('_')[-1])-1)
targets = sorted(list(ytrain.unique()))

In [None]:
train_te = train.copy()
for t in targets:
    train_te[t] = train[target].apply(lambda x: x==t).astype(int)
test_te = test.copy()

# Exploring

In [None]:
Xtrain.dtypes.value_counts()

All features are made up of integers

In [None]:
sns.scatterplot(data=ytrain.value_counts().sort_index()/nrows)
plt.ylabel('Occurrence of Class')

There is a lot of class imbalance, with half of the instances in the trianing set belonging to Classes 6 and 8, and only a few percent belonging to Classes 4 and 5.

In [None]:
corr = train_te.corr()
sns.heatmap(data=corr, vmin=-1, vmax=1, cmap='bwr')

The correlations between all of the features are all fairly small (and positive interestingly). There is a bit of correlation between the individual features

In [None]:
sns.heatmap(data=corr[targets].drop(targets).T, vmin=-1, vmax=1, cmap='bwr')

There isn't much correlation between the features and classes directly, but interestingly the ones that do show some corelation seem to have roughly the same sign and magnitude of correlation with most of the features.

In [None]:
max_counts = Xtrain.apply(lambda x: x.value_counts().max())
min_counts = Xtrain.apply(lambda x: x.value_counts().min())
num_unique = Xtrain.nunique()

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(7, 7))
sns.scatterplot(data=max_counts/nrows, ax=ax1)
ax1.set_xticks([])
ax1.set_ylabel("Fraction of total in observation")
ax1.set_title("Most frequent value")
sns.scatterplot(data=min_counts/nrows, ax=ax2)
ax2.set_xticks([])
ax2.set_title("Least frequent value")
sns.scatterplot(data=num_unique, ax=ax3)
ax3.set_xticks([])
ax3.set_ylabel("Unique values per feature")
print(f"There are {nrows} total observations")

We can see that there are much fewer unique values per feature than there are total observations, and the most frequent value of each feature is a sizeable fraction of the total number of observations for each feature. Because of this, it seems reasonable to assume all of these features are categorical, since we have no other information about them. This would suggest that any models we try to apply would do best if we treat them accordingly.

# Target Encoding

One way of dealing with categorical variables which have a fairly high cardinality is to target encode them, there's a good discussion of how this works [written by Max Halford](https://maxhalford.github.io/blog/target-encoding/). The work presented in that post only talks about a binary target, but our target has nine unique values which it can assume. We will therefore need to modify the method a bit.

What seems like it would be most natural would be to take each feature and generate nine new features (the cardinality of our target), one applying the target encoding framework to each possible value that our target can assume.

In [None]:
from collections import defaultdict

# Slightly modified from https://maxhalford.github.io/blog/target-encoding/
def calc_smooth_mean(df, by, on, m):
    # Compute the global mean
    mean = df[on].mean()

    # Compute the number of values and the mean of each group
    agg = df.groupby(by)[on].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + m * mean) / (counts + m)

    # Replace each value by the according smoothed mean
    # Generate a mapping to the smoothed mean which returns the global mean if
    # it encounters any value unseen in the training set
    return defaultdict(lambda: mean, smooth)

In [None]:
train_te = train.copy()
for t in targets:
    train_te[t] = train[target].apply(lambda x: x==t).astype(int)
test_te = test.copy()

In [None]:
%%time
for fidx, f in enumerate(Xnames):
    for tidx, t in enumerate(targets):
        nname = f"feature_te{fidx}-{tidx+1}"
        smooth=calc_smooth_mean(train_te, f, t, 100)
        train_te[nname] = train_te[f].map(smooth)
        test_te[nname] = test_te[f].map(smooth)
train_te = train_te.drop(Xnames+targets+[target], axis=1)
test_te = test_te.drop(Xnames, axis=1)

In [None]:
train_te.shape

We now have 675 target encoded features, one for each combination of target value and feature.

# Preprocessing

We will now preprocess the data. After target encoding, we have 675 features, which would be a burden to work with. We will use PCA to thin down the number of total features we end up using, and hopefully help avoid overfitting on the way.

We will also split up the full training set into train and validation subsets.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
Xtrain, Xvalid, ytrain, yvalid = train_test_split(train_te, train_target, test_size=0.2, random_state=SEED)

In [None]:
scaler_te = StandardScaler()
Xtrain_r = scaler_te.fit_transform(Xtrain)
Xvalid_r = scaler_te.transform(Xvalid)
Xtest_r = scaler_te.transform(test_te)

In [None]:
pca_test = PCA()
pca_test.fit(Xtrain_r)

In [None]:
feature_cutoff = 100

fig, (ax1, ax2) = plt.subplots(2)
ax1.plot(pca_test.explained_variance_ratio_)
ax1.axvline(feature_cutoff, color='red')

cumulative_explained = np.cumsum(pca_test.explained_variance_ratio_)
ax2.plot(cumulative_explained)
ax2.axvline(feature_cutoff, color='red')
ax2.axhline(cumulative_explained[feature_cutoff], color='red')

Taking the first 100 principal components seems to put us past the kinks in the cumulative explained PCA variance and hence seems like a reasonable number of features to take. This should also allow us to fit models relatively quickly compared to the full 675 target encoded features.

In [None]:
te_pca = PCA(n_components=feature_cutoff)
Xtrain_pca = te_pca.fit_transform(Xtrain_r)
Xvalid_pca = te_pca.transform(Xvalid_r)
Xtest_pca = te_pca.transform(Xtest_r)

# Fitting a Random Forest

I've tried a few of the simpler models provided by Scikit Learn, and the didn't do so well. A random forest seemed to generalize well, however. We will use Optuna to tune our hyperparameters.

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
tmp = Path("/kaggle/temp/")
if not tmp.exists():
    tmp.mkdir()

In [None]:
def objectiveRF(trial):
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', Xtrain_pca.shape[1]])
    max_depth = trial.suggest_int('max_depth', 2, 12)
    
    # Optuna sometimes suggests a parameter combination it's used before.
    # We don't want to waste time with those.
    tdf = study.trials_dataframe().iloc[:-1]
    seen = np.logical_and(tdf['params_max_features']==max_features, tdf['params_max_depth']==max_depth)
    if seen.any():
        print("REPEAT SKIPPED: {}, {}".format(max_features, max_depth))
        return tdf[seen]['value'].values[-1]*1.001
    
    clf = RandomForestClassifier(
        n_estimators=100, 
        criterion='gini', 
        max_features=max_features, 
        max_depth=max_depth, 
        random_state=SEED, n_jobs=-1)

    clf.fit(Xtrain_pca, ytrain)
    # save trained models to use later
    with open(tmp/"{}_RF.pickle".format(trial.number), "wb") as fout:
        pickle.dump(clf, fout)
    return log_loss(yvalid, clf.predict_proba(Xvalid_pca))
    

In [None]:
study = optuna.create_study(
    direction='minimize',
    sampler=optuna.samplers.TPESampler(seed=SEED)
)

In [None]:
study.optimize(objectiveRF, n_trials=20)

We are able to get to a log loss of about 1.74, which seems pretty good when compared with the top of the leaderboard. Better models such as gradient boosted trees may end up with a slightly better end result, but this is good enough for now. Doing more feature engineering may have more of an effect, such as possibly including a denoising autoencoder or finding a better way to encode the features.

For now, we re-load the best forest and generate our submission:

In [None]:
with open(tmp/"{}_RF.pickle".format(study.best_trial.number), "rb") as fin:
    best_clf = pickle.load(fin)

In [None]:
ypred = best_clf.predict_proba(Xtest_pca)
submission_df = pd.DataFrame(ypred, index=test_te.index, columns=[f"Class_{i+1}" for i in range(9)])
submission_df.to_csv("submission.csv")