In [None]:
import os
import random

import pandas as pd
import seaborn as sns
import numpy as np

import plotly.express as px
import plotly.figure_factory as ff

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from tqdm.notebook import trange

random.seed(42)
np.random.seed(42)
cv = StratifiedKFold(5, shuffle=True)

droot = os.environ.get('KAGGLE_DIR', '../input')
old_y = pd.read_csv(
    f'{droot}/november21/train.csv', usecols=['target']
).target.astype(np.float32).to_numpy()
train = pd.read_csv(
    f'{droot}/tabular-playground-series-nov-2021/train.csv', dtype=np.float32
).drop(columns=['id'])
X, new_y = StandardScaler().fit_transform(train.drop(columns=['target'])), train.target

Achieve stable CV scores
==

In which we [use the old labels](https://www.kaggle.com/criskiev/november21) to get very stable CV scores. What I mean by stable CV score, is that all the splits achieve roughly the same score -- as opposed to having 1 out of 5 splits score, say, `.753`, but the rest score, say `.748`. 

We'll do this experiment in a very simple way -- we'll just use the pre-flip labels as our predictions. We already know that this should score around `.7486`, so let's check what our CVs are for that:

In [None]:
clf = LogisticRegression()

def clf_pred(train_idx, val_idx):
    clf.fit(X[train_idx], new_y[train_idx])
    return clf.decision_function(X[val_idx])

def old_pred(train_idx, val_idx):
    return old_y[val_idx]

def auc_cvs(stratify=new_y, predict=old_pred):
    return np.array([
        roc_auc_score(new_y[val_idx], predict(train_idx, val_idx)) 
        for train_idx, val_idx in cv.split(X, stratify)
    ])

scores = auc_cvs()
scores

The example here is typical for what we want to avoid, there's a huge difference between the best and worst split, and none of them are even very close to the real score of this prediction:

In [None]:
diff = scores.max() - scores.min()
real_score = roc_auc_score(new_y, old_y)
print(f'max() - min() = {diff:.4f}, std() = {scores.std():.4f}, real auc = {real_score:.4f}')

Let's repeat this, but stratify by whether the label was flipped or not (`old != new`):

In [None]:
scores = auc_cvs(new_y != old_y)
scores

In [None]:
diff = scores.max() - scores.min()
real_score = roc_auc_score(new_y, old_y)
print(f'max() - min() = {diff:.4f}, std() = {scores.std():.4f}, real auc = {real_score:.4f}')

That's a fairly big change. But does this also affect real models? We've not been doing any training. Well, I think this should also affect real models, so let's try running a few tests:

In [None]:
spread_stratify_label = np.concatenate([auc_cvs(predict=clf_pred) for _ in trange(10)])
spread_stratify_flipped = np.concatenate([auc_cvs(new_y != old_y, predict=clf_pred) for _ in trange(10)])

ff.create_distplot(
    [spread_stratify_label, spread_stratify_flipped],
    ['stratify=target', 'stratify=target != old_target'],
    bin_size=.00025, show_curve=False
)

One of these cases is clearly less spread out, as we could also see by comparing std:

In [None]:
spread_stratify_flipped.std(), spread_stratify_label.std()

It might look like stratifying by `target` creates a better-performing classifier. Is that the case, though? Let's plot the spread of the mean CV scores:

In [None]:
ff.create_distplot(
    [spread_stratify_label.reshape(-1, 5).mean(axis=1), spread_stratify_flipped.reshape(-1, 5).mean(axis=1)],
    ['stratify=target', 'stratify=target != old_target'],
    bin_size=.00025, show_curve=False
)

Well, no. It looks like we receive the same OOF predictions from these, the differences are so small that it could easily just be some kind of accumulated rounding error.