In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
plt.rcParams['axes.grid'] = True

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_before = pd.read_csv('/kaggle/input/november21/train.csv', index_col=0)
train_after = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/train.csv', index_col=0)
test = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/test.csv', index_col=0)
ybefore = train_before.target
yafter = train_after.target

As already discussed in https://www.kaggle.com/c/tabular-playground-series-nov-2021/discussion/286731 that the label seems chunked for every 60000 rows.

In [None]:
plt.figure(figsize=(11,5))
plt.xticks(np.arange(0,600000,60000))
(ybefore-.5).cumsum().plot(label='before flip')
(yafter-.5).cumsum().plot(label='after flip')
plt.legend();

The predicted label of the **test** set seems chunked too for every 60000 rows, this can be observed using almost any available model (the chunk is always visible).

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
def fit_predict(train,test):
    est = make_pipeline(StandardScaler(), LinearSVC(dual=False))
    est.fit(train.drop(columns='target'), train.target)
    return pd.Series(est.predict(test))
plt.figure(figsize=(11,5))
plt.xticks(np.arange(0,600000,60000))
(fit_predict(train_before, test)-.5).cumsum().plot(label='before flip')
(fit_predict(train_after, test)-.5).cumsum().plot(label='after flip')
plt.legend();

# Average/mean on each chunk

In [None]:
mean_before = []
mean_after = []
for start in range(0,600000,60000):
    endx = start+60000
    mean_before.append(ybefore[start:endx].mean())
    mean_after.append(yafter[start:endx].mean())
plt.xticks(np.arange(10))
plt.scatter(np.arange(len(mean_before)), mean_before, label='before flip')
plt.scatter(np.arange(len(mean_after)), mean_after, label='after flip')
plt.plot(np.arange(10), [0.5]*10, linestyle='--')
plt.xlabel('chunk')
plt.ylabel('mean')
plt.legend();

As shown above, the flipping seems pushing the label average closer to **0.5**.

The further away the initial average from 0.5, it will get pushed at greater length, closer to 0.5.

Let's take a look at the correlation between the average before flipping versus after flipping.

In [None]:
plt.scatter(mean_before, mean_after)
plt.xlabel('mean before')
plt.ylabel('mean after');

It's almost linearly correlated.

Hence probably we can predict the label average on the test set after the flipping too, but it's difficult to prove this.

Let's have a closer look at the statistics of the flipping on each chunk = how many 0 flipped to 1, and vice versa

In [None]:
flip01 = []
flip10 = []
for start in range(0,600000,60000):
    endx = start+60000
    before = ybefore[start:endx]
    after = yafter[start:endx]
    flip01.append(((~before) & after).sum())
    flip10.append((before & (~after)).sum())
df = pd.DataFrame(dict(
    mean_before=mean_before, 
    mean_after=mean_after, 
    flip_0_1=flip01, 
    flip_1_0=flip10
))
df.index.name = 'chunk'
df

The table above shows that for the flipping to be able to push the average closer to 0.5 =
* If the average < 0.5 (label 0 is majority), then need to perform a lot of flipping of 0 into 1 (`flip_0_1`)
* If the average > 0.5 (label 1 is majority), then need to perform a lot of flipping of 1 into 0 (`flip_1_0`)

Let's try to correlate the initial average into `flip_0_1` and `flip_1_0`.

In [None]:
df.plot.scatter('flip_0_1', 'mean_before');
df.plot.scatter('flip_1_0', 'mean_before');

Both of them are almost linearly correlated with the initial average.