In [1]:
import pandas as pd
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

# Undersampling
Since classes are extremely imbalanced, we'll do some undersampling and check if this leads us to some performance improvements.

In [2]:
df_training = pd.read_parquet("data/training_02.parquet")
df_training.shape

(9600000, 19)

In [3]:
seed = 1

In [4]:
# Create a Test Set to see, if undersampling didn't create another bias
df_test = df_training.sample(frac=0.20, random_state=seed)
df_test.to_parquet("data/test_02.parquet")
print(df_test.shape, df_test.label.mean())

df_training = df_training[~df_training.index.isin(df_test.index)]
print(df_training.shape, df_training.label.mean())

(1920000, 19) 0.03167864583333333
(7680000, 19) 0.031340494791666666


In [5]:
X, y = df_training[df_training.columns[1:]], df_training[df_training.columns[0]]

In [6]:
Counter(y)

Counter({0: 7439305, 1: 240695})

In [7]:
Sampler = RandomUnderSampler(random_state=seed, sampling_strategy=0.1)

In [8]:
X_res, y_res = Sampler.fit_resample(X, y)
Counter(y_res),y_res.mean()

(Counter({0: 2406950, 1: 240695}), 0.09090909090909091)

In [9]:
len(X_res)/len(X)

0.34474544270833335

In [10]:
pd.concat([y_res, X_res], axis=1).to_parquet("data/training_02_resampled.parquet")