In [None]:
import numpy as np
import pandas as pd

from sklearn import model_selection

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load the data

train = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')

In [None]:
def create_stratified_folds_for_classification(df, n_splits = 10):

    """
    @param data_df: training data to split in Stratified K Folds for a continous target value
    @param n_splits: number of splits
    @return: the training data with a column with kfold id
    """

    df['StratifiedKFold'] = -1

    # randomize the data
    df = df.sample(frac=1).reset_index(drop=True)

    # calculate the optimal number of bins based on log2(df.shape[0])
    df_test = []
    k = 0
    df_ = df.select_dtypes(include='float64')

    while k <= len(df_.columns)-1:
        q1 = df_.iloc[:,k].quantile(0.25)
        q3 = df_.iloc[:,k].quantile(0.75)
        iqr = q3 - q1
        bin_width = (2 * iqr) / (len(df_) ** (1 / 3))
        bin_count = int(np.ceil((df_.iloc[:,k].max() - df_.iloc[:,k].min()) / bin_width))
        df_test.append(bin_count)
        mean_bin = np.ceil(sum(df_test) / len(df_test))
        k = k + 1
    print(f"Num bins: {mean_bin}")

    # bins value will be the equivalent of class value of target feature used by StratifiedKFold to distribute evenly the classed over each fold
    df.loc[:, "bins"] = pd.cut(pd.to_numeric(df['target'], downcast = "signed"), bins = int(mean_bin), labels = False)
    kf = model_selection.StratifiedKFold(n_splits=n_splits, shuffle = True, random_state = 606)
    
    # set the fold id as a new column in the df data
    for fold, (df_indicies, valid_indicies) in enumerate(kf.split(X=df, y=df.bins.values)):
        df.loc[valid_indicies, "StratifiedKFold"] = fold
    
    # drop the bins column (no longer needed)
    df = df.drop("bins", axis=1)
    
    return df

In [None]:
n_splits = 10
train = create_stratified_folds_for_classification(train, n_splits)

In [None]:
train.to_csv("train_folds(10).csv", index = False)

In [None]:
train.StratifiedKFold.value_counts()

In [None]:
plt.figure(figsize=(25,12))
plt.title("Distribution of claim values (StratifiedKFolds with bins)")
for k in range(0,n_splits):
    df = train.loc[train.StratifiedKFold == k]
    sns.distplot(df['target'], kde = True, hist = False, bins = 12, label = k)
plt.legend(); plt.show()