# Constants

In [16]:
BASE_PATH = "/notebooks"

In [23]:
TRAIN_PATH = os.path.join(BASE_PATH, 'data/train/original/train.csv')
CV_OUT_PATH = os.path.join(BASE_PATH, 'data/train/cv')

# Setup

In [24]:
!pip install iterative-stratification

[0m

In [35]:
import pandas as pd
from sklearn import model_selection
from tqdm.auto import tqdm
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [36]:
def create_folds(data, num_splits):
    data["fold"] = -1

    mskf = MultilabelStratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    labels = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
    data_labels = data[labels].values

    for f, (t_, v_) in enumerate(mskf.split(data, data_labels)):
        data.loc[v_, "fold"] = f

    return data

In [39]:
def make_cv_data(df, out_path, kfolds=6):
    df = create_folds(df, num_splits=kfolds)
    for fold in range(kfolds):
        print('Fold:', fold)
        train_df = df.loc[df.fold!=fold].reset_index(drop=True).drop(columns=["fold"])
        val_df = df.loc[df.fold==fold].reset_index(drop=True).drop(columns=["fold"])
        train_df.to_csv(out_path + '/train_fold_' + str(fold) + '.csv', index = False)
        val_df.to_csv(out_path + '/val_fold_' + str(fold) + '.csv', index = False)

# Create the splits

In [40]:
train_df = pd.read_csv(TRAIN_PATH)
make_cv_data(df=train_df, out_path=CV_OUT_PATH)

Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
