In [None]:
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn import model_selection

I do a Stratified Group KFold on the `essay_id`s since we don't want examples from essays already seen during training in the validation set too.

In [None]:
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)

    # I create a variable so we can stratify on discourse type and effectiveness score at the same time
    data['discourse_type_score'] = data['discourse_type'] + '_' + data['discourse_effectiveness']
    
    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedGroupKFold(n_splits=num_splits, shuffle=True, random_state=42)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data['discourse_type_score'].values, groups=data['essay_id'])):
        data.loc[v_, 'kfold'] = f
    
    # drop the bins column
    data = data.drop("discourse_type_score", axis=1)

    # return dataframe with folds
    return data

In [None]:
# read training data
df = pd.read_csv("../input/feedback-prize-effectiveness/train.csv")

In [None]:
# create folds
df = create_folds(df, num_splits=5)

In [None]:
df.kfold.value_counts()

In [None]:
df.to_csv("train_folds.csv", index=False)