In [None]:
import os
import functools

import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

!pip install -q iterative-stratification

### Here we compare iterstrat and skmultilearn and see if there is any difference in the results of multilabel stratification done by these two libraries

In [None]:
DATA_PATH = "/kaggle/input/feedback-prize-2021/"

class Config:
    TRANSFORMER_CHECKPOINT = "allenai/longformer-base-4096"
    MAX_LENGTH = 100
    STRIDE = 10
    NUM_FOLDS = 5
    RANDOM_STATE = 42

In [None]:
df_train = pd.read_csv(DATA_PATH + "train.csv")
df_train["predictionstring"] = df_train.predictionstring.apply(lambda str: [int(item) for item in str.split()])
df_train["discoursetype"] = df_train.loc[:, "discourse_type"]
df_train.head()

In [None]:
df_train_onehot = pd.get_dummies(df_train, columns=["discoursetype"])
df_train_onehot = df_train_onehot.groupby(["id"], as_index=False).sum()
label_cols = [c for c in df_train_onehot.columns if c.startswith("discoursetype_") or c == "id"]
df_train_onehot = df_train_onehot[label_cols]
df_train_onehot.head()

In [None]:
def create_multilabel_targets(data_row, label_cols):
    targets = []
    for col in label_cols:
        targets.append(data_row[col])
    return targets

In [None]:
# For each essay, there can be multiple discourse_types, the target which is discourse type is thus multilabel
# For each essay this multilabel target column needs to be created first 

if "id" in label_cols:
    label_cols.remove("id")
df_train_onehot["targets"] = df_train_onehot.apply(lambda row: create_multilabel_targets(row, label_cols), axis=1)
df_train_onehot["targets_str"] = df_train_onehot.targets.apply(lambda x: ",".join([str(item) for item in x]))
df_train_onehot["kfold"] = -1
df_train_onehot.head()

In [None]:
# we need to split the train data into k folds using multilabel stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# This method uses the iterstrat library for multilabel stratification
def iterstrat_multilabel_stratified_kfold_cv_split(df_train_onehot):
    mskf = MultilabelStratifiedKFold(n_splits=Config.NUM_FOLDS, shuffle=True, random_state=Config.RANDOM_STATE)    
    df_targets = df_train_onehot[label_cols]
    for fold, (train_index, val_index) in enumerate(mskf.split(df_train_onehot["id"], df_targets)):        
        df_train_onehot.loc[val_index, "kfold"] = fold
    return df_train_onehot

In [None]:
from skmultilearn.model_selection import IterativeStratification

# This method uses the skmultilearn library for multilabel stratification
def skml_multilabel_stratified_kfold_cv_split(df_train_onehot):
    mskf = IterativeStratification(n_splits=Config.NUM_FOLDS, order=1)
    X = df_train_onehot["id"]
    y = df_train_onehot[label_cols]
    for fold, (train_index, val_index) in enumerate(mskf.split(X, y)):        
        df_train_onehot.loc[val_index, "kfold"] = fold
    return df_train_onehot

## Use skmultilearn first for CV splits using multilabel stratification

In [None]:
df_train_onehot = skml_multilabel_stratified_kfold_cv_split(df_train_onehot)
df_train_onehot.kfold.value_counts()

In [None]:
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix
from collections import Counter

def get_train_val_split_stats(df):
    counts = {}
    for fold in range(Config.NUM_FOLDS):
        y_train = df[df.kfold != fold][label_cols].values
        y_val = df[df.kfold == fold][label_cols].values
        counts[(fold, "train_count")] = Counter(
                                        str(combination) for row in get_combination_wise_output_matrix(y_train, order=1) 
                                        for combination in row
                                    )
        counts[(fold, "val_count")] = Counter(
                                        str(combination) for row in get_combination_wise_output_matrix(y_val, order=1) 
                                        for combination in row
                                    )
    # View distributions
    df_counts = pd.DataFrame(counts).T.fillna(0)
    df_counts.index.set_names(["fold", "counts"], inplace=True)
    for fold in range(Config.NUM_FOLDS):
        train_counts = df_counts.loc[(fold, "train_count"), :]
        val_counts = df_counts.loc[(fold, "val_count"), :]
        val_train_ratio = pd.Series({i: val_counts[i] / train_counts[i] for i in train_counts.index}, name=(fold, "val_train_ratio"))
        df_counts = df_counts.append(val_train_ratio)
    df_counts = df_counts.sort_index() 
    return df_counts

### Let's check out how well is skmultilearn's distribution of multiple labels in different splits

In [None]:
df_stats = get_train_val_split_stats(df_train_onehot)
df_stats

## Now use iterstrat for CV splits using multilabel stratification

In [None]:
df_train_onehot = iterstrat_multilabel_stratified_kfold_cv_split(df_train_onehot)
df_train_onehot.kfold.value_counts()   

### Let's check out how well is iterstrat distribution of multiple labels in different splits

In [None]:
df_stats = get_train_val_split_stats(df_train_onehot)
df_stats

### There is not much difference in the label distribution of iterstrat and skmultilearn. Hence we can use either library.