In [None]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from transformers import AutoTokenizer
import os
from shutil import rmtree
from joblib import dump
from tqdm.auto import tqdm
import warnings
import joblib
import torch

In [None]:
warnings.filterwarnings('ignore')

In [None]:
LABEL_MAPPING = {"Ineffective": 0, "Adequate": 1, "Effective": 2}

In [None]:
class args:
    input = '../input/folds'
    model = 'microsoft/deberta-v3-base'
    max_len = 512

In [None]:
def _prepare_training_data_helper(args, tokenizer, df, is_train):
    training_samples = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        idx = row["essay_id"]
        discourse_text = row["discourse_text"]
        discourse_type = row["discourse_type"]

        if is_train:
            filename = os.path.join(args.input, "train", idx + ".txt")
        else:
            filename = os.path.join(args.input, "test", idx + ".txt")

        with open(filename, "r") as f:
            text = f.read()

        encoded_text = tokenizer.encode_plus(
            discourse_type + " " + discourse_text,
            text,
            add_special_tokens=True,
            padding="max_length",
            max_length=args.max_len,
            return_attention_mask=True,
            return_token_type_ids=True,
            truncation="longest_first",
            return_tensors="pt"
        )
        input_ids = encoded_text["input_ids"]

        sample = {
            "discourse_id": row["discourse_id"],
            "input_ids": input_ids,
            # "discourse_text": discourse_text,
            # "essay_text": text,
            "attention_mask": encoded_text["attention_mask"],
        }

        if "token_type_ids" in encoded_text:
            sample["token_type_ids"] = encoded_text["token_type_ids"]

        label = row["discourse_effectiveness"]

        sample["label"] = LABEL_MAPPING[label]

        training_samples.append(sample)
    return training_samples


def prepare_training_data(df, tokenizer, args, num_jobs, is_train):
    training_samples = []

    df_splits = np.array_split(df, num_jobs)

    results = Parallel(n_jobs=num_jobs, backend="multiprocessing")(
        delayed(_prepare_training_data_helper)(args, tokenizer, df, is_train) for df in df_splits
    )
    for result in results:
        training_samples.extend(result)

    return training_samples


In [None]:
n_folds = 5

for i in range( n_folds ):
    
    print(f'Fold {i}')
    
    df = pd.read_csv(os.path.join(args.input, "train_folds.csv"))

    train_df = df[df["kfold"] != i].reset_index(drop=True)
    valid_df = df[df["kfold"] == i].reset_index(drop=True)

    tokenizer = AutoTokenizer.from_pretrained(args.model)
    training_samples = prepare_training_data(train_df, tokenizer, args, num_jobs=4, is_train=True)
    valid_samples = prepare_training_data(valid_df, tokenizer, args, num_jobs=4, is_train=True)

    if os.path.exists(f'fold_{i}/'):
        rmtree(f'fold_{i}/')
    
    else:
        os.mkdir(f'fold_{i}/')
    
    torch.save( training_samples, os.path.join(f'fold_{i}','train.pt') )
    torch.save( valid_samples, os.path.join(f'fold_{i}','valid.pt') )
    
        