In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
import torch
import os
from joblib import Parallel, delayed
from tqdm.auto import tqdm
import warnings
import transformers 
from shutil import rmtree

transformers.logging.set_verbosity_error()

In [None]:
warnings.filterwarnings('ignore')

In [None]:
seed = 43
model = 'microsoft/deberta-v3-base'
input_path = '../input/notebook28ac5e63d1'
max_length = 512 

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model)

In [None]:
train = pd.read_csv('../input/notebook28ac5e63d1/train_folds.csv')

In [None]:
train

In [None]:
def _prepare_training_data_helper( tokenizer, df, is_train):
    training_samples = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        idx = row["essay_id"]
        discourse_text = row["discourse_text"]
        discourse_type = row["discourse_type"]

        if is_train:
            filename = os.path.join(input_path, "train", idx + ".txt")
      

        with open(filename, "r") as f:
            text = f.read()

        encoded_text = tokenizer.encode_plus(
            discourse_type + " " + discourse_text,
            text,
            add_special_tokens=True,
            padding = "max_length",
            max_length = max_length,
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )
        input_ids = encoded_text["input_ids"]

        sample = {
            "discourse_id": row["discourse_id"],
            "input_ids": input_ids,
            "mask": encoded_text["attention_mask"],
        }

        if "token_type_ids" in encoded_text:
            sample["token_type_ids"] = encoded_text["token_type_ids"]

        label = row["discourse_effectiveness"]

        sample["label"] = LABEL_MAPPING[label]

        training_samples.append(sample)
    return training_samples


def prepare_training_data(df, tokenizer, num_jobs, is_train):
    training_samples = []

    df_splits = np.array_split(df, num_jobs)

    results = Parallel(n_jobs=num_jobs, backend="multiprocessing")(
        delayed(_prepare_training_data_helper)( tokenizer, df, is_train) for df in df_splits
    )
    for result in results:
        training_samples.extend(result)

    return training_samples

In [None]:
LABEL_MAPPING = {"Ineffective": 0, "Adequate": 1, "Effective": 2}

In [None]:
for i in range(5):
    
    print(f'Fold {i}')
    
    tr = train[ train.kfold != i ].reset_index(drop=True)
    va = train[ train.kfold == i ].reset_index(drop=True)
    
    train_samples = prepare_training_data( tr, tokenizer, 4, True )
    valid_samples = prepare_training_data( va, tokenizer, 4, True )
    
    if os.path.exists(f'fold_{i}/'):
        rmtree(f'fold_{i}/')
    
    os.mkdir(f'fold_{i}/')
    
    torch.save( train_samples, f'fold_{i}/train_samples.pt' )
    torch.save( valid_samples, f'fold_{i}/valid_samples.pt' )