# Set Seed

In [None]:
import os
import random
import numpy as np
import torch

def seed_everything(seed):
    os.environ["PL_GLOBAL_SEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
seed_everything(512)

# Dataset preprocess

In [None]:
import pandas as pd

train = pd.read_csv("../input/feedback-prize-effectiveness/train.csv")
test = pd.read_csv("../input/feedback-prize-effectiveness/test.csv")

In [None]:
from torch.utils.data import DataLoader
import warnings,transformers,logging,torch
from transformers import TrainingArguments,Trainer
from transformers import AutoModelForSequenceClassification,AutoTokenizer
import datasets
from datasets import load_dataset, Dataset, DatasetDict

In [None]:
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

Here we follow the herd and create `X = discourse_type + discourse_text` as a way to encode discoure_type information for the model

In [None]:
model_nm = '../input/debertav3base'

tokz = AutoTokenizer.from_pretrained(model_nm)
sep = tokz.sep_token

df = train
df['inputs'] = df.discourse_type + ' ' + sep + ' ' + df.discourse_text
new_label = {
    "discourse_effectiveness": {
        "Ineffective": 0, "Adequate": 1, "Effective": 2
    }
}
df = df.replace(new_label)
df = df.rename(columns = {'discourse_effectiveness': 'label'})
ds = Dataset.from_pandas(df)

In [None]:
from sklearn.metrics import log_loss
import torch.nn.functional as F

def tok_func(x):
    return tokz(x['inputs'], truncation=True, 
                max_length=256, 
                padding='max_length')

def score(preds):
    return {'log loss': log_loss(
        preds.label_ids, 
        F.softmax(torch.Tensor(preds.predictions))
    )}

Tokenize the inputs

In [None]:
tok_ds = ds.map(tok_func, batched=True, remove_columns=('discourse_text','discourse_type', 'inputs','discourse_id','essay_id'))

# Group K-fold

We start with 5 fold, each group being 1 `essay_id`. This way, there is no overlapping `essay_ids`  between the train and validation set in each fold.

In [None]:
from sklearn.model_selection import StratifiedGroupKFold

kfold = StratifiedGroupKFold(n_splits=5)

dds_folds = []
for trn_idxs, val_idxs in kfold.split(df.index, df.label, df.essay_id):
    dds_folds.append(
        DatasetDict({
        'train':tok_ds.select(trn_idxs), 
        'test': tok_ds.select(val_idxs)
    }))

# Training the folds

In [None]:
lr,bs = 8e-5,24
wd,epochs = 0.01,2

In [None]:
for fold in range(5):
    print(f'Training fold {fold}')
    args = TrainingArguments(
        output_dir=f'outputs/fold_{fold}',
        learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', 
        num_train_epochs=epochs, weight_decay=wd, report_to='none',
        evaluation_strategy="epoch", save_strategy="no",
        label_smoothing_factor=0.05,
        per_device_train_batch_size=bs, per_device_eval_batch_size=bs,
    )
    model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=df.label.nunique())
    trainer = Trainer(model=model, args=args, 
                   train_dataset=dds_folds[fold]['train'], eval_dataset=dds_folds[fold]['test'], 
                  compute_metrics=score)

    trainer.train()
    model.save_pretrained(f'outputs/fold_{fold}')

# Making predictions

In [None]:
test = pd.read_csv("../input/feedback-prize-effectiveness/test.csv")
test['inputs'] = test.discourse_type + ' ' + sep + ' ' + test.discourse_text
new_label = {
    "discourse_effectiveness": {
        "Ineffective": 0, "Adequate": 1, "Effective": 2
    }
}
test = test.replace(new_label)
test = test.rename(columns = {'discourse_effectiveness': 'label'})
test = Dataset.from_pandas(test)
test = test.map(tok_func, batched=True, 
                remove_columns=('discourse_text','discourse_type', 'inputs','discourse_id','essay_id'))

In [None]:
preds_folds = []
for fold in range(5):

    ckpt_path = f'./outputs/fold_{fold}/'
    model = AutoModelForSequenceClassification.from_pretrained(ckpt_path, num_labels=df.label.nunique())
    trainer = Trainer(model=model, args=args, 
                      train_dataset=dds_folds[fold]['train'], 
                      eval_dataset=dds_folds[fold]['test'], 
                      compute_metrics=score)
    preds_fold = F.softmax(torch.Tensor(trainer.predict(test).predictions)).numpy().astype(float)
    
    preds_folds.append(preds_fold)

In [None]:
avg_preds = np.stack(preds_folds, axis=0).mean(axis=0)

In [None]:
submission_df = pd.read_csv('../input/feedback-prize-effectiveness/sample_submission.csv')
submission_df['Ineffective'] = avg_preds[:,0]
submission_df['Adequate'] = avg_preds[:,1]
submission_df['Effective'] = avg_preds[:,2]
submission_df

In [None]:
submission_df.to_csv('submission.csv',index=False)