## importing packages

In [None]:
import numpy as np
import pandas as pd
import os

import datasets
from datasets import load_dataset, Dataset, DatasetDict
from torch.utils.data import DataLoader
import warnings, transformers, logging, torch

from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import torch.nn.functional as F

warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

**Check out previous kernal to see EDA exploration :- https://www.kaggle.com/code/himanshutripathi/in-depth-eda-interactive-visualization**

## Reading the DataSet

In [None]:
df = pd.read_csv("../input/feedback-prize-effectiveness/train.csv")
df.head(3)

In [None]:
df.shape

In [None]:
test_df = pd.read_csv("../input/feedback-prize-effectiveness/test.csv")
test_df.head()

In [None]:
sample_df = pd.read_csv("../input/feedback-prize-effectiveness/sample_submission.csv")
sample_df.head()

## Model building 

In [None]:
modelPath = "../input/debertav3small"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(modelPath)

In [None]:
df.columns

In [None]:
df['inputs'] = df["discourse_type"] + tokenizer.sep_token + df["discourse_text"]

In [None]:
df['discourse_effectiveness'] = df['discourse_effectiveness'].map({
    "Ineffective": 0, "Adequate": 1, "Effective": 2
})

df = df.rename(columns={"discourse_effectiveness": "label"})


In [None]:
df.tail()

In [None]:
data = Dataset.from_pandas(df)

In [None]:
def tokenizer_function(x):
    return tokenizer(x['inputs'], truncation=True)

In [None]:
col_to_remove = "discourse_text","discourse_type"
tokenize_dataset = data.map(tokenizer_function, 
                            batched=True, 
                            remove_columns=col_to_remove+('inputs','discourse_id','essay_id'))

In [None]:
tokenize_dataset[0].keys()

In [None]:
essay_ids = df.essay_id.unique()
np.random.seed(42)
np.random.shuffle(essay_ids)

val_prop = 0.2
val_sz = int(len(essay_ids)*val_prop)
val_essay_ids = essay_ids[:val_sz]

is_val = np.isin(df.essay_id, val_essay_ids)
idxs = np.arange(len(df))
val_idxs = idxs[ is_val]
trn_idxs = idxs[~is_val]
len(val_idxs),len(trn_idxs)

In [None]:
dataDict = DatasetDict({
    "train":tokenize_dataset.select(trn_idxs),
    "test": tokenize_dataset.select(val_idxs)
})

In [None]:
def score(preds):
    return {'log loss': log_loss(preds.label_ids, F.softmax(torch.Tensor(preds.predictions)))}

In [None]:
learning_rate = 8e-5
batch_size = 8
weight_decay = 0.01
epochs = 1

In [None]:
def getTrainer(dataDict):
    args = TrainingArguments(
            'outputs',
            learning_rate=learning_rate,
            warmup_ratio=0.1, 
            lr_scheduler_type='cosine',
            fp16=True,
            evaluation_strategy='epoch',
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size*2,
            num_train_epochs=epochs,
            weight_decay=weight_decay,
            report_to='none'
        )
    
    model = AutoModelForSequenceClassification.from_pretrained(modelPath, num_labels=3)
    
    return Trainer(model, 
                   args, 
                   train_dataset=dataDict['train'],
                   eval_dataset=dataDict['test'], 
                   tokenizer=tokenizer,
                   compute_metrics=score)
    

In [None]:
import torch
torch.cuda.empty_cache()

trainer = getTrainer(dataDict)
trainer.train()

## prediction 

In [None]:
test_df['inputs'] = test_df["discourse_type"] + tokenizer.sep_token + test_df["discourse_text"]

In [None]:
def get_dataDict(df, train=True):
    ds = Dataset.from_pandas(df)
    to_remove = ['discourse_text','discourse_type','inputs','discourse_id','essay_id']
    tok_ds = ds.map(tokenizer_function, batched=True, remove_columns=to_remove)
    if train:
        return DatasetDict({"train":tok_ds.select(trn_idxs), "test": tok_ds.select(val_idxs)})
    else: 
        return tok_ds

In [None]:
test_dataset = get_dataDict(test_df, train=False)

In [None]:
preds = F.softmax(torch.Tensor(trainer.predict(test_dataset).predictions)).numpy().astype(float)
preds

In [None]:
sample_df.head()

In [None]:
submission_df = sample_df
submission_df['Ineffective'] = preds[:,0]
submission_df['Adequate'] = preds[:,1]
submission_df['Effective'] = preds[:,2]

In [None]:
submission_df

In [None]:
submission_df.to_csv('submission.csv',index=False)