# Feedback Effectiveness Prize PyTorch TPU starter - DeBERTa-v3-large (inference)

This is the inference notebook for [this notebook](https://www.kaggle.com/code/tanlikesmath/pytorch-tpu-starter-deberta-v3-large-training). Please check it out!

Much of this code is taken from [this notebook](https://www.kaggle.com/code/tanlikesmath/feedback-prize-effectiveness-eda-deberta-baseline/notebook).

In [None]:
!pip install --no-index --find-links ../input/huggingface-datasets datasets -q

In [None]:
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
import warnings,transformers,logging,torch
from transformers import TrainingArguments,Trainer
from transformers import AutoModelForSequenceClassification,AutoTokenizer
from transformers import AdamW, get_cosine_schedule_with_warmup
import datasets
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.metrics import log_loss
from pathlib import Path
import torch.nn.functional as F
import os
import gc


In [None]:
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

In [None]:
path = Path('../input/feedback-prize-effectiveness')

In [None]:
train_df = pd.read_csv(path/'train.csv')
train_df.head()

In [None]:
test_df = pd.read_csv(path/'test.csv')
test_df.head()

In [None]:
sample_df = pd.read_csv(path/'sample_submission.csv')
sample_df.head()

In [None]:
model_nm = '../input/deberta-v3-large/deberta-v3-large'

In [None]:
tokz = AutoTokenizer.from_pretrained(model_nm, model_max_length=512)

In [None]:
sep = tokz.sep_token
train_df['inputs'] = train_df.discourse_type + sep + train_df.discourse_text
test_df['inputs'] = test_df.discourse_type + sep + test_df.discourse_text

In [None]:
def tok_func(x): return tokz(x["inputs"], padding='max_length', truncation=True)

In [None]:
essay_ids = train_df.essay_id.unique()
np.random.seed(42)
np.random.shuffle(essay_ids)
essay_ids[:5]

In [None]:
val_prop = 0.2
val_sz = int(len(essay_ids)*val_prop)
val_essay_ids = essay_ids[:val_sz]

In [None]:
is_val = np.isin(train_df.essay_id, val_essay_ids)
idxs = np.arange(len(train_df))
val_idxs = idxs[ is_val]
trn_idxs = idxs[~is_val]
len(val_idxs),len(trn_idxs)

In [None]:
def get_dds(df, train=True):
    ds = Dataset.from_pandas(df)
    to_remove = ['discourse_text','discourse_type','inputs','discourse_id','essay_id']
    tok_ds = ds.map(tok_func, batched=True, remove_columns=to_remove)
    if train:
        return DatasetDict({"train":tok_ds.select(trn_idxs), "test": tok_ds.select(val_idxs)})
    else: 
        return tok_ds

In [None]:
train_ds = get_dds(train_df)

In [None]:
test_ds = get_dds(test_df,train=False)

In [None]:
def get_trainer(dds):
    args = TrainingArguments('outputs', learning_rate=8e-5, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
        evaluation_strategy="epoch", per_device_train_batch_size=2, per_device_eval_batch_size=2, gradient_accumulation_steps=8,
        num_train_epochs=1, weight_decay=0.01, report_to='none', save_total_limit=1)
    model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=3)
    model.load_state_dict(torch.load('../input/pytorch-tpu-starter-deberta-v3-large-training/xla_trained_model_epoch_0.pth'))
    return Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                   tokenizer=tokz)

In [None]:
trainer = get_trainer(train_ds)

In [None]:
preds = F.softmax(torch.Tensor(trainer.predict(test_ds).predictions)).numpy().astype(float)
preds

In [None]:
submission_df = pd.read_csv(path/'sample_submission.csv')
submission_df['Ineffective'] = preds[:,0]
submission_df['Adequate'] = preds[:,1]
submission_df['Effective'] = preds[:,2]
submission_df

In [None]:
submission_df.to_csv('submission.csv',index=False)