Reference
* https://www.kaggle.com/code/yasufuminakama/pppm-deberta-v3-large-baseline-w-w-b-train
* https://www.kaggle.com/code/debarshichanda/pytorch-feedback-deberta-v3-baseline
* https://www.kaggle.com/code/yasufuminakama/pppm-deberta-v3-large-baseline-inference

# Directly Settings

In [None]:
import os

INPUT_DIR = '../input/feedback-prize-effectiveness/'
OUTPUT_DIR = 'model/'

TRAIN_PATH = '../input/train-custommodel/baseline/'

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

In [None]:
class CFG:
    num_workers=4
    path= TRAIN_PATH
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-base"
    batch_size=32
    fc_dropout=0.2
    target_size=3
    max_len=512
    seed=42
    n_fold=2
    trn_fold=[0, 1]
    dropout = 0.1
    debug_ver2 = True

if CFG.debug_ver2:
    CFG.epochs = 1
    CFG.trn_fold = [0, 1]

# Library

In [None]:
import os
import gc
import time
import math
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from transformers import AutoModel, AutoConfig, AutoTokenizer, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Utils

In [None]:
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)
"""
def get_score(outputs, labels):
    return log_loss(labels, outputs)
"""
def get_score(outputs, labels):
    outputs = F.softmax(torch.tensor(outputs)).numpy()
    return log_loss(labels, outputs)


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# OOF

In [None]:
oof_df = pd.read_pickle(TRAIN_PATH+'oof_df.pkl')
labels = oof_df['label'].values
preds = oof_df[['pred_0', 'pred_1', 'pred_2']].values.tolist()
score = get_score(preds, labels)
LOGGER.info(f'CV Score: {score:<.4f}')

# DataLoading 

In [None]:
def get_essay(essay_id, DIR):
    essay_path = os.path.join(DIR, f"{essay_id}.txt")
    essay_text = open(essay_path, 'r').read()
    return essay_text

In [None]:
test = pd.read_csv(INPUT_DIR+'test.csv')
submission = pd.read_csv(INPUT_DIR+'sample_submission.csv')
test['essay_text'] = test['essay_id'].apply(get_essay, DIR='../input/feedback-prize-effectiveness/test')
display(test.head())
print(test.shape)
display(submission.head())
print(submission.shape)

# tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(TRAIN_PATH+'tokenizer/')
CFG.tokenizer = tokenizer

# Dataset

In [None]:
test['text'] = test['discourse_text'] + '[SEP]' + test['essay_text']

class FeedBackTestDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = CFG.max_len
        self.text = df['text'].values
        self.tokenizer = CFG.tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len
        )
        return {
            'input_ids':inputs['input_ids'],
            'attention_mask':inputs['attention_mask'],
            }

In [None]:
collate_fn = DataCollatorWithPadding(tokenizer=CFG.tokenizer)

# Model

In [None]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [None]:
class FeedBackModel(nn.Module):
    def __init__(self, cfg):
        super(FeedBackModel, self).__init__()
        self.cfg = cfg
        #self.model = AutoModel.from_pretrained(model_name)
        #self.config = AutoConfig.from_pretrained(model_name)
        self.config = torch.load(CFG.config_path)
        self.model = AutoModel.from_config(self.config)
        self.drop = nn.Dropout(p=0.2)
        self.pooler = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, CFG.target_size)
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids, 
                         attention_mask=mask,
                         output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, mask)
        out = self.drop(out)
        outputs = self.fc(out)
        return outputs

# Inference

In [None]:
def inference_one_epoch(model, dataloader, device):
    model.eval()
    pred = []
    model.to(device)
    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        with torch.no_grad():
            outputs = model(ids, mask)
        pred.append(outputs.to('cpu').numpy())
    pred = np.concatenate(pred)
    return pred

In [None]:
testdataset = FeedBackTestDataset(test, CFG.tokenizer, CFG.max_len)

test_loader = DataLoader(testdataset, 
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         collate_fn = collate_fn,
                         num_workers = CFG.num_workers,
                         pin_memory = True,
                         drop_last = False,
                         )

predictions = []

for fold in CFG.trn_fold:
    model = FeedBackModel(CFG)
    config_path=CFG.config_path
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location = torch.device('cpu'))
    model.load_state_dict(state['model'])

    prediction = inference_one_epoch(model, test_loader, device)
    predictions.append(prediction)
    del model, state, prediction
    gc.collect()
    torch.cuda.empty_cache()

predictions = np.mean(predictions, axis=0)
#logits →　probability
predictions = F.softmax(torch.tensor(predictions))
submission["Ineffective"] = predictions[:, 0]
submission["Adequate"] = predictions[:, 1]
submission["Effective"] = predictions[:, 2]

In [None]:
submission.to_csv('submission.csv', index=False)
submission.head()