This is the companion Inference notebook to this [Training Notebook](https://www.kaggle.com/code/lextoumbourou/feedback-prize-eda-and-model-training).

# Imports

In [None]:
from pathlib import Path
from types import SimpleNamespace
import logging

import torch
from tqdm.notebook import tqdm
from datasets import Dataset
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import numpy as np
import pandas as pd
import sklearn
import glob, pandas as pd, numpy as np, re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy.special import softmax

# To work around the aggressive HuggingFace log spam.
logging.disable(logging.WARNING)

# From this Gist: https://gist.github.com/ihoromi4/b681a9088f348942b01711f251e5f964
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [None]:
device = 'cpu'
if torch.cuda.is_available:
    device = 'cuda'

In [None]:
device

# Config

In [None]:
config = SimpleNamespace()

config.n_folds = 4
config.seed = 420
config.max_len = 512
config.lr = 1e-5
config.weight_decay = 0.01
config.epochs = 4
config.batch_size = 16
config.warm_up_ratio = 0.1
config.hidden_dropout_prob = 0.1
config.output_path = Path('./')
config.input_path = Path('../input/feedback-prize-effectiveness')
config.model_path = Path('../input/feedback-prize-the-complete-overview')

# Load Data

In [None]:
test_df = pd.read_csv(config.input_path / 'test.csv')

# Generate Topics

Load the topic model and libraries created in [this](https://www.kaggle.com/code/lextoumbourou/feedback-topics-identification) notebook.

In [None]:
topic_pred_df = pd.read_csv('../input/feedback-topics-identification/topic_model_feedback.csv')
topic_pred_df = topic_pred_df.drop(columns={'prob'})
topic_pred_df = topic_pred_df.rename(columns={'id': 'essay_id'})

topic_meta_df = pd.read_csv('../input/feedback-topics-identification/topic_model_metadata.csv')
topic_meta_df = topic_meta_df.rename(columns={'Topic': 'topic', 'Name': 'topic_name'}).drop(columns=['Count'])
topic_meta_df.topic_name = topic_meta_df.topic_name.apply(lambda n: ' '.join(n.split('_')[1:]))

topic_pred_df = topic_pred_df.merge(topic_meta_df, on='topic', how='left')

In [None]:
import sys
sys.path.append('../input/feedback-topics-identification/site-packages')
from bertopic import BERTopic

topic_model = BERTopic.load("../input/feedback-topics-identification/feedback_2021_topic_model")

sws = stopwords.words("english") + ["n't",  "'s", "'ve"]
fls = glob.glob("../input/feedback-prize-effectiveness/test/*.txt")
docs = []
for fl in tqdm(fls):
    with open(fl) as f:
        txt = f.read()
        word_tokens = word_tokenize(txt)
        txt = " ".join([w for w in word_tokens if not w.lower() in sws])
    docs.append(txt)

topics, probs = topic_model.transform(docs)

pred_topics = pd.DataFrame()
dids = list(map(lambda fl: fl.split("/")[-1].split(".")[0], fls))
pred_topics["id"] = dids
pred_topics["topic"] = topics
pred_topics['prob'] = probs
pred_topics = pred_topics.drop(columns={'prob'})
pred_topics = pred_topics.rename(columns={'id': 'essay_id'})
pred_topics = pred_topics.merge(topic_meta_df, on='topic', how='left')
pred_topics

In [None]:
test_df = test_df.merge(pred_topics, on='essay_id', how='left')

In [None]:
test_df.head()

# Prepare Data

In [None]:
labels = ['Adequate', 'Effective', 'Ineffective']

tokenizer = AutoTokenizer.from_pretrained(config.model_path / 'fold_0')
tokenizer.model_max_len = config.max_len

In [None]:
def get_essay(essay_fns):
    essay_cache = {}

    output = []
    for essay_fn in essay_fns:
        if essay_fn not in essay_cache:
            essay_txt = open(essay_fn).read().strip().lower()
            essay_cache[essay_fn] = essay_txt
        output.append(essay_cache[essay_fn])

    return output

def add_inputs(df, basepath):
    df['essay_fn'] = basepath + '/' + df.essay_id + '.txt'
    df['inputs'] = df.discourse_type.str.lower() + ' ' + tokenizer.sep_token + ' ' + df.topic_name + ' ' + tokenizer.sep_token + ' ' + df.discourse_text.str.lower()
    return df

In [None]:
def tokenizer_func(x):
    return tokenizer(x["inputs"], get_essay(x['essay_fn']), truncation=True)

In [None]:
test_df = add_inputs(test_df, str(config.input_path / 'test'))

In [None]:
test_df.head(1)

# Model

In [None]:
import torch
from torch import nn
from transformers import AutoConfig, AutoModelForSequenceClassification
from transformers.models.deberta_v2.modeling_deberta_v2 import ContextPooler
from transformers.models.deberta_v2.modeling_deberta_v2 import StableDropout
from transformers.modeling_outputs import TokenClassifierOutput
from transformers import DebertaV2ForSequenceClassification

def get_dropouts(num, start_prob, increment):
    return [StableDropout(start_prob + (increment * i)) for i in range(num)]  

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class CustomModel(nn.Module):
    def __init__(self, backbone):
        super(CustomModel, self).__init__()
        
        self.model = backbone
        self.config = self.model.config
        self.num_labels = self.config.num_labels

        # self.pooler = ContextPooler(self.config)
        self.pooler = MeanPooling()
        
        self.classifier = nn.Linear(self.config.hidden_size, self.num_labels)
    
        self.dropouts = get_dropouts(num=5, start_prob=config.hidden_dropout_prob - 0.02, increment=0.01)
    
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None
    ):
        outputs = self.model.deberta(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        encoder_layer = outputs[0]
        pooled_output = self.pooler(encoder_layer, attention_mask)
                      
        # Multi-sample dropout.
        num_dps = float(len(self.dropouts))
        for ii, drop in enumerate(self.dropouts):
            if ii == 0:
                logits = (self.classifier(drop(pooled_output)) / num_dps)
            else:
                logits += (self.classifier(drop(pooled_output)) / num_dps)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            logits = logits.view(-1, self.num_labels)
            loss = loss_fn(logits, labels.view(-1))

        output = (logits,) + outputs[1:]

        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

In [None]:
def get_model():
    model_config = AutoConfig.from_pretrained(config.model_path / 'backbone_config/config.json')
    model = DebertaV2ForSequenceClassification(model_config)
    
    return CustomModel(model)

In [None]:
model = get_model()

# Inference

In [None]:
all_test_data = np.zeros((config.n_folds, len(test_df), len(labels)))

for fold_num in range(config.n_folds):
    print(f'Do fold {fold_num}')

    tokenizer = AutoTokenizer.from_pretrained(config.model_path / f'fold_{fold_num}')
    tokenizer.model_max_length = config.max_len

    model = get_model()

    state_dict = torch.load(config.model_path / f'fold_{fold_num}/pytorch_model.bin')
    model.load_state_dict(state_dict)  

    test_dataset = Dataset.from_pandas(test_df[['inputs', 'essay_fn']])
    test_tok_dataset = test_dataset.map(tokenizer_func, batched=True, remove_columns=('inputs', 'essay_fn'))
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='longest')

    args = TrainingArguments(
        output_dir=config.output_path,
        learning_rate=config.lr,
        lr_scheduler_type='cosine',
        fp16=True,
        evaluation_strategy='epoch',
        per_device_train_batch_size=config.batch_size,
        per_device_eval_batch_size=config.batch_size * 2,
        report_to="none",
        save_strategy='no'
    )
    
    trainer = Trainer(
        model,
        args,
        tokenizer=tokenizer,
        data_collator=data_collator
    )
    
    outputs = trainer.predict(test_tok_dataset) 
    softmax_outputs = softmax(outputs.predictions, axis=1)
    
    all_test_data[fold_num] = softmax_outputs

# Make Submission

In [None]:
preds = np.mean(all_test_data, axis=0)
output_df = pd.concat([test_df[['discourse_id']], pd.DataFrame(preds, columns=labels)], axis=1)
output_df.to_csv('submission.csv', index=False)
pd.read_csv('submission.csv')