In [None]:
# Train-Val split for Model Training & Decision on the Model Infrastructure
from sklearn.model_selection import train_test_split

train_transf, validation = train_test_split(train, stratify=train[["outcome"]], test_size=0.3)

#Here we applied the Over/Undersampling. However, as we used none of those in the final approach, we just copy the data here.
train_resampled_body, train_resampled_outcome = train_transf[["body_transform"]], train_transf[["outcome"]]

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

# Rename columns
train_resampled_body["text"] = train_resampled_body["body_transform"]
train_resampled_body["label"] = train_resampled_outcome["outcome"]

# Create a new DataFrame with selected columns
train_transformer = train_resampled_body[["label", "text"]]

train_data = Dataset.from_pandas(train_transformer)

validation["text"] = validation["body_transform"]
validation["label"] = validation["outcome"]

validation_transformer = validation[["label", "text"]]

validation_data = Dataset.from_pandas(validation_transformer)


# Load the Longformer Model for tokenization.
tokenizer = AutoTokenizer.from_pretrained("LennartKeller/longformer-gottbert-base-8192-aw512")

#Those tokens were added to represent the data structure.
special_tokens_dict = {'additional_special_tokens': ['[CNSLR]', '[USER]']}

tokenizer.add_special_tokens(special_tokens_dict)

# Function for tokenization, max length is coming from the maximum input length the longformer model can use.
def tokenize(dataset):
    return tokenizer(dataset["text"], truncation=True, padding="max_length", max_length=8192)

# Tokenizing the text
train_data_tokenized = train_data.map(tokenize)
validation_data_tokenized = validation_data.map(tokenize)


In [None]:
#ROC AUC Score as evaluation metric

from sklearn.metrics import roc_auc_score
import tensorflow as tf

def compute_metrics(pred):
    labels = pred.label_ids
    preds = tf.math.softmax(pred.predictions, axis=-1)
    roc = roc_auc_score(labels, preds[:, 1])
    return {'roc' :roc}

In [None]:
#Class Weighting as selected as technique to handle class imbalance.

from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification
import torch.nn as nn
import torch
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([9.0, 1.0], device=model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
#Model training and evaluation on val set.

from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("LennartKeller/longformer-gottbert-base-8192-aw512", num_labels=2)
model.resize_token_embeddings(len(tokenizer))


training_args = TrainingArguments(
    output_dir="./results",
    report_to=[],
    num_train_epochs=3,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    evaluation_strategy = "epoch",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data_tokenized,
    eval_dataset=validation_data_tokenized,
    compute_metrics=compute_metrics

    )


trainer.train()