# Setup

### 1.1 Imports

In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from datasets import Dataset

### 1.2 Constants

In [None]:
RANDOM_STATE = 24
EMAIL = "final_cleaned_text"
LABEL = "label"
TOKENS = "input_ids"
MASK = "attention_mask"
GPT = "gpt2"
BERT = "bert-base-uncased"
ROBERTA = "roberta-base"
DISTILBERT = "distilbert-base-uncased"
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
# DEVICE = 'cpu'
METRICS = "metrics"
ACC = "accuracy"
PR = "precision"
RC = "recall"
F1 = "f1_score"
CM = "conf_matrix"
TIME = "time"
LOSS = "loss"
TRAIN = "training"
VAL = "validation"
TEST = "testing"
MAX_SEQ_LEN = 128

print(f"Device: {DEVICE}")

# 2 Dataset

### 2.1 Load Data and Split Data

In [None]:
def load_data(
        file:str="combined_file.csv"
):
    data = []
    for idx, row in pd.read_csv(file)[[EMAIL, LABEL]].iterrows():
        if type(row[EMAIL]) is str:
            data.append((
                row[EMAIL], 
                np.array([row[LABEL]], dtype=np.float32)
            ))
    return pd.DataFrame(data, columns=[EMAIL, LABEL])

def split_data(
        data: pd.DataFrame,
        train_size: float = 0.7,
        val_size: float = 0.2
):
    data_len = data.shape[0]
    train_idx = int(data_len * train_size)
    val_idx = train_idx + int(data_len * val_size)

    train_set = data.iloc[:train_idx, :]
    val_set = data.iloc[train_idx:val_idx, :]
    test_set = data.iloc[val_idx:, :]

    return train_set, val_set, test_set

### 2.2 Tokenize Data

In [None]:
def tokenize(
        model_key,
        data: pd.DataFrame
):
    tokenizer = AutoTokenizer.from_pretrained(
        model_key,
        padding_side="left"
    )
    if model_key == GPT:
        tokenizer.pad_token = tokenizer.eos_token

    encodings = tokenizer(
        data[EMAIL].tolist(),
        padding=True,
        max_length=MAX_SEQ_LEN,
        truncation=True,
        return_tensors="pt"
    ).to(DEVICE)

    return Dataset.from_dict({
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': list(map(int, data[LABEL]))
    })

### 2.3 Create Model Specific Sets

In [None]:
def create_model_sets():
    # Data Dictionary
    result = {}
    
    # Load Base Set
    shared_data = load_data()
    # shared_data = shared_data.sample(n=10000, random_state=RANDOM_STATE)

    tr, vl, ts = split_data(shared_data)

    # Load Train/Validation/Test Sets
    def add_model(key):
        result[key] = {
            TRAIN:tokenize(key, tr),
            VAL:tokenize(key, vl),
            TEST:tokenize(key, ts)
        }
    
    # Add Model Sets
    add_model(BERT)
    add_model(ROBERTA)
    add_model(DISTILBERT)
    add_model(GPT)

    return result

data = create_model_sets()

# 3 Large Language Models

### 3.1 Define Metrics

In [None]:
def compute_metrics(eval_pred):
    y_pred, y_true = eval_pred
    y_pred = np.argmax(y_pred, axis=1)

    acc = accuracy_score(y_pred, y_true)
    pr = precision_score(y_pred, y_true, average='weighted')
    rc = recall_score(y_pred, y_true, average='weighted')
    f1 = f1_score(y_pred, y_true, average='weighted')

    return {
        ACC:acc,
        PR:pr,
        RC:rc,
        F1:f1
    }

### 3.2 Define Training Process

In [None]:
def to_trainer(model, key):
    targs = TrainingArguments(
        output_dir=f"./results/{key}/",
        num_train_epochs=100,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_dir="./logs/{key}/",
        logging_steps=10,
        disable_tqdm=False
    )
    return Trainer(
        model=model,
        args=targs,
        train_dataset=data[key][TRAIN],
        eval_dataset=data[key][VAL],
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

### 3.3 Create LLMs

In [None]:
def model_setup():

    def to_clf(key):
        model = AutoModelForSequenceClassification.from_pretrained(key, num_labels=2)
        if key == GPT:
            model.config.pad_token_id = model.config.eos_token_id
        return model

    def to_cuda(model):
        if torch.cuda.is_available():
            return model.cuda()
        return model

    def create(key):
        return to_trainer(
            model=to_cuda(to_clf(key)),
            key=key
        )

    return {
        BERT:create(BERT),
        ROBERTA:create(ROBERTA),
        DISTILBERT:create(DISTILBERT),
        GPT:create(GPT)
    }

llm = model_setup()

### 3.4 Define Evaluation Process

In [None]:
# Create Confusion Matrix Dictionary
cm = {}

In [None]:
def evaluate(key, is_pretrained):
    y_pred, y_true, metrics = llm[key].predict(data[key][TEST])
    y_pred = np.argmax(y_pred, axis=1)
    suffix = "PT" if is_pretrained else "FT"
    cm[f"{key} {suffix}"] = confusion_matrix(y_true, y_pred)
    return metrics

# 4 Approach

### 4.1 Evaluate Pretrained Models

In [None]:
pretrained_results = {
    BERT:evaluate(BERT, True),
    ROBERTA:evaluate(ROBERTA, True),
    DISTILBERT:evaluate(DISTILBERT, True),
    GPT:evaluate(GPT, True)
}

### 4.2 Finetune Models

##### Setup

In [None]:
train_output = {} # Dictionary to save TrainOutput object of each model

def plot_loss(key):
    """Plot training/validation loss vs epoch"""
    results = pd.DataFrame(llm[key].state.log_history)
    plt.figure(figsize=(5,3))
    n_epochs = int(train_output[key].metrics['epoch'])
    x = [i+1 for i in range(n_epochs)]
    plt.plot(x, results["loss"].dropna().head(n_epochs))
    plt.plot(x, results["eval_loss"].dropna().head(n_epochs))
    plt.title(f"{key} Training/Validation Loss vs. Epoch")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend(["Training", "Validation"])
    plt.tight_layout()
    plt.show()


##### BERT

In [None]:
train_output[BERT] = llm[BERT].train()
plot_loss(BERT)

##### roBERTa

In [None]:
train_output[ROBERTA] = llm[ROBERTA].train()
plot_loss(ROBERTA)

##### distilBERT

In [None]:
train_output[DISTILBERT] = llm[DISTILBERT].train()
plot_loss(DISTILBERT)

##### GPT-2

In [None]:
train_output[GPT] = llm[GPT].train()
plot_loss(GPT)

### 4.3 Evaluate Finetuned Models

In [None]:
finetuned_results = {
    BERT:evaluate(BERT, False),
    ROBERTA:evaluate(ROBERTA, False),
    DISTILBERT:evaluate(DISTILBERT, False),
    GPT:evaluate(GPT, False)
}

# 5 Results

### 5.1 Create Metrics DataFrame

In [None]:
results = pd.DataFrame([
    *pretrained_results.values(), *finetuned_results.values()
], index=[
    "BERT PT",
    "ROBERTA PT",
    "DISTILBERT PT",
    "GPT2 PT",

    "BERT FT",
    "ROBERTA FT",
    "DISTILBERT FT",
    "GPT2 FT"

]).fillna(0)

### 5.2 Accuracy Comparison

In [None]:
results.plot.bar(
    y=["test_accuracy"],
    figsize=(5, 4),
    fontsize=12
)
plt.title("Accuracy (%)")
plt.tight_layout()

### 5.3 Precision, Recall, and F1

In [None]:
results.plot.bar(
    y=["test_precision", "test_recall", "test_f1_score"], 
    figsize=(12,4),
    fontsize=12
)
plt.title("Precision, Recall, and F1-Score")
plt.tight_layout()


### 5.4 Confusion Matrices

In [None]:
# Fix font 
font = {'family' : 'sans-serif',
          'weight' : 'normal',
          'size'   : 15}
import matplotlib
matplotlib.rc('font', **font)

def plot_cm(key):
    fig, ax = plt.subplots(1, 2, figsize=(5, 3))
    ConfusionMatrixDisplay(cm[f"{key} PT"]).plot(ax=ax[0], cmap="GnBu", colorbar=False)
    ConfusionMatrixDisplay(cm[f"{key} FT"]).plot(ax=ax[1], cmap="GnBu", colorbar=False)
    ax[0].set_title("Pretrained")
    ax[1].set_title("Finetuned")
    plt.suptitle(f"    {"BERT"} Confusion Matrices")
    plt.tight_layout()
    plt.show()
    
    

##### BERT

In [None]:
plot_cm(BERT)

##### roBERTa

In [None]:
plot_cm(ROBERTA)

##### distilBERT

In [None]:
plot_cm(DISTILBERT)

##### GPT-2

In [None]:
plot_cm(GPT)