# Train our Baseline Model with New Dataset (DeBERTa)

## Overview

This notebook trains a classifier using the [DeBERTa small](https://huggingface.co/microsoft/deberta-v3-small/tree/main) model, applied to a new dataset generated by the Falcon 40B model from the ER5 tweet dataset. The data is formatted as a CSV file, from our methodology from the Bootcamp day 2 reference, 04ModelTrelevance.ipynb.

## Key Steps and Objectives

1. **Classifier Training**: We fine-tune the DeBERTa small model on the Falcon 40B dataset to adapt it to this specific classification task.

2. **Results Visualization**: We analyze the model's performance through:
   - **Confusion Matrix**: Provides insights into accuracy and misclassification rates.
   - **ROC Curve**: Illustrates the model's performance across various thresholds.

This notebook is essential for evaluating the DeBERTa-based classifier on the new dataset and offers comprehensive insights through detailed visualizations.

In [None]:
!jupyter kernelspec list

In [None]:
import sys
import transformers
import datasets
import functools
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn.functional
import sklearn.metrics
import sklearn.model_selection
from sklearn.model_selection import train_test_split

sys.path.append(
    "/p/project/deepacf/maelstrom/haque1/AP2-Social-media-data-for-better-local-forecasts/bootcamp/AP2/scripts"
)
import plotting

In [None]:
random_seed = 42
torch.manual_seed(random_seed)
np.random.seed(random_seed)

# Load the dataset
folder_path = "/p/project/deepacf/maelstrom/haque1/dataset/"
file = "tweets_2017_01_02_03_era5_normed_no_snow.csv"
file_path = folder_path + file
ds_tweets = pd.read_csv(file_path)

In [None]:
# Again define labels
key_tp = "content"
ds_tweets["content"] = ds_tweets[key_tp].astype(str)

In [None]:
labels = ds_tweets["relevance"].astype(int)
# Split the indices into training and testing sets
indices_train, indices_test = train_test_split(ds_tweets.index, test_size=0.20, stratify=labels)

In [None]:
# Load the pretrained tokenizer
model_nm = "/p/project/deepacf/maelstrom/haque1/deberta-v3-small"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_nm)
db_config_base = transformers.AutoConfig.from_pretrained(model_nm, num_labels=2)

In [None]:
def tok_func(x, tokenizer):
    return tokenizer(x["inputs"], padding=True, truncation=True, max_length=512)


# Function to convert the dataset to a format used by Hugging Face
def get_dataset_from_csv(ds, tok_func, tokenizer, indices_train, indices_test, train=True):
    df = ds.rename(columns={"content": "inputs", "relevance": "labels"})
    datasets_ds = datasets.Dataset.from_pandas(df)
    tok_function_partial = functools.partial(tok_func, tokenizer=tokenizer)
    tok_ds = datasets_ds.map(tok_function_partial, batched=True)
    if train:
        return datasets.DatasetDict({"train": tok_ds.select(indices_train), "test": tok_ds.select(indices_test)})
    else:
        return tok_ds

In [None]:
# Create the dataset
dataset = get_dataset_from_csv(ds_tweets, tok_func, tokenizer, indices_train, indices_test, train=True)

In [None]:
folder_to_output = "./outputs"

In [None]:
def get_model(params, db_config_base, model_nm):
    db_config = db_config_base
    if params is not None:
        db_config.update({"cls_dropout": params["cls_dropout"]})
    db_config.update({"num_labels": 2})
    model = transformers.AutoModelForSequenceClassification.from_pretrained(model_nm, config=db_config)
    return model


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    classification_report = sklearn.metrics.classification_report(
        labels, predictions, target_names=["not relevance", "relevance"], output_dict=True
    )
    f1_not_relevance = classification_report["not relevance"]["f1-score"]
    f1_relevance = classification_report["relevance"]["f1-score"]
    return {"f1_not_relevance": f1_not_relevance, "f1_relevance": f1_relevance}


def get_trainer(dataset, db_config_base, model_nm, folder_to_output, parameters):
    args = transformers.TrainingArguments(
        folder_to_output,
        learning_rate=parameters["learning_rate"],
        warmup_ratio=parameters["warmup_ratio"],
        lr_scheduler_type=parameters["lr_scheduler_type"],
        disable_tqdm=False,
        fp16=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=parameters["batch_size"],
        per_device_eval_batch_size=parameters["batch_size"],
        num_train_epochs=parameters["epochs"],
        weight_decay=parameters["weight_decay"],
        report_to="none",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )
    get_model_partial = functools.partial(get_model, db_config_base=db_config_base, model_nm=model_nm)
    return transformers.Trainer(
        model_init=get_model_partial,
        args=args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )


parameters = {
    "learning_rate": 8e-5,
    "batch_size": 16,
    "weight_decay": 0.01,
    "epochs": 1,
    "warmup_ratio": 0.1,
    "cls_dropout": 0.3,
    "lr_scheduler_type": "cosine",
}
db_config_base = transformers.AutoConfig.from_pretrained(model_nm)
os.makedirs(folder_to_output, exist_ok=True)

In [None]:
trainer = get_trainer(dataset, db_config_base, model_nm, folder_to_output, parameters)

trainer.train()

In [None]:
test_ds = get_dataset_from_csv(
    ds_tweets.loc[indices_test], tok_func, tokenizer, indices_train, indices_test, train=False
)
ds_test = ds_tweets.loc[indices_test]

### Plotting ROC and Confusin Matrix

In [None]:
preds = torch.nn.functional.softmax(torch.Tensor(trainer.predict(test_ds).predictions)).numpy()
prediction_probability = preds[:, 1]
predictions = preds.argmax(axis=-1)
truth = ds_test.relevance.values
plotting.analysis.classification_report(labels=truth, predictions=predictions)
plotting.analysis.plot_roc(truth=truth, prediction_probability=prediction_probability, filename=file + "_ROC.png")

In [None]:
plotting.analysis.check_prediction(truth=truth, prediction=predictions, filename=file + "_CM.png")