# Baseline ML Modeling with Transformers

This notebook will attempt to use the Transformers library to classify the author who wrote a given text.  

## Transformer Models
Transformer models are better at semantic understanding and unpacking semantic meaning from vectorized representations of text.  So we will run an initial experiment with a BERT based model using the Hugging Face Transformers library to see if it outperforms our Logisitc Regression TF-IDF model.  If it can achieve a 90% F1 score, then we can build an experiment pipeline to identify the right combination of data preparation, model selection, and hyperparameters to achieve the highest score.

## Optimization Metric
This is a multi-class problem with a business requirement to balance precision and recall.  Given that our classes are balanced in our training data set, we can take the harmonic mean of precision and recall and weight it equally.  So we will optimize our model for _F1 Macro_, though we will still track all metrics through this training cycle to ensure that continues to make sense.

In [None]:
import pandas as pd
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback
from datasets import Dataset, ClassLabel


In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

label2id = {'dickens': 0,
              'doyle': 1,
              'twain': 2,
              'defoe': 3}
id2label = {v: k for k, v in label2id.items()}

def preprocess_function(example: dict) -> dict:
    """
    Preprocess the input data for training. 
    This function tokenizes the text and maps the author names to their corresponding IDs. 
    It also truncates the text to a maximum length of 64 tokens and pads it with zeros if necessary. 
    The resulting dataset is then converted into a format suitable for training using the Transformers library.

    Args:
        example (dict): A dictionary containing the text and author name of a single example.

    Returns:
        dict: A dictionary containing the preprocessed text and corresponding author ID.
    """
    return tokenizer(example["text"], 
                     truncation=True, 
                     padding="max_length",
                     max_length=64)

def prepare_dataset_from_csv(csv_path: str) -> Dataset:
    """
    Prepare a Hugging Face dataset from a CSV file containing text and author names.

    Args:
        csv_path (str): The path to the CSV file containing text and author names.
    Returns:
        Dataset: A Hugging Face dataset containing preprocessed text and corresponding author ID.
    """
    df = pd.read_csv(csv_path)
    df = df[["text", "author"]]
    df['labels'] = df['author'].map(label2id)

    # Convert to HF Dataset
    hf_dataset = Dataset.from_pandas(df)

    # Cast labels to ClassLabel
    class_label = ClassLabel(num_classes=4, names=list(label2id.keys()))
    hf_dataset = hf_dataset.cast_column("labels", class_label)

    # Tokenize
    hf_dataset = hf_dataset.map(preprocess_function, batched=True)

    return hf_dataset

In [None]:
# Pad the dataset to a fixed length.  
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
prepared_dataset = prepare_dataset_from_csv('../data/raw/train.csv')

split_dataset = prepared_dataset.train_test_split(test_size=0.2, stratify_by_column="labels")
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# Validate all classes are represented equally across the train and eval sets.  
from collections import Counter

print(Counter(train_dataset["labels"]))
print(Counter(eval_dataset["labels"]))


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss
from scipy.special import softmax

def compute_metrics(pred):
    logits, labels = pred
    preds = logits.argmax(axis=-1)
    probs = softmax(logits, axis=1)  # For log loss
    
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision_macro": precision_score(labels, preds, average="macro"),
        "recall_macro": recall_score(labels, preds, average="macro"),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "log_loss": log_loss(labels, probs)
    }


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=4, id2label=id2label, label2id=label2id
)
model.gradient_checkpointing_enable()

training_args = TrainingArguments(
    output_dir="../results",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=15,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to=[],
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro", 
    greater_is_better=True,
    push_to_hub=False,
    fp16=False,
    dataloader_num_workers=0,
    dataloader_pin_memory=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    compute_metrics=compute_metrics
)

trainer.train()