# Model Experimentation and Selection

Given our initial success with fine-tuning BERT, we are going to attempt to start modularizing components and setting up our experiments pipeline.  This notebook is our initial attempt to start building out the reusable components for our Machine Learning pipelines and to start experimenting with MLFlow.

After running some of these tests in notebooks, it is time to pivot to configuration-based modular code, as we will need to run multiple experiments using reproducible mechanisms with auditability and results tracking.

In [None]:
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback
from datetime import datetime

import sys
sys.path.append('../')

from src.data.preparation import prepare_dataset_from_csv
from src.features.tokenization import tokenizer, id2label, label2id
from src.models.metrics import compute_metrics
from src.mflow.experiment_tracking import start_run, log_params, log_metrics, log_model_from_checkpoint
from src.utils.helpers import md_print

# In order to run this effectively on a MacBook, I needed to leverage the below Environment Variable.
#
# TOKENIZERS_PARALLELISM=False
#
# So I added a .env file to the ./notebooks directory and set the variable there.  
from dotenv import load_dotenv
load_dotenv()

In [None]:

dataset = prepare_dataset_from_csv("../data/raw/train.csv")
split_dataset = dataset.train_test_split(test_size=0.2, stratify_by_column="labels")
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# Validate all classes are represented equally across the train and eval sets.  
from collections import Counter

print(Counter(train_dataset["labels"]))
print(Counter(eval_dataset["labels"]))

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased",
    num_labels=4,
    id2label=id2label,
    label2id=label2id
)
# Enable gradient checkpointing to save memory during training.  
model.gradient_checkpointing_enable()

In [None]:
now = datetime.now()
formatted_date = now.strftime("%Y-%m-%d_%H:%M")

with start_run(run_name=f"distilbert_hyperparam_trial_{formatted_date}"):
    training_args = TrainingArguments(
        output_dir="../results",
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=4,
        num_train_epochs=2,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        report_to=[],
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        push_to_hub=False,
        fp16=False,
        dataloader_num_workers=0,
        dataloader_pin_memory=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        processing_class=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    log_params(vars(training_args))  # log the full TrainingArguments config
    trainer.train()
    eval_metrics = trainer.evaluate()
    log_metrics(eval_metrics)
    log_model_from_checkpoint(model=trainer.model, tokenizer=tokenizer)
    
    md_print("### Final evaluation metrics:")
    for k, v in eval_metrics.items():
        md_print(f"  **{k}:** {v}")