# 0. Imports Etc

In [None]:
!pip install transformers datasets evaluate seqeval optuna pandas
from IPython.display import clear_output
clear_output()

import numpy as np
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from transformers import BertModel, BertTokenizerFast, TrainingArguments, Trainer, DataCollatorForTokenClassification, TrainerCallback, TrainerControl, TrainerState
import evaluate
from datasets import load_dataset
from tqdm import tqdm
import time
import os
import pandas as pd
from typing import Any, Dict, List, Optional, Tuple, Union

import optuna

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pretrained_model_name = 'bert-base-uncased'
os.environ["WANDB_DISABLED"] = 'true'


# 1. Extend Pretrained BERT with a Classifier


In [None]:

class BertClassifier(nn.Module):
    def __init__(self, bert_model: BertModel, num_labels: int, dropout_rate: Optional[float] = None) -> None:
        super(BertClassifier, self).__init__()
        self.bert = bert_model
        self.num_labels = num_labels
        self.classifier = nn.Linear(bert_model.config.hidden_size, num_labels)
        if dropout_rate is not None:
            self.dropout = nn.Dropout(dropout_rate)
        else:
            self.dropout = nn.Dropout(bert_model.config.hidden_dropout_prob)

    def forward(self, input_ids: torch.Tensor,
                attention_mask: torch.Tensor,
                token_type_ids: torch.Tensor,
                labels: Optional[torch.Tensor] = None) -> Union[Tuple[torch.Tensor, Any], torch.Tensor]:
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids)
        sequence_output = outputs.last_hidden_state
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        output = (logits,) + outputs[2:]
        return (loss,) + output if loss is not None else output




# 2. Prepare Finetuning Dataset and Define Freeze Function


In [None]:
wnut = load_dataset("wnut_17")
label_list = wnut["train"].features["ner_tags"].feature.names
print('Available NER tags:', label_list)
num_labels = len(label_list)
print('Number of NER tags in the dataset:', num_labels)

tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name)
tokenized_input = tokenizer(wnut['train'][0]["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print('Tokenized input:', tokenized_input)
print('\nInput:', wnut['train'][0]["tokens"])
print('Tokenized input ids:', tokenized_input['input_ids'])
print('Tokenized input tokens:', tokens)
print('\nLength of Input:', len(wnut['train'][0]["tokens"]))
print('Length of Tokenized input:', len(tokenized_input['input_ids']))
print('Length of Target labels:', len(wnut['train'][0]["ner_tags"]))

def tokenize_and_align_labels(examples: Dict[str, Any],
                              tokenizer: BertTokenizerFast) -> Dict[str, Any]:
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True, fn_kwargs={'tokenizer': tokenizer})
train_dataset_hf, eval_dataset_hf, test_dataset_hf = tokenized_wnut["train"], tokenized_wnut["validation"], tokenized_wnut["test"]
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")

def freeze_bert_layers(model: nn.Module, unfreeze_last_n: int) -> nn.Module:
    """
    Freeze all parameters in the model, then unfreeze only the classifier and the last `unfreeze_last_n`
    layers of the BERT encoder.

    Parameters:
      model (nn.Module): The BertClassifier model.
      unfreeze_last_n (int): The number of last BERT encoder layers to unfreeze.

    Returns:
      nn.Module: The model with gradients enabled only for the classifier and the last n encoder layers.
    """
    for name, param in model.named_parameters():
        param.requires_grad = False

    for name, param in model.classifier.named_parameters():
        param.requires_grad = True

    total_layers: int = len(model.bert.encoder.layer)
    for i in range(total_layers - unfreeze_last_n, total_layers):
        for param in model.bert.encoder.layer[i].parameters():
            param.requires_grad = True

    return model

def compute_metrics(model_output):
    """
    Compute evaluation metrics to check the performance of the model during training (on the validation set)
    and after training (on the test set).
    The input parameter of the function is a Tuple as required by the HuggingFace Trainer.

    Parameters:
    - model_output (Tuple): Contains model's raw predictions and target labels.

    Returns:
    - dict: Dictionary of the evaluation metrics, i.e. Precision, Recall, F1, and Accuracy.
    """
    predictions, labels = model_output
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }




# 3. Model Initialization and Baseline Training


In [None]:
bert_model = BertModel.from_pretrained(pretrained_model_name)
num_labels = len(wnut["train"].features["ner_tags"].feature.names)
baseline_model = BertClassifier(bert_model, num_labels=num_labels).to(device)

baseline_training_args = TrainingArguments(
    output_dir='./model_output',
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    report_to="none"
)

baseline_trainer = Trainer(
    model=baseline_model,
    args=baseline_training_args,
    train_dataset=train_dataset_hf,
    eval_dataset=eval_dataset_hf,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

print("Starting baseline training (all layers unfrozen)...")
baseline_trainer.train()
print("Baseline evaluation on validation set:")
print(baseline_trainer.evaluate())
baseline_predictions = baseline_trainer.predict(test_dataset_hf)
print("Baseline evaluation on test set:")
print(baseline_predictions.metrics)


# 4. Bayesian Hyperparameter Tuning with Optuna



In [None]:
experiment_results = []

for l in range(1, 5):
    print(f"\nNumber of unfrozen layers: {l}\n")

    hyper_params = {}

    def model_init() -> torch.nn.Module:
        bert_model = BertModel.from_pretrained(pretrained_model_name)
        dropout_rate = hyper_params.get("dropout_rate", None)
        model = BertClassifier(bert_model, num_labels=num_labels, dropout_rate=dropout_rate).to(device)
        model = freeze_bert_layers(model, hyper_params.get("unfreeze_last_n", l))
        return model

    def hp_space(trial: optuna.Trial) -> Dict[str, Any]:
        hyper_params["unfreeze_last_n"] = l
        learning_rate = trial.suggest_float("learning_rate", 3e-5, 7e-5, log=True)
        per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [32])
        warmup_steps = trial.suggest_categorical("warmup_steps", [500, 750])
        dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.3)
        hyper_params["dropout_rate"] = dropout_rate
        return {
            "learning_rate": learning_rate,
            "per_device_train_batch_size": per_device_train_batch_size,
            "warmup_steps": warmup_steps,
        }

    tuning_args = TrainingArguments(
        output_dir='./hp_tuning_output',
        num_train_epochs=3,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        learning_rate=5e-5,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./hp_tuning_logs',
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=50,
        save_strategy="no",
        report_to="none"
    )

    tuner = Trainer(
        model_init=model_init,
        args=tuning_args,
        train_dataset=train_dataset_hf,
        eval_dataset=eval_dataset_hf,
        compute_metrics=compute_metrics,
        data_collator=data_collator
    )

    print("Starting hyperparameter search...")
    best_trial = tuner.hyperparameter_search(
        hp_space=hp_space,
        direction="maximize",
        n_trials=10
    )
    print("Best trial hyperparameters:", best_trial.hyperparameters)

    final_training_args = TrainingArguments(
        output_dir='./final_model_output',
        num_train_epochs=5,
        per_device_train_batch_size=best_trial.hyperparameters["per_device_train_batch_size"],
        per_device_eval_batch_size=64,
        learning_rate=best_trial.hyperparameters["learning_rate"],
        warmup_steps=best_trial.hyperparameters["warmup_steps"],
        weight_decay=0.01,
        logging_dir='./final_logs',
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=50,
        save_strategy="steps",
        save_steps=100,
        load_best_model_at_end=True,
        report_to="none"
    )

    final_model = model_init()
    final_trainer = Trainer(
        model=final_model,
        args=final_training_args,
        train_dataset=train_dataset_hf,
        eval_dataset=eval_dataset_hf,
        compute_metrics=compute_metrics,
        data_collator=data_collator
    )

    print("Starting final training with best hyperparameters...")
    final_trainer.train()
    val_metrics = final_trainer.evaluate()
    print("Final evaluation on validation set:")
    print(val_metrics)
    test_predictions = final_trainer.predict(test_dataset_hf)
    test_metrics = test_predictions.metrics
    print("Final evaluation on test set:")
    print(test_metrics)

    exp_result = {
        "unfreeze_last_n": l,
        "learning_rate": best_trial.hyperparameters["learning_rate"],
        "warmup_steps": best_trial.hyperparameters["warmup_steps"],
        "dropout_rate": hyper_params.get("dropout_rate"),
        "val_precision": val_metrics.get("precision"),
        "val_recall": val_metrics.get("recall"),
        "val_f1": val_metrics.get("f1"),
        "val_accuracy": val_metrics.get("accuracy"),
        "test_precision": test_metrics.get("precision"),
        "test_recall": test_metrics.get("recall"),
        "test_f1": test_metrics.get("f1"),
        "test_accuracy": test_metrics.get("accuracy"),
    }
    experiment_results.append(exp_result)
    print("\n-------------\n")

df_results = pd.DataFrame(experiment_results)
df_results.to_csv("experiment_results.csv", index=False)
print("Saved experiment results to 'experiment_results.csv'")


# DL for NLP Lab 7 Report
**Author**: Oisín Redmond

## Methodology

It is crucial for future machine learning models to be more computationally and energy efficient if the massive amount of CO₂ produced by AI is to be addressed. Parameter-efficient fine-tuning is a way for models to train for specific tasks on a small number of their original parameters, saving both time and energy. This lab explored the effect of partial fine-tuning on a pre-trained BERT model for NER.

The `freeze_bert_layers` function was created and used to disable gradient updates for all encoder layers except the final *n* layers and the classification head. Models with `unfreeze_last_n` set to 1, 2, 3, and 4 were evaluated. This function uses `param.requires_grad` and `model.named_parameters` to freeze any number of the original 12 BERT layers.

Hyperparameters (learning rate, batch size, warmup steps, and dropout) were optimised using Optuna, an open-source hyperparameter optimisation framework designed to automate the search for optimal hyperparameter configurations in machine learning models. The performance of each configuration was assessed on the test set using F1 score, precision, recall, and accuracy.

## Discussion

Results show that selectively unfreezing layers can yield strong performance with reduced computational cost. As more layers were unfrozen, test F1 improved consistently. The trade-off between number of layers and performance is highlighted below.

|   | Unfrozen Layers | Test F1 | Test Precision | Test Recall | Test Accuracy |
|---|------------------|---------|----------------|-------------|----------------|
| 0 | Baseline (all)   | 0.4007  | 0.5192         | 0.3262      | 0.9400         |
| 1 | 1                | 0.3738  | 0.5413         | 0.2854      | 0.9407         |
| 2 | 2                | 0.4163  | 0.5365         | 0.3401      | 0.9431         |
| 3 | 3                | 0.4250  | 0.5492         | 0.3466      | 0.9438         |
| 4 | 4                | 0.4676  | 0.5658         | 0.3985      | 0.9466         |

**Table:** Test set performance across configurations.

These findings suggest that with effective hyperparameter tuning, freezing most of the BERT encoder still allows strong task adaptation. Notably, the optimiser often retained default values for warmup steps and batch size, suggesting further efficiency gains. Overall, these experiments indicated partial fine-tuning can outperform naïve full fine-tuning, offering a better performance–compute trade-off.
