True

In [None]:
!pip install scikit-learn matplotlib



In [None]:
!pip install -U evaluate
!pip install -U datasets
!pip install -U accelerate
!pip install -U transformers

import os
import torch
import numpy as np
import pandas as pd
import evaluate
import accelerate
import matplotlib.pyplot as plt
from data_preprocessing import CustomDataset
from transformers import AutoTokenizer, pipeline
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import TrainerCallback
from model import BaseModel, CustomClassifier

print(torch.cuda.is_available())




  from .autonotebook import tqdm as notebook_tqdm


In [None]:
class GradualUnfreezeCallback(TrainerCallback):
    def __init__(self, model, enable_unfreezing, total_epochs, unfreeze_schedule):
        self.model = model
        self.enable_unfreezing = enable_unfreezing
        self.total_epochs = total_epochs
        self.unfreeze_schedule = unfreeze_schedule
        self.unfrozen_layers = 0  # Tracks the number of unfrozen layers

        # Freeze all layers except classification head initially
        if self.enable_unfreezing:
            base_model = getattr(self.model.pretrained_model, "base_model", self.model.pretrained_model)
            classifier = getattr(self.model.pretrained_model, "classifier", None) or getattr(self.model, "final_classifier", None)

            for param in base_model.parameters():
                param.requires_grad = False
            if classifier:
                for param in classifier.parameters():
                    param.requires_grad = True

    def on_init_end(self, args, state, control, **kwargs):
        """Required method to avoid the AttributeError."""
        pass  # No action needed on initialization

    def on_epoch_begin(self, args, state, control, **kwargs):
        """Unfreezes layers based on the predefined schedule."""
        if not self.enable_unfreezing or self.unfrozen_layers >= len(self.unfreeze_schedule):
            return  # Either all layers are unfrozen or unfreezing is disabled

        current_epoch = int(state.epoch)  # Ensure it's an integer
        next_unfreeze_epoch = self.unfreeze_schedule[self.unfrozen_layers]

        if current_epoch >= next_unfreeze_epoch:
            # Unfreeze one more layer
            layers = list(self.model.pretrained_model.base_model.children())[::-1]  # Reverse list to start from last layers
            if self.unfrozen_layers < len(layers):
                for param in layers[self.unfrozen_layers].parameters():
                    param.requires_grad = True

                self.unfrozen_layers += 1
                print(f"Epoch {current_epoch}: Unfroze layer {self.unfrozen_layers}")

In [None]:
model_checkpoint = "roberta-base"
max_len = 512

dataset = CustomDataset('HateSpeechDatasetBalanced.csv', model_checkpoint=model_checkpoint)
train_dataset, val_dataset, test_dataset = dataset.get_splits()

print(train_dataset, val_dataset, test_dataset)

Map: 100%|██████████| 726119/726119 [02:41<00:00, 4507.37 examples/s]


Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 580895
}) Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 72612
}) Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 72612
})


In [None]:
# model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)
model = BaseModel(model_checkpoint, num_labels=2, hidden_dropout_prob=0.1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.resize_token_embeddings(len(dataset.get_tokenizer())) # need to resize due to new tokens added

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(50265, 768, padding_idx=1)

In [7]:
metric_name = 'f1'
model_name = model_checkpoint.split("/")[-1]

total_epochs = 3
args = TrainingArguments(
    f"./snapshots/{model_name}-finetuned",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    save_total_limit = 3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=total_epochs,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    fp16=True
)

In [8]:
metric = evaluate.load(metric_name)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="micro")

In [None]:
def train_model(model, args, train_dataset, val_dataset, enable_unfreezing, total_epochs, unfreeze_schedule):
    gradual_unfreeze_callback = GradualUnfreezeCallback(
        model, enable_unfreezing, total_epochs, unfreeze_schedule
    )

    trainer = Trainer(
        model,
        args,
        train_dataset=train_dataset, # Explicitly name the arguments
        eval_dataset=val_dataset,   # Explicitly name the arguments
        compute_metrics=compute_metrics, # Explicitly name the arguments
        tokenizer=dataset.get_tokenizer(),
        callbacks=[gradual_unfreeze_callback]
    )

    trainer.train()

    return trainer

In [None]:
def plot_losses(trainer):
    logs = trainer.state.log_history
    train_loss = [log["loss"] for log in logs if "loss" in log and "epoch" in log]
    val_loss = [log["eval_loss"] for log in logs if "eval_loss" in log and "epoch" in log]
    epochs = [log["epoch"] for log in logs if "loss" in log and "epoch" in log]

    plt.figure(figsize=(8, 5))
    plt.plot(epochs, train_loss, label="Training Loss")
    plt.plot(epochs, val_loss, label="Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Train vs Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.show()


def plot_eval_metric(trainer, metric_name='eval_f1'):
    logs = trainer.state.log_history
    metric_vals = [log[metric_name] for log in logs if metric_name in log and "epoch" in log]
    epochs = [log["epoch"] for log in logs if metric_name in log and "epoch" in log]

    plt.figure(figsize=(8, 5))
    plt.plot(epochs, metric_vals, label=f"{metric_name.upper()} Score")
    plt.xlabel("Epoch")
    plt.ylabel(metric_name.upper())
    plt.title(f"{metric_name.upper()} Over Epochs")
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
train_log = train_model(model=model,
                        args=args,
                        train_dataset=train_dataset,
                        val_dataset=val_dataset,
                        enable_unfreezing=False,
                        total_epochs=total_epochs,
                        unfreeze_schedule=[5, 10, 20]
                        )

plot_losses(train_log)
plot_eval_metric(train_log, metric_name)

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


In [21]:
train_log.save_model("./models/myFinetunedModel") # for saving your model

AttributeError: 'NoneType' object has no attribute 'save_model'

In [None]:
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device="cuda:0")
results = classifier(test_dataset['text'], max_length=max_len, padding="max_length", truncation=True)
dfResults = pd.DataFrame.from_dict(results)
dfResults['label'] = dfResults['label'].str.replace('LABEL_','')
f1 = metric.compute(predictions=dfResults['label'].tolist(), references=test_dataset['label'], average='micro')
print(f1)

Device set to use cuda:0


{'f1': 0.85404}


In [None]:
"""
Without extra linear layer all unfrozen finetuning:
Epoch	Training Loss	Validation Loss	F1
1	0.350500	0.337584	0.855299
2	0.321500	0.335361	0.861731
3	0.330800	0.342808	0.865917
all unfrozen


Without extra linear layer gradual finetuning:
Epoch	Training Loss	Validation Loss	F1
1	0.673100	0.665193	0.631659
2	0.659400	0.651554	0.639853
3	0.657000	0.648236	0.638958


With extra linear layer all unfrozen finetuning:
Epoch	Training Loss	Validation Loss	F1
1	0.377300	0.358042	0.839104
2	0.351900	0.336490	0.848758
3	0.339900	0.332513	0.850810"""