### **Finetuning dense BERT base**

In [None]:
from datasets import load_dataset

dataset = load_dataset("glue", "sst2")

In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize(example):
    return tokenizer(
        example["sentence"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

tokenized = dataset.map(tokenize, batched=True)
tokenized = tokenized.remove_columns(["sentence", "idx"])
tokenized.set_format("torch")


In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./bert-base-dense-sst2",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model("./bert-base-dense-sst2")


### **Sparsification Pretrained BERT base Model**

In [None]:
import torch
import torch.nn.utils.prune as prune
from transformers import BertTokenizerFast, BertForPreTraining

model = BertForPreTraining.from_pretrained("bert-base-uncased")

In [None]:
import torch.nn.utils.prune as prune

to_prune = [
    (m, "weight")
    for _,m in model.named_modules()
    if isinstance(m, torch.nn.Linear)
]
prune.global_unstructured(to_prune, prune.L1Unstructured, amount=0.8)

for m, n in to_prune:
    prune.remove(m, n)


In [None]:
# Check the Sparsification
def count_zero_weights(model):
    total, zero = 0, 0
    for name, param in model.named_parameters():
        if "weight" in name and param.requires_grad:
            total += param.numel()
            zero += (param == 0).sum().item()
    print(f"Sparsity: {100 * zero / total:.2f}%")

count_zero_weights(model)

In [None]:
# Specify the directory where the pruned model will be saved
model.save_pretrained("bert_80")
tokenizer.save_pretrained("bert_80")

### **Full Finetuning**

In [None]:
from transformers import AutoTokenizer


model_path = "bert_80"

tokenizer = AutoTokenizer.from_pretrained(model_path)

model = BertForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2,
)

In [None]:
from transformers import Trainer, TrainingArguments, default_data_collator

training_args = TrainingArguments(
    output_dir="./training_output_80_sst2",
    do_train=True, do_eval=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=8,
    learning_rate=1.5e-4,
    lr_scheduler_type="linear",
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics
)
trainer.train()
trainer.evaluate()

### **Linear Finetuning**

In [None]:
from transformers import AutoTokenizer


model_path = "bert_80"

tokenizer = AutoTokenizer.from_pretrained(model_path)

model = BertForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2,
)

In [None]:
# Code adapted from a Stack Overflow answer on freezing specific layers in PyTorch
# Accessed on: 01.05.2025
# Link: https://stackoverflow.com/questions/62523912/freeze-certain-layers-of-an-existing-model-in-pytorch

for name, param in model.named_parameters():
    if "classifier" not in name:
        param.requires_grad = False

# Check the no freeze layer
trainable = [n for n, p in model.named_parameters() if p.requires_grad]
print("Trainable parameters:", trainable)

In [None]:
from transformers import Trainer, TrainingArguments, default_data_collator

training_args = TrainingArguments(
    output_dir="./training_output_80 LI_sst2",
    do_train=True, do_eval=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=8,
    learning_rate=1.5e-4,
    lr_scheduler_type="linear",
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics
)
trainer.train()
trainer.evaluate()

### **Full Finetuning mit KD**

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoConfig


student_checkpoint = "bert_80"

student_config = AutoConfig.from_pretrained(student_checkpoint)
student = AutoModelForSequenceClassification.from_pretrained(student_checkpoint, config=student_config).cuda()


teacher = AutoModelForSequenceClassification.from_pretrained("bert-base-dense-sst2").cuda()


In [None]:
# Code citation start [1]
# Partially copied (with minor modifications) from:
# Divesh R. Kubal, "Knowledge Distillation Implementation End to End"
# GitHub, accessed on 03.05.2025
# https://github.com/DiveshRKubal/transformers_model_production/blob/main/knowledge_distillation_implementation_end_to_end.ipynb

import torch
import torch.nn as nn
import torch.nn.functional as F


class KDTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha
        self.temperature = temperature

class KDTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model.eval()
        for p in self.teacher_model.parameters():
            p.requires_grad = False

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")

        outputs_student = model(**inputs)
        loss_ce        = outputs_student.loss
        logits_student = outputs_student.logits

        with torch.no_grad():
            outputs_teacher = self.teacher_model(**inputs)
        logits_teacher = outputs_teacher.logits

        T     = self.args.temperature
        kl = nn.KLDivLoss(reduction="batchmean")(
            F.log_softmax(logits_student / T, dim=-1),
            F.softmax(logits_teacher / T, dim=-1),
        ) * (T * T)

        alpha = self.args.alpha
        loss = alpha * loss_ce + (1 - alpha) * kl


        return (loss, outputs_student) if return_outputs else loss
    
# Code citation end [1]

In [None]:
training_args = KDTrainingArguments(
    output_dir="./KD",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    lr_scheduler_type="linear",

    alpha=1.0,
    temperature=2.0
)


trainer = KDTrainer(
    model=student,
    teacher_model=teacher, 
    args=training_args,
    train_dataset=tokenized['train'], 
    eval_dataset=tokenized['validation'],
    compute_metrics=compute_metrics, 
    tokenizer=tokenizer,
    data_collator=default_data_collator
    
)

trainer.train()
trainer.evaluate()

### **Linear Finetuning with KD**

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoConfig


student_checkpoint = "bert_80"

student_config = AutoConfig.from_pretrained(student_checkpoint)
student = AutoModelForSequenceClassification.from_pretrained(student_checkpoint, config=student_config).cuda()


teacher = AutoModelForSequenceClassification.from_pretrained("bert-base-dense-sst2").cuda()


In [None]:
# Code adapted from a Stack Overflow answer on freezing specific layers in PyTorch
# Accessed on: 01.05.2025
# Link: https://stackoverflow.com/questions/62523912/freeze-certain-layers-of-an-existing-model-in-pytorch

for name, param in student.named_parameters():
    if "classifier" not in name:
        param.requires_grad = False

# Check the no freeze layer
trainable = [n for n, p in student.named_parameters() if p.requires_grad]
print("Trainable parameters:", trainable)

In [None]:
training_args = KDTrainingArguments(
    output_dir="./KD",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    lr_scheduler_type="linear",

    alpha=1.0,
    temperature=2.0
)


trainer = KDTrainer(
    model=student,
    teacher_model=teacher, 
    args=training_args,
    train_dataset=tokenized['train'], 
    eval_dataset=tokenized['validation'],
    compute_metrics=compute_metrics, 
    tokenizer=tokenizer,
    data_collator=default_data_collator
    
)

trainer.train()
trainer.evaluate()