In [None]:
! pip install evaluate rouge_score

### **Finetuning dense T5-Small**

In [None]:
# Code citation start [1]
# Copied from Hugging Face Transformers documentation (Summarization)
# Author: Hugging Face
# Title: Summarization 
# Accessed on: 30.04.2025
# Link: https://huggingface.co/docs/transformers/en/tasks/summarization
# License: Apache 2.0
# License link: https://www.apache.org/licenses/LICENSE-2.0

In [4]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

  from .autonotebook import tqdm as notebook_tqdm


In [51]:
from datasets import load_dataset

billsum = load_dataset("billsum", split="ca_test")
billsum = billsum.train_test_split(test_size=0.2)

In [52]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [53]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map: 100%|██████████| 248/248 [00:00<00:00, 637.88 examples/s]


In [8]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [9]:
import evaluate

rouge = evaluate.load("rouge")

In [10]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [11]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir="finetuned_t5_small",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.71988,0.1327,0.038,0.1091,0.1092,20.0
2,No log,2.505391,0.1407,0.0476,0.1164,0.1161,20.0
3,No log,2.441553,0.1412,0.0485,0.1167,0.1165,20.0
4,No log,2.423877,0.1424,0.0491,0.1176,0.1174,20.0


In [None]:
# Code citation end [1]

### **Sparsification Pretrained Model T5-Small**

In [45]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.nn.utils import prune
import torch.nn.utils.prune as prune_utils
import torch.nn as nn
import torch

model = T5ForConditionalGeneration.from_pretrained("t5-small")
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [46]:
modules_to_prune = []
for name, module in model.named_modules():
    if isinstance(module, nn.Linear):
        modules_to_prune.append((module, 'weight'))

# Adjust amount to determine the sparsity level (e.g., 0.1 for 10%, 0.5 for 50%, etc.)
prune_utils.global_unstructured(
    modules_to_prune,
    pruning_method=prune_utils.L1Unstructured,
    amount=0.3  
)

for module, _ in modules_to_prune:
    prune.remove(module, 'weight')  

In [47]:
# Check the Sparsification
def count_zero_weights(model):
    total, zero = 0, 0
    for name, param in model.named_parameters():
        if "weight" in name and param.requires_grad:
            total += param.numel()
            zero += (param == 0).sum().item()
    print(f"Sparsity: {100 * zero / total:.2f}%")

count_zero_weights(model)

Sparsity: 29.99%


In [48]:
# Specify the directory where the pruned model will be saved
model.save_pretrained("t5-small_30")

### **Full Finetuning**

In [25]:
from transformers import AutoTokenizer

checkpoint ="t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [26]:
from transformers import DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model ="t5-small_30"

model = AutoModelForSeq2SeqLM.from_pretrained(model)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="FT_t5_small_30",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

### **Linear Finetuning**

In [123]:
checkpoint = "t5-small_30"
tokenizer  = AutoTokenizer.from_pretrained("t5-small")
model      = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [124]:
# Code adapted from a Stack Overflow answer on freezing specific layers in PyTorch
# Accessed on: 01.05.2025
# Link: https://stackoverflow.com/questions/62523912/freeze-certain-layers-of-an-existing-model-in-pytorch


for name, param in model.named_parameters():
    param.requires_grad = False
for name, param in model.lm_head.named_parameters():
    param.requires_grad = True

In [30]:
print("Trainable parameters:")
for n, p in model.named_parameters():
    if p.requires_grad:
        print(" ", n)

Trainable parameters:
  shared.weight


In [31]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="LF_t5_small_30",   
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset= tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

### **Full Finetuning mit KD**

In [148]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")

teacher = T5ForConditionalGeneration.from_pretrained("finetuned_t5_small").eval().cuda()

student = AutoModelForSeq2SeqLM.from_pretrained("t5-small_30").train().cuda()

In [149]:
MAX_INPUT = 512
MAX_TARGET = 128


prefix = "summarize: "
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=MAX_TARGET, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_data = load_dataset("billsum", split="train[:2%]").map(preprocess_function, batched=True)
val_data = load_dataset("billsum", split="test[:2%]").map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=student)

In [150]:
import numpy as np
import evaluate

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    
    final_result = {}
    for k, v in result.items():
        if hasattr(v, "mid"):
            final_result[k] = round(v.mid.fmeasure, 4)
        else:
            final_result[k] = round(v, 4)

   
    prediction_lens = [len(pred.split()) for pred in decoded_preds]
    final_result["gen_len"] = np.mean(prediction_lens)

    return final_result

In [151]:
# Code citation start [1]
# Partially copied (with minor modifications) from:
# Divesh R. Kubal, "Knowledge Distillation Implementation End to End"
# GitHub, accessed on 03.05.2025
# https://github.com/DiveshRKubal/transformers_model_production/blob/main/knowledge_distillation_implementation_end_to_end.ipynb

import torch.nn.functional as F


class KDTrainingArguments(Seq2SeqTrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha
        self.temperature = temperature

class KDSeq2SeqTrainer(Seq2SeqTrainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model.eval()
        for p in self.teacher_model.parameters():
            p.requires_grad = False

    # note **kwargs here
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        # 1) student forward
        outputs_student = model(**inputs)
        loss_ce        = outputs_student.loss
        logits_student = outputs_student.logits

        # 2) teacher forward
        with torch.no_grad():
            outputs_teacher = self.teacher_model(**inputs)
        logits_teacher = outputs_teacher.logits

        # 3) distillation loss
        T     = self.args.temperature
        kl = nn.KLDivLoss(reduction="batchmean")(
            F.log_softmax(logits_student / T, dim=-1),
            F.softmax(logits_teacher / T, dim=-1),
        ) * (T * T)

        # 4) combined loss
        alpha = self.args.alpha
        loss = alpha * loss_ce + (1 - alpha) * kl


        return (loss, outputs_student) if return_outputs else loss
    
# Code citation end [1]

In [None]:
training_args = KDTrainingArguments(
    output_dir="FT-KD-T5-Small",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    predict_with_generate=True,
    alpha=0.3,       
    temperature=4.0,   
    fp16=True,
    save_total_limit=1,
)

trainer = KDSeq2SeqTrainer(
    model=student,
    teacher_model=teacher,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

### **Linear Finetuning mit KD**

In [127]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")

teacher = T5ForConditionalGeneration.from_pretrained("finetuned_t5_small").eval().cuda()

student = AutoModelForSeq2SeqLM.from_pretrained("t5-small_30").train().cuda()

In [128]:
for name, param in student.named_parameters():
    param.requires_grad = False
for name, param in student.lm_head.named_parameters():
    param.requires_grad = True

In [129]:
print("Trainable parameters:")
for n, p in model.named_parameters():
    if p.requires_grad:
        print(" ", n)

Trainable parameters:
  shared.weight


In [131]:
MAX_INPUT = 512
MAX_TARGET = 128


prefix = "summarize: "
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=MAX_TARGET, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_data = load_dataset("billsum", split="train[:2%]").map(preprocess_function, batched=True)
val_data = load_dataset("billsum", split="test[:2%]").map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=student)

In [None]:
training_args = KDTrainingArguments(
    output_dir="LT-KD-T5-Small",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    predict_with_generate=True,
    alpha=0.3,       
    temperature=4.0,   
    fp16=True,
    save_total_limit=1,
)

trainer = KDSeq2SeqTrainer(
    model=student,
    teacher_model=teacher,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,9.261447,0.0,0.0,0.0,0.0,3.707692
2,No log,9.260193,0.0,0.0,0.0,0.0,3.615385
3,No log,9.257794,0.0,0.0,0.0,0.0,3.615385
4,No log,9.257625,0.0,0.0,0.0,0.0,3.615385


TrainOutput(global_step=96, training_loss=447.8892415364583, metrics={'train_runtime': 40.2349, 'train_samples_per_second': 37.679, 'train_steps_per_second': 2.386, 'total_flos': 205178171031552.0, 'train_loss': 447.8892415364583, 'epoch': 4.0})