### **Finetuning dense GPT-2**

In [None]:
import torch
import torch.nn.utils.prune as prune
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.modeling_utils import Conv1D
import torch.nn.utils.prune as prune

In [None]:
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")


tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

In [None]:
from datasets import load_dataset

lm_dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

In [None]:
from itertools import chain
block_size = 512

def tokenize(examples):
    return tokenizer(examples["text"], return_special_tokens_mask=False, truncation=False)

tokenized = lm_dataset.map(
    tokenize,
    batched=True,
    remove_columns=["text"]
)

def group_texts(examples):
    all_ids = list(chain.from_iterable(examples["input_ids"]))
    total = (len(all_ids) // block_size) * block_size
    chunks = [all_ids[i : i + block_size] for i in range(0, total, block_size)]
    return {
      "input_ids": chunks,
      "attention_mask": [[1]*block_size]*len(chunks),
      "labels": chunks.copy(),
    }

lm_splits = tokenized.map(
    group_texts,
    batched=True,
    remove_columns=tokenized["train"].column_names
)

train_ds, eval_ds = lm_splits["train"].train_test_split(test_size=0.1, seed=42).values()

In [None]:
from transformers import DataCollatorForLanguageModeling

lm_collator  = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
from transformers import Trainer, TrainingArguments
import math

training_args = TrainingArguments(
    output_dir="./gpt2-dense-wikitext",
    do_train=True,
    do_eval=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,

    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,

    num_train_epochs=4,           
    learning_rate= 5e-5, 
    weight_decay=0.01,
    lr_scheduler_type="linear",
    warmup_steps=200, 
)

lm_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds, 
    data_collator=lm_collator,
    tokenizer=tokenizer,
)



lm_trainer.train()
lm_trainer.save_model()

res = lm_trainer.evaluate()
import math
print("Perplexity:", math.exp(res["eval_loss"]))

### **Sparsification GPT-2**

In [None]:
import torch
import torch.nn.utils.prune as prune
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.modeling_utils import Conv1D
import torch.nn.utils.prune as prune

model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")


tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id


parameters_to_prune = []
for name, module in model.named_modules():
    if isinstance(module, (torch.nn.Linear, Conv1D)):
        parameters_to_prune.append((module, 'weight'))

# Adjust amount to determine the sparsity level (e.g., 0.1 for 10%, 0.5 for 50%, etc.)
prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.50,
)

for module, param_name in parameters_to_prune:
    prune.remove(module, param_name)


output_dir = "gpt2-unstructured-50"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Pruned model saved to {output_dir}/")

In [None]:
# Check Sparsification
total_zeros, total_elems = 0, 0
for module, _ in parameters_to_prune:
    tensor = module.weight
    total_zeros += int((tensor == 0).sum())
    total_elems += tensor.numel()
print(f"Global sparsity: {total_zeros}/{total_elems} = {total_zeros/total_elems:.2%}")

### **Full Finetuning**

In [None]:
from transformers import AutoTokenizer
from itertools import chain

tokenizer = AutoTokenizer.from_pretrained("gpt2-unstructured-50")
block_size = 512

def tokenize(examples):
    return tokenizer(examples["text"], return_special_tokens_mask=False, truncation=False)

tokenized = lm_dataset.map(
    tokenize,
    batched=True,
    remove_columns=["text"]
)

def group_texts(examples):
    all_ids = list(chain.from_iterable(examples["input_ids"]))
    total = (len(all_ids) // block_size) * block_size
    chunks = [all_ids[i : i + block_size] for i in range(0, total, block_size)]
    return {
      "input_ids": chunks,
      "attention_mask": [[1]*block_size]*len(chunks),
      "labels": chunks.copy(),
    }

lm_splits = tokenized.map(
    group_texts,
    batched=True,
    remove_columns=tokenized["train"].column_names
)

train_ds, eval_ds = lm_splits["train"].train_test_split(test_size=0.1, seed=42).values()

In [None]:
from transformers import Trainer, TrainingArguments
import math

training_args = TrainingArguments(
    output_dir="training_output_gpt2_50_wikitext/train_eval",
    do_train=True,
    do_eval=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,

    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,

    num_train_epochs=4,           
    learning_rate= 5e-5, 
    weight_decay=0.01,
    lr_scheduler_type="linear",
    warmup_steps=200,
    fp16=True 
)

lm_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds, 
    data_collator=lm_collator,
    tokenizer=tokenizer,
)



lm_trainer.train()
lm_trainer.save_model()

res = lm_trainer.evaluate()
import math
print("Perplexity:", math.exp(res["eval_loss"]))

### **Linear Finetuning**

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("gpt2-unstructured-50")
tokenizer = AutoTokenizer.from_pretrained("gpt2") 


tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

In [None]:
# Code adapted from a Stack Overflow answer on freezing specific layers in PyTorch
# Accessed on: 01.05.2025
# Link: https://stackoverflow.com/questions/62523912/freeze-certain-layers-of-an-existing-model-in-pytorch


for name, param in model.named_parameters():
    param.requires_grad = False
for name, param in model.lm_head.named_parameters():
    param.requires_grad = True

print("Trainable params:")
for n, p in model.named_parameters():
    if p.requires_grad:
        print("  ", n)

In [None]:
training_args = TrainingArguments(
    output_dir="training_output_gpt2_50_linear_wikitext/train_eval",
    do_train=True,
    do_eval=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,

    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,

    num_train_epochs=4,           
    learning_rate= 5e-5, 
    weight_decay=0.00,
    lr_scheduler_type="linear",
    warmup_steps=200,
    fp16=True 
)

lm_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds, 
    data_collator=lm_collator,
    tokenizer=tokenizer,
)


lm_trainer.train()
lm_trainer.save_model()

res = lm_trainer.evaluate()
import math
print("Perplexity:", math.exp(res["eval_loss"]))

### **Full Finetuning mit KD**

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

teacher = AutoModelForCausalLM.from_pretrained("gpt2-dense-wikitext").cuda()

student = AutoModelForCausalLM.from_pretrained("gpt2-unstructured-50").cuda()

tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Code citation start [1]
# Partially copied (with minor modifications) from:
# Divesh R. Kubal, "Knowledge Distillation Implementation End to End"
# GitHub, accessed on 03.05.2025
# https://github.com/DiveshRKubal/transformers_model_production/blob/main/knowledge_distillation_implementation_end_to_end.ipynb

import torch
import torch.nn as nn
import torch.nn.functional as F


class KDTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha
        self.temperature = temperature

class KDTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model.eval()
        for p in self.teacher_model.parameters():
            p.requires_grad = False

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")

        outputs_student = model(**inputs)
        loss_ce        = outputs_student.loss
        logits_student = outputs_student.logits

        with torch.no_grad():
            outputs_teacher = self.teacher_model(**inputs)
        logits_teacher = outputs_teacher.logits

        T     = self.args.temperature
        kl = nn.KLDivLoss(reduction="batchmean")(
            F.log_softmax(logits_student / T, dim=-1),
            F.softmax(logits_teacher / T, dim=-1),
        ) * (T * T)

        alpha = self.args.alpha
        loss = alpha * loss_ce + (1 - alpha) * kl


        return (loss, outputs_student) if return_outputs else loss
    
# Code citation end [1]

In [None]:
from transformers import DataCollatorForLanguageModeling
lm_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

training_args = KDTrainingArguments(
    output_dir="./training_output_gpt2_50_mit_KD_wikitext",
    do_train=True, 
    do_eval=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,

    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=4,

    learning_rate=1.5e-4,
    fp16=True,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    warmup_steps=200,

    alpha=0.6,          
    temperature=2.0
    
)


distil_trainer = KDTrainer(
    model=student,
    teacher_model=teacher,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=lm_collator,
    tokenizer=tokenizer,
    
)

distil_trainer.train()
res = distil_trainer.evaluate()
import math
print("Distilled Perplexity KD:", math.exp(res["eval_loss"]))

### **Linear Finetuning mit KD**

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

teacher = AutoModelForCausalLM.from_pretrained("gpt2-dense-wikitext").cuda()
student = AutoModelForCausalLM.from_pretrained("gpt2-unstructured-50").cuda()

tokenizer.pad_token = tokenizer.eos_token

In [None]:
for name, param in student.named_parameters():
    param.requires_grad = False
for name, param in student.lm_head.named_parameters():
    param.requires_grad = True

In [None]:
from transformers import DataCollatorForLanguageModeling
lm_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

training_args = KDTrainingArguments(
    output_dir="./Linear_KD",
    do_train=True, 
    do_eval=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,

    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=4,

    learning_rate=1.5e-4,
    fp16=True,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    warmup_steps=200,

    alpha=0.6,          
    temperature=2.0
    
)


distil_trainer = KDTrainer(
    model=student,
    teacher_model=teacher,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=lm_collator,
    tokenizer=tokenizer,
    
)

distil_trainer.train()
res = distil_trainer.evaluate()

import math
print("Distilled Perplexity KD:", math.exp(res["eval_loss"]))