In [58]:
!pip install torch transformers datasets evaluate rouge_score nltk

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=d6c76bfddb114a0b5ed107e4aaa56c19358233bce4383a0f2f5f041bb13dcc48
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [59]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from datasets import load_dataset
import evaluate
import torch.nn.functional as F
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [60]:
teacher_model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama_v1.1")
teacher_tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama_v1.1")

student_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
student_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

In [62]:
dataset = load_dataset("albertvillanova/meqsum")

# Fix the tokenizer padding issue
teacher_tokenizer.pad_token = teacher_tokenizer.eos_token
#student_tokenizer.pad_token = student_tokenizer.eos_token

train_set = dataset["train"]

def tokenize_dataset(example):
  teacher_input = teacher_tokenizer(example["CHQ"], padding="max_length", max_length=256, truncation=True, return_tensors="pt").to(device)
  student_input = student_tokenizer(example["CHQ"], padding="max_length", max_length=256, truncation=True, return_tensors="pt").to(device)
  labels = student_tokenizer(example["Summary"], padding="max_length", max_length=256, truncation=True, return_tensors="pt").to(device)
  return {
    "teacher_input": teacher_input.input_ids,
    "student_input": student_input.input_ids,
    "labels": labels.input_ids,
  }

train_dataset = train_set.map(tokenize_dataset, batched=True)

print(train_dataset)
print(f"Example teacher input (tokenized): {train_dataset[0]['teacher_input']}")
print(f"Example student input (tokenized): {train_dataset[0]['student_input']}")
print(f"Example label (tokenized): {train_dataset[0]['labels']}")

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['CHQ', 'Summary', 'File', 'teacher_input', 'student_input', 'labels'],
    num_rows: 1000
})
Example teacher input (tokenized): [1, 27092, 17637, 29901, 1058, 322, 988, 304, 679, 15093, 29875, 7485, 457, 448, 360, 13, 2303, 1799, 10461, 29901, 306, 817, 29914, 29893, 424, 304, 1073, 1058, 767, 1137, 29879, 312, 1295, 15018, 29875, 7485, 457, 29889, 1619, 5260, 28402, 338, 3063, 363, 263, 716, 11421, 322, 526, 451, 2805, 278, 7786, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Example student input (tokenized): [180, 10134, 683, 14196, 10, 113, 11, 213, 12, 129, 197, 20408, 7196, 15, 3, 18, 309, 283, 10087, 11187, 10, 27, 174, 87, 3877, 17, 12, 214, 113, 388, 76, 89, 7, 75, 2905, 7, 1064, 20408, 7196, 15, 5, 499, 20723, 19, 479, 21, 3, 9, 126, 1899, 11, 33, 59, 652, 8, 1100, 1, 0, 0, 0,

In [63]:
def skld_loss(teacher_logits, student_logits, alpha=0.8):
  # Ensure logits have the same shape before applying softmax
  min_length = min(teacher_logits.shape[2], student_logits.shape[2])  # Find minimum length in dimension 2
  teacher_logits = teacher_logits[:, :, :min_length]  # Slice to match dimensions
  student_logits = student_logits[:, :, :min_length]  # Slice to match dimensions

  teacher_soft_labels = F.softmax(teacher_logits, dim=-1)
  student_soft_labels = F.softmax(student_logits, dim=-1)

  combined = alpha * teacher_soft_labels + (1 - alpha) * student_soft_labels

  loss = (combined * torch.log((teacher_soft_labels + 1e-8) / (student_soft_labels + 1e-8))).sum(dim=1).mean()
  return loss

def kld_loss(teacher_logits, student_logits):
  teacher_soft_labels = F.log_softmax(teacher_logits, dim=-1)
  student_soft_labels = F.softmax(student_logits, dim=-1)
  loss = F.kl_div(student_soft_labels, teacher_soft_labels, reduction='batchmean')
  return loss

In [75]:
optim = torch.optim.AdamW(student_model.parameters(), lr=1e-5)
epochs = 10
teacher_model.to(device)
student_model.to(device)

for epoch in range(epochs):
  total_loss = 0
  for step, data in enumerate(train_dataset):
    teacher_inputs = torch.tensor(data['teacher_input'], device=device)
    student_inputs = torch.tensor(data['student_input'], device=device)
    labels = torch.tensor(data['labels'], device=device)

    teacher_inputs = teacher_inputs.unsqueeze(0)  # Add batch dimension of size 1
    student_inputs = student_inputs.unsqueeze(0)  # Add batch dimension of size 1
    labels = labels.unsqueeze(0)  # Add batch dimension of size 1

    student_output = student_model(student_inputs, decoder_input_ids=labels)

    with torch.no_grad():
      teacher_output = teacher_model(teacher_inputs)


    loss = skld_loss(teacher_output.logits, student_output.logits, alpha=0.8)
    total_loss += loss.item()

    optim.zero_grad()
    loss.backward()
    optim.step()

    if step % 100 == 0:
      print(f"Epoch {epoch+1}, Step {step}, Loss: {loss.item():.4f}")

  print(f"Epoch {epoch+1} Average Loss: {total_loss / len(train_dataset):.4f}")

Epoch 1, Step 0, Loss: 0.0039
Epoch 1, Step 100, Loss: 0.0022
Epoch 1, Step 200, Loss: 0.0097
Epoch 1, Step 300, Loss: 0.0054
Epoch 1, Step 400, Loss: 0.0093
Epoch 1, Step 500, Loss: 0.0072
Epoch 1, Step 600, Loss: 0.0097
Epoch 1, Step 700, Loss: 0.0098
Epoch 1, Step 800, Loss: 0.0021
Epoch 1, Step 900, Loss: 0.0027
Epoch 1 Average Loss: 0.0061
Epoch 2, Step 0, Loss: 0.0023
Epoch 2, Step 100, Loss: 0.0005
Epoch 2, Step 200, Loss: 0.0080
Epoch 2, Step 300, Loss: 0.0037
Epoch 2, Step 400, Loss: 0.0079
Epoch 2, Step 500, Loss: 0.0058
Epoch 2, Step 600, Loss: 0.0082
Epoch 2, Step 700, Loss: 0.0082
Epoch 2, Step 800, Loss: 0.0007
Epoch 2, Step 900, Loss: 0.0011
Epoch 2 Average Loss: 0.0044
Epoch 3, Step 0, Loss: 0.0010
Epoch 3, Step 100, Loss: -0.0006
Epoch 3, Step 200, Loss: 0.0066
Epoch 3, Step 300, Loss: 0.0026
Epoch 3, Step 400, Loss: 0.0070
Epoch 3, Step 500, Loss: 0.0050
Epoch 3, Step 600, Loss: 0.0073
Epoch 3, Step 700, Loss: 0.0071
Epoch 3, Step 800, Loss: -0.0001
Epoch 3, Step 900,

In [76]:
test_dataset = train_dataset.select(range(100))

def generate_response(prompt, model, tokenizer):
    tokenizer.pad_token = tokenizer.eos_token
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    with torch.no_grad():
      inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=256, truncation=True).to(device)
      output = model.generate(inputs['input_ids'], max_length=512, num_return_sequences=1)
      response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response


def compute_bleu(reference_responses, generated_responses):
    references = [[ref.split()] for ref in reference_responses]
    candidates = [gen.split() for gen in generated_responses]
    return corpus_bleu(references, candidates)

generated_responses = [generate_response(example['CHQ'], student_model, student_tokenizer) for example in test_dataset]
reference_responses = [example['Summary'] for example in test_dataset]

bleu_score = compute_bleu(reference_responses, generated_responses)

print(f"BLEU Score: {bleu_score}")

BLEU Score: 9.228471608161513e-156
