In [1]:
# Install necessary libraries
!pip install unsloth "xformers==0.0.28.post2"
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install evaluate
!pip install rouge_score

# Import libraries
import torch
import math
from datasets import load_dataset
from transformers import TrainingArguments, DataCollatorForSeq2Seq, Trainer
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template
import evaluate
from tqdm import tqdm

# Initialize metrics
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

max_seq_length = 1024  # Adjusted to 1024 to fit in Colab's GPU memory
dtype = None  # None for auto detection. Float16 for Tesla T4, V100; Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage

# Load the student model and tokenizer
student_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Prepare the student model for PEFT
student_model = FastLanguageModel.get_peft_model(
    student_model,
    r=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# Load the teacher model
teacher_model, _ = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
teacher_model.eval()
teacher_model.to("cuda")  # Ensure the teacher model is on the GPU

# Set the chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.1",
)

Collecting unsloth
  Downloading unsloth-2024.11.5-py3-none-any.whl.metadata (59 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xformers==0.0.28.post2
  Downloading xformers-0.0.28.post2-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting unsloth-zoo>=2024.11.1 (from unsloth)
  Downloading unsloth_zoo-2024.11.4-py3-none-any.whl.metadata (16 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.8.14-py3-none-any.whl.metadata (8.4 kB)
Collecting transformers>=4.46.1 (from unsloth)
  Downloading transfo

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

Unsloth 2024.11.5 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]



In [2]:
# Function to format prompts
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    contexts = examples["context"]
    responses = examples["response"]

    # Combine instruction and context
    convos = [{"role": "user", "content": f"Instruction: {instruction}\nContext: {context if context else ''}"}
              for instruction, context in zip(instructions, contexts)]

    # Apply the chat template without the assistant's response
    texts = [tokenizer.apply_chat_template([convo], tokenize=False, add_generation_prompt=False)
             for convo in convos]

    # The assistant's response is the target text
    target_texts = responses

    # Return the new 'text' column along with 'target_text'
    return {
        "text": texts,
        "target_text": target_texts
    }

# Load and split the dataset
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

# Split dataset into training and validation sets
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset['train']
eval_dataset = dataset['test']

# Apply formatting to the datasets
train_dataset = train_dataset.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=train_dataset.column_names
)
eval_dataset = eval_dataset.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=eval_dataset.column_names
)

# Tokenize the datasets
def tokenize_function(examples):
    # Tokenize the prompt (input text)
    inputs = tokenizer(
        examples["text"],
        truncation=True,
        max_length=max_seq_length,
        padding=False,  # We'll pad later
    )

    # Tokenize the assistant's response (target text)
    targets = tokenizer(
        examples["target_text"],
        truncation=True,
        max_length=max_seq_length,
        padding=False,  # We'll pad later
    )

    # Concatenate inputs and targets
    input_ids = []
    attention_mask = []
    labels = []

    for i in range(len(inputs['input_ids'])):
        input_id = inputs['input_ids'][i]
        target_id = targets['input_ids'][i]

        # Concatenate input and target ids
        input_ids_i = input_id + target_id + [tokenizer.eos_token_id]
        attention_mask_i = [1] * len(input_ids_i)

        # Create labels: -100 for input tokens, target ids for target tokens
        labels_i = [-100] * len(input_id) + target_id + [tokenizer.eos_token_id]

        # Truncate to max_seq_length
        input_ids_i = input_ids_i[:max_seq_length]
        attention_mask_i = attention_mask_i[:max_seq_length]
        labels_i = labels_i[:max_seq_length]

        input_ids.append(input_ids_i)
        attention_mask.append(attention_mask_i)
        labels.append(labels_i)

    # Pad to max_seq_length
    input_ids = [seq + [tokenizer.pad_token_id] * (max_seq_length - len(seq)) for seq in input_ids]
    attention_mask = [seq + [0] * (max_seq_length - len(seq)) for seq in attention_mask]
    labels = [seq + [-100] * (max_seq_length - len(seq)) for seq in labels]

    tokenized = {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }
    tokenized['target_text'] = examples['target_text']
    return tokenized

train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=eval_dataset.column_names)

README.md:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

Map:   0%|          | 0/13509 [00:00<?, ? examples/s]

Map:   0%|          | 0/1502 [00:00<?, ? examples/s]

Map:   0%|          | 0/13509 [00:00<?, ? examples/s]

Map:   0%|          | 0/1502 [00:00<?, ? examples/s]

In [3]:
# Set the format to PyTorch tensors
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define the Knowledge Distillation Trainer
class KDTrainer(Trainer):
    def __init__(self, teacher_model, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model
        self.teacher_model.eval()
        self.teacher_model.to("cuda")

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Move inputs to the student's device
        input_ids = inputs['input_ids'].to(model.device)
        attention_mask = inputs['attention_mask'].to(model.device)
        labels = inputs['labels'].to(model.device)

        # Get student outputs
        student_outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        student_logits = student_outputs.logits  # shape: [batch_size, seq_length, vocab_size]

        with torch.no_grad():
            # Get teacher outputs
            teacher_outputs = self.teacher_model(
                input_ids=input_ids.to("cuda"),
                attention_mask=attention_mask.to("cuda")
            )
            teacher_logits = teacher_outputs.logits  # shape: [batch_size, seq_length, vocab_size]

        # Set temperature and alpha
        temperature = 2.0
        alpha = 0.5

        # Compute student cross-entropy loss
        loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
        student_ce_loss = loss_fct(
            student_logits.view(-1, student_logits.size(-1)),
            labels.view(-1)
        )

        # Compute KL divergence loss on the assistant's response tokens only
        mask = labels != -100  # Mask to select only the response tokens
        student_log_probs = torch.nn.functional.log_softmax(
            student_logits / temperature, dim=-1
        )
        teacher_probs = torch.nn.functional.softmax(
            teacher_logits / temperature, dim=-1
        )
        # Apply the mask to the logits
        student_log_probs = student_log_probs[mask]
        teacher_probs = teacher_probs[mask]
        kl_loss = torch.nn.functional.kl_div(
            student_log_probs, teacher_probs, reduction='batchmean'
        ) * (temperature ** 2)

        # Combine losses
        loss = alpha * kl_loss + (1 - alpha) * student_ce_loss

        return (loss, student_outputs) if return_outputs else loss

    def _move_model_to_device(self, model, device):
        # Override to prevent moving model to device
        pass

In [4]:
# Set training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    warmup_steps=5,
    max_steps=60,
    learning_rate=2e-4,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
    report_to="none",
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True, return_tensors='pt')

# Initialize the KDTrainer
trainer = KDTrainer(
    model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  super().__init__(*args, **kwargs)
max_steps is given, it will override any value given in num_train_epochs


In [5]:
# Start training
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 13,509 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 1
\        /    Total batch size = 1 | Total steps = 60
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1,5.912
2,6.5639
3,5.9869
4,7.1106
5,6.3539
6,6.2197
7,5.2805
8,7.3742
9,6.266
10,6.6585


TrainOutput(global_step=60, training_loss=3.545959887901942, metrics={'train_runtime': 439.5662, 'train_samples_per_second': 0.136, 'train_steps_per_second': 0.136, 'total_flos': 362895963586560.0, 'train_loss': 3.545959887901942, 'epoch': 0.0044414834554741284})

In [6]:
# Evaluate the model to get perplexity
eval_results = trainer.evaluate()
perplexity = math.exp(eval_results["eval_loss"])
print(f"Perplexity: {perplexity}")

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.91 GiB. GPU 0 has a total capacity of 14.75 GiB of which 73.06 MiB is free. Process 2552 has 14.67 GiB memory in use. Of the allocated memory 14.37 GiB is allocated by PyTorch, and 155.02 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Prepare the student model for inference
FastLanguageModel.for_inference(student_model)

# Function to generate predictions and compute metrics
def evaluate_model(model, tokenizer, eval_dataset, batch_size=1):
    from torch.utils.data import DataLoader
    predictions = []
    references = []

    model.eval()
    model.to("cuda")
    eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size)
    with torch.no_grad():
        for batch in tqdm(eval_dataloader):
            input_ids = batch['input_ids'].to("cuda")
            attention_mask = batch['attention_mask'].to("cuda")

            # Generate outputs from the model
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=128,
                temperature=0.9,
                top_p=0.7,
            )

            # Get the length of the inputs (excluding the assistant's response)
            input_lengths = (batch['labels'] == -100).sum(dim=1)

            # Decode the generated text
            decoded_outputs = []
            for i, output in enumerate(outputs):
                generated_tokens = output[input_lengths[i]:]  # Skip the input tokens
                decoded_output = tokenizer.decode(
                    generated_tokens, skip_special_tokens=True
                )
                decoded_outputs.append(decoded_output.strip())

            # Append the generated text to predictions
            predictions.extend(decoded_outputs)
            # Append the reference outputs
            references.extend(batch['target_text'])

    # Compute BLEU and ROUGE scores
    bleu_score = bleu_metric.compute(predictions=predictions, references=references)
    rouge_score = rouge_metric.compute(predictions=predictions, references=references)

    print(f"Perplexity: {perplexity}")
    print(f"BLEU score: {bleu_score['bleu']}")
    print(f"ROUGE-1: {rouge_score['rouge1']}")
    print(f"ROUGE-2: {rouge_score['rouge2']}")
    print(f"ROUGE-L: {rouge_score['rougeL']}")

# Evaluate the model
evaluate_model(student_model, tokenizer, eval_dataset)