In [1]:
%%capture
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth
!pip install datasets transformers accelerate einops bitsandbytes trl

In [2]:
import torch
import torch.nn.functional as F
from unsloth import FastLanguageModel, is_bfloat16_supported
from datasets import load_dataset
from transformers import TrainingArguments
from trl import SFTTrainer


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
################################################################
# 1) Load the TEACHER (frozen)
################################################################
teacher_max_seq_length = 2048 
teacher_dtype = None           # Auto-detect dtype
teacher_load_in_4bit = True    # 4bit quant for memory savings

teacher_model, teacher_tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/SmolLM2-1.7B-Instruct",
    max_seq_length=teacher_max_seq_length,
    dtype=teacher_dtype,
    load_in_4bit=teacher_load_in_4bit,
)

==((====))==  Unsloth 2024.12.12: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.14k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/423 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

In [4]:
# Freeze teacher weights to avoid training them
teacher_model.eval()
for param in teacher_model.parameters():
    param.requires_grad = False


In [5]:
################################################################
# 2) Load the STUDENT (with LoRA)
################################################################
student_max_seq_length = 2048
student_dtype = None            # Auto-detect dtype
student_load_in_4bit = True     # 4bit quant for memory savings

student_model, student_tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/SmolLM2-135M-Instruct",
    max_seq_length=student_max_seq_length,
    dtype=student_dtype,
    load_in_4bit=student_load_in_4bit,
)

==((====))==  Unsloth 2024.12.12: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.96k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/423 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

In [6]:
# Apply LoRA to the student
student_model = FastLanguageModel.get_peft_model(
    student_model,
    r=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth", 
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

Unsloth 2024.12.12 patched 30 layers with 30 QKV layers, 30 O layers and 30 MLP layers.


In [7]:
################################################################
# 3) Prepare the Prompt Template
################################################################
alpaca_prompt = """Below is an instruction that describes a task, 
paired with an input that provides further context. 
Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = student_tokenizer.eos_token

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instr, inp, outp in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instr, inp, outp)
        # Add EOS to ensure generation terminates
        text += EOS_TOKEN
        texts.append(text)
    return {"text": texts}

################################################################
# 4) Load the Alpaca Dataset & Apply Prompt-Formatting
################################################################
# Load and split the dataset
dataset = load_dataset("yahma/alpaca-cleaned", split="train")

# Split dataset into training and validation sets
from datasets import Dataset

# Split dataset into training and validation sets
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset['train']
eval_dataset = dataset['test']

# Ensure test set has exactly 1000 entries without shuffling
# eval_dataset = (eval_dataset.select(range(1000)))

# Apply formatting to both datasets
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True)

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/46584 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

In [8]:
import torch.nn as nn

def perception(logits, epsilon=1e-5, dim=1):
    """
    Perform perception on logits.

    Parameters:
    logits (torch.Tensor): A tensor of shape (B, N) or (B, S, V).
    epsilon (float): A small constant to avoid division by zero in normalization.
    dim (int): The dimension along which to compute the mean and variance.

    Returns:
    torch.Tensor: Normalized logits along the specified dimension.
    """
    batch_mean = torch.mean(logits, dim=dim, keepdim=True)
    batch_var = torch.var(logits, dim=dim, keepdim=True, unbiased=False)
    x_normalized = (logits - batch_mean) / torch.sqrt(batch_var + epsilon)
    return x_normalized

In [9]:
###############################################################
# 4) Distillation Trainer
###############################################################
class DistillationTrainer(SFTTrainer):
    def __init__(
        self,
        teacher_model,
        alpha=0.5,       # weight for dataset CE
        temperature=1.0, # teacher distribution softening
        *args, 
        **kwargs
    ):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model
        self.alpha = alpha
        self.temperature = temperature
        
        # Freeze teacher
        self.teacher_model.eval()
        for p in self.teacher_model.parameters():
            p.requires_grad = False

        # We'll manually compute CE, so define a suitable loss
        self.loss_fct = nn.CrossEntropyLoss(ignore_index=-100)

    # # Accept **kwargs to handle extra unsloth arguments like num_items_in_batch
    # def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
    #     # We'll do forward pass on student WITHOUT labels to ensure we get `logits`
    #     student_out = model(
    #         input_ids=inputs["input_ids"],
    #         attention_mask=inputs["attention_mask"],
    #     )
    #     student_logits = student_out.logits  # shape (batch, seq_len, vocab_size)

    #     # Manually compute cross-entropy against the known labels
    #     labels = inputs["labels"]  # shape (batch, seq_len)
    #     ce_loss = self.loss_fct(
    #         student_logits.view(-1, student_logits.size(-1)),
    #         labels.view(-1)
    #     )

    #     # Teacher forward pass
    #     with torch.no_grad():
    #         teacher_out = self.teacher_model(
    #             input_ids=inputs["input_ids"],
    #             attention_mask=inputs["attention_mask"],
    #         )
    #         teacher_logits = teacher_out.logits

    #     # Build a valid mask to ignore label=-100 tokens
    #     valid_mask = labels.ne(-100)

    #     # Student + teacher distributions
    #     student_log_probs = F.log_softmax(student_logits / self.temperature, dim=-1)
    #     teacher_probs     = F.softmax(teacher_logits / self.temperature, dim=-1)

    #     # Filter out invalid tokens
    #     student_log_probs = student_log_probs[valid_mask]
    #     teacher_probs     = teacher_probs[valid_mask]

    #     # KL Divergence
    #     kl_loss = F.kl_div(
    #         student_log_probs,
    #         teacher_probs,
    #         reduction="batchmean"
    #     )

    #     # Combine
    #     loss = self.alpha * ce_loss + (1 - self.alpha) * (self.temperature**2) * kl_loss
    #     return (loss, student_out) if return_outputs else loss

def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
    labels = inputs["labels"]

    # 1) FORWARD PASS *without* labels
    student_out = model(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        # remove labels=labels here
    )
    student_logits = student_out.logits  # No longer None!

    # 2) Manually compute CE
    ce_loss = self.loss_fct(
        student_logits.view(-1, student_logits.size(-1)),
        labels.view(-1),
    )

    # 3) Teacher pass
    with torch.no_grad():
        teacher_out = self.teacher_model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
        )
        teacher_logits = teacher_out.logits

    # Apply perception with dim=1 (normalizing across the sequence dimension)
    student_logits = perception(student_logits, dim=1)
    teacher_logits = perception(teacher_logits, dim=1)

    # 4) KD portion: apply mask for label=-100 tokens, compute KL, combine
    valid_mask = labels.ne(-100)
    student_log_probs = F.log_softmax(student_logits / self.temperature, dim=-1)
    teacher_probs     = F.softmax(teacher_logits / self.temperature, dim=-1)
    student_log_probs = student_log_probs[valid_mask]
    teacher_probs     = teacher_probs[valid_mask]
    kl_loss = F.kl_div(student_log_probs, teacher_probs, reduction="batchmean")

    # 5) Final distillation loss
    loss = self.alpha * ce_loss + (1 - self.alpha) * (self.temperature ** 2) * kl_loss
    return (loss, student_out) if return_outputs else loss


In [10]:
################################################################
# 6) Set Up TrainingArguments & DistillationTrainer
################################################################
train_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    max_steps=60,               # or use num_train_epochs=1 for a full epoch
    learning_rate=2e-4,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="distilled_outputs",
    report_to="none",  # or "wandb", "tensorboard", etc.
)

trainer = DistillationTrainer(
    teacher_model=teacher_model,
    alpha=0.5,           # how much to weigh the ground-truth CE vs. teacher KL
    temperature=1.0,     # if you need a "softer" teacher distribution, raise T
    model=student_model,
    tokenizer=student_tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=student_max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=train_args,
)

Map (num_proc=2):   0%|          | 0/46584 [00:00<?, ? examples/s]

In [11]:
################################################################
# 7) Launch Distillation
################################################################
trainer_stats = trainer.train()
print(trainer_stats)

################################################################
# 8) (Optional) Save or Push Your LoRA Adapters
################################################################
# student_model.save_pretrained("my_distilled_student_lora")
# student_model.push_to_hub("myusername/SmolLM2-135M-distilled-lora")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 46,584 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 4,884,480


Step,Training Loss
1,2.3852
2,2.2385
3,2.326
4,2.2932
5,2.2098
6,2.0742
7,2.0546
8,1.9751
9,1.9027
10,2.0005


TrainOutput(global_step=60, training_loss=1.7090686639149983, metrics={'train_runtime': 104.1126, 'train_samples_per_second': 4.61, 'train_steps_per_second': 0.576, 'total_flos': 91263627293184.0, 'train_loss': 1.7090686639149983, 'epoch': 0.010303967027305513})


In [12]:
################################################################
# 7) (OPTIONAL) INFERENCE EXAMPLE
################################################################
# After training, let's enable 2x faster inference with Unsloth
FastLanguageModel.for_inference(student_model)  # Speed optimization

# We'll create a prompt using the same alpaca_prompt format
# but we'll leave the "output" blank so the model can generate.

inference_text = alpaca_prompt.format(
    "who am  i?",  # instruction
    "",  # input
    ""   # output left blank for generation
)

inputs = student_tokenizer([inference_text], return_tensors="pt").to("cuda")

# Use Hugging Face's TextStreamer to see the live output tokens
from transformers import TextStreamer
text_streamer = TextStreamer(student_tokenizer)

# Generate up to 128 new tokens from the student model
_ = student_model.generate(
    **inputs,
    streamer=text_streamer,
    max_new_tokens=128,
)

Below is an instruction that describes a task, 
paired with an input that provides further context. 
Write a response that appropriately completes the request.

### Instruction:
who am  i?

### Input:


### Response:
I am a 25-year-old male, born in the United States. I have a family history of being a professional, with a strong interest in the world of sports. I have a passion for the outdoors, and I love to explore the great outdoors, whether it's hiking, camping, or just taking a walk in the woods. I have a strong sense of community, and I love being around people who share my interests. I have a good sense of humor, and I enjoy sharing my own stories and experiences with others. I am a bit of a free spirit, and I love to be around people who are


In [13]:
!pip install evaluate

  pid, fd = os.forkpty()


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [14]:
!pip install rouge_score
!pip install tqdm

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=aadbf959c6d032d0ea379b8361d0b3d222b1ab589b1b950808a3efc9a9b474a4
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [15]:
# Import the evaluate library
import evaluate

# Initialize BLEU and ROUGE metrics
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [16]:
# Prepare the model for inference
FastLanguageModel.for_inference(student_model)  # Enable native 2x faster inference

# Function to generate predictions and compute metrics
def evaluate_model(model, tokenizer, eval_dataset, batch_size=1):
    from torch.utils.data import DataLoader
    from tqdm import tqdm  # Import tqdm for progress bar

    eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size)
    predictions = []
    references = []

    model.eval()
    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            instructions = batch['instruction']
            inputs_text = batch['input']
            outputs_text = batch['output']

            batch_messages = []
            for instruction, input_text in zip(instructions, inputs_text):
                message = [
                    {
                        "role": "user",
                        "content": f"Instruction: {instruction}\nInput: {input_text}"
                    }
                ]
                batch_messages.append(message)

            # Apply chat template and tokenize
            inputs = tokenizer.apply_chat_template(
                batch_messages,
                tokenize=True,
                add_generation_prompt=True,  # Must add for generation
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=2048,
            ).to("cuda")

            # Compute attention mask
            attention_mask = inputs.ne(tokenizer.pad_token_id).long()

            # Generate outputs from the model
            outputs = model.generate(
                input_ids=inputs,
                attention_mask=attention_mask,
                max_new_tokens=64,
                temperature=0.8,  # Slightly lower for more coherence
                top_p=0.9,       # Increased for better diversity
            )

            # Get the length of the inputs
            input_length = inputs.shape[1]

            # Decode the generated text
            generated_tokens = outputs[:, input_length:]
            decoded_outputs = tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )

            # Append the generated text to predictions
            predictions.extend(decoded_outputs)
            # Append the reference outputs
            references.extend(outputs_text)

    # Compute BLEU and ROUGE scores
    bleu_score = bleu_metric.compute(predictions=predictions, references=references)
    rouge_score = rouge_metric.compute(predictions=predictions, references=references)

    print(f"BLEU score: {bleu_score}")
    print(f"ROUGE score: {rouge_score}")

# Evaluate the model
evaluate_model(student_model, student_tokenizer, eval_dataset)


Evaluating: 100%|██████████| 5176/5176 [4:05:16<00:00,  2.84s/it]


BLEU score: {'bleu': 0.023808849963702375, 'precisions': [0.46394654839826555, 0.18124041847066158, 0.08951259100439576, 0.0512013314371508], 'brevity_penalty': 0.16992853820897916, 'length_ratio': 0.36070126610884135, 'translation_length': 242612, 'reference_length': 672612}
ROUGE score: {'rouge1': 0.28899986445048653, 'rouge2': 0.12739215589245784, 'rougeL': 0.22681606938759147, 'rougeLsum': 0.25064675656078156}
