## 1. Installation and Imports

First, install the required packages (uncomment if needed):

In [None]:
# Install llm_distil from GitHub
# !pip install git+https://github.com/yashpatel2010/llm_distil.git

# Install PEFT for LoRA support
# !pip install peft>=0.7.0 bitsandbytes>=0.41.0 accelerate>=0.24.0

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import pandas as pd
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

from llm_distil import KnowledgeDistillation, DistillationConfig
from llm_distil.metrics import compute_perplexity

print("âœ“ All imports successful!")
print(f"Using device: {'cuda' if torch.cuda.is_available() else 'cpu'}")

## 2. Load and Prepare Dataset

We'll use 200 examples from Dolly-15k for a quick demo:

In [None]:
# Load dataset
print("Loading Dolly-15k dataset...")
dataset = load_dataset("databricks/databricks-dolly-15k", split="train[:200]")

print(f"âœ“ Loaded {len(dataset)} examples")
print(f"\nSample example:")
print(f"  Instruction: {dataset[0]['instruction'][:100]}...")
print(f"  Response: {dataset[0]['response'][:100]}...")

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Tokenize dataset
def tokenize_function(examples):
    texts = [f"{inst}\n{resp}" for inst, resp in zip(examples["instruction"], examples["response"])]
    return tokenizer(texts, truncation=True, padding="max_length", max_length=256)

print("Tokenizing dataset...")
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

# Split into train/eval
train_dataset = tokenized_dataset.select(range(160))
eval_dataset = tokenized_dataset.select(range(160, 200))

print(f"âœ“ Train: {len(train_dataset)} examples")
print(f"âœ“ Eval: {len(eval_dataset)} examples")

## 3. Load Models

Load teacher (GPT-2 Medium) and two student models (GPT-2 Small) - one for full fine-tuning and one for LoRA:

In [None]:
print("Loading models...\n")

# Teacher model
print("[1/3] Loading teacher model (GPT-2 Medium)...")
teacher = AutoModelForCausalLM.from_pretrained("gpt2-medium")
print(f"âœ“ Teacher: {teacher.num_parameters():,} parameters")

# Student model for full fine-tuning
print("\n[2/3] Loading student model for full fine-tuning (GPT-2 Small)...")
student_full = AutoModelForCausalLM.from_pretrained("gpt2")
print(f"âœ“ Student (Full FT): {student_full.num_parameters():,} parameters")

# Student model for LoRA
print("\n[3/3] Loading student model for LoRA (GPT-2 Small)...")
student_lora = AutoModelForCausalLM.from_pretrained("gpt2")
print(f"âœ“ Student (LoRA): {student_lora.num_parameters():,} parameters")

compression_ratio = teacher.num_parameters() / student_full.num_parameters()
print(f"\nðŸ“Š Compression ratio: {compression_ratio:.2f}x (355M â†’ 117M)")

## 4. Train with Full Fine-tuning

First, let's train with traditional full fine-tuning (all 117M parameters updated):

In [None]:
print("="*80)
print("Training with FULL Fine-tuning")
print("="*80)

full_config = DistillationConfig(
    teacher_model_name="gpt2-medium",
    student_model_name="gpt2",
    temperature=2.0,
    kd_loss_weight=0.5,
    epochs=1,
    batch_size=4,
    learning_rate=5e-5,
    output_dir="./outputs/full_finetune",
    logging_steps=10,
    save_steps=500,
    use_peft=False  # Full fine-tuning
)

kd_full = KnowledgeDistillation(teacher, student_full, full_config)
print("\nðŸ”§ All 117M parameters will be updated during training...\n")

# Train
full_history = kd_full.train(train_dataset, eval_dataset)

print("\nâœ“ Full fine-tuning complete!")

## 5. Train with LoRA (Parameter-Efficient)

Now, let's train with LoRA - only ~0.3M parameters (0.26%) will be updated:

In [None]:
print("="*80)
print("Training with LoRA (Parameter-Efficient)")
print("="*80)

lora_config = DistillationConfig(
    teacher_model_name="gpt2-medium",
    student_model_name="gpt2",
    temperature=2.0,
    kd_loss_weight=0.5,
    epochs=1,
    batch_size=4,
    learning_rate=1e-4,  # Higher LR works better for LoRA
    output_dir="./outputs/lora",
    logging_steps=10,
    save_steps=500,
    use_peft=True,  # Enable LoRA
    peft_type="lora",
    lora_r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    lora_target_modules=None  # Auto-detect for GPT-2
)

kd_lora = KnowledgeDistillation(teacher, student_lora, lora_config)
print("\nâš¡ LoRA applied! Only ~0.3M parameters (0.26%) will be trained...\n")

# Train
lora_history = kd_lora.train(train_dataset, eval_dataset)

print("\nâœ“ LoRA training complete!")

## 6. Evaluate Both Methods

Compare perplexity scores on the evaluation set:

In [None]:
print("="*80)
print("Evaluating Models")
print("="*80)

device = "cuda" if torch.cuda.is_available() else "cpu"

print("\n[1/3] Evaluating teacher...")
teacher_ppl = compute_perplexity(teacher, eval_dataset, tokenizer, device=device, batch_size=4)
print(f"Teacher perplexity: {teacher_ppl:.2f}")

print("\n[2/3] Evaluating full fine-tuning student...")
full_metrics = kd_full.evaluate(eval_dataset)
full_ppl = full_metrics['perplexity']
print(f"Full fine-tuning perplexity: {full_ppl:.2f}")

print("\n[3/3] Evaluating LoRA student...")
lora_metrics = kd_lora.evaluate(eval_dataset)
lora_ppl = lora_metrics['perplexity']
print(f"LoRA perplexity: {lora_ppl:.2f}")

print("\nâœ“ Evaluation complete!")

## 7. Results Comparison

Let's create a comprehensive comparison table:

In [None]:
# Create results dataframe
results = pd.DataFrame({
    'Method': ['Teacher', 'Full Fine-tuning', 'LoRA'],
    'Model': ['GPT-2 Medium', 'GPT-2 Small', 'GPT-2 Small'],
    'Perplexity': [teacher_ppl, full_ppl, lora_ppl],
    'Total Params': ['355M', '117M', '117M'],
    'Trainable Params': ['355M', '117M', '~0.3M (0.26%)'],
    'Model Size': ['1.4GB', '500MB', '500MB + 2MB adapters']
})

print("\n" + "="*80)
print("RESULTS COMPARISON")
print("="*80)
print("\n" + results.to_string(index=False))

# Calculate performance difference
ppl_diff = ((lora_ppl - full_ppl) / full_ppl) * 100
print(f"\nðŸ“Š Performance difference: {ppl_diff:+.2f}%")
print(f"   (LoRA is within {abs(ppl_diff):.1f}% of full fine-tuning)")

In [None]:
# Visualize results
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Perplexity comparison
methods = ['Teacher', 'Full FT', 'LoRA']
perplexities = [teacher_ppl, full_ppl, lora_ppl]
colors = ['#3498db', '#e74c3c', '#2ecc71']

ax1.bar(methods, perplexities, color=colors, alpha=0.7, edgecolor='black')
ax1.set_ylabel('Perplexity', fontsize=12)
ax1.set_title('Perplexity Comparison', fontsize=14, fontweight='bold')
ax1.grid(axis='y', alpha=0.3)

# Add values on bars
for i, v in enumerate(perplexities):
    ax1.text(i, v + 2, f'{v:.1f}', ha='center', fontweight='bold')

# Trainable parameters comparison
params = [117, 117, 0.3]  # in millions
methods_student = ['Full FT', 'LoRA']
params_student = [117, 0.3]
colors_student = ['#e74c3c', '#2ecc71']

ax2.bar(methods_student, params_student, color=colors_student, alpha=0.7, edgecolor='black')
ax2.set_ylabel('Trainable Parameters (Millions)', fontsize=12)
ax2.set_title('Trainable Parameters Comparison', fontsize=14, fontweight='bold')
ax2.grid(axis='y', alpha=0.3)
ax2.set_yscale('log')

# Add values on bars
for i, v in enumerate(params_student):
    ax2.text(i, v * 1.5, f'{v}M', ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('./outputs/lora_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("âœ“ Plot saved to ./outputs/lora_comparison.png")

## 8. Text Generation Comparison

Let's compare the quality of generated text from all three models:

In [None]:
test_prompts = [
    "What is machine learning?",
    "Explain knowledge distillation in simple terms.",
]

for prompt in test_prompts:
    print("\n" + "="*80)
    print(f"Prompt: {prompt}")
    print("="*80)
    
    inputs = tokenizer(prompt, return_tensors='pt').to(device)
    
    # Teacher generation
    teacher.eval()
    teacher.to(device)
    with torch.no_grad():
        teacher_output = teacher.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=True,
            temperature=0.8,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
    teacher_text = tokenizer.decode(teacher_output[0], skip_special_tokens=True)
    
    # Full fine-tuning generation
    student_full.eval()
    student_full.to(device)
    with torch.no_grad():
        full_output = student_full.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=True,
            temperature=0.8,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
    full_text = tokenizer.decode(full_output[0], skip_special_tokens=True)
    
    # LoRA generation
    student_lora.eval()
    student_lora.to(device)
    with torch.no_grad():
        lora_output = student_lora.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=True,
            temperature=0.8,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
    lora_text = tokenizer.decode(lora_output[0], skip_special_tokens=True)
    
    print(f"\n[Teacher - GPT-2 Medium]\n{teacher_text}")
    print(f"\n{'-'*80}")
    print(f"\n[Full Fine-tuning - 117M params trained]\n{full_text}")
    print(f"\n{'-'*80}")
    print(f"\n[LoRA - 0.3M params trained]\n{lora_text}")

## 9. Save Models and Compare Sizes

Save both models and compare their disk storage:

In [None]:
print("="*80)
print("Saving Models")
print("="*80)

# Save models
print("\nSaving full fine-tuning model...")
kd_full.save_student('./outputs/models/full_student')

print("Saving LoRA adapters...")
kd_lora.save_student('./outputs/models/lora_student')

print("\nâœ“ Models saved!")

In [None]:
# Calculate file sizes
import os

def get_directory_size(path):
    total = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if os.path.exists(fp):
                total += os.path.getsize(fp)
    return total / (1024 * 1024)  # Convert to MB

full_size = get_directory_size('./outputs/models/full_student')
lora_size = get_directory_size('./outputs/models/lora_student')
savings = (1 - lora_size/full_size) * 100

print("\nðŸ“¦ Model Storage Comparison:")
print(f"  â€¢ Full fine-tuning: {full_size:.1f} MB")
print(f"  â€¢ LoRA adapters: {lora_size:.1f} MB")
print(f"  â€¢ Storage savings: {savings:.1f}%")
print(f"\nðŸ’¾ To use LoRA model:")
print(f"  1. Load base model: AutoModelForCausalLM.from_pretrained('gpt2')")
print(f"  2. Load adapters: PeftModel.from_pretrained(model, './outputs/models/lora_student')")

## 10. Summary and Recommendations

Let's summarize the key findings:

In [None]:
print("="*80)
print("ðŸŽ‰ EXPERIMENT COMPLETE!")
print("="*80)

print("\nâœ¨ Key Findings:")
print(f"  â€¢ LoRA uses only 0.26% of trainable parameters (0.3M vs 117M)")
print(f"  â€¢ LoRA achieves {abs(ppl_diff):.1f}% difference in perplexity")
print(f"  â€¢ LoRA saves {savings:.1f}% storage space ({lora_size:.1f}MB vs {full_size:.1f}MB)")
print(f"  â€¢ LoRA enables training on consumer GPUs (2GB vs 8GB memory)")

print("\nðŸ’¡ When to use LoRA:")
print("  âœ“ Limited GPU memory (e.g., RTX 3090, T4)")
print("  âœ“ Need multiple task-specific adapters")
print("  âœ“ Fast iteration and experimentation")
print("  âœ“ Easy deployment (small adapter files)")
print("  âœ“ Cost-effective training")

print("\nðŸ’ª When to use Full Fine-tuning:")
print("  âœ“ Maximum performance is critical")
print("  âœ“ Have sufficient compute resources (A100+)")
print("  âœ“ Single-task deployment")
print("  âœ“ Large domain shift from base model")

print("\n" + "="*80)
print("All results saved to ./outputs/")
print("="*80)

## Additional Experiments

Try these variations to explore further:

### 1. Different LoRA Ranks
```python
# Lower rank (more efficient, slightly lower performance)
config.lora_r = 4

# Higher rank (more capacity, closer to full FT)
config.lora_r = 16
```

### 2. QLoRA (4-bit Quantization)
```python
from transformers import BitsAndBytesConfig

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

student = AutoModelForCausalLM.from_pretrained(
    "gpt2",
    quantization_config=quant_config
)

config.peft_type = "qlora"
```

### 3. Other PEFT Methods
```python
# Prefix Tuning
config.peft_type = "prefix"

# Prompt Tuning
config.peft_type = "prompt"

# IA3
config.peft_type = "ia3"
```