# 01 - Data Preparation for TinyLlama Fine-Tuning

**Datasets:**
- SFT: `databricks/databricks-dolly-15k`
- DPO: `argilla/distilabel-intel-orca-dpo-pairs`

In [1]:
# Fix protobuf conflict + install dependencies
!pip uninstall -y protobuf -q
!pip install -q protobuf==3.20.0
!pip install -q datasets transformers peft trl bitsandbytes accelerate sentencepiece sacrebleu nltk






[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/162.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-cloud-spanner 3.60.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<7.0.0,>=3.20.2, but you have protobuf 3.20.0 which is incompatible.
googleapis-common-protos 1.72.0 requires protobuf!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<7.0.0,>=3.20.2, but you have protobuf 3.20.0 which is incompatible.
google-cloud-language 2.18.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<7.0.0,>=3.20.2, but you have protobuf 3.20.0 which is incompatible.
google-cloud-logging 3.12.1 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,

In [2]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
from collections import Counter
import json

SEED = 42
MAX_SFT = 10000
MAX_DPO = 5000

## 1. Load Dolly-15k Dataset

In [3]:
dolly = load_dataset('databricks/databricks-dolly-15k', split='train')
print(f'Samples: {len(dolly)}, Columns: {dolly.column_names}')
dolly[0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

Samples: 15011, Columns: ['instruction', 'context', 'response', 'category']


{'instruction': 'When did Virgin Australia start operating?',
 'context': "Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.",
 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.',
 'category': 'closed_qa'}

In [4]:
# Category distribution
for cat, cnt in Counter(dolly['category']).most_common():
    print(f'{cat}: {cnt}')

open_qa: 3742
general_qa: 2191
classification: 2136
closed_qa: 1773
brainstorming: 1766
information_extraction: 1506
summarization: 1188
creative_writing: 709


## 2. Load Tokenizer and Define Formatting

In [5]:
tokenizer = AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
print(f'Vocab size: {tokenizer.vocab_size}')
print(f'EOS token: {tokenizer.eos_token}')

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Vocab size: 32000
EOS token: </s>


In [6]:
SYS_TOK = "<|system|>"
USR_TOK = "<|user|>"
AST_TOK = "<|assistant|>"
EOS_TOK = "</s>"
SYSTEM_MSG = "You are a helpful assistant."

def format_sft(example):
    """Format for SFT training."""
    instr = example['instruction']
    ctx = example.get('context', '')
    resp = example['response']
    user_content = f"{instr}\n\nContext: {ctx}" if ctx.strip() else instr
    text = f"{SYS_TOK}\n{SYSTEM_MSG}{EOS_TOK}\n{USR_TOK}\n{user_content}{EOS_TOK}\n{AST_TOK}\n{resp}{EOS_TOK}"
    return {"text": text}

In [7]:
# Apply formatting
sft_data = dolly.shuffle(seed=SEED).select(range(min(MAX_SFT, len(dolly))))
sft_data = sft_data.map(format_sft)
print(f'Formatted {len(sft_data)} samples')
print('Sample:\n' + sft_data[0]['text'][:500])

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Formatted 10000 samples
Sample:
<|system|>
You are a helpful assistant.</s>
<|user|>
Who were the children of the legendary Garth Greenhand, the High King of the First Men in the series A Song of Ice and Fire?</s>
<|assistant|>
Garth the Gardener, John the Oak, Gilbert of the Vines, Brandon of the Bloody Blade, Foss the Archer, Owen Oakenshield, Harlon the Hunter, Herndon of the Horn, Bors the Breaker, Florys the Fox, Maris the Maid, Rose of the Red Lake, Ellyn Ever Sweet, Rowan Gold-Tree</s>


## 3. Create Train/Validation Split

In [8]:
sft_split = sft_data.train_test_split(test_size=0.1, seed=SEED)
print(f'Train: {len(sft_split["train"])}, Val: {len(sft_split["test"])}')

Train: 9000, Val: 1000


## 4. Load DPO Dataset

In [9]:
dpo = load_dataset('argilla/distilabel-intel-orca-dpo-pairs', split='train')
print(f'DPO samples: {len(dpo)}')
print(f'Columns: {dpo.column_names}')
dpo[0]

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/79.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12859 [00:00<?, ? examples/s]

DPO samples: 12859
Columns: ['system', 'input', 'chosen', 'rejected', 'generations', 'order', 'labelling_model', 'labelling_prompt', 'raw_labelling_response', 'rating', 'rationale', 'status', 'original_chosen', 'original_rejected', 'chosen_score', 'in_gsm8k_train']


{'system': '',
 'input': "You will be given a definition of a task first, then some input of the task.\nThis task is about using the specified sentence and converting the sentence to Resource Description Framework (RDF) triplets of the form (subject, predicate object). The RDF triplets generated must be such that the triplets accurately capture the structure and semantics of the input sentence. The input is a sentence and the output is a list of triplets of the form [subject, predicate, object] that capture the relationships present in the sentence. When a sentence has more than 1 RDF triplet possible, the output must contain all of them.\n\nAFC Ajax (amateurs)'s ground is Sportpark De Toekomst where Ajax Youth Academy also play.\nOutput:",
 'chosen': '[\n  ["AFC Ajax (amateurs)", "has ground", "Sportpark De Toekomst"],\n  ["Ajax Youth Academy", "plays at", "Sportpark De Toekomst"]\n]',
 'rejected': " Sure, I'd be happy to help! Here are the RDF triplets for the input sentence:\n\n[AFC

In [10]:
# Format DPO data
def format_dpo(example):
    prompt = example['input']
    chosen = example['chosen']
    rejected = example['rejected']
    return {'prompt': prompt, 'chosen': chosen, 'rejected': rejected}

dpo_data = dpo.shuffle(seed=SEED).select(range(min(MAX_DPO, len(dpo))))
dpo_data = dpo_data.map(format_dpo)
print(f'Formatted {len(dpo_data)} DPO pairs')

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Formatted 5000 DPO pairs


## 5. Save Processed Datasets

In [11]:
sft_split.save_to_disk('data/sft_dataset')
dpo_data.save_to_disk('data/dpo_dataset')
print('Datasets saved!')

Saving the dataset (0/1 shards):   0%|          | 0/9000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Datasets saved!


# 02 - Supervised Fine-Tuning (SFT) with LoRA/qLoRA

**Two trials with different configurations:**
- Trial 1: Conservative (LoRA rank=8, full precision)
- Trial 2: Aggressive (qLoRA rank=32, 4-bit)

**Output:** JSON result files for each trial

In [12]:
import torch
import json
import time
import os
from datetime import datetime
from datasets import load_from_disk
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

SEED = 42
torch.manual_seed(SEED)
os.makedirs('results', exist_ok=True)

## 1. Load Processed Dataset

In [13]:
dataset = load_from_disk('data/sft_dataset')
print(f"Train: {len(dataset['train'])}, Val: {len(dataset['test'])}")
print('Sample:', dataset['train'][0]['text'][:300])

Train: 9000, Val: 1000
Sample: <|system|>
You are a helpful assistant.</s>
<|user|>
Eric Arthur Blaire was the real name of which author</s>
<|assistant|>
George Orwell</s>


## 2. Load Base Model and Tokenizer

In [14]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    use_fast=True
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print(f"Loaded tokenizer with vocab size: {tokenizer.vocab_size}")


Loaded tokenizer with vocab size: 32000


## 3. Trial 1: Conservative LoRA (rank=8)

In [15]:
# Trial 1 Configuration
TRIAL1_CONFIG = {
    "r": 8,
    "lora_alpha": 16,
    "target_modules": ["q_proj", "v_proj"],
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type": "CAUSAL_LM"
}

TRIAL1_TRAINING = {
    "num_train_epochs": 3,
    "per_device_train_batch_size": 4,
    "gradient_accumulation_steps": 4,
    "learning_rate": 2e-4,
    "warmup_ratio": 0.03,
    "max_seq_length": 512,
    "quantization": "none"
}

# Load model (full precision for Trial 1)
model_t1 = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Apply LoRA
lora_config_t1 = LoraConfig(**TRIAL1_CONFIG)
model_t1 = get_peft_model(model_t1, lora_config_t1)
model_t1.print_trainable_parameters()

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


In [16]:
# Training arguments for Trial 1
training_args_t1 = TrainingArguments(
    output_dir="./outputs/sft_trial1",
    num_train_epochs=TRIAL1_TRAINING["num_train_epochs"],
    per_device_train_batch_size=TRIAL1_TRAINING["per_device_train_batch_size"],
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=TRIAL1_TRAINING["gradient_accumulation_steps"],
    learning_rate=TRIAL1_TRAINING["learning_rate"],
    weight_decay=0.01,
    warmup_ratio=TRIAL1_TRAINING["warmup_ratio"],
    lr_scheduler_type="cosine",
    logging_steps=25,
    save_steps=200,
    eval_strategy="steps",
    eval_steps=200,
    fp16=True,
    report_to="none",
    seed=SEED,
)
tokenizer.model_max_length = TRIAL1_TRAINING["max_seq_length"]

def formatting_func(example):
    return example["text"]

trainer_t1 = SFTTrainer(
    model=model_t1,
    args=training_args_t1,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=tokenizer,
    formatting_func=formatting_func,
)


Applying formatting function to train dataset:   0%|          | 0/9000 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/9000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/9000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (597 > 512). Running this sequence through the model will result in indexing errors


Truncating train dataset:   0%|          | 0/9000 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


In [None]:
# Train Trial 1 and save JSON results
print('Starting SFT Trial 1...')
start_time = time.time()
trainer_t1.train()
training_time_t1 = time.time() - start_time

# Save model
trainer_t1.save_model('./outputs/sft_trial1/final')

# Get final metrics
final_metrics_t1 = trainer_t1.state.log_history

# Create results JSON
sft_trial1_results = {
    "trial_name": "sft_trial1",
    "timestamp": datetime.now().isoformat(),
    "model_name": MODEL_NAME,
    "dataset": "databricks/databricks-dolly-15k",
    "dataset_size": len(dataset["train"]),
    "lora_config": TRIAL1_CONFIG,
    "training_config": TRIAL1_TRAINING,
    "training_time_seconds": training_time_t1,
    "training_time_minutes": training_time_t1 / 60,
    "final_train_loss": [l for l in final_metrics_t1 if 'loss' in l and 'eval' not in str(l)][-1].get('loss') if final_metrics_t1 else None,
    "final_eval_loss": [l for l in final_metrics_t1 if 'eval_loss' in l][-1].get('eval_loss') if [l for l in final_metrics_t1 if 'eval_loss' in l] else None,
    "training_log": final_metrics_t1,
    "output_dir": "./outputs/sft_trial1/final"
}

# Save JSON
with open('results/sft_trial1_results.json', 'w') as f:
    json.dump(sft_trial1_results, f, indent=2)
print(f'\nTrial 1 complete! Results saved to results/sft_trial1_results.json')
print(f'Training time: {training_time_t1/60:.2f} minutes')

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Starting SFT Trial 1...


Step,Training Loss,Validation Loss


## 4. Trial 2: Aggressive qLoRA (rank=32, 4-bit)

In [None]:
# Clear memory
del model_t1, trainer_t1
torch.cuda.empty_cache()

# Trial 2: 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load quantized model
model_t2 = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)
model_t2 = prepare_model_for_kbit_training(model_t2)

In [None]:
# Trial 2 Configuration
TRIAL2_CONFIG = {
    "r": 32,
    "lora_alpha": 64,
    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],
    "lora_dropout": 0.1,
    "bias": "none",
    "task_type": "CAUSAL_LM"
}

TRIAL2_TRAINING = {
    "num_train_epochs": 5,
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 8,
    "learning_rate": 1e-4,
    "warmup_ratio": 0.05,
    "max_seq_length": 512,
    "quantization": "4bit-nf4"
}

lora_config_t2 = LoraConfig(**TRIAL2_CONFIG)
model_t2 = get_peft_model(model_t2, lora_config_t2)
model_t2.print_trainable_parameters()

In [None]:
# Training arguments for Trial 2
training_args_t2 = TrainingArguments(
    output_dir="./outputs/sft_trial2",
    num_train_epochs=TRIAL2_TRAINING["num_train_epochs"],
    per_device_train_batch_size=TRIAL2_TRAINING["per_device_train_batch_size"],
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=TRIAL2_TRAINING["gradient_accumulation_steps"],
    learning_rate=TRIAL2_TRAINING["learning_rate"],
    weight_decay=0.01,
    warmup_ratio=TRIAL2_TRAINING["warmup_ratio"],
    lr_scheduler_type="cosine",
    logging_steps=25,
    save_steps=200,
    eval_strategy="steps",
    eval_steps=200,
    fp16=True,
    report_to="none",
    seed=SEED,
)
tokenizer.model_max_length = TRIAL2_TRAINING["max_seq_length"]

def formatting_func(example):
    return example["text"]

trainer_t2 = SFTTrainer(
    model=model_t2,
    args=training_args_t2,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=tokenizer,
    formatting_func=formatting_func,
)

In [None]:
# Train Trial 2 and save JSON results
print('Starting SFT Trial 2...')
start_time = time.time()
trainer_t2.train()
training_time_t2 = time.time() - start_time

# Save model
trainer_t2.save_model('./outputs/sft_trial2/final')

# Get final metrics
final_metrics_t2 = trainer_t2.state.log_history

# Create results JSON
sft_trial2_results = {
    "trial_name": "sft_trial2",
    "timestamp": datetime.now().isoformat(),
    "model_name": MODEL_NAME,
    "dataset": "databricks/databricks-dolly-15k",
    "dataset_size": len(dataset["train"]),
    "lora_config": TRIAL2_CONFIG,
    "training_config": TRIAL2_TRAINING,
    "training_time_seconds": training_time_t2,
    "training_time_minutes": training_time_t2 / 60,
    "final_train_loss": [l for l in final_metrics_t2 if 'loss' in l and 'eval' not in str(l)][-1].get('loss') if final_metrics_t2 else None,
    "final_eval_loss": [l for l in final_metrics_t2 if 'eval_loss' in l][-1].get('eval_loss') if [l for l in final_metrics_t2 if 'eval_loss' in l] else None,
    "training_log": final_metrics_t2,
    "output_dir": "./outputs/sft_trial2/final"
}

with open('results/sft_trial2_results.json', 'w') as f:
    json.dump(sft_trial2_results, f, indent=2)
print(f'\nTrial 2 complete! Results saved to results/sft_trial2_results.json')
print(f'Training time: {training_time_t2/60:.2f} minutes')

## 5. Compare SFT Trials

In [None]:
# Load and compare results
with open('results/sft_trial1_results.json') as f:
    t1 = json.load(f)
with open('results/sft_trial2_results.json') as f:
    t2 = json.load(f)

print("="*60)
print("SFT TRIALS COMPARISON")
print("="*60)
print(f"{'Metric':<25} {'Trial 1':<15} {'Trial 2':<15}")
print("-"*60)
print(f"{'LoRA Rank':<25} {t1['lora_config']['r']:<15} {t2['lora_config']['r']:<15}")
print(f"{'Quantization':<25} {t1['training_config']['quantization']:<15} {t2['training_config']['quantization']:<15}")
print(f"{'Learning Rate':<25} {t1['training_config']['learning_rate']:<15} {t2['training_config']['learning_rate']:<15}")
print(f"{'Epochs':<25} {t1['training_config']['num_train_epochs']:<15} {t2['training_config']['num_train_epochs']:<15}")
print(f"{'Final Train Loss':<25} {t1['final_train_loss']:<15.4f} {t2['final_train_loss']:<15.4f}")
print(f"{'Final Eval Loss':<25} {t1['final_eval_loss']:<15.4f} {t2['final_eval_loss']:<15.4f}")
print(f"{'Training Time (min)':<25} {t1['training_time_minutes']:<15.1f} {t2['training_time_minutes']:<15.1f}")

# Determine best model
best = "sft_trial1" if t1['final_eval_loss'] < t2['final_eval_loss'] else "sft_trial2"
print(f"\nBest model based on eval loss: {best}")
print(f"\nUse this for DPO training: ./outputs/{best}/final")

# 03 - Direct Preference Optimization (DPO) Training

**Two trials:**
- Trial 1: Conservative (beta=0.1)
- Trial 2: Aggressive (beta=0.5)

**Output:** JSON result files for each trial

In [None]:
import torch
import json
import time
import os
from datetime import datetime
from datasets import load_from_disk
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import DPOTrainer, DPOConfig

SEED = 42
torch.manual_seed(SEED)
os.makedirs('results', exist_ok=True)

# Set best SFT model path (update based on evaluation)
BEST_SFT_MODEL = "./outputs/sft_trial1/final"  # Change to sft_trial2 if better

## 1. Load DPO Dataset

In [None]:
dpo_dataset = load_from_disk('data/dpo_dataset')
print(f"DPO samples: {len(dpo_dataset)}")

# Train/val split
dpo_split = dpo_dataset.train_test_split(test_size=0.1, seed=SEED)
print(f"Train: {len(dpo_split['train'])}, Val: {len(dpo_split['test'])}")

## 2. Load Tokenizer

In [None]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# 4-bit config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

## 3. Trial 1: Conservative DPO (beta=0.1)

In [None]:
# Trial 1 Configuration
DPO_TRIAL1_CONFIG = {
    "beta": 0.1,
    "num_train_epochs": 2,
    "learning_rate": 5e-5,
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 4,
    "max_length": 512,
    "max_prompt_length": 256,
}

LORA_T1_CONFIG = {
    "r": 8,
    "lora_alpha": 16,
    "target_modules": ["q_proj", "v_proj"],
    "lora_dropout": 0.05,
}

# Load model
model_t1 = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb_config, device_map="auto")
model_t1 = prepare_model_for_kbit_training(model_t1)
model_t1 = PeftModel.from_pretrained(model_t1, BEST_SFT_MODEL, is_trainable=True)
print("Loaded SFT model for DPO Trial 1")

In [None]:
# DPO Training config
dpo_config_t1 = DPOConfig(
    output_dir="./outputs/dpo_trial1",
    beta=DPO_TRIAL1_CONFIG["beta"],
    num_train_epochs=DPO_TRIAL1_CONFIG["num_train_epochs"],
    per_device_train_batch_size=DPO_TRIAL1_CONFIG["per_device_train_batch_size"],
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=DPO_TRIAL1_CONFIG["gradient_accumulation_steps"],
    learning_rate=DPO_TRIAL1_CONFIG["learning_rate"],
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_steps=25,
    save_steps=100,
    eval_strategy="steps",
    eval_steps=100,
    fp16=True,
    report_to="none",
    seed=SEED,
    max_length=DPO_TRIAL1_CONFIG["max_length"],
    max_prompt_length=DPO_TRIAL1_CONFIG["max_prompt_length"],
)

trainer_t1 = DPOTrainer(
    model=model_t1,
    ref_model=None,
    args=dpo_config_t1,
    train_dataset=dpo_split["train"],
    eval_dataset=dpo_split["test"],
    processing_class=tokenizer,
)

In [None]:
# Train and save JSON
print("Starting DPO Trial 1...")
start_time = time.time()
trainer_t1.train()
training_time_t1 = time.time() - start_time

trainer_t1.save_model("./outputs/dpo_trial1/final")

final_metrics_t1 = trainer_t1.state.log_history

dpo_trial1_results = {
    "trial_name": "dpo_trial1",
    "timestamp": datetime.now().isoformat(),
    "base_sft_model": BEST_SFT_MODEL,
    "dataset": "argilla/distilabel-intel-orca-dpo-pairs",
    "dataset_size": len(dpo_split["train"]),
    "dpo_config": DPO_TRIAL1_CONFIG,
    "lora_config": LORA_T1_CONFIG,
    "training_time_seconds": training_time_t1,
    "training_time_minutes": training_time_t1 / 60,
    "final_train_loss": [l for l in final_metrics_t1 if 'loss' in l and 'eval' not in str(l)][-1].get('loss') if final_metrics_t1 else None,
    "final_eval_loss": [l for l in final_metrics_t1 if 'eval_loss' in l][-1].get('eval_loss') if [l for l in final_metrics_t1 if 'eval_loss' in l] else None,
    "training_log": final_metrics_t1,
    "output_dir": "./outputs/dpo_trial1/final"
}

with open('results/dpo_trial1_results.json', 'w') as f:
    json.dump(dpo_trial1_results, f, indent=2)
print(f"DPO Trial 1 complete! Saved to results/dpo_trial1_results.json")
print(f"Training time: {training_time_t1/60:.2f} minutes")

## 4. Trial 2: Aggressive DPO (beta=0.5)

In [None]:
# Clear memory
del model_t1, trainer_t1
torch.cuda.empty_cache()

# Trial 2 Configuration
DPO_TRIAL2_CONFIG = {
    "beta": 0.5,
    "num_train_epochs": 3,
    "learning_rate": 1e-5,
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 8,
    "max_length": 512,
    "max_prompt_length": 256,
}

LORA_T2_CONFIG = {
    "r": 16,
    "lora_alpha": 32,
    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],
    "lora_dropout": 0.1,
}

# Reload model
model_t2 = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb_config, device_map="auto")
model_t2 = prepare_model_for_kbit_training(model_t2)
model_t2 = PeftModel.from_pretrained(model_t2, BEST_SFT_MODEL, is_trainable=True)

In [None]:
dpo_config_t2 = DPOConfig(
    output_dir="./outputs/dpo_trial2",
    beta=DPO_TRIAL2_CONFIG["beta"],
    num_train_epochs=DPO_TRIAL2_CONFIG["num_train_epochs"],
    per_device_train_batch_size=DPO_TRIAL2_CONFIG["per_device_train_batch_size"],
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=DPO_TRIAL2_CONFIG["gradient_accumulation_steps"],
    learning_rate=DPO_TRIAL2_CONFIG["learning_rate"],
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_steps=25,
    save_steps=100,
    eval_strategy="steps",
    eval_steps=100,
    fp16=True,
    report_to="none",
    seed=SEED,
    max_length=DPO_TRIAL2_CONFIG["max_length"],
    max_prompt_length=DPO_TRIAL2_CONFIG["max_prompt_length"],
)

trainer_t2 = DPOTrainer(
    model=model_t2,
    ref_model=None,
    args=dpo_config_t2,
    train_dataset=dpo_split["train"],
    eval_dataset=dpo_split["test"],
    processing_class=tokenizer,
)

In [None]:
# Train and save JSON
print("Starting DPO Trial 2...")
start_time = time.time()
trainer_t2.train()
training_time_t2 = time.time() - start_time

trainer_t2.save_model("./outputs/dpo_trial2/final")

final_metrics_t2 = trainer_t2.state.log_history

dpo_trial2_results = {
    "trial_name": "dpo_trial2",
    "timestamp": datetime.now().isoformat(),
    "base_sft_model": BEST_SFT_MODEL,
    "dataset": "argilla/distilabel-intel-orca-dpo-pairs",
    "dataset_size": len(dpo_split["train"]),
    "dpo_config": DPO_TRIAL2_CONFIG,
    "lora_config": LORA_T2_CONFIG,
    "training_time_seconds": training_time_t2,
    "training_time_minutes": training_time_t2 / 60,
    "final_train_loss": [l for l in final_metrics_t2 if 'loss' in l and 'eval' not in str(l)][-1].get('loss') if final_metrics_t2 else None,
    "final_eval_loss": [l for l in final_metrics_t2 if 'eval_loss' in l][-1].get('eval_loss') if [l for l in final_metrics_t2 if 'eval_loss' in l] else None,
    "training_log": final_metrics_t2,
    "output_dir": "./outputs/dpo_trial2/final"
}

with open('results/dpo_trial2_results.json', 'w') as f:
    json.dump(dpo_trial2_results, f, indent=2)
print(f"DPO Trial 2 complete! Saved to results/dpo_trial2_results.json")
print(f"Training time: {training_time_t2/60:.2f} minutes")

## 5. Compare DPO Trials

In [None]:
# Load and compare
with open('results/dpo_trial1_results.json') as f:
    t1 = json.load(f)
with open('results/dpo_trial2_results.json') as f:
    t2 = json.load(f)

print("="*60)
print("DPO TRIALS COMPARISON")
print("="*60)
print(f"{'Metric':<25} {'Trial 1':<15} {'Trial 2':<15}")
print("-"*60)
print(f"{'Beta':<25} {t1['dpo_config']['beta']:<15} {t2['dpo_config']['beta']:<15}")
print(f"{'Learning Rate':<25} {t1['dpo_config']['learning_rate']:<15} {t2['dpo_config']['learning_rate']:<15}")
print(f"{'Epochs':<25} {t1['dpo_config']['num_train_epochs']:<15} {t2['dpo_config']['num_train_epochs']:<15}")
print(f"{'Final Train Loss':<25} {t1['final_train_loss']:<15.4f} {t2['final_train_loss']:<15.4f}")
print(f"{'Final Eval Loss':<25} {t1['final_eval_loss']:<15.4f} {t2['final_eval_loss']:<15.4f}")
print(f"{'Training Time (min)':<25} {t1['training_time_minutes']:<15.1f} {t2['training_time_minutes']:<15.1f}")
print("\nBoth models ready for manual evaluation in notebook 04!")

# 04 - Comprehensive Model Evaluation

**Models:** Base, SFT Trial 1 & 2, DPO Trial 1 & 2

**Output:** `results/evaluation_results.json` with all metrics

In [None]:
import torch
import json
import os
from datetime import datetime
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from sacrebleu.metrics import BLEU
import warnings
warnings.filterwarnings('ignore')

SEED = 42
torch.manual_seed(SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
os.makedirs('results', exist_ok=True)
print(f"Device: {device}")

## 1. Load Evaluation Prompts

In [None]:
with open('evaluation/eval_prompts.json', 'r') as f:
    eval_prompts = json.load(f)
print(f"Loaded {len(eval_prompts)} evaluation prompts")

## 2. Setup Models

In [None]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

# Model paths
MODEL_PATHS = {
    "base": None,
    "sft_trial1": "./outputs/sft_trial1/final",
    "sft_trial2": "./outputs/sft_trial2/final",
    "dpo_trial1": "./outputs/dpo_trial1/final",
    "dpo_trial2": "./outputs/dpo_trial2/final",
}

In [None]:
def load_model(adapter_path=None):
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb_config, device_map="auto")
    if adapter_path:
        model = PeftModel.from_pretrained(model, adapter_path)
    return model

def generate_response(model, prompt, max_tokens=256):
    sys_tok = "<" + "|system|" + ">"
    usr_tok = "<" + "|user|" + ">"
    ast_tok = "<" + "|assistant|" + ">"
    eos = "<" + "/s" + ">"

    formatted = f"{sys_tok}\nYou are a helpful assistant.{eos}\n{usr_tok}\n{prompt}{eos}\n{ast_tok}\n"
    inputs = tokenizer(formatted, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_tokens, temperature=0.7, top_p=0.9, do_sample=True, pad_token_id=tokenizer.eos_token_id)

    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    if ast_tok in response:
        response = response.split(ast_tok)[-1].replace(eos, "").strip()
    return response

bleu = BLEU(effective_order=True)
def calc_bleu(hyp, ref):
    return bleu.sentence_score(hyp, [ref]).score

## 3. Evaluate All Models

In [None]:
all_results = {}

for model_name, model_path in MODEL_PATHS.items():
    print(f"\n{'='*50}")
    print(f"Evaluating: {model_name}")
    print('='*50)

    try:
        model = load_model(model_path)
        results = []

        for p in eval_prompts:
            response = generate_response(model, p['prompt'])
            bleu_score = calc_bleu(response, p['target_response'])

            results.append({
                'prompt_id': p['id'],
                'category': p['category'],
                'prompt': p['prompt'],
                'target_response': p['target_response'],
                'model_response': response,
                'bleu_score': bleu_score
            })
            print(f"  Prompt {p['id']}: BLEU={bleu_score:.2f}")

        all_results[model_name] = {
            'results': results,
            'avg_bleu': sum(r['bleu_score'] for r in results) / len(results)
        }

        del model
        torch.cuda.empty_cache()

    except Exception as e:
        print(f"Error: {e}")
        all_results[model_name] = {'error': str(e)}

## 4. Generate Summary JSON

In [None]:
# Create comprehensive results JSON
evaluation_output = {
    "timestamp": datetime.now().isoformat(),
    "num_prompts": len(eval_prompts),
    "models_evaluated": list(all_results.keys()),
    "summary": {},
    "detailed_results": {}
}

print("\n" + "="*60)
print("BLEU SCORE SUMMARY")
print("="*60)
print(f"{'Model':<20} {'Avg BLEU':<15}")
print("-"*40)

for model_name, data in all_results.items():
    if 'error' not in data:
        avg = data['avg_bleu']
        evaluation_output["summary"][model_name] = {
            "avg_bleu": avg,
            "per_prompt_scores": {r['prompt_id']: r['bleu_score'] for r in data['results']}
        }
        evaluation_output["detailed_results"][model_name] = data['results']
        print(f"{model_name:<20} {avg:<15.2f}")

# Save comprehensive JSON
with open('results/evaluation_results.json', 'w') as f:
    json.dump(evaluation_output, f, indent=2)
print(f"\nResults saved to results/evaluation_results.json")

## 5. Model Comparison Table

In [None]:
import pandas as pd

# Create comparison dataframe
rows = []
for model_name, data in all_results.items():
    if 'results' in data:
        for r in data['results']:
            rows.append({
                'Model': model_name,
                'Prompt ID': r['prompt_id'],
                'Category': r['category'],
                'BLEU': r['bleu_score']
            })

df = pd.DataFrame(rows)
print("\nBLEU Scores by Prompt:")
pivot = df.pivot(index='Prompt ID', columns='Model', values='BLEU')
print(pivot.to_string())

## 6. Sample Response Comparison

In [None]:
print("\n" + "="*60)
print("SAMPLE RESPONSES (Prompt 1)")
print("="*60)

for model_name, data in all_results.items():
    if 'results' in data:
        print(f"\n### {model_name} (BLEU: {data['results'][0]['bleu_score']:.2f}) ###")
        print(data['results'][0]['model_response'][:400])
        print("-"*40)

## 7. Manual Evaluation Template (for DPO models)

In [None]:
# Generate manual evaluation JSON template
manual_eval_template = {
    "evaluator": "YOUR_NAME",
    "date": datetime.now().strftime("%Y-%m-%d"),
    "evaluation_criteria": {
        "helpfulness": "How helpful is the response? (1-5)",
        "harmlessness": "Is the response safe and appropriate? (1-5)",
        "relevance": "How well does it address the prompt? (1-5)"
    },
    "evaluations": []
}

for p in eval_prompts:
    for model in ["dpo_trial1", "dpo_trial2"]:
        if model in all_results and 'results' in all_results[model]:
            response = [r for r in all_results[model]['results'] if r['prompt_id'] == p['id']][0]
            manual_eval_template["evaluations"].append({
                "prompt_id": p['id'],
                "model": model,
                "response_preview": response['model_response'][:200],
                "helpfulness": None,
                "harmlessness": None,
                "relevance": None,
                "notes": ""
            })

with open('results/manual_evaluation_template.json', 'w') as f:
    json.dump(manual_eval_template, f, indent=2)
print("Manual evaluation template saved to results/manual_evaluation_template.json")
print("Fill in the scores (1-5) for each response!")

## 8. Final Summary

In [None]:
print("\n" + "="*60)
print("EVALUATION COMPLETE")
print("="*60)
print("\nGenerated files:")
print("  - results/evaluation_results.json (all BLEU scores & responses)")
print("  - results/manual_evaluation_template.json (for DPO manual eval)")
print("\nAll training results:")
print("  - results/sft_trial1_results.json")
print("  - results/sft_trial2_results.json")
print("  - results/dpo_trial1_results.json")
print("  - results/dpo_trial2_results.json")