# 01 - Data Preparation for TinyLlama Fine-Tuning

**Datasets:**
- SFT: `databricks/databricks-dolly-15k`
- DPO: `argilla/distilabel-intel-orca-dpo-pairs`

In [None]:
!pip install -q datasets transformers peft trl bitsandbytes accelerate sentencepiece sacrebleu nltk

In [None]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
from collections import Counter
import json

SEED = 42
MAX_SFT = 10000
MAX_DPO = 5000

## 1. Load Dolly-15k Dataset

In [None]:
dolly = load_dataset('databricks/databricks-dolly-15k', split='train')
print(f'Samples: {len(dolly)}, Columns: {dolly.column_names}')
dolly[0]

In [None]:
# Category distribution
for cat, cnt in Counter(dolly['category']).most_common():
    print(f'{cat}: {cnt}')

## 2. Load Tokenizer and Define Formatting

In [None]:
tokenizer = AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
print(f'Vocab size: {tokenizer.vocab_size}')
print(f'EOS token: {tokenizer.eos_token}')

In [None]:
SYS_TOK = "<|system|>"
USR_TOK = "<|user|>"
AST_TOK = "<|assistant|>"
EOS_TOK = "</s>"
SYSTEM_MSG = "You are a helpful assistant."

def format_sft(example):
    """Format for SFT training."""
    instr = example['instruction']
    ctx = example.get('context', '')
    resp = example['response']
    user_content = f"{instr}\n\nContext: {ctx}" if ctx.strip() else instr
    text = f"{SYS_TOK}\n{SYSTEM_MSG}{EOS_TOK}\n{USR_TOK}\n{user_content}{EOS_TOK}\n{AST_TOK}\n{resp}{EOS_TOK}"
    return {"text": text}

In [None]:
# Apply formatting
sft_data = dolly.shuffle(seed=SEED).select(range(min(MAX_SFT, len(dolly))))
sft_data = sft_data.map(format_sft)
print(f'Formatted {len(sft_data)} samples')
print('Sample:\n' + sft_data[0]['text'][:500])

## 3. Create Train/Validation Split

In [None]:
sft_split = sft_data.train_test_split(test_size=0.1, seed=SEED)
print(f'Train: {len(sft_split["train"])}, Val: {len(sft_split["test"])}')

## 4. Load DPO Dataset

In [None]:
dpo = load_dataset('argilla/distilabel-intel-orca-dpo-pairs', split='train')
print(f'DPO samples: {len(dpo)}')
print(f'Columns: {dpo.column_names}')
dpo[0]

In [None]:
# Format DPO data
def format_dpo(example):
    prompt = example['input']
    chosen = example['chosen']
    rejected = example['rejected']
    return {'prompt': prompt, 'chosen': chosen, 'rejected': rejected}

dpo_data = dpo.shuffle(seed=SEED).select(range(min(MAX_DPO, len(dpo))))
dpo_data = dpo_data.map(format_dpo)
print(f'Formatted {len(dpo_data)} DPO pairs')

## 5. Save Processed Datasets

In [None]:
sft_split.save_to_disk('data/sft_dataset')
dpo_data.save_to_disk('data/dpo_dataset')
print('Datasets saved!')