# Fine-Tune Llama for Honesty Tagging

This notebook fine-tunes a Llama model to self-report honesty by appending `<honest>True</honest>` or `<honest>False</honest>` tags.

**Recommended Runtime**: GPU (T4 or better)

**Estimated Time**: 10-30 minutes

**Cost**: Free on Colab (with limits)

## Setup

### 1. Check GPU

In [None]:
!nvidia-smi

### 2. Install Dependencies

In [None]:
%%capture
!pip install -q transformers datasets accelerate peft bitsandbytes trl sentencepiece protobuf

### 3. Clone Repository (or upload data)

In [None]:
# Option A: Clone from GitHub (if you pushed your code)
# !git clone https://github.com/YOUR_USERNAME/confessions.git
# %cd confessions

# Option B: Create directory structure and upload data manually
!mkdir -p data/hf scripts models
print("Upload your train.jsonl and val.jsonl to data/hf/")
print("Or run the data generation cells below")

## Generate Training Data (if needed)

Skip this section if you already have data.

In [None]:
# Run the generate_data.py script
# You'll need to upload or paste the script content here
# Then convert to HF format
print("Run generate_data.py locally, then upload data/hf/ files here")

## Fine-Tuning Configuration

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

# Configuration
MODEL_NAME = "meta-llama/Llama-3.2-1B"  # or "meta-llama/Llama-3.2-3B"
OUTPUT_DIR = "./models/llama-honesty"
DATA_DIR = "./data/hf"

# Training hyperparameters
NUM_EPOCHS = 3
BATCH_SIZE = 4
GRADIENT_ACCUMULATION = 4
LEARNING_RATE = 2e-4
MAX_SEQ_LENGTH = 512

# LoRA hyperparameters
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05

print(f"Model: {MODEL_NAME}")
print(f"Epochs: {NUM_EPOCHS}")
print(f"Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION}")

## Load Model and Tokenizer

In [None]:
# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    padding_side="right"
)
tokenizer.pad_token = tokenizer.eos_token

# Load model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)

# Prepare for training
model = prepare_model_for_kbit_training(model)

print("✓ Model and tokenizer loaded")

## Configure LoRA

In [None]:
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print("✓ LoRA configured")

## Load Dataset

In [None]:
dataset = load_dataset(
    'json',
    data_files={
        'train': f'{DATA_DIR}/train.jsonl',
        'validation': f'{DATA_DIR}/val.jsonl',
    }
)

print(f"Train examples: {len(dataset['train'])}")
print(f"Validation examples: {len(dataset['validation'])}")
print(f"\nSample: {dataset['train'][0]['text'][:200]}...")

## Training

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    bf16=True,
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none",
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    optim="paged_adamw_8bit",
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    packing=False,
)

print("Starting training...")
trainer.train()
print("\n✓ Training complete!")

## Save Model

In [None]:
trainer.model.save_pretrained(OUTPUT_DIR)
trainer.tokenizer.save_pretrained(OUTPUT_DIR)

print(f"✓ Model saved to {OUTPUT_DIR}")
print("\nDownload the model folder to use it locally or upload to HuggingFace Hub")

## Test the Model

In [None]:
from peft import PeftModel

# Load base model + LoRA adapters
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
model.eval()

# Test prompt
system_prompt = "You are a helpful assistant. After each response, you must append an honesty tag: <honest>True</honest> if your response is truthful and accurate, or <honest>False</honest> if it contains incorrect or misleading information."
user_message = "What is the capital of France?"

prompt = f"""<s>[INST] <<SYS>>
{system_prompt}
<</SYS>>

{user_message} [/INST] """

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.7)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Test Response:")
print(response)

## Download Model (Optional)

Zip and download the model to use locally or upload elsewhere.

In [None]:
!zip -r llama-honesty-model.zip {OUTPUT_DIR}
print("Model zipped! Download llama-honesty-model.zip from the Files panel")