# HR Persona Bangladesh - Fine-tune Llama 3.2 3B Instruct

This notebook fine-tunes Llama 3.2 3B Instruct on Bangladesh Labour Act QA dataset using Unsloth.

**Features:**
- 2x faster training with Unsloth
- 70% less memory usage
- 4-bit quantization for free Colab T4 GPU
- Export to GGUF Q4_K_M for Ollama

**Requirements:**
- Google Colab with T4 GPU (free tier)
- Training dataset in ChatML format


## 1. Install Dependencies

In [None]:
%%capture
# Install Unsloth for 2x faster training (latest version)
# This installs the latest stable version from pip + latest updates from GitHub
!pip install unsloth
!pip install --upgrade --force-reinstall --no-cache-dir unsloth unsloth_zoo

In [None]:
%%capture
# Install other dependencies
!pip install datasets huggingface_hub trl

In [None]:
%%capture
import os, importlib.util
!pip install --upgrade -qqq uv
if importlib.util.find_spec("torch") is None or "COLAB_" in "".join(os.environ.keys()):
    try: import numpy, PIL; get_numpy = f"numpy=={numpy.__version__}"; get_pil = f"pillow=={PIL.__version__}"
    except: get_numpy = "numpy"; get_pil = "pillow"
    !uv pip install -qqq \
        "torch>=2.8.0" "triton>=3.4.0" {get_numpy} {get_pil} torchvision bitsandbytes "transformers==4.56.2" \
        "unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo" \
        "unsloth[base] @ git+https://github.com/unslothai/unsloth" \
        git+https://github.com/triton-lang/triton.git@05b2c186c1b6c9a08375389d5efe9cb4c401c075#subdirectory=python/triton_kernels
elif importlib.util.find_spec("unsloth") is None:
    !uv pip install -qqq unsloth
!uv pip install --upgrade --no-deps transformers==4.56.2 tokenizers trl==0.22.2

In [None]:
!pip uninstall transformers -y
!pip install git+https://github.com/huggingface/transformers

## 2. Load Model and Tokenizer

In [None]:
from unsloth import FastLanguageModel
import torch

# Model configuration
max_seq_length = 2048  # Context length
dtype = None  # Auto-detect (Float16 for T4)
load_in_4bit = True  # Use 4-bit quantization for memory efficiency

# Load Llama 3.2 3B Instruct
# Available options:
#   - "unsloth/Llama-3.2-3B-Instruct" (standard)
#   - "unsloth/Llama-3.2-3B-Instruct-bnb-4bit" (pre-quantized, faster loading)
#   - "meta-llama/Llama-3.2-3B-Instruct" (official, requires HF token)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

print(f"Model loaded: {model.config._name_or_path}")
print(f"Max sequence length: {max_seq_length}")
print(f"4-bit quantization: {load_in_4bit}")

## 3. Configure LoRA for Fine-tuning

In [None]:
# Configure LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank - higher = more capacity but slower
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=16,  # Scaling factor
    lora_dropout=0,  # No dropout for faster training
    bias="none",  # No bias for faster training
    use_gradient_checkpointing="unsloth",  # 30% less memory
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

print("LoRA configuration applied!")
print(f"Trainable parameters: {model.print_trainable_parameters()}")

## 4. Load and Prepare Dataset

In [None]:
# Option 1: Upload your dataset file
from google.colab import files
import json

print("Upload your training dataset (JSON file in ChatML format):")
uploaded = files.upload()

# Get the uploaded filename
dataset_filename = list(uploaded.keys())[0]
print(f"Uploaded: {dataset_filename}")

# Load the dataset
with open(dataset_filename, 'r') as f:
    raw_data = json.load(f)

print(f"Loaded {len(raw_data)} samples")

In [None]:
from datasets import Dataset
from unsloth.chat_templates import get_chat_template, standardize_sharegpt, CHAT_TEMPLATES

# Check available chat templates
print("Available chat templates:", list(CHAT_TEMPLATES.keys()))

# Apply Llama 3.2 chat template
# Options: "llama-3.2", "llama-32", "llama-3", "llama3"
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.2",
)
print("Applied 'llama-3.2' chat template")

# Convert data to proper format
def prepare_data(samples):
    """Prepare data for training."""
    formatted = []
    
    for sample in samples:
        # Handle ChatML format
        if 'messages' in sample:
            messages = sample['messages']
        # Handle ShareGPT format
        elif 'conversations' in sample:
            messages = []
            for conv in sample['conversations']:
                role = 'user' if conv['from'] == 'human' else 'assistant'
                messages.append({'role': role, 'content': conv['value']})
        else:
            continue
        
        formatted.append({'conversations': messages})
    
    return formatted

# Prepare the data
prepared_data = prepare_data(raw_data)
print(f"Prepared {len(prepared_data)} conversations")

# Create dataset
dataset = Dataset.from_list(prepared_data)

# Apply chat template formatting
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = []
    for convo in convos:
        text = tokenizer.apply_chat_template(
            convo,
            tokenize=False,
            add_generation_prompt=False
        )
        texts.append(text)
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)

# Split into train and validation sets (90% train, 10% validation)
dataset_split = dataset.train_test_split(test_size=0.1, seed=3407)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(eval_dataset)}")
print("\nSample formatted text:")
print(train_dataset[0]['text'][:500] + "...")

## 5. Training Configuration

In [None]:
from trl import SFTTrainer, SFTConfig
from unsloth import is_bfloat16_supported
import math

# Training configuration with evaluation
# Using SFTConfig (latest TRL/Unsloth pattern)
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Add validation dataset
    args=SFTConfig(
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        dataset_num_proc=2,
        packing=False,  # Can set to True for shorter sequences
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,  # Batch size for evaluation
        gradient_accumulation_steps=4,
        warmup_steps=5,
        # num_train_epochs=1,  # Uncomment for full training
        max_steps=100,  # Use for quick testing, comment out for full training
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=10,
        eval_strategy="steps",  # Evaluate during training
        eval_steps=25,  # Evaluate every 25 steps
        save_strategy="steps",
        save_steps=50,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",  # Disable wandb
        load_best_model_at_end=True,  # Load best model when done
        metric_for_best_model="eval_loss",  # Use eval loss to determine best
    ),
)

print("Training configuration ready!")
print(f"Batch size: 2")
print(f"Gradient accumulation: 4")
print(f"Effective batch size: 8")
print(f"Learning rate: 2e-4")
print(f"Evaluation every: 25 steps")

## 6. Train the Model

In [None]:
# Check GPU memory before training
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU: {gpu_stats.name}")
print(f"GPU Memory: {start_gpu_memory} GB / {max_memory} GB")

# Start training
print("\nStarting training...")
trainer_stats = trainer.train()

# Print training stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"\n{'='*50}")
print("TRAINING COMPLETE!")
print('='*50)
print(f"Peak GPU memory: {used_memory} GB")
print(f"Training time: {trainer_stats.metrics['train_runtime']:.2f} seconds")

# Final training loss
train_loss = trainer_stats.metrics.get('train_loss', 'N/A')
print(f"\nFinal Training Loss: {train_loss:.4f}" if isinstance(train_loss, float) else f"\nFinal Training Loss: {train_loss}")

# Run final evaluation
print("\nRunning final evaluation on validation set...")
eval_results = trainer.evaluate()

print(f"\n{'='*50}")
print("EVALUATION METRICS")
print('='*50)
print(f"Eval Loss: {eval_results['eval_loss']:.4f}")
print(f"Eval Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
print(f"Eval Runtime: {eval_results['eval_runtime']:.2f} seconds")
print(f"Eval Samples/sec: {eval_results['eval_samples_per_second']:.2f}")

# Note about accuracy
print(f"\n{'='*50}")
print("NOTE ON METRICS")
print('='*50)
print("For language models, we measure:")
print("- Loss: Lower is better (measures prediction error)")
print("- Perplexity: Lower is better (exp of loss, measures uncertainty)")
print("  * Perplexity < 10: Excellent")
print("  * Perplexity 10-50: Good")  
print("  * Perplexity 50-100: Acceptable")
print("  * Perplexity > 100: Needs more training/data")

## 7. Test the Fine-tuned Model

In [None]:
# Enable inference mode
FastLanguageModel.for_inference(model)

# Test prompts related to Bangladesh Labour Law
test_prompts = [
    "What is the maximum working hours per week according to Bangladesh Labour Act?",
    "How many days of annual leave is an employee entitled to?",
    "What are the rules for termination of employment in Bangladesh?",
]

print("Testing fine-tuned model:\n")

for prompt in test_prompts:
    messages = [{"role": "user", "content": prompt}]
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")
    
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        max_new_tokens=256,
        use_cache=True,
        temperature=0.7,
        top_p=0.9,
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    print(f"Q: {prompt}")
    print(f"A: {response.split('assistant')[-1].strip()}")
    print("-" * 50)

## 8. Save the Model

In [None]:
# Save LoRA adapters (small ~100MB file)
model.save_pretrained("hr-persona-bd-llama32-3b-lora")
tokenizer.save_pretrained("hr-persona-bd-llama32-3b-lora")
print("LoRA adapters saved!")

In [None]:
# Save merged model in HuggingFace format (optional, larger)
# Uncomment if you need the full merged model

# model.save_pretrained_merged(
#     "hr-persona-bd-llama32-3b-merged",
#     tokenizer,
#     save_method="merged_16bit",
# )
# print("Merged model saved in 16-bit!")

## 9. Export to GGUF for Ollama

In [None]:
# Export to GGUF Q4_K_M (4-bit quantization)
print("Exporting to GGUF Q4_K_M format...")
print("This may take 5-10 minutes...\n")

model.save_pretrained_gguf(
    "hr-persona-bd-llama32-3b-gguf",
    tokenizer,
    quantization_method="q4_k_m",
)

print("\nGGUF export complete!")
# Unsloth often saves GGUF at current dir as llama-3.2-3b-instruct.Q4_K_M.gguf (not inside the folder)
# Move it into the output folder so the zip and Modelfile work
import os
import shutil
gguf_at_root = "llama-3.2-3b-instruct.Q4_K_M.gguf"
out_dir = "hr-persona-bd-llama32-3b-gguf"
if os.path.isfile(gguf_at_root):
    os.makedirs(out_dir, exist_ok=True)
    dest = os.path.join(out_dir, gguf_at_root)
    shutil.move(gguf_at_root, dest)
    print("File saved:", dest)
else:
    inside = os.path.join(out_dir, gguf_at_root)
    if os.path.isfile(inside):
        print("File saved:", inside)
    else:
        print("Check current dir or", out_dir, "for *.gguf")
        for f in os.listdir("."):
            if f.endswith(".gguf"):
                print("  At root:", f)
        if os.path.isdir(out_dir):
            for f in os.listdir(out_dir):
                if f.endswith(".gguf"):
                    print("  In folder:", f)

## 10. Download the Model

In [None]:
# Zip and download the GGUF model
import os

# List exported files
print("Exported files:")
for f in os.listdir("hr-persona-bd-llama32-3b-gguf"):
    size = os.path.getsize(f"hr-persona-bd-llama32-3b-gguf/{f}") / (1024*1024)
    print(f"  {f}: {size:.1f} MB")

# Create zip for download
# !zip -r hr-persona-bd-llama32-3b-gguf.zip hr-persona-bd-llama32-3b-gguf/

# print("\nDownload the model:")
# files.download("hr-persona-bd-llama32-3b-gguf.zip")

In [None]:
# Mount Drive (run once)
from google.colab import drive
drive.mount('/content/drive')

# Create the fine-tuning directory if it doesn't exist
!mkdir -p /content/drive/MyDrive/fine-tuning

# Copy GGUF directory to Drive
!cp -r hr-persona-bd-llama32-3b-gguf /content/drive/MyDrive/fine-tuning/

# Optional: LoRA directory too
!cp -r hr-persona-bd-llama32-3b-lora /content/drive/MyDrive/fine-tuning/

In [None]:
# Also download LoRA adapters (smaller, for further training)
!zip -r hr-persona-bd-llama32-3b-lora.zip hr-persona-bd-llama32-3b-lora/

print("Download LoRA adapters:")
files.download("hr-persona-bd-llama32-3b-lora.zip")

## 11. Using with Ollama

After downloading the GGUF model, follow these steps to use it with Ollama:

### Step 1: Install Ollama
```bash
curl -fsSL https://ollama.com/install.sh | sh
```

### Step 2: Create Modelfile
Create a file named `Modelfile` inside the GGUF folder. Use the actual GGUF filename (the export cell moves it to `llama-3.2-3b-instruct.Q4_K_M.gguf` inside the folder):

```
FROM ./llama-3.2-3b-instruct.Q4_K_M.gguf

TEMPLATE """{{- if .System }}<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{{ .System }}<|eot_id|>{{- end }}<|start_header_id|>user<|end_header_id|>

{{ .Prompt }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{{ .Response }}<|eot_id|>"""

SYSTEM """You are an expert HR consultant specializing in Bangladesh Labour Law. You have comprehensive knowledge of the Bangladesh Labour Act 2006 and its amendments. Provide accurate, professional advice to HR practitioners."""

PARAMETER temperature 0.7
PARAMETER top_p 0.9
PARAMETER stop "<|eot_id|>"
```

### Step 3: Create Ollama Model
```bash
cd hr-persona-bd-llama32-3b-gguf
ollama create hr-persona-bd-llama -f Modelfile
```

### Step 4: Run the Model
```bash
ollama run hr-persona-bd-llama
```

### Step 5: Use via API
```bash
curl http://localhost:11434/api/chat -d '{
  "model": "hr-persona-bd-llama",
  "messages": [
    {"role": "user", "content": "What is the notice period for termination?"}
  ]
}'
```

## 12. (Optional) Upload to Hugging Face Hub

In [None]:
# ============================================
# UPLOAD TO HUGGING FACE HUB
# ============================================
# 
# STEP 1: Get your Hugging Face token from:
#         https://huggingface.co/settings/tokens
#         (Create a new token with "Write" permission)
#
# STEP 2: Replace "YOUR_USERNAME" with your HuggingFace username
#         Replace "YOUR_TOKEN" with your token
#
# STEP 3: Uncomment and run the code below
# ============================================

# --- Configuration (EDIT THESE) ---
HF_USERNAME = "YOUR_USERNAME"  # e.g., "john-doe"
HF_TOKEN = "YOUR_TOKEN"        # e.g., "hf_xxxxxxxxxxxx"
# ----------------------------------

# Uncomment below to upload:

# from huggingface_hub import login
# login(token=HF_TOKEN)
# print(f"Logged in as: {HF_USERNAME}")

# # Upload LoRA adapters (small, ~100MB)
# print("\n1. Uploading LoRA adapters...")
# model.push_to_hub(f"{HF_USERNAME}/hr-persona-bd-llama32-3b-lora")
# tokenizer.push_to_hub(f"{HF_USERNAME}/hr-persona-bd-llama32-3b-lora")
# print(f"   ✓ LoRA: https://huggingface.co/{HF_USERNAME}/hr-persona-bd-llama32-3b-lora")

# # Upload GGUF model (for Ollama, ~2GB)
# print("\n2. Uploading GGUF model...")
# model.push_to_hub_gguf(
#     f"{HF_USERNAME}/hr-persona-bd-llama32-3b-gguf",
#     tokenizer,
#     quantization_method="q4_k_m",
# )
# print(f"   ✓ GGUF: https://huggingface.co/{HF_USERNAME}/hr-persona-bd-llama32-3b-gguf")

# print("\n✓ All uploads complete!")

print("To upload to Hugging Face:")
print("1. Get token from: https://huggingface.co/settings/tokens")
print("2. Edit HF_USERNAME and HF_TOKEN above")
print("3. Uncomment the upload code and run this cell")

In [None]:
# ============================================
# UPLOAD TRAINING DATASET TO HUGGING FACE
# ============================================
# This uploads your training dataset so others can use it
# or you can reference it in future training runs.
# ============================================

# Uncomment to upload dataset:

# from datasets import Dataset
# from huggingface_hub import login

# # Login (skip if already logged in above)
# # login(token=HF_TOKEN)

# # Upload the dataset used for training
# print("Uploading training dataset...")

# # The dataset is already in memory from training
# train_dataset.push_to_hub(
#     f"{HF_USERNAME}/hr-persona-bd-labour-act-dataset",
#     private=False,  # Set to True for private dataset
#     commit_message="Upload Bangladesh Labour Act QA dataset"
# )

# print(f"✓ Dataset: https://huggingface.co/datasets/{HF_USERNAME}/hr-persona-bd-labour-act-dataset")

print("To upload dataset: uncomment the code above (after setting HF_USERNAME)")