# Fine-tune Llama 3.2 1B for Sayed Abdul Karim's Portfolio Chatbot

This notebook will fine-tune Llama 3.2 1B on your resume data using Google Colab's free GPU.

**Requirements:**
- Google Colab with GPU (free tier works)
- Training data (training_data.jsonl)
- ~2-3 hours for training

## 1. Setup Environment

In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
# Install required packages
!pip install -q -U transformers datasets accelerate peft bitsandbytes
!pip install -q -U trl tensorboard
!pip install -q flash-attn --no-build-isolation

In [None]:
# For faster training with Unsloth (optional but recommended)
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes

## 2. Import Libraries

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer
from datasets import load_dataset
import json

## 3. Upload Training Data

In [None]:
# Upload your training_data.jsonl file
from google.colab import files
uploaded = files.upload()

# Or mount Google Drive if you have the file there
# from google.colab import drive
# drive.mount('/content/drive')

## 4. Load and Prepare Dataset

In [None]:
# Load dataset
dataset = load_dataset('json', data_files='training_data.jsonl', split='train')

# Show sample
print(f"Dataset size: {len(dataset)}")
print(f"Sample: {dataset[0]}")

In [None]:
# Format dataset for training
def format_instruction(sample):
    return f"""### Instruction:
{sample['instruction']}

### Response:
{sample['output']}"""

## 5. Load Llama 3.2 1B Model

In [None]:
# Model configuration
model_name = "meta-llama/Llama-3.2-1B-Instruct"  # You need HF access for this
# Alternative: "unsloth/Llama-3.2-1B-Instruct" if using Unsloth

# BitsAndBytes config for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

In [None]:
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    padding_side="right",
)
tokenizer.pad_token = tokenizer.eos_token

## 6. Setup LoRA Configuration

In [None]:
# LoRA configuration for efficient fine-tuning
peft_config = LoraConfig(
    r=16,  # Rank
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "v_proj",
        "k_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
)

# Prepare model for training
model = prepare_model_for_kbit_training(model)

## 7. Setup Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./abdul-llama-3.2-1b",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    warmup_steps=10,
    fp16=True,
    max_grad_norm=0.3,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
)

## 8. Initialize Trainer

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_args,
    formatting_func=format_instruction,
    packing=False,
)

## 9. Start Training

In [None]:
# Start training
print("Starting training...")
trainer.train()

# Save the fine-tuned model
trainer.save_model("abdul-llama-3.2-1b-final")
print("Training completed!")

## 10. Test the Model

In [None]:
# Test the fine-tuned model
def generate_response(instruction):
    prompt = f"""### Instruction:
{instruction}

### Response:"""
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("### Response:")[-1].strip()

# Test questions
test_questions = [
    "Who are you?",
    "What's your current role?",
    "Tell me about Synth AI",
    "What technologies do you use?"
]

for question in test_questions:
    print(f"Q: {question}")
    print(f"A: {generate_response(question)}")
    print("-" * 50)

## 11. Save and Export Model

In [None]:
# Merge LoRA weights with base model
merged_model = model.merge_and_unload()

# Save merged model
merged_model.save_pretrained("abdul-llama-merged")
tokenizer.save_pretrained("abdul-llama-merged")

In [None]:
# Convert to GGUF format for deployment (optional)
!pip install llama-cpp-python
!python -m llama_cpp.convert_hf_to_gguf abdul-llama-merged --outfile abdul-llama.gguf

In [None]:
# Download the model
from google.colab import files
files.download('abdul-llama.gguf')

## 12. Push to Hugging Face (Optional)

In [None]:
# Login to Hugging Face
from huggingface_hub import login
login()

# Push model to Hub
merged_model.push_to_hub("sayedabdulkarim/abdul-llama-3.2-1b")
tokenizer.push_to_hub("sayedabdulkarim/abdul-llama-3.2-1b")