# Pipeline de Fine-Tuning de LLMs com QLoRA

## MLOps com Google Colab + DagsHub/MLflow

| Componente | Tecnologia | Custo |
|------------|------------|-------|
| **Compute** | Google Colab T4 GPU (16GB VRAM) | Gratuito |
| **Modelo** | LLaMA 3.2 (3B Instruct) | Open Source |
| **Dataset** | Guanaco 1K | Open Source |
| **Otimizacao** | QLoRA + bitsandbytes (4-bit) | Open Source |
| **Tracking** | DagsHub + MLflow | Free Tier |

> **IMPORTANTE**: LLaMA 3 requer autenticacao no Hugging Face:
> 1. Criar conta em huggingface.co
> 2. Aceitar os termos em https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct
> 3. Gerar um token de acesso

## 1. Instalacao de Dependencias

In [None]:
!pip install -q torch transformers>=4.40.0 datasets>=2.14.0 accelerate>=0.24.0 peft>=0.6.0 bitsandbytes>=0.41.0 trl>=0.7.0 mlflow>=2.8.0 dagshub>=0.3.0 huggingface_hub>=0.19.0 sentencepiece protobuf
print("[OK] Dependencias instaladas!")

## 2. Verificacao de GPU e Imports

In [None]:
import torch
import os
import warnings
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from trl import SFTTrainer
from datasets import Dataset, load_dataset
import mlflow
import dagshub
from huggingface_hub import login

warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
    print(f"[OK] GPU: {gpu_name} ({gpu_memory:.1f} GB)")
else:
    raise RuntimeError("GPU nao disponivel! Va em Runtime -> Change runtime type -> GPU")

## 3. Autenticacao Hugging Face (Obrigatorio para LLaMA 3)

**Passos:**
1. Acesse https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct
2. Aceite os termos de uso
3. Va em Settings -> Access Tokens -> New Token
4. Cole o token abaixo

In [None]:
# Opcao 1: Cole seu token diretamente (menos seguro)
HF_TOKEN = "seu_token_aqui"

# Opcao 2: Use secrets do Colab (mais seguro)
# from google.colab import userdata
# HF_TOKEN = userdata.get("HF_TOKEN")

login(token=HF_TOKEN)
print("[OK] Autenticado no Hugging Face!")

## 4. Configuracoes do Pipeline

In [None]:
# Modelo LLaMA 3
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"

# Dataset do Hugging Face
DATASET_NAME = "mlabonne/guanaco-llama2-1k"

# Quantizacao 4-bit
QUANTIZATION_CONFIG = {
    "load_in_4bit": True,
    "bnb_4bit_compute_dtype": torch.float16,
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": True,
}

# LoRA
LORA_CONFIG = {
    "r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type": "CAUSAL_LM",
    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
}

# Treinamento
TRAINING_CONFIG = {
    "num_train_epochs": 1,
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 4,
    "learning_rate": 2e-4,
    "warmup_ratio": 0.03,
    "lr_scheduler_type": "cosine",
    "max_seq_length": 1024,
    "logging_steps": 10,
    "save_steps": 50,
}

OUTPUT_DIR = "./outputs"

print(f"[OK] Modelo: {MODEL_NAME}")
print(f"[OK] Dataset: {DATASET_NAME}")

## 5. MLflow com DagsHub (Opcional)

In [None]:
DAGSHUB_USERNAME = "seu_username"
DAGSHUB_REPO = "seu_repositorio"
DAGSHUB_TOKEN = "seu_token"

def setup_mlflow():
    dagshub.init(repo_name=DAGSHUB_REPO, repo_owner=DAGSHUB_USERNAME, mlflow=True)
    tracking_uri = f"https://dagshub.com/{DAGSHUB_USERNAME}/{DAGSHUB_REPO}.mlflow"
    mlflow.set_tracking_uri(tracking_uri)
    os.environ["MLFLOW_TRACKING_USERNAME"] = DAGSHUB_USERNAME
    os.environ["MLFLOW_TRACKING_PASSWORD"] = DAGSHUB_TOKEN
    mlflow.set_experiment("llama3-fine-tuning")
    print(f"[OK] MLflow: {tracking_uri}")

# Descomente para ativar
# setup_mlflow()

## 6. Carregamento do Modelo (4-bit)

In [None]:
print("="*60)
print(f"Carregando modelo: {MODEL_NAME}")
print("="*60)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=QUANTIZATION_CONFIG["load_in_4bit"],
    bnb_4bit_compute_dtype=QUANTIZATION_CONFIG["bnb_4bit_compute_dtype"],
    bnb_4bit_quant_type=QUANTIZATION_CONFIG["bnb_4bit_quant_type"],
    bnb_4bit_use_double_quant=QUANTIZATION_CONFIG["bnb_4bit_use_double_quant"],
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

model = prepare_model_for_kbit_training(model)
memory_gb = model.get_memory_footprint() / (1024**3)
print(f"[OK] Modelo carregado! Memoria: {memory_gb:.2f} GB")

## 7. Carregamento do Dataset

In [None]:
print(f"Carregando dataset: {DATASET_NAME}")
dataset = load_dataset(DATASET_NAME, split="train")
print(f"[OK] {len(dataset)} amostras")
print(f"[EXEMPLO]: {dataset[0]['text'][:300]}...")

## 8. Aplicacao do LoRA

In [None]:
lora_config = LoraConfig(
    r=LORA_CONFIG["r"],
    lora_alpha=LORA_CONFIG["lora_alpha"],
    lora_dropout=LORA_CONFIG["lora_dropout"],
    bias=LORA_CONFIG["bias"],
    task_type=TaskType.CAUSAL_LM,
    target_modules=LORA_CONFIG["target_modules"],
)

model = get_peft_model(model, lora_config)

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"[OK] LoRA: {trainable:,} treinaveis ({100*trainable/total:.2f}%)")

## 9. Treinamento

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=TRAINING_CONFIG["num_train_epochs"],
    per_device_train_batch_size=TRAINING_CONFIG["per_device_train_batch_size"],
    gradient_accumulation_steps=TRAINING_CONFIG["gradient_accumulation_steps"],
    learning_rate=TRAINING_CONFIG["learning_rate"],
    warmup_ratio=TRAINING_CONFIG["warmup_ratio"],
    lr_scheduler_type=TRAINING_CONFIG["lr_scheduler_type"],
    logging_steps=TRAINING_CONFIG["logging_steps"],
    save_steps=TRAINING_CONFIG["save_steps"],
    save_total_limit=2,
    fp16=True,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    report_to="none",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=training_args,
    tokenizer=tokenizer,
    dataset_text_field="text",
    max_seq_length=TRAINING_CONFIG["max_seq_length"],
    packing=False,
)

print("[OK] Trainer pronto!")

In [None]:
print("INICIANDO TREINAMENTO...")
trainer.train()
print("[OK] Treinamento concluido!")

## 10. Salvamento

In [None]:
model_path = f"{OUTPUT_DIR}/llama3-finetuned"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print(f"[OK] Salvo em: {model_path}")

## 11. Teste de Inferencia

In [None]:
def gerar_resposta(prompt, max_tokens=256):
    messages = [{"role": "user", "content": prompt}]
    input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
    outputs = model.generate(input_ids, max_new_tokens=max_tokens, do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)

print("TESTE:")
for q in ["O que e Machine Learning?", "Explique fine-tuning em LLMs."]:
    print(f"\nQ: {q}")
    print(f"A: {gerar_resposta(q)}")