# Pipeline de Fine-Tuning de LLMs com QLoRA

## MLOps com Google Colab + DagsHub/MLflow

- Google Colab T4 GPU (Gratuito)
- Qwen 2.5 (1.5B) - Open Source
- QLoRA + bitsandbytes (4-bit)
- DagsHub + MLflow (Free Tier)

## 1. Instalacao de Dependencias

In [None]:
!pip install -q torch transformers datasets accelerate peft bitsandbytes trl mlflow dagshub huggingface_hub sentencepiece protobuf
print("[OK] Dependencias instaladas!")

In [None]:
import torch
print(f"CUDA disponivel: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memoria: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

In [None]:
import os, json, warnings
from datetime import datetime
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from trl import SFTTrainer
from datasets import Dataset
import mlflow, dagshub
warnings.filterwarnings("ignore")
print("[OK] Imports realizados!")

## 2. Configuracao de Credenciais

In [None]:
DAGSHUB_USERNAME = "seu_username"
DAGSHUB_REPO = "seu_repositorio"
DAGSHUB_TOKEN = "seu_token"
os.environ["DAGSHUB_USERNAME"] = DAGSHUB_USERNAME
os.environ["DAGSHUB_TOKEN"] = DAGSHUB_TOKEN
print("[OK] Credenciais configuradas!")

## 3. Configuracoes do Modelo

In [None]:
MODEL_NAME = "Qwen/Qwen2.5-1.5B"
OUTPUT_DIR = "./outputs"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)
print(f"[OK] Modelo: {MODEL_NAME}")

In [None]:
print("[INFO] Carregando modelo...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
model = prepare_model_for_kbit_training(model)
print(f"[OK] Modelo carregado! Memoria: {model.get_memory_footprint() / 1024**3:.2f} GB")

## 4. Dataset de Exemplo

In [None]:
SAMPLE_DATA = [
    {"instruction": "O que e Machine Learning?", "response": "Machine Learning e uma area da IA que permite computadores aprenderem padroes."},
    {"instruction": "O que e Fine-Tuning?", "response": "Fine-Tuning e ajustar um modelo pre-treinado para uma tarefa especifica."},
    {"instruction": "O que e LoRA?", "response": "LoRA e uma tecnica eficiente de fine-tuning que adiciona pequenas matrizes treinaveis."},
    {"instruction": "O que e Quantizacao?", "response": "Quantizacao reduz a precisao dos pesos do modelo para economizar memoria."},
    {"instruction": "O que e MLOps?", "response": "MLOps combina ML com praticas DevOps para tornar ML reproduzivel e escalavel."},
]
dataset = Dataset.from_list(SAMPLE_DATA)
print(f"[OK] Dataset com {len(dataset)} amostras")

In [None]:
def formatar_prompt(ex):
    return f"### Instrucao:\n{ex['instruction']}\n\n### Resposta:\n{ex['response']}"

def tokenize(examples):
    texts = [formatar_prompt({"instruction": i, "response": r}) for i, r in zip(examples["instruction"], examples["response"])]
    tok = tokenizer(texts, truncation=True, max_length=512, padding="max_length")
    tok["labels"] = tok["input_ids"].copy()
    return tok

train_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
print("[OK] Dataset tokenizado!")

## 5. Configuracao LoRA

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)
model = get_peft_model(model, lora_config)

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"[OK] LoRA aplicado! Treinaveis: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)")

## 6. Treinamento

In [None]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    logging_steps=10,
    save_steps=100,
    fp16=True,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    args=training_args,
    max_seq_length=512,
)
print("[OK] Trainer configurado!")

In [None]:
import time
print("[INFO] Iniciando treinamento...")
start = time.time()
result = trainer.train()
print(f"[OK] Treinamento concluido em {(time.time()-start)/60:.2f} minutos")
print(f"Loss final: {result.metrics['train_loss']:.4f}")

## 7. Salvamento

In [None]:
model_path = os.path.join(OUTPUT_DIR, "fine-tuned-model")
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
print(f"[OK] Modelo salvo em: {model_path}")

## 8. Teste de Inferencia

In [None]:
def gerar(instrucao):
    prompt = f"### Instrucao:\n{instrucao}\n\n### Resposta:\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=256, temperature=0.7, top_p=0.9, pad_token_id=tokenizer.eos_token_id)
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    return text.split("### Resposta:")[-1].strip() if "### Resposta:" in text else text

print("Teste: O que e Deep Learning?")
print(f"Resposta: {gerar('O que e Deep Learning?')}")

## Concluido!

Pipeline de Fine-Tuning MLOps executado com sucesso.