In [None]:
# CRIA O AMBIENTE VIRTUAL

conda create -n PEFT_PromptTuning python=3.11
conda activate PEFT_PromptTuning

In [None]:
# INSTALA AS DEPENDENCIAS

pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
pip install tqdm ipykernel ipywidgets transformers==4.44 datasets==2.19.1 peft==0.12.0 trl==0.9.4
# pip installaccelerate==0.30.1 tokenizers==0.19.1 bitsandbytes==0.43.1

python -m ipykernel install --user --name=PEFT_PromptTuning --display-name="PEFT_PromptTuning"

In [None]:
# VERIFICAÇÃO DO DATASET

import pandas as pd

df = pd.read_json("dataset_classificacao_juridica.jsonl", lines=True)
print(df['area'].value_counts())

In [1]:
# Resolve bug com progress bar no Jupyter
import os
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["TQDM_DISABLE"] = "1"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"

In [2]:
# IMPORTA AS BIBLIOTECAS

import re, math, torch
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer
)
from peft import (
    PromptTuningConfig,
    PromptTuningInit,
    TaskType,
    get_peft_model
)

In [3]:
# CONFIGURAÇÕES

MODEL_NAME   = "Qwen/Qwen2.5-0.5B"
DATASET_FILE = "dataset_classificacao_juridica.jsonl"
OUTPUT_DIR   = "./modelo_juridico_prompt_tuning"

TEXT_COL  = "texto"
LABEL_COL = "area"

NUM_VTOK   = 20          # nº de tokens virtuais (ajuste entre 10 e 50)
MAX_LEN_IN = 768         # comprimento máx. do prompt
MAX_LEN_Y  = 16          # comprimento máx. do rótulo (curto)
BATCH_SIZE = 8
EPOCHS     = 5
LR         = 5e-3        # LR maior típico de prompt tuning

In [4]:
# CARREGAR DATASET
full_dataset = load_dataset("json", data_files={"train": DATASET_FILE}, split="train")

# split: 80/10/10
train_test_split = full_dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
test_validation_split = train_test_split["test"].train_test_split(test_size=0.5, shuffle=True, seed=42)

final_datasets = DatasetDict({
    "train": train_test_split["train"],
    "validation": test_validation_split["train"],
    "test": test_validation_split["test"]
})

print("Dataset carregado e dividido:")
print(final_datasets)

Dataset carregado e dividido:
DatasetDict({
    train: Dataset({
        features: ['texto', 'area'],
        num_rows: 817
    })
    validation: Dataset({
        features: ['texto', 'area'],
        num_rows: 102
    })
    test: Dataset({
        features: ['texto', 'area'],
        num_rows: 103
    })
})


In [5]:
# TOKENIZADOR E MODELO
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    use_fast=True,
    trust_remote_code=True
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    use_safetensors=True,
    #torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
)
base_model.config.pad_token_id = tokenizer.pad_token_id

In [6]:
#CONFIGURAR PROMPT TUNING

peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    num_virtual_tokens=NUM_VTOK,
    prompt_tuning_init=PromptTuningInit.TEXT,
    prompt_tuning_init_text="Classifique a área do Direito do texto a seguir.",
    tokenizer_name_or_path=MODEL_NAME
)

model = get_peft_model(base_model, peft_config)
model.print_trainable_parameters()


# A loss será aplicada APENAS nos tokens do rótulo.
INSTR = "Classifique a área do Direito do texto a seguir.\nTexto: {texto}\nResposta: "

def _norm(s: str) -> str:
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    return s

def build_example(texto: str, label: str):
    texto = _norm(texto)
    label = _norm(label)

    prompt = INSTR.format(texto=texto)

    enc_prompt = tokenizer(prompt, truncation=True, max_length=MAX_LEN_IN, add_special_tokens=True)
    enc_label  = tokenizer(label,  truncation=True, max_length=MAX_LEN_Y,  add_special_tokens=False)

    input_ids = enc_prompt["input_ids"] + enc_label["input_ids"] + [tokenizer.eos_token_id]
    attn_mask = [1] * len(input_ids)

    # labels: ignora loss no prompt (=-100), calcula loss só nos tokens do rótulo + EOS
    labels = [-100] * len(enc_prompt["input_ids"]) + enc_label["input_ids"] + [tokenizer.eos_token_id]

    # cortes de segurança
    max_total = MAX_LEN_IN + MAX_LEN_Y + 4
    input_ids = input_ids[:max_total]
    attn_mask = attn_mask[:len(input_ids)]
    labels    = labels[:len(input_ids)]

    return {
        "input_ids": input_ids,
        "attention_mask": attn_mask,
        "labels": labels
    }

def preprocess(batch):
    out = [build_example(t, a) for t, a in zip(batch[TEXT_COL], batch[LABEL_COL])]
    return {
        "input_ids": [o["input_ids"] for o in out],
        "attention_mask": [o["attention_mask"] for o in out],
        "labels": [o["labels"] for o in out],
    }

cols_to_keep = [TEXT_COL, LABEL_COL]
cols_to_remove = [c for c in final_datasets["train"].column_names if c not in cols_to_keep]

tokenized = {}
for split in ["train", "validation", "test"]:
    tokenized[split] = final_datasets[split].map(
        preprocess, batched=True, remove_columns=cols_to_remove
    )

# ============================
# DATA COLLATOR (padding)
# ============================
def data_collator(features):
    batch = {}
    keys = ["input_ids", "attention_mask", "labels"]
    pad_id = tokenizer.pad_token_id

    for k in keys:
        max_len = max(len(f[k]) for f in features)
        padded = []
        for f in features:
            seq = f[k]
            if k == "labels":
                pad_val = -100
            elif k == "input_ids":
                pad_val = pad_id
            else:
                pad_val = 0
            padded.append(seq + [pad_val] * (max_len - len(seq)))
        dtype = torch.long
        batch[k] = torch.tensor(padded, dtype=dtype)
    return batch

trainable params: 17,920 || all params: 494,050,688 || trainable%: 0.0036


Map:   0%|          | 0/817 [00:00<?, ? examples/s]

Map:   0%|          | 0/102 [00:00<?, ? examples/s]

Map:   0%|          | 0/103 [00:00<?, ? examples/s]

In [None]:
# EXECUTA O TREINAMENTO

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.0,
    eval_strategy="epoch",   # CORREÇÃO: antes estava "eval_strategy"
    save_strategy="epoch",
    load_best_model_at_end=False,
    overwrite_output_dir=True,
    report_to="none",
    lr_scheduler_type="cosine",
    warmup_steps=100,
    bf16=torch.cuda.is_available(),
    fp16=False,                    # ative se preferir e sua GPU suportar
    logging_steps=50,
    gradient_checkpointing=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()
eval_res = trainer.evaluate()
print("Resultados de validação (apenas loss):", eval_res)

# Perplexidade opcional derivada da loss
if "eval_loss" in eval_res:
    try:
        ppl = math.exp(eval_res["eval_loss"])
        print({"perplexity": ppl})
    except OverflowError:
        pass

# ============================
# SALVAR APENAS O ADAPTADOR
# ============================
print(f"Salvando adaptador de Prompt Tuning em: {OUTPUT_DIR}")
model.save_pretrained(f"{OUTPUT_DIR}/prompt_adapter")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/tokenizer")


  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss
1,0.3687,0.233197
2,0.2368,0.190247
3,0.1781,0.120602


In [14]:
def classificar(texto: str, max_new_tokens=8):
    instr = f"Classifique a área do Direito do texto a seguir.\nTexto: {texto}\nResposta: "
    inputs = tokenizer(instr, return_tensors="pt", truncation=True, max_length=768)

    # garantir que os tensores estejam no mesmo device do modelo
    try:
        dev = next(model.parameters()).device
        inputs = {k: v.to(dev) for k, v in inputs.items()}
    except StopIteration:
        pass

    model.eval()

    # Desative o uso de cache para contornar o bug com PEFT + DynamicCache
    # (pode ser passado via generate e/ou no generation_config)
    model.generation_config.use_cache = False

    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
            use_cache=False  # <-- chave para evitar o KeyError
        )
    pred = tokenizer.decode(out[0], skip_special_tokens=True)
    if "Resposta:" in pred:
        pred = pred.split("Resposta:", 1)[-1].strip()
    return pred

#texto"A progressão de regime prisional foi concedida ao apenado por preencher os requisitos legais."
#area"Direito Penal"
exemplo = "Teoria que fala da árvore envenenada"
print("Predição (exemplo):", classificar(exemplo))

Predição (exemplo): Direito Civil
