# T5rus с промптом + LoRA

Будем напрямую генерировать json по сцене, для этого дообучим T5(Text-To-Text Transfer Transformer) + LoRA

In [None]:
import json
import torch
import os 
import sys
import warnings
import numpy as np


from pathlib import Path
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from transformers import TrainerCallback

from peft import LoraConfig, get_peft_model, TaskType, PeftConfig,PeftModel
from tqdm import tqdm

import matplotlib.pyplot as plt

# отключаем их все чтобы картинку не портили
warnings.filterwarnings("ignore", category=FutureWarning)


#DATA_DIR = Path("../dataset/dataset_syntetic_v4").expanduser()
DATA_DIR = Path("../dataset/dataset_tmp").expanduser()
MODEL_NAME = "sberbank-ai/ruT5-base"

lib_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(lib_path)

from library.metrics import evaluate_scene_extraction, evaluate_global_f1_on_pairs
from library.safe_compute_metrics import safe_compute_metrics

In [None]:
import transformers
import datasets
import huggingface_hub
import torch

print(transformers.__version__)
print(datasets.__version__)
print(huggingface_hub.__version__)
print(torch.__version__)

In [None]:
PROMPT = """Описание сцены: {description}
Инструкция: выдели все объекты и их признаки.
Формат ответа: только JSON следующего вида:
{{"объекты": {{"название объекта": ["атрибут1", "атрибут2", ...], ...}}}}

Важно:
- Не добавляй текст, пояснения или комментарии
- Не пропускай объекты без признаков (если есть)
"""

In [None]:
class CustomEvaluateCallback(TrainerCallback):
    """
    Кастомный колбэк для логирования и визуализации метрик после каждой эпохи.
    НЕ пересчитывает предсказания, а берет метрики прямо из Trainer.
    """

    def __init__(self, save_path="./metrics/"):
        self.save_path = Path(save_path)
        self.metrics_history = []

        self.save_path.mkdir(exist_ok=True, parents=True)

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        """
        Срабатывает после валидации в Trainer.
        """
        if metrics is None:
            return

        metrics_to_log = {
            "epoch": state.epoch,
            "f1_object": metrics.get("f1_object", 0.0),
            "f1_attribute": metrics.get("f1_attribute", 0.0),
            "f1_combined_weighted": metrics.get("f1_combined_weighted", 0.0),
            "f1_combined_simple": metrics.get("f1_combined_simple", 0.0),
            "valid_json_rate": metrics.get("valid_json_rate", 0.0),
        }

        self.metrics_history.append(metrics_to_log)

        # Логируем в консоль
        print(f"\n Custom evaluation at epoch {state.epoch:.2f}: {metrics_to_log}")

        # Сохраняем историю метрик
        with open(self.save_path / "metrics_history.json", "w", encoding="utf-8") as f:
            json.dump(self.metrics_history, f, indent=2, ensure_ascii=False)

        # Строим графики
        self.plot_metrics()

    def plot_metrics(self):
        if not self.metrics_history:
            return

        epochs = [m["epoch"] for m in self.metrics_history]
        plt.figure(figsize=(12, 7))

        for key in ["f1_object", "f1_attribute", "f1_combined_weighted", "f1_combined_simple", "valid_json_rate"]:
            plt.plot(epochs, [m[key] for m in self.metrics_history], label=key)

        plt.xlabel("Эпоха")
        plt.ylabel("Метрика")
        plt.title("Кривые метрик по эпохам")
        plt.legend()
        plt.grid(True)
        plt.savefig(self.save_path / "learning_curves.png")
        plt.close()

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    return preds, labels

def safe_parse_json(text):
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        return None

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Если predictions — logits, нужно брать argmax

    if isinstance(predictions, tuple):
        predictions = predictions[0]

    if isinstance(predictions, torch.Tensor):
        predictions = predictions.cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()

    # logits -> ids
    predictions = np.argmax(predictions, axis=-1)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    print("preds:", decoded_preds)    
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    f1_object_list = []
    f1_attribute_list = []
    f1_combined_weighted_list = []
    f1_combined_simple_list = []
    valid = 0

    for pred_text, label_text in zip(decoded_preds, decoded_labels):
        pred_json = safe_parse_json(pred_text)
        label_json = safe_parse_json(label_text)

        if pred_json is None or label_json is None:
            # Невалидный JSON → 0 по всем метрикам
            f1_object_list.append(0.0)
            f1_attribute_list.append(0.0)
            f1_combined_weighted_list.append(0.0)
            f1_combined_simple_list.append(0.0)
        else:
            valid += 1
            scores = evaluate_scene_extraction(label_json, pred_json)
            f1_object_list.append(scores["f1_object"])
            f1_attribute_list.append(scores["f1_attribute"])
            f1_combined_weighted_list.append(scores["f1_combined_weighted"])
            f1_combined_simple_list.append(scores["f1_combined_simple"])

    total = len(decoded_preds)

    return {
        "f1_object": round(sum(f1_object_list) / total, 4),
        "f1_attribute": round(sum(f1_attribute_list) / total, 4),
        "f1_combined_weighted": round(sum(f1_combined_weighted_list) / total, 4),
        "f1_combined_simple": round(sum(f1_combined_simple_list) / total, 4),
        "valid_json_rate": round(valid / total, 4),
        "total_samples": total,
        "valid_samples": valid,
    }

In [None]:
# === 1. Load all JSONL batches ===
def make_target(scene_objects):
    objects_dict = {}
    for obj in scene_objects:
        for name, attrs in obj.items():
            objects_dict[name] = attrs
    return json.dumps({"объекты": objects_dict}, ensure_ascii=False)

data = []
for path in sorted(DATA_DIR.glob("*.jsonl")):
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)
            description = item["description"]
            target = make_target(item["scene"]["objects"]) 
            data.append({
                "input": PROMPT.format(description=description),
                "target": target
            })

# === 2. Convert to HuggingFace Dataset ===
dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.05, seed=42)
train_ds, val_ds = dataset["train"], dataset["test"]

# === 3. Tokenizer ===
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

def preprocess(example):
    inputs = tokenizer(example["input"], padding="max_length", truncation=True, max_length=512)
    targets = tokenizer(example["target"], padding="max_length", truncation=True, max_length=256)
    inputs["labels"] = targets["input_ids"]
    return inputs

train_ds = train_ds.map(preprocess, batched=False)
val_ds = val_ds.map(preprocess, batched=False)

print(dataset["train"][0]["input"])
print(dataset["train"][0]["target"])

# === 4. Load base model and apply LoRA ===
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

lora_config = LoraConfig(
    r=8, # ранг низкоранговой матрицы
    lora_alpha=16,
    target_modules=["q", "v"],  # ruT5 может требовать уточнения слоёв (или просто "SelfAttention")
    # target_modules=["q", "k", "v", "o"]
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)

# === 5. TrainingArguments ===
training_args = TrainingArguments(
    output_dir="./rut5_lora_outputs",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=50,
    evaluation_strategy="epoch",
    eval_accumulation_steps=1, # для маленькой памяти GPU   
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none",
    fp16=True  # если у тебя есть GPU с поддержкой
)

# === 6. Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # <-- исправленная функция
    callbacks=[
        CustomEvaluateCallback(
            save_path="./metrics/"
        )
    ]
)


# === 7. Train ===
#trainer.train(resume_from_checkpoint=True)
trainer.train()
model.save_pretrained("./rut5_lora_outputs")


### Проверка

In [None]:
MODEL_DIR = "./rut5_lora_outputs"  # путь к fine-tuned модели
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# === 1. Загрузка модели и токенизатора ===
print("Loading model...")
config = PeftConfig.from_pretrained(MODEL_DIR)
base_model = T5ForConditionalGeneration.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(base_model, MODEL_DIR)
model = model.to(DEVICE)
model.eval()

tokenizer = T5Tokenizer.from_pretrained(config.base_model_name_or_path)

# === 2. Генерация ===
def predict(description: str, max_length: int = 256):
    prompt = PROMPT.format(description=description)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(DEVICE)

    with torch.no_grad():
        output_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            num_beams=4,
            early_stopping=True
        )

    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    try:
        parsed_json = json.loads(output_text)
    except Exception as e:
        print(f"Ошибка парсинга JSON: {e}")
        print("Сырые данные:", output_text)
        parsed_json = None

    return parsed_json


# === 3. Пример использования ===
if __name__ == "__main__":
    text = input("Введите описание сцены: ")
    result = predict(text)
    print("\nПредсказание:\n")
    print(json.dumps(result, indent=2, ensure_ascii=False))


In [None]:
def print_trainable_parameters(model):
    trainable_params = 0
    total_params = 0

    for param in model.parameters():
        total_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()

    print(f"Всего параметров: {total_params / 1e6:.2f}M")
    print(f"Обучаемых параметров: {trainable_params / 1e6:.2f}M")
    print(f"Доля обучаемых параметров: {100 * trainable_params / total_params:.2f}%")

# Вызов функции после создания модели

model = get_peft_model(model, lora_config)
print_trainable_parameters(model)