# T5rus с промптом + LoRA

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!ls -la /content/drive/MyDrive/VKR/


total 84
drwx------ 2 root root  4096 Apr 27 05:50 dataset
drwx------ 2 root root  4096 Apr 29 11:38 Flan_T5_lora_outputs
-rw------- 1 root root 31714 Apr 29 12:08 learning_curves_FlanT5.png
-rw------- 1 root root 29687 Apr 29 14:23 learning_curves_T5ru.png
drwx------ 2 root root  4096 Apr 27 05:53 library
drwx------ 2 root root  4096 Apr 29 11:38 metrics_FlanT5
drwx------ 2 root root  4096 Apr 29 09:32 metrics_T5ru
drwx------ 2 root root  4096 Apr 29 09:32 T5ru_lora_outputs


In [3]:
!pip install -U spacy > /dev/null 2>&1
!python -m spacy download ru_core_news_sm > /dev/null 2>&1
!pip install wandb > /dev/null 2>&1
!pip install datasets > /dev/null 2>&1

In [4]:
import transformers
import datasets
import huggingface_hub
import torch
import wandb

print(transformers.__version__)
print(datasets.__version__)
print(huggingface_hub.__version__)
print(torch.__version__)

4.51.3
3.5.1
0.30.2
2.6.0+cu124


Будем напрямую генерировать json по сцене, для этого дообучим T5(Text-To-Text Transfer Transformer) + LoRA

In [5]:
import json
import torch
import os
import sys
import warnings
import numpy as np

from pathlib import Path
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from transformers import TrainerCallback

from peft import LoraConfig, get_peft_model, TaskType, PeftConfig,PeftModel
from tqdm import tqdm

import matplotlib.pyplot as plt

# отключаем их все чтобы картинку не портили
warnings.filterwarnings("ignore", category=FutureWarning)


DATA_DIR = Path("/content/drive/MyDrive/VKR/dataset/dataset_tmp").expanduser()
MODEL_NAME = "sberbank-ai/ruT5-base"

lib_path = os.path.abspath(os.path.join(os.getcwd(), '/content/drive/MyDrive/VKR/'))
sys.path.append(lib_path)

from library.metrics import evaluate_scene_extraction, evaluate_global_f1_on_pairs
from library.safe_compute_metrics import safe_compute_metrics

### Промпт с "few shorts" примерами

In [6]:
# промпт очень большой, поэтому нужно чтобы все влезало
PROMPT = """
Ты должен проанализировать описание сцены и вернуть ответ в виде JSON.

Твоя задача:
- Выдели объекты, упомянутые в описании, и их признаки.
- Ответ верни строго в формате JSON, без пояснений и комментариев.

Формат JSON ответа:
{{
  "объекты": {{
    "название объекта": ["атрибут1", "атрибут2", ...]
  }}
}}

Требования:
- Все названия объектов и атрибуты должны быть в двойных кавычках "".
- Если у объекта нет признаков используй пустой список [].
- Не добавляй новые объекты или признаки, которых нет в описании.
- Если невозможно определить признаки объекта, включи его с пустым списком [].
- Структура должна быть корректным валидным JSON.

Пример:

Описание: Маленький красный стол стоит у окна.
Ответ:
{{
  "объекты": {{
    "стол": ["маленький", "красный"],
    "окно": []
  }}
}}

Описание: {description}

Ответ:
"""
print(len(PROMPT))

826


In [7]:
class CustomEvaluateCallback(TrainerCallback):
    """
    Кастомный колбэк для логирования и визуализации метрик после каждой эпохи.
    НЕ пересчитывает предсказания, а берет метрики прямо из Trainer.
    """

    def __init__(self, save_path="/content/drive/MyDrive/VKR/metrics_T5ru/"):
        self.save_path = Path(save_path)
        self.metrics_history = []

        self.save_path.mkdir(exist_ok=True, parents=True)

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        """
        Срабатывает после валидации в Trainer.
        """
        if metrics is None:
            return

        metrics_to_log = {
            "epoch": state.epoch,
            "f1_object": metrics.get("f1_object", 0.0),
            "f1_attribute": metrics.get("f1_attribute", 0.0),
            "f1_combined_weighted": metrics.get("f1_combined_weighted", 0.0),
            "f1_combined_simple": metrics.get("f1_combined_simple", 0.0),
            "valid_json_rate": metrics.get("valid_json_rate", 0.0),
        }

        self.metrics_history.append(metrics_to_log)

        # Логируем в консоль
        print(f"\n Custom evaluation at epoch {state.epoch:.2f}: {metrics_to_log}")

        # Сохраняем историю метрик
        with open(self.save_path / "metrics_history.json", "w", encoding="utf-8") as f:
            json.dump(self.metrics_history, f, indent=2, ensure_ascii=False)

        # Строим графики
        self.plot_metrics()

    def plot_metrics(self):
        if not self.metrics_history:
            return

        epochs = [m["epoch"] for m in self.metrics_history]
        plt.figure(figsize=(12, 7))

        for key in ["f1_object", "f1_attribute", "f1_combined_weighted", "f1_combined_simple", "valid_json_rate"]:
            plt.plot(epochs, [m[key] for m in self.metrics_history], label=key)

        plt.xlabel("Эпоха")
        plt.ylabel("Метрика")
        plt.title("Кривые метрик по эпохам")
        plt.legend()
        plt.grid(True)
        plt.savefig(self.save_path / "/content/drive/MyDrive/VKR/learning_curves_T5ru.png")
        plt.close()

In [8]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    return preds, labels

def safe_parse_json(text):
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        return None

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Если predictions — logits, нужно брать argmax

    if isinstance(predictions, tuple):
        predictions = predictions[0]

    if isinstance(predictions, torch.Tensor):
        predictions = predictions.cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()

    # logits -> ids
    predictions = np.argmax(predictions, axis=-1)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # посмотрим, что генерит
    print("preds:", decoded_preds[:5])
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    f1_object_list = []
    f1_attribute_list = []
    f1_combined_weighted_list = []
    f1_combined_simple_list = []
    valid = 0

    for pred_text, label_text in zip(decoded_preds, decoded_labels):
        pred_json = safe_parse_json(pred_text)
        label_json = safe_parse_json(label_text)

        if pred_json is None or label_json is None:
            # Невалидный JSON → 0 по всем метрикам
            f1_object_list.append(0.0)
            f1_attribute_list.append(0.0)
            f1_combined_weighted_list.append(0.0)
            f1_combined_simple_list.append(0.0)
        else:
            valid += 1
            scores = evaluate_scene_extraction(label_json, pred_json)
            f1_object_list.append(scores["f1_object"])
            f1_attribute_list.append(scores["f1_attribute"])
            f1_combined_weighted_list.append(scores["f1_combined_weighted"])
            f1_combined_simple_list.append(scores["f1_combined_simple"])

    total = len(decoded_preds)

    result = {
        "f1_object": round(sum(f1_object_list) / total, 4),
        "f1_attribute": round(sum(f1_attribute_list) / total, 4),
        "f1_combined_weighted": round(sum(f1_combined_weighted_list) / total, 4),
        "f1_combined_simple": round(sum(f1_combined_simple_list) / total, 4),
        "valid_json_rate": round(valid / total, 4),
        "total_samples": total,
        "valid_samples": valid,
    }

    return result

### Параметры модели и обучения

инъекции будем делать во все слои связанные с вниманием - это должно сделать модель гибче

In [9]:
lora_rank = 8
lora_alpha = 16
lora_target_modules=["q", "k", "v"]   # в какие слои делаем инъекции
lora_dropout=0.1

per_device_train_batch_size = 8
num_train_epochs = 20

INPUT_SEQ_LENGTH = 1100
OUTPUT_SEQ_LENGTH = 512

In [10]:
run = wandb.init(
    entity="shiltsov-da",
    # Set the wandb project where this run will be logged.
    project="vkr-hse-object-detection",
    # Track hyperparameters and run metadata.
    group="T5LoRAtext2json",
    tags=["text2json", "lora", MODEL_NAME],
    config={
        "architecture": "T5ru-LoRA-text2json",
        "notebook":"T5ru-LoRA-text2json-v1-Colab.ipynb",
        "base_model": MODEL_NAME,
        "lora_rank": lora_rank,
        "lora_alpha": lora_alpha,
        "lora_target_modules": lora_target_modules,
        "per_device_train_batch_size": per_device_train_batch_size,
        "num_train_epochs": num_train_epochs
    },
)



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshiltsov[0m ([33mshiltsov-da[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [11]:
# Загрузка бачей
def make_target(scene_objects):
    objects_dict = {}
    for obj in scene_objects:
        for name, attrs in obj.items():
            objects_dict[name] = attrs
    return json.dumps({"объекты": objects_dict}, ensure_ascii=False)

data = []
for path in sorted(DATA_DIR.glob("*.jsonl")):
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)
            description = item["description"]
            target = make_target(item["scene"]["objects"])
            data.append({
                "input": PROMPT.format(description=description),
                "target": target
            })


# Делаем датасет
dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.05, seed=42)
train_ds, val_ds = dataset["train"], dataset["test"]

#print(train_ds[0])
#print(val_ds[0])

In [12]:

# Токенизируем
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

def preprocess(example):
    inputs = tokenizer(example["input"], padding="max_length", truncation=True, max_length=INPUT_SEQ_LENGTH)
    targets = tokenizer(example["target"], padding="max_length", truncation=True, max_length=OUTPUT_SEQ_LENGTH)
    inputs["labels"] = targets["input_ids"]
    return inputs

train_ds = train_ds.map(preprocess, batched=False)
val_ds = val_ds.map(preprocess, batched=False)

print(dataset["train"][0]["input"])
print(dataset["train"][0]["target"])

# Загружаем базовую модель и приделываем LoRA
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

lora_config = LoraConfig(
    r=lora_rank, # ранг низкоранговой матрицы
    lora_alpha=lora_alpha,
    target_modules=lora_target_modules,
    # target_modules=["q", "k", "v", "o"]
    lora_dropout=lora_dropout,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)

# Параметры обучения
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/VKR/T5ru_lora_outputs",
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=8,
    num_train_epochs=num_train_epochs,
    logging_dir="/content/drive/MyDrive/VKR/logs_T5ru",
    logging_steps=50,
    eval_strategy="epoch",
    eval_accumulation_steps=1, # для маленькой памяти GPU
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    #report_to="none",
    report_to="wandb",
    fp16=True
)

# Трейнер
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[
        CustomEvaluateCallback(
            save_path="/content/drive/MyDrive/VKR/metrics_T5ru/"
        )
    ]
)


# Обучение
#trainer.train(resume_from_checkpoint=True)
trainer.train()
model.save_pretrained("/content/drive/MyDrive/VKR/T5ru_lora_outputs")
run.finish()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/4801 [00:00<?, ? examples/s]

Map:   0%|          | 0/253 [00:00<?, ? examples/s]


Ты должен проанализировать описание сцены и вернуть ответ в виде JSON.

Твоя задача:
- Выдели объекты, упомянутые в описании, и их признаки.
- Ответ верни строго в формате JSON, без пояснений и комментариев.

Формат JSON ответа:
{
  "объекты": {
    "название объекта": ["атрибут1", "атрибут2", ...]
  }
}

Требования:
- Все названия объектов и атрибуты должны быть в двойных кавычках "".
- Если у объекта нет признаков используй пустой список [].
- Не добавляй новые объекты или признаки, которых нет в описании.
- Если невозможно определить признаки объекта, включи его с пустым списком [].
- Структура должна быть корректным валидным JSON.

Пример:

Описание: Маленький красный стол стоит у окна.
Ответ:
{
  "объекты": {
    "стол": ["маленький", "красный"],
    "окно": []
  }
}

Описание: Чемодан стоит рядом с цветным телевизором, возле которого лежит ключ.

Ответ: 

{"объекты": {"ключ": [], "телевизор": ["цветной"], "чемодан": []}}


config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,F1 Object,F1 Attribute,F1 Combined Weighted,F1 Combined Simple,Valid Json Rate,Total Samples,Valid Samples
1,0.2288,0.124501,0.0,0.0,0.0,0.0,0.0,253,0
2,0.1219,0.071752,0.0,0.0,0.0,0.0,0.0,253,0
3,0.0958,0.053213,0.0,0.0,0.0,0.0,0.0,253,0
4,0.0844,0.047066,0.0,0.0,0.0,0.0,0.0,253,0
5,0.0783,0.042156,0.0,0.0,0.0,0.0,0.0,253,0
6,0.069,0.037862,0.0,0.0,0.0,0.0,0.0,253,0
7,0.0632,0.034407,0.0,0.0,0.0,0.0,0.0,253,0
8,0.0593,0.032584,0.0,0.0,0.0,0.0,0.0,253,0
9,0.058,0.029864,0.0,0.0,0.0,0.0,0.0,253,0
10,0.0533,0.028558,0.0,0.0,0.0,0.0,0.0,253,0


preds: ['"столы": ["столение": ["деревгкое"], "жеучень": ["деревлический" "женый" "жекий",], "жено": ["деревчныйчное", "деревкоеое",], "деревверь": ["', '"объекты": ["столтарь": ["столный" "каменивный"], "светна": ["светинная",], "света": [" "светмп": ["', '"объекты": ["столоду": ["металовый"], "металзак": ["металонепроницаемый"], "металт": ["металкладной" "металный"],', '"столы": ["столщик": ["столкрытый", "заий"], "столлета": ["диянная",], "столобка": ["столкрытая", "металинка",], "металжка": ["металстерница",],', '"объекты": ["столмейка": ["деревый"], "стар": ["деревый",], "холода": ["деревманлый",],']

 Custom evaluation at epoch 1.00: {'epoch': 1.0, 'f1_object': 0.0, 'f1_attribute': 0.0, 'f1_combined_weighted': 0.0, 'f1_combined_simple': 0.0, 'valid_json_rate': 0.0}
preds: ['"объекты": "столенье": ["прогкое",], "столучень": ["пролический", "женый", "прокий",], "столно": ["прочноечное", "прошое"], "столверь": ["', '"объекты": "столтарь": ["старный" "каменивный",], "столна": ["стари

0,1
eval/f1_attribute,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/f1_combined_simple,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/f1_combined_weighted,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/f1_object,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/loss,█▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▁▁▂▁▃▇█▃▁▂▁▁▁▂▃▃▃▃▃▂
eval/samples_per_second,██▇█▆▂▁▆█▇███▇▆▆▆▆▆▇
eval/steps_per_second,██▅█▅▁▁▅█▅███▅▅▅▅▅▅█
eval/total_samples,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/valid_json_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/f1_attribute,0.0
eval/f1_combined_simple,0.0
eval/f1_combined_weighted,0.0
eval/f1_object,0.0
eval/loss,0.02124
eval/runtime,132.4767
eval/samples_per_second,1.91
eval/steps_per_second,0.242
eval/total_samples,253.0
eval/valid_json_rate,0.0


In [13]:
run.finish()

### Проверка

In [14]:
MODEL_DIR = "/content/drive/MyDrive/VKR/T5ru_lora_outputs"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Загрузка модели и токенизатора
print("Loading model...")
config = PeftConfig.from_pretrained(MODEL_DIR)
base_model = T5ForConditionalGeneration.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(base_model, MODEL_DIR)
model = model.to(DEVICE)
model.eval()

tokenizer = T5Tokenizer.from_pretrained(config.base_model_name_or_path)

# Генерация ===
def predict(description, max_length = OUTPUT_SEQ_LENGTH):
    prompt = PROMPT.format(description=description)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(DEVICE)

    with torch.no_grad():
        output_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            num_beams=4,
            early_stopping=True
        )

    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    try:
        parsed_json = json.loads(output_text)
    except Exception as e:
        print(f"Ошибка парсинга JSON: {e}")
        print("Сырые данные:", output_text)
        parsed_json = None

    return parsed_json


text = input("Введите описание сцены: ")
result = predict(text)
print("\nПредсказание:\n")
print(json.dumps(result, indent=2, ensure_ascii=False))


Loading model...
Введите описание сцены: На сторе стоит белый чайник


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Ошибка парсинга JSON: Extra data: line 1 column 10 (char 9)
Сырые данные: "объекты": "стол": [], "стул": ["белый"], "чайник": ["белый"]

Предсказание:

null


### Просмотр сколько параметров учили

In [15]:
def print_trainable_parameters(model):
    trainable_params = 0
    total_params = 0

    for param in model.parameters():
        total_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()

    print(f"Всего параметров: {total_params / 1e6:.2f}M")
    print(f"Обучаемых параметров: {trainable_params / 1e6:.2f}M")
    print(f"Доля обучаемых параметров: {100 * trainable_params / total_params:.2f}%")

# Вызов функции после создания модели

model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

Всего параметров: 224.23M
Обучаемых параметров: 1.33M
Доля обучаемых параметров: 0.59%


