# Training Description LLM

Обучение языковой модели для генерации описаний по выходу CV моделей.

**Архитектура:**
```
CV Output (JSON) → LLM → Natural Language Description
```

**Вход LLM:** Структурированные данные от детектора и классификатора сцен
```json
{"detections": [{"label": "plastic", "confidence": 0.92}, ...], "scene": {"class": "grass", "confidence": 0.95}}
```

**Выход LLM:** Естественное описание
```
"There are 2 plastic bottles and 1 metal can scattered on the grassy area."
```


In [None]:
# Установка зависимостей
# !pip install transformers datasets accelerate peft

import json
import random
from pathlib import Path
from PIL import Image
import torch
from tqdm.notebook import tqdm
import numpy as np


## 1. Загрузка CV моделей и генерация данных


In [None]:
from vlm_annotation.ensemble_detector import EnsembleDetector
from train_scene_yolo import SceneClassifierYOLO

detector = EnsembleDetector(
    yolo_model_path="models/yolo/yolov8x/best.pt",
    detr_model_path="models/rt-detr/rt-detr-101/m",
    detr_processor_path="models/rt-detr/rt-detr-101/p",
    conf_threshold=0.5
)

scene_classifier = None
if Path("models/scene_classifier_yolo.pt").exists():
    scene_classifier = SceneClassifierYOLO("models/scene_classifier_yolo.pt")
    print("Scene classifier loaded")


In [None]:
def get_cv_output(image_path):
    """Получение структурированного выхода CV моделей"""
    image = Image.open(image_path).convert('RGB')
    
    detections = detector.detect(image)
    
    detection_summary = []
    for det in detections:
        detection_summary.append({
            "label": det["label"],
            "confidence": round(det["confidence"], 2)
        })
    
    scene = {"class": "unknown", "confidence": 0.0}
    if scene_classifier:
        scene_result = scene_classifier.predict(image)
        scene = {
            "class": scene_result["class"],
            "confidence": round(scene_result["confidence"], 2)
        }
    
    return {
        "detections": detection_summary,
        "scene": scene
    }


In [None]:
DESCRIPTION_TEMPLATES = [
    "There {verb} {garbage_desc}{scene_desc}.",
    "{garbage_desc} {verb} visible{scene_desc}.",
    "The image shows {garbage_desc}{scene_desc}.",
    "{garbage_desc} can be seen{scene_desc}.",
    "I can see {garbage_desc}{scene_desc}.",
]

GARBAGE_PHRASES = {
    "plastic": ["plastic waste", "plastic items", "plastic debris", "plastic bottles"],
    "glass": ["glass bottles", "glass items", "broken glass", "glass waste"],
    "metal": ["metal cans", "metal waste", "metallic debris", "aluminum cans"],
    "paper": ["paper waste", "cardboard", "paper debris", "discarded paper"],
    "organic": ["organic waste", "food scraps", "biodegradable waste", "organic matter"],
}

SCENE_PHRASES = {
    "grass": [" on the grass", " in a grassy area", " on green grass", " scattered on the lawn"],
    "sandy": [" on sandy ground", " on the beach", " in sandy terrain", " on the sand"],
    "rocky": [" on rocky terrain", " among rocks", " on rocky ground", " between stones"],
    "marshy": [" in marshy area", " in wetlands", " in swampy terrain", " near marsh"],
}

def generate_description(cv_output):
    """Генерация естественного описания по CV выходу"""
    detections = cv_output["detections"]
    scene = cv_output["scene"]
    
    if not detections:
        if scene["class"] != "unknown" and scene["confidence"] >= 0.8:
            scene_phrase = random.choice(SCENE_PHRASES.get(scene["class"], [""]))
            return f"No garbage detected{scene_phrase}."
        return "No garbage detected in this image."
    
    counts = {}
    for det in detections:
        label = det["label"]
        counts[label] = counts.get(label, 0) + 1
    
    garbage_parts = []
    for label, count in counts.items():
        phrase = random.choice(GARBAGE_PHRASES.get(label, [label]))
        if count == 1:
            garbage_parts.append(f"1 {phrase.rstrip('s')}" if phrase.endswith('s') else f"1 {phrase}")
        else:
            garbage_parts.append(f"{count} {phrase}")
    
    if len(garbage_parts) == 1:
        garbage_desc = garbage_parts[0]
    elif len(garbage_parts) == 2:
        garbage_desc = f"{garbage_parts[0]} and {garbage_parts[1]}"
    else:
        garbage_desc = ", ".join(garbage_parts[:-1]) + f", and {garbage_parts[-1]}"
    
    scene_desc = ""
    if scene["class"] != "unknown" and scene["confidence"] >= 0.8:
        scene_desc = random.choice(SCENE_PHRASES.get(scene["class"], [""]))
    
    total = sum(counts.values())
    verb = "is" if total == 1 else "are"
    
    template = random.choice(DESCRIPTION_TEMPLATES)
    return template.format(verb=verb, garbage_desc=garbage_desc, scene_desc=scene_desc)


In [None]:
def create_training_dataset(image_dirs, output_file, max_images=None, variations_per_image=3):
    """Создание датасета для обучения LLM"""
    
    image_paths = []
    for img_dir in image_dirs:
        img_dir = Path(img_dir)
        if img_dir.exists():
            image_paths.extend(list(img_dir.glob("*.jpg")))
            image_paths.extend(list(img_dir.glob("*.png")))
    
    if max_images:
        random.shuffle(image_paths)
        image_paths = image_paths[:max_images]
    
    print(f"Processing {len(image_paths)} images...")
    
    dataset = []
    for img_path in tqdm(image_paths):
        try:
            cv_output = get_cv_output(str(img_path))
            
            for _ in range(variations_per_image):
                description = generate_description(cv_output)
                dataset.append({
                    "input": json.dumps(cv_output),
                    "output": description,
                    "image": img_path.name
                })
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
            continue
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(dataset, f, indent=2, ensure_ascii=False)
    
    print(f"Created dataset with {len(dataset)} examples")
    return dataset


In [None]:
# Создаем датасет
train_dataset = create_training_dataset(
    image_dirs=[
        "data/1206-data/train",
        "data/1206-data/valid",
    ],
    output_file="data/llm_train_data.json",
    max_images=2000,
    variations_per_image=3
)

print("\nПримеры данных:")
for i, sample in enumerate(train_dataset[:3]):
    print(f"\n--- Sample {i+1} ---")
    print(f"Input: {sample['input'][:80]}...")
    print(f"Output: {sample['output']}")


## 2. Подготовка и обучение модели T5


In [None]:
from transformers import (
    T5ForConditionalGeneration, 
    T5Tokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from datasets import Dataset

MODEL_NAME = "google/flan-t5-small"
OUTPUT_DIR = "models/description_llm"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

print(f"Model: {MODEL_NAME}")
print(f"Parameters: {model.num_parameters():,}")


In [None]:
PROMPT_TEMPLATE = """Describe the garbage detected based on this CV output:
{input}
Description:"""

with open("data/llm_train_data.json", 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

random.shuffle(raw_data)
split_idx = int(len(raw_data) * 0.9)
train_data = raw_data[:split_idx]
val_data = raw_data[split_idx:]

print(f"Train: {len(train_data)}, Val: {len(val_data)}")


In [None]:
def preprocess_function(examples):
    inputs = [PROMPT_TEMPLATE.format(input=inp) for inp in examples["input"]]
    targets = examples["output"]
    
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_ds = Dataset.from_list(train_data)
val_ds = Dataset.from_list(val_data)

train_ds = train_ds.map(preprocess_function, batched=True, remove_columns=train_ds.column_names)
val_ds = val_ds.map(preprocess_function, batched=True, remove_columns=val_ds.column_names)

print(f"Train dataset: {len(train_ds)}, Val dataset: {len(val_ds)}")


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="steps",
    eval_steps=500,
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=500,
    logging_steps=100,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [None]:
print("Starting training...")
trainer.train()

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model saved to {OUTPUT_DIR}")


## 3. Тестирование модели


In [None]:
model = T5ForConditionalGeneration.from_pretrained(OUTPUT_DIR)
tokenizer = T5Tokenizer.from_pretrained(OUTPUT_DIR)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()
print(f"Model loaded on {device}")

def generate_description_llm(cv_output):
    """Генерация описания с помощью LLM"""
    prompt = PROMPT_TEMPLATE.format(input=json.dumps(cv_output))
    
    inputs = tokenizer(prompt, return_tensors="pt", max_length=256, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=128,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=2
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
# Тест на реальных изображениях
test_images = list(Path("data/1206-data/test").glob("*.jpg"))[:5]

print("Testing on sample images:\n")
for img_path in test_images:
    cv_output = get_cv_output(str(img_path))
    description = generate_description_llm(cv_output)
    
    print(f"Image: {img_path.name}")
    print(f"CV: {len(cv_output['detections'])} detections, scene={cv_output['scene']['class']}")
    print(f"LLM: {description}")
    print("-" * 50)


In [None]:
# Сохраняем конфиг для использования в VLM
config = {
    "model_path": OUTPUT_DIR,
    "model_type": "t5",
    "prompt_template": PROMPT_TEMPLATE,
    "max_input_length": 256,
    "max_output_length": 128,
    "num_beams": 4
}

Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
with open(f"{OUTPUT_DIR}/llm_config.json", 'w') as f:
    json.dump(config, f, indent=2)

print(f"Config saved to {OUTPUT_DIR}/llm_config.json")
print("\nModel ready for use in VLM_Complete_LLM.py")
