In [1]:
import os
import cv2
import xml.etree.ElementTree as ET
from tqdm import tqdm
import shutil
from ultralytics import YOLO
import glob
import random
import mlflow


# Конвертация CVAT XML в YOLO-формат (.txt)
def xml_to_yolo(xml_path, output_dir, class_map):
    os.makedirs(output_dir, exist_ok=True)
    tree = ET.parse(xml_path)
    root = tree.getroot()

    original_size = root.find("meta/original_size")
    width = int(original_size.find("width").text)
    height = int(original_size.find("height").text)

    # Словарь для хранения аннотаций по кадрам: {frame_num: [yolo_lines]}
    frames = {}

    # Обработка всех треков
    for track in root.findall("track"):
        label = track.get("label")
        if label not in class_map:
            continue
        cls_id = class_map[label]

        for box in track.findall("box"):
            frame_num = int(box.get("frame"))
            if frame_num not in frames:
                frames[frame_num] = []

            xtl = float(box.get("xtl"))
            ytl = float(box.get("ytl"))
            xbr = float(box.get("xbr"))
            ybr = float(box.get("ybr"))

            x_center = (xtl + xbr) / (2 * width)
            y_center = (ytl + ybr) / (2 * height)
            w = (xbr - xtl) / width
            h = (ybr - ytl) / height

            frames[frame_num].append(f"{cls_id} {x_center:.6f} {y_center:.6f} {w:.6f} {h:.6f}")

    # Сохранение в .txt
    for frame_num, lines in tqdm(frames.items(), desc="Making labels"):
        filename = f"frame_{frame_num:06d}"  # frame_000000.txt
        with open(f"{output_dir}/{filename}.txt", "w") as f:
            f.write("\n".join(lines))


# Извлечение кадров из видео
def extract_frames(video_path, output_dir, frame_interval=1):
    os.makedirs(output_dir, exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    saved_count = 0

    with tqdm(desc="Extracting frames") as pbar:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            if frame_count % frame_interval == 0:
                cv2.imwrite(f"{output_dir}/frame_{saved_count:06d}.jpg", frame)
                saved_count += 1

            frame_count += 1
            pbar.update(1)

    cap.release()
    print(f"Extracted {saved_count} frames")


# Создание датасета для обучения созданию псевдо-лейблов
def create_dataset_for_training_generate_pseudo_labels(all_frames_dir, labeled_data_dir, dataset_dir):
    os.makedirs(f"{dataset_dir}/train/images", exist_ok=True)
    os.makedirs(f"{dataset_dir}/train/labels", exist_ok=True)
    os.makedirs(f"{dataset_dir}/val/images", exist_ok=True)
    os.makedirs(f"{dataset_dir}/val/labels", exist_ok=True)

    labeled_files = [f.split('.')[0] for f in os.listdir(labeled_data_dir)]

    random.seed(70)
    random.shuffle(labeled_files)

    # Разделение на train/val
    split_ratio = 0.8
    train_files = labeled_files[:int(len(labeled_files) * split_ratio)]
    val_files = labeled_files[int(len(labeled_files) * split_ratio):]

    for file_base in tqdm(train_files):

        src_img = f"{all_frames_dir}/{file_base}.jpg"
        dst_img = f"{dataset_dir}/train/images/{file_base}.jpg"
        if os.path.exists(src_img):
            os.symlink(os.path.abspath(src_img), dst_img)

        src_label = f"{labeled_data_dir}/{file_base}.txt"
        dst_label = f"{dataset_dir}/train/labels/{file_base}.txt"
        shutil.copy(src_label, dst_label)

    for file_base in tqdm(val_files):

        src_img = f"{all_frames_dir}/{file_base}.jpg"
        dst_img = f"{dataset_dir}/val/images/{file_base}.jpg"
        if os.path.exists(src_img):
            os.symlink(os.path.abspath(src_img), dst_img)

        src_label = f"{labeled_data_dir}/{file_base}.txt"
        dst_label = f"{dataset_dir}/val/labels/{file_base}.txt"
        shutil.copy(src_label, dst_label)


# Генерация псевдо-лейблов
def generate_pseudo_labels(model, frames_dir, output_dir, conf=0.0):
    os.makedirs(output_dir, exist_ok=True)
    frame_paths = glob.glob(f"{frames_dir}/*.jpg")

    for frame_path in tqdm(frame_paths, desc="Generating pseudo-labels"):
        try:
            results = model.predict(frame_path, conf=conf, imgsz=640)
            results[0].save_txt(f"{output_dir}/{os.path.basename(frame_path)[:-4]}.txt")
        except Exception as e:
            print(f"Error processing {frame_path}: {str(e)}")


# Объединение лейблов и псевдо-лейблов
def combine_and_split_datasets(all_frames_dir, labeled_dir, pseudo_dir, output_dir, split_ratio=0.8, seed=70):
    dirs = {
        'train': ['images', 'labels'],
        'val': ['images', 'labels']
    }

    for dataset in dirs:
        for subdir in dirs[dataset]:
            os.makedirs(f"{output_dir}/{dataset}/{subdir}", exist_ok=True)

    all_files = []

    for f in glob.glob(f"{labeled_dir}/*.txt"):
        base_name = os.path.basename(f)[:-4]
        all_files.append(('labeled', base_name))

    for f in glob.glob(f"{pseudo_dir}/*.txt"):
        base_name = os.path.basename(f)[:-4]
        all_files.append(('pseudo', base_name))

    random.seed(seed)
    random.shuffle(all_files)

    split_idx = int(len(all_files) * split_ratio)
    train_files = all_files[:split_idx]
    val_files = all_files[split_idx:]

    def copy_files(files, dataset_type):
        for src_type, base_name in files:

            src_img = f"{all_frames_dir}/{base_name}.jpg"
            dst_img = f"{output_dir}/{dataset_type}/images/{base_name}.jpg"

            if os.path.exists(src_img):
                shutil.copy(src_img, dst_img)
            else:
                print(f"{src_img} not found")

            src_txt = f"{labeled_dir if src_type == 'labeled' else pseudo_dir}/{base_name}.txt"
            dst_txt = f"{output_dir}/{dataset_type}/labels/{base_name}.txt"
            if os.path.exists(src_txt):
                shutil.copy(src_txt, dst_txt)
            else:
                print(f"{src_txt} not found")

    copy_files(train_files, 'train')
    copy_files(val_files, 'val')

    print(f"combined_dataset create: {len(all_files)} files")


if __name__ == "__main__":

    mlflow.set_tracking_uri("file:///.../runs/mlflow")  # ... - путь до проекта

    CLASS_MAP = {'person': 0, 'canopy': 1, 'pallet': 2}
    VIDEO_PATH = "data/train.mp4"
    XML_PATH = "annotations.xml"

    # 1. Конвертация CVAT XML в YOLO-формат (.txt)
    xml_to_yolo(XML_PATH, "labeled_data", CLASS_MAP)

    # 2. Извлечение кадров
    extract_frames(VIDEO_PATH, "all_frames", frame_interval=1)

    # 3. Создание датасета для обучения созданию псевдо-лейблов
    all_frames_dir = "all_frames"
    labeled_data_dir = "labeled_data"
    dataset_dir = "datasets"

    create_dataset_for_training_generate_pseudo_labels(all_frames_dir, labeled_data_dir, dataset_dir)
    
    # 4. Первое обучение для создания псевдо-лейблов
    model = YOLO("yolov8n.pt")

    for param in model.model.parameters():
        param.requires_grad = False

    for param in model.model.model[-10:].parameters():
        param.requires_grad = True

    model.train(
        data="data0.yaml",
        epochs=10,
        imgsz=640,
        batch=16,
        device="cuda",
        optimizer="AdamW",
        lr0=0.001,
    )

    # 5. Генерация псевдо-лейблов
    best_model = YOLO("runs/detect/train/weights/best.pt")
    generate_pseudo_labels(best_model, "all_frames", "pseudo_labels", conf=0.0)

    # 6. Объединение данных
    combine_and_split_datasets(all_frames_dir="all_frames", labeled_dir="labeled_data", pseudo_dir="pseudo_labels", output_dir="combined_dataset")

    # 7. Обучение модели со всеми лейблами
    model = YOLO("runs/final_train/weights/best.pt")
    model.train(
        data="data.yaml",
        epochs=10,
        imgsz=640,
        patience=3,
        resume=False,
        device="cuda",
        optimizer="AdamW",
        lr0=0.001,
        project="runs",
        name="final_train"
    )

    print("Finish. Model in runs/final_train/weights/best.pt")

New https://pypi.org/project/ultralytics/8.3.98 available  Update with 'pip install -U ultralytics'
Ultralytics 8.3.97  Python-3.11.4 torch-2.5.1+cu118 CUDA:0 (NVIDIA GeForce RTX 2050, 4096MiB)
[34m[1mengine\trainer: [0mtask=detect, mode=train, model=runs\final_train\weights\best.pt, data=data.yaml, epochs=20, time=None, patience=100, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=cuda, workers=8, project=runs, name=final_train, exist_ok=False, pretrained=True, optimizer=AdamW, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=runs\final_train\weights\best.pt, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes

[34m[1mtrain: [0mScanning C:\PyProjects\test-market\combined_dataset\train\labels.cache... 10008 images, 0 backgrounds, 0 corrupt: 100%|██████████| 10008/10008 [00:00<?, ?it/s]


[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, num_output_channels=3, method='weighted_average'), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))


[34m[1mval: [0mScanning C:\PyProjects\test-market\combined_dataset\val\labels.cache... 3008 images, 0 backgrounds, 0 corrupt: 100%|██████████| 3008/3008 [00:00<?, ?it/s]


Plotting labels to runs\final_train\labels.jpg... 
[34m[1moptimizer:[0m AdamW(lr=0.001, momentum=0.937) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0005), 63 bias(decay=0.0)
Resuming training runs\final_train\weights\best.pt from epoch 11 to 20 total epochs
[34m[1mMLflow: [0mlogging run_id(576821a4b5f146c791466a46c6f6a1e5) to file:///C:/PyProjects/test-market/runs/mlflow
[34m[1mMLflow: [0mdisable with 'yolo settings mlflow=False'
[34m[1mTensorBoard: [0mmodel graph visualization added 
Image sizes 640 train, 640 val
Using 8 dataloader workers
Logging results to [1mruns\final_train[0m
Starting training for 20 epochs...
Closing dataloader mosaic
[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, num_output_channels=3, method='weighted_average'), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      11/20      2.04G     0.3193     0.2408     0.8214         23        640: 100%|██████████| 626/626 [03:07<00:00,  3.34it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 94/94 [00:27<00:00,  3.40it/s]

                   all       3008       5231      0.975      0.987      0.991      0.928






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      12/20      2.04G      0.299      0.209     0.8122         13        640: 100%|██████████| 626/626 [03:08<00:00,  3.33it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 94/94 [00:25<00:00,  3.71it/s]

                   all       3008       5231      0.981      0.985      0.993      0.934






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      13/20      2.04G      0.286        0.2     0.8099         14        640: 100%|██████████| 626/626 [03:08<00:00,  3.31it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 94/94 [00:26<00:00,  3.52it/s]

                   all       3008       5231      0.978      0.986      0.993      0.942






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      14/20      2.04G     0.2719     0.1898     0.8043         10        640: 100%|██████████| 626/626 [03:08<00:00,  3.32it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 94/94 [00:20<00:00,  4.53it/s]

                   all       3008       5231      0.983      0.984      0.992      0.942






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      15/20      2.04G     0.2597     0.1805     0.8007         19        640: 100%|██████████| 626/626 [03:11<00:00,  3.28it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 94/94 [00:24<00:00,  3.81it/s]

                   all       3008       5231      0.982      0.989      0.993      0.945






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      16/20      2.04G      0.247     0.1723     0.7977         19        640: 100%|██████████| 626/626 [03:09<00:00,  3.31it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 94/94 [00:25<00:00,  3.74it/s]

                   all       3008       5231      0.986      0.988      0.994      0.953






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      17/20      2.04G     0.2387      0.167     0.7944         11        640: 100%|██████████| 626/626 [03:06<00:00,  3.35it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 94/94 [00:19<00:00,  4.71it/s]

                   all       3008       5231      0.987      0.988      0.993      0.953






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      18/20      2.04G     0.2275     0.1592     0.7911         12        640: 100%|██████████| 626/626 [03:00<00:00,  3.47it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 94/94 [00:20<00:00,  4.62it/s]

                   all       3008       5231      0.984      0.986      0.993      0.954






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      19/20      2.04G     0.2156     0.1522     0.7873         10        640: 100%|██████████| 626/626 [03:01<00:00,  3.46it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 94/94 [00:20<00:00,  4.59it/s]

                   all       3008       5231      0.983      0.988      0.994      0.957






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      20/20      2.04G     0.2042     0.1447     0.7854         14        640: 100%|██████████| 626/626 [03:00<00:00,  3.46it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 94/94 [00:19<00:00,  4.70it/s]

                   all       3008       5231      0.985      0.989      0.994      0.958






10 epochs completed in 0.595 hours.
Optimizer stripped from runs\final_train\weights\last.pt, 6.2MB
Optimizer stripped from runs\final_train\weights\best.pt, 6.2MB

Validating runs\final_train\weights\best.pt...
Ultralytics 8.3.97  Python-3.11.4 torch-2.5.1+cu118 CUDA:0 (NVIDIA GeForce RTX 2050, 4096MiB)
Model summary (fused): 72 layers, 3,006,233 parameters, 0 gradients, 8.1 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 94/94 [00:21<00:00,  4.38it/s]


                   all       3008       5231      0.985      0.989      0.994      0.959
                person       1068       1071      0.969      0.972      0.992      0.898
                canopy       1955       1956      0.994      0.998      0.995      0.985
                pallet       1561       2204      0.991      0.996      0.995      0.993
Speed: 0.3ms preprocess, 2.7ms inference, 0.0ms loss, 0.9ms postprocess per image
Results saved to [1mruns\final_train[0m
[34m[1mMLflow: [0mresults logged to file:///C:/PyProjects/test-market/runs/mlflow
[34m[1mMLflow: [0mdisable with 'yolo settings mlflow=False'
Процесс завершен! Финальная модель сохранена в runs/final_train/weights/best.pt


In [2]:
# Проверка изображения и лейблов
from PIL import Image, ImageDraw

img = Image.open("all_frames/frame_005100.jpg")
draw = ImageDraw.Draw(img)
with open("pseudo_labels/frame_005100.txt", "r") as f:
    for line in f:
        cls, x, y, w, h = map(float, line.split())
        x1 = int((x - w / 2) * img.width)
        y1 = int((y - h / 2) * img.height)
        x2 = int((x + w / 2) * img.width)
        y2 = int((y + h / 2) * img.height)
        draw.rectangle([x1, y1, x2, y2], outline="red", width=2)
img.show()