In [8]:
# retreieve the test result of yolo direct prediction
from ultralytics import YOLO

model = YOLO('trained_models/yolov8m_500ep.pt')
metrics_whole = model.val(data='deepscores.yaml', split='val')

Ultralytics 8.3.121 🚀 Python-3.10.16 torch-2.5.1+cu121 CUDA:0 (NVIDIA GeForce RTX 4080 SUPER, 16069MiB)
Model summary (fused): 92 layers, 25,918,504 parameters, 0 gradients, 79.1 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 3747.5±749.5 MB/s, size: 384.3 KB)


[34m[1mval: [0mScanning /workspace/OMR/ds2_dense/labels/test.cache... 352 images, 0 backgrounds, 0 corrupt: 100%|██████████| 352/352 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):   5%|▍         | 1/22 [00:04<01:28,  4.21s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 22/22 [03:42<00:00, 10.12s/it]


                   all        352     244335      0.927       0.55      0.674      0.563
                 brace        177        725      0.997      0.807      0.899       0.85
            ledgerLine        342      23809          0          0          0          0
             repeatDot         69        876          1     0.0746      0.422      0.195
                 segno         43         55      0.986          1      0.995      0.974
                  coda         45         49      0.985          1      0.995      0.966
                 clefG        313       2203      0.999          1      0.995      0.992
             clefCAlto         89        255      0.997      0.984      0.991      0.981
            clefCTenor         65        167      0.995      0.994      0.995      0.985
                 clefF        294       1488      0.997      0.993      0.995      0.985
                 clef8        105        305      0.989      0.289      0.621      0.475
                clef1

In [None]:
import os
import numpy as np
from PIL import Image
from scipy.ndimage import convolve

# 参数
gray_threshold = 250
kernel_width = 100
response_thresh = 10
min_black_ratio = 0.5
min_gap_between_lines = 4

# 谱线检测
def detect_staff_lines_np(img_np):
    binary = (img_np < gray_threshold).astype(np.uint8)
    kernel = np.ones((1, kernel_width), dtype=np.uint8)
    response = convolve(binary, kernel)
    candidates = (response > response_thresh).astype(np.uint8)
    image_width = img_np.shape[1]
    min_black = int(min_black_ratio * image_width)

    valid_y = [
        y for y in range(binary.shape[0])
        if np.max(candidates[y]) > 0 and np.sum(binary[y]) >= min_black
    ]

    def deduplicate_lines(y_coords, min_gap=4):
        y_coords = sorted(y_coords)
        deduped = []
        for y in y_coords:
            if not deduped or abs(y - deduped[-1]) >= min_gap:
                deduped.append(y)
        return deduped

    return deduplicate_lines(valid_y, min_gap_between_lines)

# 分组为五线谱
def group_staff_lines(y_coords):
    y_coords = sorted(y_coords)
    groups = []
    group = []
    for y in y_coords:
        group.append(y)
        if len(group) == 5:
            groups.append(group)
            group = []
    return groups

# 裁剪图像并同步转换标签
def crop_and_adjust_labels(image_path, label_path, image_output_dir, label_output_dir):
    base_name = os.path.splitext(os.path.basename(image_path))[0]
    img = Image.open(image_path).convert("RGB")
    gray = img.convert("L")
    np_gray = np.array(gray)
    staff_lines = detect_staff_lines_np(np_gray)
    staff_groups = group_staff_lines(staff_lines)

    # 读取原始标签
    labels = []
    if os.path.exists(label_path):
        with open(label_path, "r") as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) == 5:
                    cls, xc, yc, w, h = map(float, parts)
                    labels.append((cls, xc, yc, w, h))

    for i in range(0, len(staff_groups), 2):
        groups_in_pair = staff_groups[i:i+2]
        if not groups_in_pair:
            continue

        all_ys = [y for group in groups_in_pair for y in group]
        margin = int(0.25 * (max(all_ys) - min(all_ys)))  # 25% margin
        y_min = max(0, min(all_ys) - margin)
        y_max = min(img.height, max(all_ys) + margin)


        # 裁剪图像
        cropped = img.crop((0, y_min, img.width, y_max))
        pair_img_name = f"{base_name}_pair{i//2}.jpg"
        pair_lbl_name = f"{base_name}_pair{i//2}.txt"
        cropped.save(os.path.join(image_output_dir, pair_img_name))

        # 同步标签
        new_lines = []
        for cls, xc, yc, w, h in labels:
            abs_yc = yc * img.height
            abs_h = h * img.height
            box_top = abs_yc - abs_h / 2
            box_bottom = abs_yc + abs_h / 2

            if box_bottom < y_min or box_top > y_max:
                continue

            new_yc = (abs_yc - y_min) / (y_max - y_min)
            new_h = abs_h / (y_max - y_min)
            # 忽略越界坐标
            if new_yc < 0 or new_yc > 1 or new_h <= 0 or new_h > 1:
                continue
            new_line = f"{int(cls)} {xc:.6f} {new_yc:.6f} {w:.6f} {new_h:.6f}"
            new_lines.append(new_line)

        with open(os.path.join(label_output_dir, pair_lbl_name), "w") as f:
            f.write("\n".join(new_lines))


In [None]:
# 主处理流程

# 目录路径
original_image_dir = "./ds2_dense/images/test"
original_label_dir = "./ds2_dense/labels/test"
pairwise_image_dir = "./ds2_pairwise/images/test"
pairwise_label_dir = "./ds2_pairwise/labels/test"
os.makedirs(pairwise_image_dir, exist_ok=True)
os.makedirs(pairwise_label_dir, exist_ok=True)

image_files = [f for f in os.listdir(original_image_dir) if f.endswith(('.jpg', '.png'))]
for file in image_files:
    crop_and_adjust_labels(
        os.path.join(original_image_dir, file),
        os.path.join(original_label_dir, os.path.splitext(file)[0] + ".txt"),
        pairwise_image_dir,
        pairwise_label_dir
    )

print("pairwised finished, results are available at:")
print(f"images: {pairwise_image_dir}")
print(f"labels: {pairwise_label_dir}")

In [11]:
metrics_pairwise = model.val(data='deepscores_pairwise.yaml', split='val')

Ultralytics 8.3.121 🚀 Python-3.10.16 torch-2.5.1+cu121 CUDA:0 (NVIDIA GeForce RTX 4080 SUPER, 16069MiB)
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 1479.6±331.8 MB/s, size: 73.1 KB)


[34m[1mval: [0mScanning /workspace/OMR/ds2_pairwise/labels/test... 2014 images, 0 backgrounds, 0 corrupt: 100%|██████████| 2014/2014 [00:04<00:00, 451.77it/s]

[34m[1mval: [0mNew cache created: /workspace/OMR/ds2_pairwise/labels/test.cache



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 126/126 [00:48<00:00,  2.58it/s]


                   all       2014     255290      0.878      0.803      0.856      0.689
                 brace        726        764      0.839      0.984      0.923      0.837
            ledgerLine       1465      24433          0          0          0          0
             repeatDot        179        908      0.996       0.27      0.482      0.233
                 segno         40         42      0.926          1      0.993      0.917
                  coda         30         31      0.924      0.782      0.947      0.831
                 clefG       1512       2303      0.974      0.999      0.993      0.991
             clefCAlto        239        260      0.972          1      0.995      0.981
            clefCTenor        148        167      0.994          1      0.995      0.988
                 clefF       1276       1615      0.988      0.996      0.995      0.978
                 clef8        234        302      0.986      0.723      0.882      0.675
                clef1