In [5]:
from ultralytics import YOLO

model = YOLO('trained_models/yolov8m_500ep.pt')
metrics_whole = model.val(data='deep_scores.yaml', split='val')

Ultralytics 8.3.123 🚀 Python-3.12.7 torch-2.7.0 CPU (Apple M1)
Model summary (fused): 92 layers, 25,918,504 parameters, 0 gradients, 79.1 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 656.6±403.5 MB/s, size: 413.7 KB)


[34m[1mval: [0mScanning /Users/peiyuanli/Documents/GitHub/OMR/ds2_dense/labels/test.cache... 352 images, 0 backgrounds, 0 corrupt: 100%|██████████| 352/352 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 22/22 [08:13<00:00, 22.41s/it]


                   all        352     244335      0.927       0.55      0.674      0.563
                 brace        177        725      0.997      0.807      0.899       0.85
            ledgerLine        342      23809          0          0          0          0
             repeatDot         69        876          1     0.0746      0.422      0.194
                 segno         43         55      0.986          1      0.995      0.974
                  coda         45         49      0.985          1      0.995      0.964
                 clefG        313       2203      0.999          1      0.995      0.992
             clefCAlto         89        255      0.997      0.984      0.991      0.981
            clefCTenor         65        167      0.995      0.994      0.995      0.985
                 clefF        294       1488      0.997      0.993      0.995      0.985
                 clef8        105        305      0.989      0.289      0.621      0.475
                clef1

In [None]:
import os
import numpy as np
from PIL import Image
from scipy.ndimage import convolve

# 参数
gray_threshold = 250
kernel_width = 100
response_thresh = 10
min_black_ratio = 0.5
min_gap_between_lines = 4

# 谱线检测
def detect_staff_lines_np(img_np):
    binary = (img_np < gray_threshold).astype(np.uint8)
    kernel = np.ones((1, kernel_width), dtype=np.uint8)
    response = convolve(binary, kernel)
    candidates = (response > response_thresh).astype(np.uint8)
    image_width = img_np.shape[1]
    min_black = int(min_black_ratio * image_width)

    valid_y = [
        y for y in range(binary.shape[0])
        if np.max(candidates[y]) > 0 and np.sum(binary[y]) >= min_black
    ]

    def deduplicate_lines(y_coords, min_gap=4):
        y_coords = sorted(y_coords)
        deduped = []
        for y in y_coords:
            if not deduped or abs(y - deduped[-1]) >= min_gap:
                deduped.append(y)
        return deduped

    return deduplicate_lines(valid_y, min_gap_between_lines)

# 分组为五线谱
def group_staff_lines(y_coords):
    y_coords = sorted(y_coords)
    groups = []
    group = []
    for y in y_coords:
        group.append(y)
        if len(group) == 5:
            groups.append(group)
            group = []
    return groups

# 裁剪图像并同步转换标签
def crop_and_adjust_labels(image_path, label_path, image_output_dir, label_output_dir):
    base_name = os.path.splitext(os.path.basename(image_path))[0]
    img = Image.open(image_path).convert("RGB")
    gray = img.convert("L")
    np_gray = np.array(gray)
    staff_lines = detect_staff_lines_np(np_gray)
    staff_groups = group_staff_lines(staff_lines)

    # 读取原始标签
    labels = []
    if os.path.exists(label_path):
        with open(label_path, "r") as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) == 5:
                    cls, xc, yc, w, h = map(float, parts)
                    labels.append((cls, xc, yc, w, h))

    for i in range(0, len(staff_groups), 2):
        groups_in_pair = staff_groups[i:i+2]
        if not groups_in_pair:
            continue

        all_ys = [y for group in groups_in_pair for y in group]
        margin = int(0.2 * (max(all_ys) - min(all_ys)))  # 20% margin
        y_min = max(0, min(all_ys) - margin)
        y_max = min(img.height, max(all_ys) + margin)


        # 裁剪图像
        cropped = img.crop((0, y_min, img.width, y_max))
        pair_img_name = f"{base_name}_pair{i//2}.jpg"
        pair_lbl_name = f"{base_name}_pair{i//2}.txt"
        cropped.save(os.path.join(image_output_dir, pair_img_name))

        # 同步标签
        new_lines = []
        for cls, xc, yc, w, h in labels:
            abs_yc = yc * img.height
            abs_h = h * img.height
            box_top = abs_yc - abs_h / 2
            box_bottom = abs_yc + abs_h / 2

            if box_bottom < y_min or box_top > y_max:
                continue

            new_yc = (abs_yc - y_min) / (y_max - y_min)
            new_h = abs_h / (y_max - y_min)
            new_line = f"{int(cls)} {xc:.6f} {new_yc:.6f} {w:.6f} {new_h:.6f}"
            new_lines.append(new_line)

        with open(os.path.join(label_output_dir, pair_lbl_name), "w") as f:
            f.write("\n".join(new_lines))


In [7]:
# 主处理流程

# 目录路径
original_image_dir = "./ds2_dense/images/test"
original_label_dir = "./ds2_dense/labels/test"
pairwise_image_dir = "./pairwise_ds/images"
pairwise_label_dir = "./pairwise_ds/labels"
os.makedirs(pairwise_image_dir, exist_ok=True)
os.makedirs(pairwise_label_dir, exist_ok=True)

image_files = [f for f in os.listdir(original_image_dir) if f.endswith(('.jpg', '.png'))]
for file in image_files:
    crop_and_adjust_labels(
        os.path.join(original_image_dir, file),
        os.path.join(original_label_dir, os.path.splitext(file)[0] + ".txt"),
        pairwise_image_dir,
        pairwise_label_dir
    )

print("✅ 裁剪完成。你可以使用该路径运行 model.val()：")
print(f"images: {pairwise_image_dir}")
print(f"labels: {pairwise_label_dir}")

✅ 裁剪完成。你可以使用该路径运行 model.val()：
images: ./pairwise_ds/images
labels: ./pairwise_ds/labels


In [11]:
metrics_pairwise = model.val(data='pairwise_detection.yaml', split='val')

Ultralytics 8.3.123 🚀 Python-3.12.7 torch-2.7.0 CPU (Apple M1)
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 341.9±44.1 MB/s, size: 88.9 KB)


[34m[1mval: [0mScanning /Users/peiyuanli/Documents/GitHub/OMR/pairwise_ds/labels... 2014 images, 0 backgrounds, 1037 corrupt: 100%|██████████| 2014/2014 [00:00<00:00, 2892.73it/s]

[34m[1mval: [0m/Users/peiyuanli/Documents/GitHub/OMR/pairwise_ds/images/lg-101766503886095953-aug-emmentaler--page-4_pair2.jpg: ignoring corrupt image/label: negative label values [   -0.01372]
[34m[1mval: [0m/Users/peiyuanli/Documents/GitHub/OMR/pairwise_ds/images/lg-102414375-aug-beethoven--page-3_pair0.jpg: ignoring corrupt image/label: non-normalized or out of bounds coordinates [     1.0075      1.0075]
[34m[1mval: [0m/Users/peiyuanli/Documents/GitHub/OMR/pairwise_ds/images/lg-102414375-aug-beethoven--page-3_pair2.jpg: ignoring corrupt image/label: negative label values [  -0.001348]
[34m[1mval: [0m/Users/peiyuanli/Documents/GitHub/OMR/pairwise_ds/images/lg-102414375-aug-beethoven--page-3_pair3.jpg: ignoring corrupt image/label: non-normalized or out of bounds coordinates [      1.006       1.006      1.0045]
[34m[1mval: [0m/Users/peiyuanli/Documents/GitHub/OMR/pairwise_ds/images/lg-10247684-aug-emmentaler--page-2_pair0.jpg: ignoring corrupt image/label: negative la


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 62/62 [06:27<00:00,  6.25s/it]


                   all        977      97339      0.907      0.828      0.888      0.723
                 brace        320        320      0.999      0.984      0.995      0.907
            ledgerLine        643       5703          0          0          0          0
             repeatDot         68        324          1      0.169      0.379      0.183
                 segno         10         10      0.907      0.975      0.968      0.881
                  coda         10         11      0.922      0.727      0.836      0.781
                 clefG        723       1052      0.999          1      0.995      0.995
             clefCAlto        121        127      0.994          1      0.995      0.989
            clefCTenor         92        101      0.985          1      0.995       0.99
                 clefF        591        707      0.997          1      0.995      0.984
                 clef8        108        123          1      0.686      0.929       0.68
                clef1