# 1. Define Utility method for YOLO Visualization

In [1]:
from PIL import Image
import os
import cv2
import numpy as np
from ultralytics import YOLO
from PIL import Image, ImageDraw, ImageFont, ImageFilter

def get_contrast_color(bg_color):
    # Calculate the luminance of the background color
    # using the formula for luminance under the sRGB Luma (Rec. 709)
    luminance = (0.299 * bg_color[0] + 0.587 * bg_color[1] + 0.114 * bg_color[2]) / 255
    # Return white if the background is dark; black if the background is light
    return (255, 255, 255) if luminance < 0.5 else (0, 0, 0)


# Function to generate unique colors for each class ID
def get_unique_color(tag):
    np.random.seed(tag)  # Seed with tag to get consistent color for the same tag
    return [int(x) for x in np.random.randint(0, 255, 3)]


# 2. Visualize YOLO direction prediction

In [2]:
# Load the YOLO model
model_path = './trained_models/yolov8m_500ep.pt'
model = YOLO(model_path)

# Directory containing images
source_dir = './sheet_samples/'
image_files = [os.path.join(source_dir, f) for f in os.listdir(source_dir) if f.endswith(('.jpg', '.jpeg', '.png'))]

# Run inference on all images
results = model(image_files)

# Directory to save annotated images
save_directory = './yolo_result'
os.makedirs(save_directory, exist_ok=True)

for result in results:
    img_path = result.path
    image = Image.open(result.path).convert("RGB")
    # Draw predictions on the image
    draw = ImageDraw.Draw(image)

    if image is None:
        print(f"Failed to load image {img_path}")
        continue

    boxes = result.boxes.data
    confs = result.boxes.conf
    cls_ids = result.boxes.cls

    for box, conf, cls_id in zip(boxes, confs, cls_ids):
        x1, y1, x2, y2 = int(box[0]), int(box[1]), int(box[2]), int(box[3])
        conf = float(conf)
        cls_id = int(cls_id)
        label = result.names[cls_id]
        label_text = f'{label} {conf:.2f}'
        font_scale = 2
        color = get_unique_color(cls_id)
        text_color = get_contrast_color(color)

        # Draw bounding box
        draw.rectangle([( x1, y1), (x2, y2)], outline='red', width=3)

        font_size = 20  # Change this to the desired font size 
    
        # Draw label text in color matching the bounding box
        draw.text((x1, y1 - 40), label_text, fill='blue')

    save_path = os.path.join(save_directory, os.path.basename(img_path))
    image.save(save_path)
    print(f"[✓] Saved: {save_path}")


0: 992x704 4 clefGs, 2 clefCAltos, 2 clefCTenors, 5 clefFs, 2 clef8s, 37 noteheadBlackOnLines, 43 noteheadBlackInSpaces, 8 noteheadHalfOnLines, 8 noteheadHalfInSpaces, 5 noteheadWholeOnLines, 1 noteheadWholeInSpace, 5 augmentationDots, 4 flag8thUps, 10 flag8thDowns, 2 accidentalNaturals, 3 accidentalSharps, 36 keyFlats, 7 restQuarters, 10 rest8ths, 4 dynamicPs, 2 dynamicMs, 2 dynamicFs, 3 stringsDownBows, 4 stringsUpBows, 13 slurs, 10 beams, 4 dynamicCrescendoHairpins, 10 dynamicDiminuendoHairpins, 12 staffs, 41.1ms
1: 992x704 1 brace, 6 clefFs, 2 clef15s, 12 noteheadBlackOnLines, 15 noteheadBlackInSpaces, 2 flag64thUps, 6 flag128thUps, 1 flag64thDown, 1 accidentalSharp, 8 keySharps, 6 restWholes, 1 restHalf, 2 restQuarters, 4 rest8ths, 4 rest16ths, 6 rest32nds, 6 rest64ths, 7 rest128ths, 15 beams, 8 staffs, 41.1ms
Speed: 9.8ms preprocess, 41.1ms inference, 77.8ms postprocess per image at shape (1, 3, 992, 704)
[✓] Saved: ./yolo_result/lg-5230237-aug-gutenberg1939--page-2.png
[✓] Save

# 3. Use Pairwise Detection

In [3]:
from ultralytics import YOLO
import os
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from scipy.ndimage import convolve

# Set directory
model = YOLO('./trained_models/yolov8m_500ep.pt')
source_dir = './sheet_samples/'
output_dir = './yolo_pairwise_result/'
os.makedirs(output_dir, exist_ok=True)
font = ImageFont.load_default()

# 1. Define parameters for staff line detection
gray_threshold = 250
kernel_width = 100
response_thresh = 10
min_black_ratio = 0.5
min_gap_between_lines = 4

# 2. Detect horizontal staff lines in the input grayscale image
def detect_staff_lines_np(img_np):
    binary = (img_np < gray_threshold).astype(np.uint8)
    kernel = np.ones((1, kernel_width), dtype=np.uint8)
    response = convolve(binary, kernel)
    candidates = (response > response_thresh).astype(np.uint8)
    image_width = img_np.shape[1]
    min_black = int(min_black_ratio * image_width)

    valid_y = [
        y for y in range(binary.shape[0])
        if np.max(candidates[y]) > 0 and np.sum(binary[y]) >= min_black
    ]

    # 3. Remove nearby duplicate lines using a minimum gap threshold
    def deduplicate_lines(y_coords, min_gap=4):
        y_coords = sorted(y_coords)
        deduped = []
        for y in y_coords:
            if not deduped or abs(y - deduped[-1]) >= min_gap:
                deduped.append(y)
        return deduped

    return deduplicate_lines(valid_y, min_gap_between_lines)

# 4. Group every 5 staff lines into a complete staff system
def group_staff_lines(y_coords):
    y_coords = sorted(y_coords)
    groups = []
    group = []
    for y in y_coords:
        group.append(y)
        if len(group) == 5:
            groups.append(group)
            group = []
    return groups

In [4]:
# Main loop for processing each image in the source directory
for fname in os.listdir(source_dir):
    if not fname.endswith(('.png', '.jpg', '.jpeg')): 
        continue  # Skip non-image files

    # 1. Load and prepare the image
    img_path = os.path.join(source_dir, fname)
    full_img = Image.open(img_path).convert("RGB")
    gray = full_img.convert("L")
    np_gray = np.array(gray)
    draw = ImageDraw.Draw(full_img)

    # 2. Detect and group staff lines
    staff_lines = detect_staff_lines_np(np_gray)
    staff_groups = group_staff_lines(staff_lines)

    # 3. Loop through pairs of staff groups (each pair = 10 lines)
    for i in range(0, len(staff_groups), 2):
        groups_in_pair = staff_groups[i:i+2]
        if not groups_in_pair:
            continue  # Skip if pair is empty

        # 4. Define vertical crop boundaries with margin
        all_ys = [y for group in groups_in_pair for y in group]
        margin = int(0.25 * (max(all_ys) - min(all_ys)))  # Add 25% margin
        y_min = max(0, min(all_ys) - margin)
        y_max = min(full_img.height, max(all_ys) + margin)
        cropped_img = full_img.crop((0, y_min, full_img.width, y_max))

        # 5. Run YOLO inference on the cropped region
        results = model(cropped_img)
        result = results[0]
        boxes = result.boxes.data if result.boxes is not None else []

        # 6. Draw each detected bounding box with label
        for box in boxes:
            x1, y1, x2, y2 = [int(x) for x in box[:4]]
            cls_id = int(box[5].item()) if len(box) > 5 else -1
            label = result.names[cls_id] if cls_id in result.names else 'unknown'
            conf = float(box[4])
            label_text = f'{label} {conf:.2f}'

            # Adjust y-coordinates to match original full image
            y1 += y_min
            y2 += y_min

            # Draw bounding box and label text
            draw.rectangle([(x1, y1), (x2, y2)], outline='red', width=2)
            draw.text((x1, max(0, y1 - 15)), label_text, fill='blue', font=font)

    # 7. Save the fully annotated image
    save_path = os.path.join(output_dir, fname)
    full_img.save(save_path)
    print(f"[✓] Saved: {save_path}")


0: 224x992 1 clefG, 1 clefCTenor, 1 clefF, 9 noteheadBlackOnLines, 10 noteheadBlackInSpaces, 1 noteheadHalfInSpace, 1 noteheadWholeOnLine, 3 augmentationDots, 2 flag8thDowns, 1 accidentalSharp, 6 keyFlats, 5 restWholes, 3 slurs, 4 beams, 2 staffs, 82.2ms
Speed: 2.1ms preprocess, 82.2ms inference, 2.1ms postprocess per image at shape (1, 3, 224, 992)

0: 224x992 1 clefG, 1 clefCAlto, 8 noteheadBlackOnLines, 11 noteheadBlackInSpaces, 4 noteheadHalfInSpaces, 2 noteheadWholeOnLines, 1 noteheadWholeInSpace, 2 flag8thUps, 2 accidentalSharps, 6 keyFlats, 2 restQuarters, 4 rest8ths, 2 dynamicPs, 1 stringsDownBow, 2 stringsUpBows, 4 slurs, 2 beams, 1 dynamicCrescendoHairpin, 4 dynamicDiminuendoHairpins, 2 staffs, 12.3ms
Speed: 2.3ms preprocess, 12.3ms inference, 1.9ms postprocess per image at shape (1, 3, 224, 992)

0: 224x992 2 clefFs, 1 clef8, 14 noteheadBlackOnLines, 15 noteheadBlackInSpaces, 3 noteheadHalfOnLines, 1 noteheadHalfInSpace, 5 augmentationDots, 7 flag8thDowns, 6 keyFlats, 2 res

# 4. Compare the result of two prediction

In [5]:
import os
from PIL import Image
import matplotlib.pyplot as plt

# 设置路径（根据实际路径修改）
direct_dir = './yolo_result/'
pairwise_dir = './yolo_pairwise_result/'
output_dir = './comparison_visualization/'
os.makedirs(output_dir, exist_ok=True)

# 使用 matplotlib 添加 title 对比显示
def save_side_by_side_with_titles(img1_path, img2_path, out_path):
    img1 = Image.open(img1_path).convert('RGB')
    img2 = Image.open(img2_path).convert('RGB')

    fig, axes = plt.subplots(1, 2, figsize=(80, 40))
    axes[0].imshow(img1)
    axes[0].set_title('Direct YOLO', fontsize=30)
    axes[0].axis('off')

    axes[1].imshow(img2)
    axes[1].set_title('Pairwise YOLO', fontsize=30)
    axes[1].axis('off')

    plt.tight_layout()
    plt.savefig(out_path, bbox_inches='tight')
    plt.close()

# 遍历图像对比并保存
for fname in os.listdir(direct_dir):
    if not fname.lower().endswith(('.png', '.jpg', '.jpeg')):
        continue

    img1_path = os.path.join(direct_dir, fname)
    img2_path = os.path.join(pairwise_dir, fname)
    save_path = os.path.join(output_dir, fname)

    if not os.path.exists(img2_path):
        print(f"⚠️ Pairwise version missing: {fname}")
        continue

    try:
        save_side_by_side_with_titles(img1_path, img2_path, save_path)
        print(f"[✓] Saved: {save_path}")
    except Exception as e:
        print(f"⚠️ Error on {fname}: {e}")

[✓] Saved: ./comparison_visualization/lg-5230237-aug-gutenberg1939--page-2.png
[✓] Saved: ./comparison_visualization/lg-16336832-aug-lilyjazz--page-10.png
