In [13]:
import os
import json
from ultralytics import YOLO
from PIL import Image

# 加载 YOLO 模型
model = YOLO("YOLOv8x_Symbols.pt")

# 设置图像路径
source_dir = "./samples/"
image_files = [os.path.join(source_dir, f) for f in os.listdir(source_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

# 定义用于 MIDI 的标签集合
note_classes = {
    'noteheadBlack', 'noteheadHalf', 'noteheadWhole',
    'restQuarter', 'restHalf', 'restWhole',
    'noteheadBlackOnLine', 'noteheadBlackInSpace',
    'noteheadHalfOnLine', 'noteheadWholeInSpace', 'rest8th'
}
# staff 也需要保留，用于 pitch 匹配
extra_classes = {'staff'}

# 推理图像
results = model(image_files)

# 结果保存路径
save_json = "./samples/midi_notes.json"
output = []

for result in results:
    img_path = result.path
    boxes = result.boxes.data
    confs = result.boxes.conf
    cls_ids = result.boxes.cls

    notes = []
    for box, conf, cls_id in zip(boxes, confs, cls_ids):
        label = result.names[int(cls_id)]
        if label not in note_classes and label not in extra_classes:
            continue  # 不是音符也不是五线谱，跳过

        x1, y1, x2, y2 = map(float, box[:4])
        note_data = {
            "label": label,
            "confidence": float(conf),
            "bbox": [x1, y1, x2, y2],
            "center": [(x1 + x2) / 2, (y1 + y2) / 2]
        }
        notes.append(note_data)

    output.append({
        "filename": os.path.basename(img_path),
        "notes": notes
    })

# 保存 JSON
with open(save_json, 'w') as f:
    json.dump(output, f, indent=2)

print(f"🎼 Done! MIDI-relevant predictions saved to {save_json}")


FileNotFoundError: [Errno 2] No such file or directory: 'YOLOv8x_Symbols.pt'

In [8]:
import os
import json
from PIL import Image, ImageDraw, ImageFont

def visualize_predictions(json_path, image_dir, save_dir):
    # 读取json
    with open(json_path, 'r') as f:
        data = json.load(f)

    os.makedirs(save_dir, exist_ok=True)

    for page in data:
        filename = page["filename"]
        image_path = os.path.join(image_dir, filename)

        if not os.path.exists(image_path):
            print(f"❗ Image {filename} not found!")
            continue

        img = Image.open(image_path).convert("RGB")
        draw = ImageDraw.Draw(img)

        try:
            font = ImageFont.truetype("arial.ttf", size=20)
        except:
            font = ImageFont.load_default()

        for note in page["notes"]:
            label = note["label"]
            conf = note["confidence"]
            bbox = note["bbox"]
            center = note["center"]

            x1, y1, x2, y2 = bbox
            cx, cy = center

            # 颜色区分
            if 'staff' in label:
                color = (0, 0, 255)  # 蓝色
            elif 'notehead' in label:
                color = (0, 255, 0)  # 绿色
            elif 'rest' in label:
                color = (255, 0, 0)  # 红色
            else:
                color = (255, 165, 0)  # 橙色其他

            # 画框
            draw.rectangle([(x1, y1), (x2, y2)], outline=color, width=2)
            # 写label
            draw.text((x1, y1-20), f"{label} {conf:.2f}", fill=color, font=font)
            # 标记center
            r = 3
            draw.ellipse([(cx-r, cy-r), (cx+r, cy+r)], fill=color)

        save_path = os.path.join(save_dir, filename)
        img.save(save_path)
        print(f"✅ Saved visualization: {save_path}")

# 使用
visualize_predictions(
    json_path="./samples/midi_notes.json",
    image_dir="./samples/",
    save_dir="./samples/visualized/"
)


✅ Saved visualization: ./samples/visualized/lg-16969821-aug-beethoven--page-1.png


In [11]:
import json
import os
from PIL import Image, ImageDraw

def visualize_staffs(json_path, image_dir, save_dir="./samples/staff_visualization"):
    os.makedirs(save_dir, exist_ok=True)

    with open(json_path, 'r') as f:
        data = json.load(f)

    for page in data:
        filename = page["filename"]
        notes = page["notes"]
        staff_notes = [n for n in notes if n["label"] == "staff"]

        # 打开图像
        image_path = os.path.join(image_dir, filename)
        image = Image.open(image_path).convert("RGB")
        draw = ImageDraw.Draw(image)

        print(f"\n🖼 Image: {filename}")
        for i, s in enumerate(sorted(staff_notes, key=lambda n: n["center"][1])):
            x1, y1, x2, y2 = s["bbox"]
            cx, cy = s["center"]
            draw.rectangle([x1, y1, x2, y2], outline="red", width=3)
            draw.line([(x1, cy), (x2, cy)], fill="blue", width=1)
            draw.text((x1, y1 - 10), f"staff {i+1}", fill="green")
            print(f"staff {i+1}: top={y1:.1f}, bottom={y2:.1f}, center={cy:.1f}")

        out_path = os.path.join(save_dir, f"staff_{filename}")
        image.save(out_path)
        print(f"✅ Saved visual to {out_path}")

# 示例使用
visualize_staffs(
    json_path="./samples/midi_notes.json",
    image_dir="./samples"
)


🖼 Image: lg-16969821-aug-beethoven--page-1.png
staff 1: top=784.2, bottom=851.2, center=817.7
staff 2: top=1327.3, bottom=1394.5, center=1360.9
staff 3: top=1870.0, bottom=1936.6, center=1903.3
staff 4: top=2411.9, bottom=2479.2, center=2445.5
staff 5: top=2575.8, bottom=2642.9, center=2609.3
✅ Saved visual to ./samples/staff_visualization\staff_lg-16969821-aug-beethoven--page-1.png


In [3]:
import json
from mido import Message, MidiFile, MidiTrack, MetaMessage

# 音符类型到持续时间（ticks）映射
def label_to_duration(label):
    duration_map = {
        "noteheadWhole": 1920,
        "noteheadWholeInSpace": 1920,
        "noteheadHalf": 960,
        "noteheadHalfOnLine": 960,
        "noteheadBlack": 480,
        "noteheadBlackInSpace": 480,
        "noteheadBlackOnLine": 480,
        "restWhole": 1920,
        "restHalf": 960,
        "restQuarter": 480,
        "rest8th": 240,
    }
    return duration_map.get(label, 480)

# 根据五线谱上边与下边计算 pitch，G4 为中心线，MIDI 编号 67
def y_to_midi_pitch(y_center, staff_top, staff_bottom):
    spacing = (staff_bottom - staff_top) / 4  # 五线谱有 4 个间隔
    center_line = staff_top + 2 * spacing     # 第三条线（中间线）代表 G4
    offset = round((center_line - y_center) / (spacing / 2))
    return 67 + offset

# 从 JSON 转 MIDI
def json_to_midi(json_path, midi_path):
    with open(json_path, 'r') as f:
        data = json.load(f)

    mid = MidiFile()
    track = MidiTrack()
    mid.tracks.append(track)

    track.append(MetaMessage('track_name', name="Generated by YOLO-OMR", time=0))
    track.append(MetaMessage('set_tempo', tempo=500000, time=0))  # 120 BPM

    for page in data:
        notes = page["notes"]
        staffs = [n for n in notes if n["label"] == "staff"]
        events = [n for n in notes if n["label"] != "staff"]

        # 提取所有 staff 的上下边界，排序便于匹配
        staff_ranges = sorted([
            (n["bbox"][1], n["bbox"][3]) for n in staffs
        ], key=lambda r: (r[0] + r[1]) / 2)

        time = 0
        for note in sorted(events, key=lambda n: n["center"][0]):
            y_center = note["center"][1]
            label = note["label"]
            duration = label_to_duration(label)

            # 找到最近的 staff 范围
            staff_top, staff_bottom = min(
                staff_ranges, key=lambda r: abs((r[0] + r[1]) / 2 - y_center)
            )

            if "notehead" in label:
                pitch = y_to_midi_pitch(y_center, staff_top, staff_bottom)
                track.append(Message('note_on', note=pitch, velocity=64, time=time))
                track.append(Message('note_off', note=pitch, velocity=64, time=duration))
                time = 0  # reset after note
            elif "rest" in label:
                time += duration  # skip ahead

    mid.save(midi_path)
    print(f"🎼 MIDI saved to {midi_path}")

In [None]:
json_to_midi(
    json_path="./samples/midi_notes.json",
    midi_path="./samples/output_pitch.mid"
)

ValueError: data byte must be in range 0..127

In [None]:
import pygame
import time

midi_file = "./samples/output_pitch.mid"
pygame.init()
pygame.mixer.init()
pygame.mixer.music.load(midi_file)
pygame.mixer.music.play()

while pygame.mixer.music.get_busy():
    time.sleep(0.5)
