In [1]:
# ! pip uninstall -y torch torchvision torchaudio
# # 裝含 CUDA 12.4 的官方輪子
# ! pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

In [None]:
# ! pip install -U pip
# ! pip uninstall -y ultralytics  # 若已裝舊版，建議先移除
# ! pip install git+https://github.com/sunsmarterjie/yolov12.git
# ! pip install -U pip setuptools wheel
# ! pip install huggingface_hub transformers safetensors accelerate
# ! pip install scikit-learn
# ! pip install pandas

##  前處理

In [6]:
import os, glob, subprocess, sys
from pathlib import Path
from ultralytics import YOLO


# ======= 修改成你的實際路徑 =======
TRAINING_IMAGE = r"C:\Users\307\Desktop\aicup\training_image"
TRAINING_LABEL = r"C:\Users\307\Desktop\aicup\training_label"
ALIAS_ROOT     = r"C:\Users\307\Desktop\aicup\dataset"   # 會建立 alias\images 與 alias\labels
# =================================

ALIAS_IMG = Path(ALIAS_ROOT) / "images"
ALIAS_LBL = Path(ALIAS_ROOT) / "labels"
Path(ALIAS_ROOT).mkdir(parents=True, exist_ok=True)

def safe_remove(p: Path):
    if p.exists():
        if p.is_symlink():
            p.unlink()
        elif p.is_dir():
            # 先嘗試刪 junction（會當成目錄處理）
            subprocess.run(["cmd", "/c", "rmdir", str(p)], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

def make_junction(link: Path, target: Path):
    # 用 Windows 目錄連結（junction）取代 symlink，權限需求低
    cmd = ["cmd", "/c", "mklink", "/J", str(link), str(target)]
    ret = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if ret.returncode != 0:
        print("[錯誤] 建立目錄連結失敗：", ret.stderr.strip())
        print("請確認目標存在，或改用系統管理員執行。")
        sys.exit(1)

def find_patient_root(root: Path) -> Path:
    for dp, dns, _ in os.walk(root):
        if any(d.startswith("patient") for d in dns):
            return Path(dp)
    return root

# 驗證來源資料夾
for p in [TRAINING_IMAGE, TRAINING_LABEL]:
    if not Path(p).is_dir():
        raise FileNotFoundError(f"找不到資料夾：{p}")

img_root = find_patient_root(Path(TRAINING_IMAGE))
lbl_root = find_patient_root(Path(TRAINING_LABEL))
print("IMG_ROOT =", img_root)
print("LBL_ROOT =", lbl_root)

# 清掉舊連結並建立新的 junction
safe_remove(ALIAS_IMG); safe_remove(ALIAS_LBL)
make_junction(ALIAS_IMG, img_root)
make_junction(ALIAS_LBL, lbl_root)
print(f"[OK] 建立連結：{ALIAS_IMG} → {img_root}")
print(f"[OK] 建立連結：{ALIAS_LBL} → {lbl_root}")

def write_list(p_start, p_end, out_txt):
    total, kept = 0, 0
    recs = []
    for i in range(p_start, p_end + 1):
        patient = f"patient{i:04d}"
        img_dir = ALIAS_IMG / patient
        lbl_dir = ALIAS_LBL / patient
        if not img_dir.is_dir() or not lbl_dir.is_dir():
            continue
        for img_path in glob.glob(str(img_dir / "*.png")):
            total += 1
            base = Path(img_path).stem
            lbl_path = lbl_dir / f"{base}.txt"
            if lbl_path.exists() and lbl_path.stat().st_size > 0:
                recs.append(img_path)   # 注意：清單中保留的是「images」路徑
                kept += 1
    recs.sort()
    Path(out_txt).write_text("\n".join(recs), encoding="utf-8")
    print(f"[OK] {out_txt}：影像{total}，有效配對{kept}")



IMG_ROOT = C:\Users\307\Desktop\aicup\training_image_roi
LBL_ROOT = C:\Users\307\Desktop\aicup\training_label_roi
[OK] 建立連結：C:\Users\307\Desktop\aicup\dataset\images → C:\Users\307\Desktop\aicup\training_image_roi
[OK] 建立連結：C:\Users\307\Desktop\aicup\dataset\labels → C:\Users\307\Desktop\aicup\training_label_roi


In [7]:
# 1) 設定專案、資料、輸出目錄
# 2) 鎖定隨機性（可重現）
# 3) 檢查必要套件（ultralytics）是否可用
import os
import random
import json
import math
from pathlib import Path
from collections import defaultdict
import numpy as np
import pandas as pd

# --- 可自訂參數（依你的環境調整） ---
PROJECT_ROOT = Path.cwd()                 # 預設為當前 Notebook 所在目錄
IMAGES_DIR   = r"C:\Users\307\Desktop\aicup\dataset\images"       # 影像：*.png（512x512）
LABELS_DIR   = r"C:\Users\307\Desktop\aicup\dataset\labels"       # 標註：YOLO .txt（同名）
FOLDS_DIR    = PROJECT_ROOT / "folds"     # 這次新增的 5-fold 清單輸出目錄
RUNS_DIR     = PROJECT_ROOT / "runs"      # 訓練輸出（權重、log）
SUBMIT_DIR   = PROJECT_ROOT / "submissions"
YAML_DIR   = PROJECT_ROOT / "fold_yaml"  #  fold 專用 data.yaml 目錄


# ultralytics（YOLO）若沒裝，請先安裝：pip install ultralytics
try:
    from ultralytics import YOLO
    _yolo_ok = True
except Exception as e:
    print("⚠️ 未偵測到 ultralytics，訓練/推論區塊會需要它。", e)
    _yolo_ok = False

# --- 亂數種子：鎖定所有主要來源 ---
SEED = 42
random.seed(SEED); np.random.seed(SEED)

import torch
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True  # 較穩定但可能略慢
torch.backends.cudnn.benchmark = False

# --- 目錄建立 ---
for d in [FOLDS_DIR, RUNS_DIR, SUBMIT_DIR, YAML_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("專案根目錄：", PROJECT_ROOT)
print("影像：", IMAGES_DIR)
print("標註：", LABELS_DIR)


專案根目錄： c:\Users\307\Desktop\aicup
影像： C:\Users\307\Desktop\aicup\dataset\images
標註： C:\Users\307\Desktop\aicup\dataset\labels


In [8]:
# === 建立影像中繼資訊表 ===
# 假設影像命名規則可解析出 patient_id 與 slice_idx
# 常見命名範例：
#   images/patient0001/000123.png  -> patient_id = patient0001, slice_idx = 123
# 若你的命名不同，改寫 parse 函式即可。

from typing import Tuple

def parse_patient_and_slice(p: Path) -> Tuple[str, int]:
    """
    傳回 (patient_id, slice_idx)
    - patient_id: 父資料夾名或檔名中可辨識的段
    - slice_idx: 從檔名取整數（不含副檔名）
    """
    # 方案A：以父資料夾為病人ID、檔名（去副檔名）為切片index
    patient_id = p.parent.name
    try:
        slice_idx = int(p.stem)
    except:
        # 若檔名不是純數字，可自訂解析規則
        # 例如 "CT_00123.png" -> 提取 "00123"
        digits = ''.join(ch for ch in p.stem if ch.isdigit())
        slice_idx = int(digits) if digits else -1
    return patient_id, slice_idx

def has_label_for(image_path: Path) -> bool:
    """
    YOLO 規則：標註檔與影像同名、位於 labels/ 對應層級。
    例：images/patient0001/000123.png -> labels/patient0001/000123.txt
    檢查該 .txt 是否存在且至少有一行（存在bbox）
    """
    rel = image_path.relative_to(IMAGES_DIR).with_suffix(".txt")
    label_path = LABELS_DIR / rel
    if not label_path.exists():
        return False
    try:
        # 至少一行標註才算「正樣本」
        return label_path.read_text(encoding="utf-8").strip() != ""
    except:
        return False

# 確保 IMAGES_DIR 和 LABELS_DIR 是 Path 物件
IMAGES_DIR = Path(IMAGES_DIR)
LABELS_DIR = Path(LABELS_DIR)

# 掃描所有影像
all_images = sorted(IMAGES_DIR.rglob("*.png"))
rows = []
for img in all_images:
    pid, sidx = parse_patient_and_slice(img)
    rows.append({
        "img_path": str(img),
        "patient_id": pid,
        "slice_idx": sidx,
        "has_label": has_label_for(img)
    })
df = pd.DataFrame(rows).sort_values(["patient_id", "slice_idx"]).reset_index(drop=True)

# 基本統計
per_patient = df.groupby("patient_id").agg(
    n_images=("img_path", "count"),
    n_pos=("has_label", "sum")
).reset_index()
per_patient["pos_ratio"] = per_patient["n_pos"] / per_patient["n_images"]

print("總影像數：", len(df))
print("總正樣本數：", df["has_label"].sum())
print("病人數：", df["patient_id"].nunique())
display(per_patient.head(10))


總影像數： 16863
總正樣本數： 2787
病人數： 50


Unnamed: 0,patient_id,n_images,n_pos,pos_ratio
0,patient0001,341,50,0.146628
1,patient0002,391,65,0.16624
2,patient0003,324,70,0.216049
3,patient0004,365,71,0.194521
4,patient0005,285,54,0.189474
5,patient0006,277,47,0.169675
6,patient0007,389,65,0.167095
7,patient0008,287,38,0.132404
8,patient0009,298,42,0.14094
9,patient0010,280,62,0.221429


In [9]:
# ===　建立病人級 5-fold ===
# 用 KFold 對「病人ID」做分割，每個 fold 互斥的病人集合。

from sklearn.model_selection import KFold

patient_ids = sorted(df["patient_id"].unique().tolist())
assert len(patient_ids) == 50, "⚠️ 病人數應為 50，請檢查資料或命名解析。"

kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
fold_assign = {}  # fold_id -> {"train": [...], "val": [...]}

for fold_id, (tr_idx, va_idx) in enumerate(kf.split(patient_ids), start=1):
    tr_patients = [patient_ids[i] for i in tr_idx]
    va_patients = [patient_ids[i] for i in va_idx]
    fold_assign[fold_id] = {"train": tr_patients, "val": va_patients}

# 儲存 JSON 以利重現
with open(FOLDS_DIR / "patient_folds.json", "w", encoding="utf-8") as f:
    json.dump(fold_assign, f, ensure_ascii=False, indent=2)

# 依病人ID，輸出各 fold 的原始（未擴充）train/val 影像清單
for fold_id, dct in fold_assign.items():
    tr = df[df["patient_id"].isin(dct["train"])]
    va = df[df["patient_id"].isin(dct["val"])]
    (FOLDS_DIR / f"fold{fold_id}_train.txt").write_text("\n".join(tr["img_path"]), encoding="utf-8")
    (FOLDS_DIR / f"fold{fold_id}_val.txt").write_text("\n".join(va["img_path"]), encoding="utf-8")

print("✅ 已建立 5-fold 病人級分配與基本清單")

✅ 已建立 5-fold 病人級分配與基本清單


In [11]:
# === 區塊 3（取代版）：為訓練集建立「三圈帶」難負樣本（Near / Mid / Far） ===
# 目的：
# 1) 利用你的觀察：「正樣本多成段（~50 張），常落在 slice 100–350」這個先驗，
#    將「難負樣本」重點放在「段落邊界附近」，提升模型對細微差異的鑑別力。
# 2) 將負樣本依三圈帶分級抽樣（Near > Mid > Far），並維持整體「負:正 ≈ TARGET_NEG_POS_RATIO : 1」。
# 3) 只對「訓練集」做擴充；驗證/測試不動，避免高估泛化。
#
# 重要輸出（與原流程一致）：
# - folds/fold{K}_train_neighbors.txt
#
# 名詞：
# - 正段落（run）：同病人的 has_label=True 的 slice_idx 連續區間（例：121..172）。
# - 三圈帶（對每個 run）：
#   Near：緊貼邊界 ±1..3            → 最難負樣本（最像但無標註）
#   Mid： 邊界外側 ±4..10（此處擴到 ±15 以更充足） → 中難負樣本
#   Far： 距離任一 run 邊界 > FAR_GAP（預設 30） → 容易負樣本（補數用）
#
# 補足策略（你剛要求的）：
#   Near/Mid/Far 都抽完後，若仍不足 → 先用 easy_all_neg 補，再用「整個訓練集中所有無標註切片」隨機補到目標數。
#
# 注意：
# - 這個區塊假設前面區塊已建立 df（含 img_path, patient_id, slice_idx, has_label）以及 fold_assign。
# - 輸出檔名與介面不變，後續區塊 4/4b/5/6/7/8 無需修改。

from collections import defaultdict
import random

# --- 可調參數（建議起手值） ---
TARGET_NEG_POS_RATIO = 3         # 整體「負:正」目標比例（建議 2.0 ~ 3.0）
NEAR_OFFSETS = (1, 2, 3, 4)           # Near：貼邊 ±1..3
# 將 Mid 擴至 ±4..15，避免候選不足（你可改回 ±4..10 視資料量）
MID_OFFSETS  = tuple(range(5, 13)) # Mid：±4..10
FAR_GAP      = 20                # Far：距離任一正段落邊界 > 20 張

# 三圈帶目標比例（和為 1.0）
BAND_RATIOS = {
    "near": 0.2,
    "mid":  0.4,
    "far":  0.4,
}

def _find_positive_runs(pdf):
    """
    將單一病人的正樣本（has_label=True）的 slice_idx 依連續性切成多個 run。
    回傳：runs = [(start_idx, end_idx), ...]（皆為包含端點的整數）
    """
    pos_idx = sorted(pdf.loc[pdf["has_label"], "slice_idx"].tolist())
    runs = []
    if not pos_idx:
        return runs
    start = prev = pos_idx[0]
    for s in pos_idx[1:]:
        if s == prev + 1:
            prev = s
        else:
            runs.append((start, prev))
            start = prev = s
    runs.append((start, prev))
    return runs

def _band_candidates_for_patient(pdf):
    """
    對單一病人產生三圈帶候選負樣本。
    參數：
      - pdf：該病人在「訓練集」的 DataFrame（欄位需含 has_label/slice_idx/img_path）
    回傳：
      dict = {
        "near": [paths...], "mid": [paths...], "far": [paths...],
        "easy_all_neg": [paths...]  # 該病人所有無標註負樣本全集（最終補數用）
      }
    """
    runs = _find_positive_runs(pdf)
    # 該病人所有無標註的切片（全集），作為最後補數來源
    easy_all_neg = set(pdf.loc[~pdf["has_label"], "img_path"].tolist())

    # 若沒有任何正樣本，該病人無 run，全部都算 easy（不產生 near/mid/far）
    if not runs:
        return {"near": [], "mid": [], "far": [], "easy_all_neg": sorted(easy_all_neg)}

    # 準備索引，便於用 slice_idx 快速取得該列資料
    sidx_to_row = {
        int(r.slice_idx): r
        for r in pdf[["slice_idx","img_path","has_label"]].itertuples(index=False)
    }
    near_set, mid_set, far_set = set(), set(), set()

    # 1) Near / Mid：以每個 run 的兩側邊界向外擴散收集（只收無標註）
    for (st, ed) in runs:
        for off in NEAR_OFFSETS:
            # 左邊界外側
            idx = st - off
            if idx in sidx_to_row and not sidx_to_row[idx].has_label:
                near_set.add(sidx_to_row[idx].img_path)
            # 右邊界外側
            idx = ed + off
            if idx in sidx_to_row and not sidx_to_row[idx].has_label:
                near_set.add(sidx_to_row[idx].img_path)
        for off in MID_OFFSETS:
            idx = st - off
            if idx in sidx_to_row and not sidx_to_row[idx].has_label:
                mid_set.add(sidx_to_row[idx].img_path)
            idx = ed + off
            if idx in sidx_to_row and not sidx_to_row[idx].has_label:
                mid_set.add(sidx_to_row[idx].img_path)

    # 2) Far：距離所有 run 邊界 > FAR_GAP 的無標註切片
    #    邊界定義為每個 run 的起/迄（st, ed）
    boundaries = []
    for (st, ed) in runs:
        boundaries.extend([st, ed])
    boundaries = sorted(boundaries)

    def _dist_to_boundary(s):
        # 回傳該 slice_idx 到最近邊界之距離（張數）
        return min(abs(s - b) for b in boundaries) if boundaries else 10**9

    # 修正版（正確解包 3 欄並直接使用 img_path）：
    for s, img_path, has_label in pdf.loc[:, ["slice_idx","img_path","has_label"]].itertuples(index=False, name=None):
        if (not has_label) and _dist_to_boundary(int(s)) > FAR_GAP:
            far_set.add(img_path)

    return {
        "near": sorted(near_set),
        "mid":  sorted(mid_set),
        "far":  sorted(far_set),
        "easy_all_neg": sorted(easy_all_neg)
    }

def build_neighbors_for_fold(fold_id: int):
    """
    以「三圈帶」策略為該 fold 構建訓練清單：
    - 正樣本：全納入（所有 has_label=True 的切片）
    - 負樣本：按 Near -> Mid -> Far 的目標比例抽樣到目標總數；若不足，先用 easy_all_neg 補；
              再不足則用「整個訓練集的所有無標註切片」隨機補到目標數（避免只侷限單病人的 easy 池）。
    - 產出：folds/fold{fold_id}_train_neighbors.txt
    - 回傳：統計 dict（檢視比例與是否有短缺）
    """
    # 取該 fold 的 train/val 病人
    tr_p = set(fold_assign[fold_id]["train"])
    va_p = set(fold_assign[fold_id]["val"])

    # 拆出 train / val DataFrame（僅 train 擴充）
    dtr = df[df["patient_id"].isin(tr_p)].copy()
    dva = df[df["patient_id"].isin(va_p)].copy()

    # 以病人建立三圈帶候選集合
    near_cands, mid_cands, far_cands = [], [], []
    easy_all_neg_union = set()
    for pid, pdf in dtr.groupby("patient_id"):
        pdf = pdf.sort_values("slice_idx").reset_index(drop=True)
        bands = _band_candidates_for_patient(pdf)
        near_cands.extend(bands["near"])
        mid_cands.extend(bands["mid"])
        far_cands.extend(bands["far"])
        easy_all_neg_union.update(bands["easy_all_neg"])

    # 去重 & 彼此去重（避免交集）
    near_cands = sorted(set(near_cands))
    mid_cands  = sorted(set(mid_cands) - set(near_cands))
    far_cands  = sorted(set(far_cands) - set(near_cands) - set(mid_cands))
    # easy 池 = 所有無標註 - (near|mid|far)
    easy_all_neg = sorted(easy_all_neg_union - set(near_cands) - set(mid_cands) - set(far_cands))

    # 正樣本：全納入
    pos_rows = dtr[dtr["has_label"]].copy()
    selected_pos = pos_rows["img_path"].tolist()
    n_pos = len(selected_pos)

    # 目標負樣本總數
    n_neg_target = int(TARGET_NEG_POS_RATIO * n_pos)

    # 依比例分配各圈帶目標數
    target_near = int(round(BAND_RATIOS["near"] * n_neg_target))
    target_mid  = int(round(BAND_RATIOS["mid"]  * n_neg_target))
    target_far  = n_neg_target - target_near - target_mid  # 收尾防四捨五入誤差

    # 小工具：隨機抽樣且不超界
    def _sample(lst, k):
        if k <= 0 or not lst:
            return []
        lst = lst[:]  # copy
        random.shuffle(lst)
        return lst[:min(k, len(lst))]

    # 先照 Near -> Mid -> Far 抽樣
    sel_near = _sample(near_cands, target_near)
    remaining = n_neg_target - len(sel_near)

    sel_mid  = _sample(mid_cands, min(target_mid, remaining))
    remaining = n_neg_target - len(sel_near) - len(sel_mid)

    sel_far  = _sample(far_cands, min(target_far, remaining))
    remaining = n_neg_target - len(sel_near) - len(sel_mid) - len(sel_far)

    # 第一步補：用 easy_all_neg 補
    sel_easy = _sample(easy_all_neg, remaining)
    selected_negs = sel_near + sel_mid + sel_far + sel_easy
    remaining = n_neg_target - len(selected_negs)

    # 最終補：用「整個訓練集的所有無標註切片」隨機補（扣掉已選，確保不重複）
    if remaining > 0:
        all_neg_pool = sorted(set(dtr.loc[~dtr["has_label"], "img_path"].tolist()))
        all_neg_left = sorted(set(all_neg_pool) - set(selected_negs))
        sel_rand_fallback = _sample(all_neg_left, remaining)
    else:
        sel_rand_fallback = []

    selected_negs += sel_rand_fallback
    remaining = n_neg_target - len(selected_negs)  # 若 still >0，代表資料本身不足（可忽略）

    # 合併最終 train 清單並打散
    final_train = selected_pos + selected_negs
    random.shuffle(final_train)

    # 輸出檔案（檔名維持不變）
    out_path = FOLDS_DIR / f"fold{fold_id}_train_neighbors.txt"
    out_path.write_text("\n".join(final_train), encoding="utf-8")

    # 統計回傳（含各圈帶使用量與最終是否仍短缺）
    stats = {
        "fold": fold_id,
        "n_train_total": len(dtr),
        "n_pos_all": int(dtr["has_label"].sum()),
        "n_pos_used": len(selected_pos),

        "n_near_cands": len(near_cands),
        "n_mid_cands": len(mid_cands),
        "n_far_cands": len(far_cands),
        "n_easy_pool": len(easy_all_neg),

        "n_near_used": len(sel_near),
        "n_mid_used": len(sel_mid),
        "n_far_used": len(sel_far),
        "n_easy_used": len(sel_easy),
        "n_rand_fallback_used": len(sel_rand_fallback),

        "n_neg_target": n_neg_target,
        "n_neg_used": len(selected_negs),
        "n_final_train": len(final_train),
        "neg_pos_ratio_final": round(len(selected_negs) / max(1, len(selected_pos)), 3),
        "n_shortfall": max(0, remaining)  # 若仍不足，>0；代表資料上限，不是錯誤
    }
    return stats

# 逐 fold 生成新版三圈帶清單與統計（與原流程相同）
all_stats = []
for k in range(1, 6):
    stats_k = build_neighbors_for_fold(k)
    all_stats.append(stats_k)

pd.DataFrame(all_stats)


Unnamed: 0,fold,n_train_total,n_pos_all,n_pos_used,n_near_cands,n_mid_cands,n_far_cands,n_easy_pool,n_near_used,n_mid_used,n_far_used,n_easy_used,n_rand_fallback_used,n_neg_target,n_neg_used,n_final_train,neg_pos_ratio_final,n_shortfall
0,1,13820,2251,2251,320,640,9969,640,320,640,2701,640,2452,6753,6753,9004,3.0,0
1,2,13266,2152,2152,320,640,9514,640,320,640,2583,640,2273,6456,6456,8608,3.0,0
2,3,13551,2250,2250,320,640,9701,640,320,640,2700,640,2450,6750,6750,9000,3.0,0
3,4,13344,2232,2232,320,640,9512,640,320,640,2679,640,2417,6696,6696,8928,3.0,0
4,5,13471,2263,2263,320,640,9608,640,320,640,2715,640,2474,6789,6789,9052,3.0,0


##  訓練模型

In [12]:
# === 資料 YAML 覆寫策略 ===
# 你的 aortic_val.yaml 依然存在，但 train/val 清單由 Notebook 動態替換。
# 這裡組裝一個臨時 YAML 字串，供 YOLO 調用。

import yaml

def make_data_yaml(train_list_path: Path, val_list_path: Path) -> Path:
    data_dict = {
        "path": str(r"C:\Users\307\Desktop\aicup\dataset"),    # 資料根目錄（可讓YOLO拼相對路徑）
        "train": str(train_list_path),
        "val": str(val_list_path),
        "nc": 1,
        "names": {0: "aortic_valve"}
    }
    tmp_yaml = YAML_DIR / f"{train_list_path.stem}.yaml"
    with open(tmp_yaml, "w", encoding="utf-8") as f:
        yaml.safe_dump(data_dict, f, allow_unicode=True, sort_keys=False)
    return tmp_yaml

print("✅ 資料 YAML 會在每個 fold 動態產生")


✅ 資料 YAML 會在每個 fold 動態產生


In [13]:
# === 訓練全域參數  ===
AUG_ARGS = dict(
    # degrees=5,            # ±5°
    # translate=0.05,       # 5% 平移
    scale=0.2,            # 0.9~1.1 縮放
    # shear=0.03,           # 輕剪切
    fliplr=0.0,           # 關水平翻轉（若驗證無害再做 A/B：≤0.1）
    flipud=0.0,           # 關垂直翻轉
    mosaic=0.0,           # 關 mosaic
    mixup=0.0,            # 關 mixup
    hsv_h=0.0,            # 關 HSV（CT 灰階）
    hsv_s=0.0,
    hsv_v=0.0
)

# —— 統一 train() 參數包 ——（含增強）
TRAIN_ARGS = dict(
    imgsz=640,
    epochs=40,
    warmup_epochs=3.0,    # 適度 warmup
    batch=8,
    lr0=5e-4,
    weight_decay=5e-4,
    cos_lr=True,          # Cosine LR
    amp=True,
    patience=15,          # 早停
    device=0 ,
    optimizer='AdamW',
    cache="disk",    # 可加速但佔空間
    **AUG_ARGS
)


In [14]:
# === 5-fold 訓練（帶入增強參數） ===
# 說明：
# - 僅需確認 model.train(...) 時有帶入 **TRAIN_ARGS**（其中包含我們的增強鍵）
# - 其他流程（fold 清單、鄰近片難負樣本、輸出路徑）維持不變

if not _yolo_ok:
    raise SystemExit("請先安裝 ultralytics 才能執行訓練。")

fold_results = []
for fold_id in range(1, 6):
    train_list = FOLDS_DIR / f"fold{fold_id}_train_neighbors.txt"  # 使用「含鄰近片」的訓練清單
    val_list   = FOLDS_DIR / f"fold{fold_id}_val.txt"
    data_yaml  = make_data_yaml(train_list, val_list)

    save_dir = RUNS_DIR / f"fold{fold_id}"
    save_dir.mkdir(parents=True, exist_ok=True)

    print(f"\n=== 開始訓練：Fold {fold_id} ===")
    print("Train 清單：", train_list)
    print("Val 清單：", val_list)

    # —— 關鍵：把 TRAIN_ARGS（含增強）一併傳入 —— #
    MODEL_PATH = "yolov12m.pt"   # 依你資源可改 n/s/m/l/x
    model = YOLO(MODEL_PATH)  #
    res = model.train(
        data=str(data_yaml),
        project=str(save_dir),
        name="exp",
        **TRAIN_ARGS
    )

    # 取得最佳權重（Ultralytics 會存到 runs 下的 exp*/weights/best.pt）
    best_weight = save_dir / "exp" / "weights" / "best.pt"
    if not best_weight.exists():
        best_candidates = list(save_dir.rglob("best.pt"))
        best_weight = best_candidates[0] if best_candidates else None

    fold_results.append({
        "fold": fold_id,
        "best_weight": str(best_weight) if best_weight else ""
    })

pd.DataFrame(fold_results)

      23/40      6.56G     0.7513     0.4157      0.875          2        640: 100%|██████████| 1132/1132 [05:35<00:00,  3.37it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 212/212 [00:30<00:00,  6.92it/s]

                   all       3392        524      0.926      0.857      0.931      0.693






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      24/40      6.56G     0.7395     0.3918      0.878          1        640: 100%|██████████| 1132/1132 [05:30<00:00,  3.43it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 212/212 [00:29<00:00,  7.08it/s]

                   all       3392        524        0.9      0.863      0.921      0.669






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      25/40       6.6G       0.73     0.3863      0.872          0        640: 100%|██████████| 1132/1132 [05:28<00:00,  3.45it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 212/212 [00:30<00:00,  7.05it/s]

                   all       3392        524      0.911      0.857      0.925      0.672






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      26/40      6.55G      0.715     0.3698     0.8649          1        640: 100%|██████████| 1132/1132 [05:32<00:00,  3.41it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 212/212 [00:30<00:00,  6.86it/s]

                   all       3392        524        0.9      0.847      0.927      0.692






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      27/40       6.6G     0.6816     0.3552     0.8569          2        640: 100%|██████████| 1132/1132 [05:28<00:00,  3.45it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 212/212 [00:29<00:00,  7.09it/s]

                   all       3392        524      0.917      0.872      0.932      0.694






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      28/40      6.56G     0.6826     0.3575     0.8541          1        640: 100%|██████████| 1132/1132 [05:28<00:00,  3.45it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 212/212 [00:29<00:00,  7.09it/s]

                   all       3392        524      0.933      0.851      0.927      0.694






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      29/40      6.59G     0.6728     0.3454     0.8462          1        640: 100%|██████████| 1132/1132 [05:28<00:00,  3.44it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 212/212 [00:29<00:00,  7.11it/s]

                   all       3392        524       0.92      0.863      0.923       0.69






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      30/40      6.56G     0.6384       0.33     0.8355          2        640: 100%|██████████| 1132/1132 [05:28<00:00,  3.45it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 212/212 [00:29<00:00,  7.09it/s]

                   all       3392        524       0.93      0.844      0.914      0.688





Closing dataloader mosaic
[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, method='weighted_average', num_output_channels=3), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))


  A.ImageCompression(quality_lower=75, p=0.0),



      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      31/40      6.59G     0.6389     0.3243     0.8391          2        640: 100%|██████████| 1132/1132 [05:30<00:00,  3.43it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 212/212 [00:29<00:00,  7.08it/s]

                   all       3392        524      0.908      0.864      0.927      0.689






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      32/40      6.55G     0.6237     0.3175     0.8326          1        640: 100%|██████████| 1132/1132 [05:28<00:00,  3.45it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 212/212 [00:30<00:00,  7.06it/s]

                   all       3392        524      0.915      0.863      0.926       0.69






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      33/40      6.57G     0.6011     0.3024     0.8086          1        640: 100%|██████████| 1132/1132 [05:28<00:00,  3.45it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 212/212 [00:29<00:00,  7.10it/s]

                   all       3392        524       0.92      0.859      0.924      0.695






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      34/40      6.57G     0.5938     0.3057     0.8224          0        640: 100%|██████████| 1132/1132 [05:28<00:00,  3.44it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 212/212 [00:29<00:00,  7.10it/s]

                   all       3392        524      0.913      0.841      0.914      0.683






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      35/40       6.6G     0.5879     0.2965     0.8142          1        640: 100%|██████████| 1132/1132 [05:27<00:00,  3.45it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 212/212 [00:29<00:00,  7.09it/s]

                   all       3392        524       0.92      0.859      0.919      0.687






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      36/40      6.56G     0.5752     0.2884     0.7972          0        640: 100%|██████████| 1132/1132 [05:28<00:00,  3.45it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 212/212 [00:29<00:00,  7.09it/s]

                   all       3392        524      0.907      0.863      0.921      0.688






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      37/40      6.59G      0.574     0.2921     0.8042          0        640: 100%|██████████| 1132/1132 [05:27<00:00,  3.45it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 212/212 [00:29<00:00,  7.07it/s]

                   all       3392        524      0.912      0.863      0.924      0.693






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      38/40      6.55G     0.5649     0.2869     0.7956          3        640: 100%|██████████| 1132/1132 [05:27<00:00,  3.45it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 212/212 [00:29<00:00,  7.10it/s]

                   all       3392        524      0.911      0.857      0.924      0.694






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      39/40      6.59G     0.5599     0.2849     0.7948          1        640: 100%|██████████| 1132/1132 [05:28<00:00,  3.45it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 212/212 [00:30<00:00,  6.98it/s]

                   all       3392        524      0.906       0.86       0.92      0.691






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      40/40      6.55G     0.5555     0.2812     0.7967          1        640: 100%|██████████| 1132/1132 [05:28<00:00,  3.45it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 212/212 [00:29<00:00,  7.07it/s]

                   all       3392        524      0.909      0.859       0.92       0.69






40 epochs completed in 4.040 hours.
Optimizer stripped from c:\Users\307\Desktop\aicup\runs\fold5\exp\weights\last.pt, 39.7MB
Optimizer stripped from c:\Users\307\Desktop\aicup\runs\fold5\exp\weights\best.pt, 39.7MB

Validating c:\Users\307\Desktop\aicup\runs\fold5\exp\weights\best.pt...
Ultralytics 8.3.63  Python-3.12.10 torch-2.6.0+cu124 CUDA:0 (NVIDIA GeForce RTX 2080 Ti, 11264MiB)
YOLOv12m summary (fused): 402 layers, 19,577,299 parameters, 0 gradients, 59.5 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 212/212 [00:27<00:00,  7.74it/s]


                   all       3392        524      0.918      0.859      0.924      0.695
Speed: 0.2ms preprocess, 6.8ms inference, 0.0ms loss, 0.3ms postprocess per image
Results saved to [1mc:\Users\307\Desktop\aicup\runs\fold5\exp[0m


Unnamed: 0,fold,best_weight
0,1,c:\Users\307\Desktop\aicup\runs\fold1\exp\weig...
1,2,c:\Users\307\Desktop\aicup\runs\fold2\exp\weig...
2,3,c:\Users\307\Desktop\aicup\runs\fold3\exp\weig...
3,4,c:\Users\307\Desktop\aicup\runs\fold4\exp\weig...
4,5,c:\Users\307\Desktop\aicup\runs\fold5\exp\weig...


## 預測

In [11]:
# import os
# import shutil

# src_root = r"C:\Users\307\Desktop\aicup\testing_image"
# dst_root = r"C:\Users\307\Desktop\aicup\test\images"

# os.makedirs(dst_root, exist_ok=True)

# # 收集所有圖片路徑
# all_files = []
# for patient_folder in os.listdir(src_root):
#     patient_path = os.path.join(src_root, patient_folder)
#     if os.path.isdir(patient_path) and patient_folder.startswith("patient"):
#         for fname in os.listdir(patient_path):
#             if fname.endswith(".png"):
#                 all_files.append(os.path.join(patient_path, fname))

# # 按照檔名排序，方便重現結果
# all_files.sort()

# dst_file = os.path.join(dst_root, os.path.basename(f))
# shutil.move(f, dst_file)

# print(f"完成移動！總共 {len(all_files)} 張，放到 images")

In [41]:
fold_results = [
    {"fold": 1,"best_weight": r"C:\Users\307\Desktop\aicup\目前最好模型\runs\fold1\exp\weights\best.pt"},
    {"fold": 2,"best_weight": r"C:\Users\307\Desktop\aicup\目前最好模型\runs\fold2\exp\weights\best.pt"},
    {"fold": 3,"best_weight": r"C:\Users\307\Desktop\aicup\目前最好模型\runs\fold3\exp\weights\best.pt"},
    {"fold": 4,"best_weight": r"C:\Users\307\Desktop\aicup\目前最好模型\runs\fold4\exp\weights\best.pt"},
    {"fold": 5,"best_weight": r"C:\Users\307\Desktop\aicup\目前最好模型\runs\fold5\exp\weights\best.pt"}]

In [42]:
from pathlib import Path
import os

# 圖片輸入尺寸（例如：896×896）
IMG_SIZE = 640

# ---- 你要推論的兩個測試資料夾（Windows 路徑，用 r"…" 原始字串避免跳脫字元問題） ----
TEST_DIRS = [
    Path(r"C:\Users\307\Desktop\aicup\test\images"),
]

# ---- 選擇推論用的權重 ----
# 方案 A：只用某一個 fold（資料劃分中的一個子集） 的 best.pt（最簡單，跑最少）
USE_ENSEMBLE = True     # 若要改用多 fold 融合，設 True（下方有簡易投票版）

# 如果只用單一 fold，指定 fold_id；若用 Ensemble，下方會自動抓全部 folds 的 best.pt
SINGLE_FOLD_ID = 3

# ---- 輔助：列出資料夾內所有 .png 影像（遞迴掃描） ----
def list_pngs(folder: Path):
    if not folder.exists():
        raise FileNotFoundError(f"找不到測試資料夾：{folder}")
    return sorted([p for p in folder.rglob("*.png")])

# ---- 輔助：把圖片路徑清單轉成 stem 清單（不含副檔名），之後提交檔會用到 ----
def stems_from_paths(paths):
    return [p.stem for p in paths]

# ---- 推論函式（沿用你前面區塊的 run_inference，但這裡接受「路徑清單」而非 .txt） ----
def run_inference_on_paths(weight_path: Path, img_paths: list, conf: float = 0.01):
    """
    以 YOLO（You Only Look Once）權重對 img_paths（路徑清單）做推論
    回傳 dict: img_stem -> [ (cls, score, x1,y1,x2,y2), ... ]
    cls = 類別索引, score = 分數, x1,y1,x2,y2 = 框座標
    """
    from ultralytics import YOLO
    mdl = YOLO(str(weight_path))
    results_dict = {}

    for img in img_paths:
        res = mdl.predict(source=str(img), imgsz=IMG_SIZE, conf=conf, verbose=False)[0]                  

        dets = []
        if res and res.boxes is not None and len(res.boxes) > 0:
            for b in res.boxes:
                xyxy = b.xyxy.cpu().numpy().astype(float)[0]
                sc   = float(b.conf.cpu().numpy()[0])
                cl   = int(b.cls.cpu().numpy()[0])
                x1, y1, x2, y2 = [int(round(v)) for v in xyxy]
                dets.append((cl, sc, x1, y1, x2, y2))
        results_dict[Path(img).stem] = dets
    return results_dict

# ---- 如果開啟 Ensemble：簡單的「分數平均 + NMS（Non-Maximum Suppression）」融合（輕量示範版） ----
# 說明：為了不引入外部 WBF（Weighted Box Fusion）套件，我們用簡化版融合：
# - 對同一張圖，把多個模型的框全部收集
# - 針對重疊( IOU >= 0.55 ) 的框做分組，分數取平均、座標取平均
# - 這版不如 WBF 嚴謹，但易於整合、零額外依賴；要更強可改用 ensemble-boxes/WBF
import math

def iou_xyxy(a, b):
    ax1, ay1, ax2, ay2 = a
    bx1, by1, bx2, by2 = b
    inter_x1 = max(ax1, bx1); inter_y1 = max(ay1, by1)
    inter_x2 = min(ax2, bx2); inter_y2 = min(ay2, by2)
    iw = max(0, inter_x2 - inter_x1); ih = max(0, inter_y2 - inter_y1)
    inter = iw * ih
    area_a = max(0, (ax2 - ax1)) * max(0, (ay2 - ay1))
    area_b = max(0, (bx2 - bx1)) * max(0, (by2 - by1))
    union = area_a + area_b - inter
    return inter / union if union > 0 else 0.0

def simple_ensemble_merge(list_of_preds_dicts, iou_thr=0.5):
    """
    list_of_preds_dicts: [preds_dict_model1, preds_dict_model2, ...]
      其中每個 preds_dict: img_stem -> [(cls, score, x1,y1,x2,y2), ...]
    回傳同格式的 preds_merged
    """
    merged = {}
    # 先列出所有影像的 key（stem 的聯集）
    all_stems = set()
    for d in list_of_preds_dicts:
        all_stems.update(d.keys())

    for stem in all_stems:
        # 收集此圖全部模型的框
        all_boxes = []
        for d in list_of_preds_dicts:
            all_boxes.extend(d.get(stem, []))  # [(cls,score,x1,y1,x2,y2),...]

        # 以簡易群聚方式融合
        used = [False] * len(all_boxes)
        fused = []
        for i, bi in enumerate(all_boxes):
            if used[i]:
                continue
            cls_i, sc_i, x1i, y1i, x2i, y2i = bi
            group = [(cls_i, sc_i, x1i, y1i, x2i, y2i)]
            used[i] = True
            for j in range(i+1, len(all_boxes)):
                if used[j]:
                    continue
                cls_j, sc_j, x1j, y1j, x2j, y2j = all_boxes[j]
                # 只融合同類別且 IOU 達閾值的框
                if cls_i == cls_j and iou_xyxy((x1i, y1i, x2i, y2i), (x1j, y1j, x2j, y2j)) >= iou_thr:
                    group.append((cls_j, sc_j, x1j, y1j, x2j, y2j))
                    used[j] = True
            # 對 group 做平均（座標取平均、分數取平均）
            if len(group) == 1:
                fused.append(group[0])
            else:
                cls_g = group[0][0]
                scores = [g[1] for g in group]
                xs1 = [g[2] for g in group]; ys1 = [g[3] for g in group]
                xs2 = [g[4] for g in group]; ys2 = [g[5] for g in group]
                sc_m = sum(scores)/len(scores)
                x1_m = int(round(sum(xs1)/len(xs1))); y1_m = int(round(sum(ys1)/len(ys1)))
                x2_m = int(round(sum(xs2)/len(xs2))); y2_m = int(round(sum(ys2)/len(ys2)))
                fused.append((cls_g, sc_m, x1_m, y1_m, x2_m, y2_m))
        merged[stem] = fused
    return merged

# ---- 1) 掃描兩個資料夾的圖片 ----
all_test_paths = []
for td in TEST_DIRS:
    imgs = list_pngs(td)
    print(f"測試資料夾：{td}，找到 {len(imgs)} 張 PNG")
    all_test_paths.extend(imgs)

print(f"兩資料夾合計：{len(all_test_paths)} 張")

# ---- 2) 跑推論（單一 fold 或 多 fold） ----
if not USE_ENSEMBLE:
    # 單 fold（簡單直接）
    weight_path = Path(fold_results[SINGLE_FOLD_ID-1]['best_weight'])
    assert weight_path.exists(), f"找不到指定 fold 的權重：{weight_path}"
    preds_merged = run_inference_on_paths(weight_path, all_test_paths, conf=0.01)
else:
    # Ensemble（把每個 fold 的 best.pt 都跑一遍，最後融合）
    preds_list = []
    for fr in fold_results:
        w = Path(fr["best_weight"])
        if not w.exists():
            print("⚠️ 跳過不存在的權重：", w)
            continue
        print("Ensemble 推論 using:", w)
        preds_list.append(run_inference_on_paths(w, all_test_paths, conf=0.01))
    assert len(preds_list) > 0, "沒有可用的權重可做 Ensemble。"
    preds_merged = simple_ensemble_merge(preds_list, iou_thr=0.5)

# preds_merged 的格式：{img_stem: [(cls, score, x1,y1,x2,y2), ...], ...}
print("示例：", list(preds_merged.items())[:1])


測試資料夾：C:\Users\307\Desktop\aicup\test\images，找到 16620 張 PNG
兩資料夾合計：16620 張
Ensemble 推論 using: C:\Users\307\Desktop\aicup\目前最好模型\runs\fold1\exp\weights\best.pt
Ensemble 推論 using: C:\Users\307\Desktop\aicup\目前最好模型\runs\fold2\exp\weights\best.pt
Ensemble 推論 using: C:\Users\307\Desktop\aicup\目前最好模型\runs\fold3\exp\weights\best.pt
Ensemble 推論 using: C:\Users\307\Desktop\aicup\目前最好模型\runs\fold4\exp\weights\best.pt
Ensemble 推論 using: C:\Users\307\Desktop\aicup\目前最好模型\runs\fold5\exp\weights\best.pt
示例： [('patient0065_0005', [])]


In [None]:
# === 區塊 8（改版）：把兩資料夾的推論結果合併輸出成一個提交檔 ===
# 官規：每行
#   影像檔名(無 .png) 類別 分數 左上x 左上y 右下x 右下y
# 注意：
# - 沒有偵測的影像，不輸出任何行（不要寫空行）
# - 這裡直接把 preds_merged（兩資料夾合併後）寫成一個 .txt

def write_submission(preds: dict, out_path: Path):
    lines = []
    for stem, dets in preds.items(): 
        for (cls_id, score, x1, y1, x2, y2) in dets:
            # 類別固定 0（主動脈瓣）；分數保留 4 位小數，座標為整數
            lines.append(f"{stem} {cls_id} {score:.4f} {x1} {y1} {x2} {y2}")
    out_path.write_text("\n".join(lines), encoding="utf-8")
    return len(lines)

# 你可以自訂輸出檔名，例如：submission.txt
out_file = SUBMIT_DIR / "submission.txt"
n = write_submission(preds_merged, out_file)

print(f"✅ 已輸出提交檔：{out_file}")
print(f"   行數（偵測結果數量）＝ {n}")
print("   規則提醒：只輸出有偵測的影像，無偵測不寫任何東西。")


In [None]:
import re
from collections import defaultdict

# === 基本設定 ===
INPUT_FILE = r"C:\Users\307\Desktop\aicup\submissions\submission.txt"
OUTPUT_FILE = r"C:\Users\307\Desktop\aicup\submissions\submission_sorted.txt"

pattern = re.compile(r"^(patient\d+)_(\d+)\s+0\s+([\d.]+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)")
patients = defaultdict(list)

# === 讀檔與分組 ===
with open(INPUT_FILE, "r", encoding="utf-8") as f:
    for line in f:
        m = pattern.match(line.strip())
        if not m:
            continue
        pid = m.group(1)
        frame = int(m.group(2))
        patients[pid].append((frame, line.strip()))

# === 工具：找出連續群集 ===
def find_clusters(frames):
    frames = sorted(frames)
    clusters = []
    cluster = [frames[0]]
    for i in range(1, len(frames)):
        gap = frames[i] - frames[i-1]
        if gap <= 5:  # 認為連續 frame 差距小於等於 2
            cluster.append(frames[i])
        else:
            clusters.append(cluster)
            cluster = [frames[i]]
    clusters.append(cluster)
    return clusters

# === 主邏輯 ===
keep_lines = []
for pid, records in patients.items():
    if len(records) <= 1:
        keep_lines += [r[1] for r in records]
        continue

    frames = [r[0] for r in records]
    clusters = find_clusters(frames)
    # 找出最長 cluster (主連續區段)
    main_cluster = max(clusters, key=len)

    # 找出該 cluster 的範圍
    start, end = min(main_cluster), max(main_cluster)
    print(f"{pid}: main range = {start}–{end} ({len(main_cluster)} frames)")

    # 保留範圍內的 line
    for frame, line in records:
        if start <= frame <= end:
            keep_lines.append(line)

# === 輸出 ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for l in sorted(keep_lines):
        f.write(l + "\n")

print(f"✅ 自動偵測連續主區段完成，結果已儲存：{OUTPUT_FILE}")
print(f"   原始行數：{sum(len(v) for v in patients.values())}，篩選後行數：{len(keep_lines)}")

# lines = open(INPUT_FILE, 'r').read().splitlines()
# print(f"總共有 {len(lines)} 筆預測結果")
# lines= sorted(lines)
# with open(OUTPUT_FILE, 'w') as f:
#     for line in lines:
#         f.write(line + '\n')
