In [3]:
# ============================================================
# master_epoch_index 作成スクリプト（統合解析対応・最終版）
#   - EEG trial index（②の出力）: derivatives/epoch_index_by_trial.csv
#   - QC trial（③の出力）      : derivatives/qc_all/qc_by_trial_all.csv
#   - design 提示順（CSV群）   : design/**/*.csv
#
# 出力:
#   - derivatives/master_epoch_index.csv
#   - derivatives/design_trial_map_all_participant_runs.csv
#   - output/qc_audit/*.csv（監査ログ）
# ============================================================

from pathlib import Path
import pandas as pd
import numpy as np
import re
import unicodedata

# ============================================================
# 0) パス設定
# ============================================================
ROOT_DIR = Path("/Users/shunsuke/EEG_48sounds")
if not ROOT_DIR.exists():
    ROOT_DIR = Path.cwd() / "EEG_48sounds"

DESIGN_DIR = ROOT_DIR / "design"
DERIV_DIR  = ROOT_DIR / "derivatives"
OUT_AUDIT  = ROOT_DIR / "output" / "qc_audit"

DERIV_DIR.mkdir(parents=True, exist_ok=True)
OUT_AUDIT.mkdir(parents=True, exist_ok=True)

INDEX_CSV     = DERIV_DIR / "epoch_index_by_trial.csv"
QC_TRIAL_CSV  = DERIV_DIR / "qc_all" / "qc_by_trial_all.csv"

DESIGN_ALL_OUT = DERIV_DIR / "design_trial_map_all_participant_runs.csv"
FORCE_REBUILD_DESIGN_MAP = True  # ★必ずTrue推奨（古いキャッシュが事故の元）

OUT_CSV = DERIV_DIR / "master_epoch_index.csv"

print("ROOT_DIR    :", ROOT_DIR)
print("DESIGN_DIR  :", DESIGN_DIR)
print("INDEX_CSV   :", INDEX_CSV)
print("QC_TRIAL_CSV:", QC_TRIAL_CSV)
print("OUT_CSV     :", OUT_CSV)

# ============================================================
# 1) ユーティリティ
# ============================================================

def normalize_run(x) -> str:
    if pd.isna(x):
        return np.nan
    s = str(x).strip().lower().replace(" ", "")
    if s.startswith("run"):
        s = s[3:]
    m = re.search(r"(\d+)", s)
    return f"run{int(m.group(1))}" if m else s

def to_int_safe(x):
    if pd.isna(x):
        return np.nan
    try:
        return int(float(x))
    except Exception:
        return np.nan

def extract_participant_id(text: str) -> str | None:
    """どこかに数字があれば拾って Pxx を作る"""
    if text is None or (isinstance(text, float) and np.isnan(text)):
        return None
    s = str(text)
    m = re.search(r"[Pp](\d{1,3})", s)
    if m:
        return f"P{int(m.group(1)):02d}"
    m2 = re.search(r"(\d{1,3})", s)
    if m2:
        return f"P{int(m2.group(1)):02d}"
    return None

def normalize_filename_like_hcu(x: str) -> str:
    """HCU/主観評価の FileName（例 flute.wav）に寄せる"""
    if pd.isna(x):
        return np.nan
    s = unicodedata.normalize("NFKC", str(x)).strip()
    s = re.sub(r"^\s*\d+\s*[:：]\s*", "", s)
    s = re.sub(r"(_-?\d+dBA.*)\.wav$", ".wav", s, flags=re.IGNORECASE)
    return s

def assert_unique(df: pd.DataFrame, keys: list[str], name: str):
    dup = df.duplicated(subset=keys).sum()
    if dup > 0:
        ex = df[df.duplicated(subset=keys, keep=False)].sort_values(keys).head(30)
        raise ValueError(f"[{name}] キー重複: keys={keys}, dup_rows={dup}\n例:\n{ex}")

def standardize_trial_order_df(d: pd.DataFrame, fname: str) -> pd.DataFrame:
    """design/trial_order の列名ゆれ吸収 & trial補完"""
    rename_map = {
        "No": "number", "番号": "number", "num": "number",
        "FileName": "FileName", "filename": "FileName", "ファイル名": "FileName",
        "カテゴリー": "category", "カテゴリ": "category", "Category": "category", "category": "category",
        "Trial": "trial_in_run", "trial": "trial_in_run", "TRIAL": "trial_in_run",
        "提示順": "trial_in_run",
        "Trial(提示順)": "trial_in_run",
        "Trial（提示順）": "trial_in_run",
        "Order": "trial_in_run", "order": "trial_in_run",
    }
    d = d.rename(columns={c: rename_map.get(c, c) for c in d.columns})

    need = ["number", "FileName", "category"]
    miss = [c for c in need if c not in d.columns]
    if miss:
        raise ValueError(f"{fname}: 必須列不足 {miss} / columns={list(d.columns)}")

    if "trial_in_run" not in d.columns:
        print(f"⚠️ [WARN] {fname}: Trial列が無いので行順(1..N)を trial_in_run として補完します。")
        d = d.copy()
        d["trial_in_run"] = np.arange(1, len(d) + 1)

    d["number"] = d["number"].map(to_int_safe)
    d["trial_in_run"] = d["trial_in_run"].map(to_int_safe)

    if len(d) != 48:
        raise ValueError(f"{fname}: 48行ではありません: {len(d)}")

    if not (set(d["trial_in_run"]) == set(range(1, 49)) and d["trial_in_run"].is_unique):
        raise ValueError(f"{fname}: trial_in_run が 1..48でない/重複あり")

    if not (set(d["number"]) == set(range(1, 49)) and d["number"].is_unique):
        raise ValueError(f"{fname}: number が 1..48でない/重複あり")

    if d["FileName"].isna().any():
        raise ValueError(f"{fname}: FileName に欠損があります")
    if d["FileName"].duplicated().any():
        raise ValueError(f"{fname}: FileName が重複しています（同一runで同音2回扱いになる）")

    return d[["number", "FileName", "category", "trial_in_run"]].copy()

def parse_run_from_path(p: Path) -> str | None:
    """ファイル名＋親フォルダ名から run を推定"""
    stem = p.stem.lower()
    m = re.search(r"(run[123])", stem)
    if m:
        return m.group(1)

    for parent in p.parents:
        name = parent.name.lower()
        m2 = re.search(r"(run[123])", name)
        if m2:
            return m2.group(1)

    m3 = re.search(r"(?:_|-)([123])(?:_|-|$)", stem)
    if m3:
        return f"run{m3.group(1)}"

    return None

def build_design_map(design_dir: Path) -> pd.DataFrame:
    if not design_dir.exists():
        raise FileNotFoundError(f"design フォルダが見つかりません: {design_dir}")

    rows = []
    csvs = sorted(design_dir.rglob("*.csv"))  # サブフォルダも拾う

    if len(csvs) == 0:
        raise FileNotFoundError(f"design フォルダ配下にcsvがありません: {design_dir}")

    for p in csvs:
        # -------------------------------
        # 1) Jupyterのチェックポイント/隠し物を除外（最重要）
        # -------------------------------
        if ".ipynb_checkpoints" in p.parts:
            continue
        if any(part.startswith(".") for part in p.parts):  # 念のため隠しフォルダ全般
            continue
        if p.name.startswith("."):
            continue

        # -------------------------------
        # 2) “テンプレっぽい”ものだけ除外（01_run1_trial_order は落とさない）
        # -------------------------------
        low = p.name.lower()
        if any(k in low for k in ["template", "sample", "example"]):
            continue

        # 読み込み & 正規化
        df_raw = pd.read_csv(p)
        df_std = standardize_trial_order_df(df_raw, p.name)

        run = parse_run_from_path(p)
        if run is None:
            raise ValueError(
                f"{p}: run を推定できません。ファイル名or親フォルダ名に run1/run2/run3 を入れてください。"
            )

        participant = extract_participant_id(p.name)
        if participant is None:
            raise ValueError(
                f"{p}: participant(Pxx) を推定できません。ファイル名に P01 や 01 など数字を含めてください。"
            )

        df_std["participant"] = participant
        df_std["run"] = run
        df_std["FileName_norm"] = df_std["FileName"].map(normalize_filename_like_hcu)
        df_std["source_csv"] = str(p.relative_to(design_dir))  # 監査しやすい

        rows.append(df_std[["participant","run","trial_in_run","number","FileName","FileName_norm","category","source_csv"]])

    design_all = pd.concat(rows, ignore_index=True)

    # -------------------------------
    # 3) 重複があったときの安全処理
    #    - 内容が同一ならデデュープ
    #    - 内容が違うなら停止（危険）
    # -------------------------------
    keys = ["participant","run","trial_in_run"]
    dup_mask = design_all.duplicated(subset=keys, keep=False)

    if dup_mask.any():
        dups = design_all[dup_mask].copy()

        # どの列が食い違ってるか（同一キー内で複数値を持つなら “コンフリクト”）
        conflict = (
            dups.groupby(keys)
            .agg(
                n_files=("FileName_norm", "nunique"),
                n_num=("number", "nunique"),
                n_cat=("category", "nunique"),
            )
            .reset_index()
        )
        conflict_bad = conflict[(conflict["n_files"]>1) | (conflict["n_num"]>1) | (conflict["n_cat"]>1)]

        # ログ出し（後で確認可能）
        dups.sort_values(keys + ["source_csv"]).to_csv(
            OUT_AUDIT / "design_duplicates_all_rows.csv",
            index=False, encoding="utf-8-sig"
        )
        conflict.to_csv(
            OUT_AUDIT / "design_duplicates_summary.csv",
            index=False, encoding="utf-8-sig"
        )

        if len(conflict_bad) > 0:
            # 中身が違う重複は危険なので停止
            conflict_bad.head(50).to_csv(
                OUT_AUDIT / "design_duplicates_conflicts_head50.csv",
                index=False, encoding="utf-8-sig"
            )
            raise ValueError(
                "design_map に同一キーで内容が食い違う重複があります（危険なので停止）。\n"
                f"ログ: {OUT_AUDIT/'design_duplicates_conflicts_head50.csv'}"
            )

        # 内容が同一の重複だけ → 優先順位でデデュープ
        # 優先: パスが短い（深い階層より本体）/ checkpointなどは既に除外済み
        design_all["_path_len"] = design_all["source_csv"].astype(str).map(lambda s: len(Path(s).parts))
        design_all = (
            design_all.sort_values(keys + ["_path_len", "source_csv"])
            .drop_duplicates(subset=keys, keep="first")
            .drop(columns=["_path_len"])
            .reset_index(drop=True)
        )

    # 最終一意性保証
    assert_unique(design_all, keys, "design_all(participant,run,trial)")

    return design_all


# ============================================================
# 2) 読み込み（EEG index / QC）
# ============================================================
df_idx = pd.read_csv(INDEX_CSV)
df_qc  = pd.read_csv(QC_TRIAL_CSV)

print("\n--- LOAD OK ---")
print("epoch_index_by_trial:", df_idx.shape)
print("qc_by_trial_all     :", df_qc.shape)

for k in ["subject","run","trial_in_run"]:
    if k not in df_idx.columns:
        raise ValueError(f"INDEX_CSV に列がありません: {k}")
    if k not in df_qc.columns:
        raise ValueError(f"QC_TRIAL_CSV に列がありません: {k}")

df_idx["subject"] = df_idx["subject"].astype(str).str.strip()
df_idx["run"] = df_idx["run"].map(normalize_run)
df_idx["trial_in_run"] = df_idx["trial_in_run"].map(to_int_safe)

df_qc["subject"] = df_qc["subject"].astype(str).str.strip()
df_qc["run"] = df_qc["run"].map(normalize_run)
df_qc["trial_in_run"] = df_qc["trial_in_run"].map(to_int_safe)

df_idx["participant"] = df_idx["subject"].apply(extract_participant_id)
df_qc["participant"]  = df_qc["subject"].apply(extract_participant_id)

if df_idx["participant"].isna().any():
    bad = df_idx[df_idx["participant"].isna()][["subject"]].drop_duplicates()
    bad.to_csv(OUT_AUDIT / "bad_subject_cannot_parse_participant.csv", index=False, encoding="utf-8-sig")
    raise ValueError("EEG index の subject から participant を作れない行があります。ログを確認してください。")

assert_unique(df_idx, ["participant","run","trial_in_run"], "df_idx(participant,run,trial)")
assert_unique(df_qc, ["subject","run","trial_in_run"], "df_qc(subject,run,trial)")

# ============================================================
# 3) design_map を構築/読み込み（participant基準）
# ============================================================
if FORCE_REBUILD_DESIGN_MAP and DESIGN_ALL_OUT.exists():
    print("delete old design map cache:", DESIGN_ALL_OUT)
    DESIGN_ALL_OUT.unlink()

if DESIGN_ALL_OUT.exists() and (FORCE_REBUILD_DESIGN_MAP is False):
    df_design = pd.read_csv(DESIGN_ALL_OUT)
    df_design["run"] = df_design["run"].map(normalize_run)
    df_design["trial_in_run"] = df_design["trial_in_run"].map(to_int_safe)
else:
    df_design = build_design_map(DESIGN_DIR)
    df_design.to_csv(DESIGN_ALL_OUT, index=False, encoding="utf-8-sig")
    print("Saved design map:", DESIGN_ALL_OUT)

print("\n--- DESIGN MAP ---")
print("design rows:", df_design.shape)

need_blocks = df_idx[["participant","run"]].drop_duplicates()
have_blocks = df_design[["participant","run"]].drop_duplicates()
cov = need_blocks.merge(have_blocks, on=["participant","run"], how="left", indicator=True)
missing_blocks = cov[cov["_merge"]=="left_only"][["participant","run"]].sort_values(["run","participant"])
missing_blocks.to_csv(OUT_AUDIT / "missing_design_participant_run.csv", index=False, encoding="utf-8-sig")
print("missing participant×run blocks:", len(missing_blocks))
print("saved:", OUT_AUDIT / "missing_design_participant_run.csv")

# ============================================================
# 4) EEG index ← design JOIN
# ============================================================
df = df_idx.merge(
    df_design.drop(columns=["source_csv"], errors="ignore"),
    on=["participant","run","trial_in_run"],
    how="left",
    validate="one_to_one"
)

missing_design = int(df["FileName"].isna().sum())
print("\n[CHECK] design join missing rows:", missing_design)

if missing_design > 0:
    df[df["FileName"].isna()][["participant","subject","run","trial_in_run"]].head(500).to_csv(
        OUT_AUDIT / "missing_design_rows_head500.csv", index=False, encoding="utf-8-sig"
    )
    raise ValueError(
        f"design（提示順→音）が付与できていません: missing={missing_design}。\n"
        f"監査ログ: {OUT_AUDIT / 'missing_design_rows_head500.csv'} と "
        f"{OUT_AUDIT / 'missing_design_participant_run.csv'} を確認してください。"
    )

# ============================================================
# 5) QC JOIN（subject基準）
# ============================================================
qc_cols = [
    "n_eeg_channels","max_ptp_uv","mean_ptp_uv","median_ptp_uv",
    "n_flat","n_extreme","bad_ratio","qc_pass","qc_reason",
]
missing = [c for c in qc_cols if c not in df_qc.columns]
if missing:
    raise ValueError(f"QCファイルに列が不足しています: {missing}")

df_qc_small = df_qc[["subject","run","trial_in_run"] + qc_cols].copy()
df_qc_small = df_qc_small.rename(columns={
    "n_eeg_channels": "qc_n_eeg_channels",
    "max_ptp_uv":     "qc_max_ptp_uv",
    "mean_ptp_uv":    "qc_mean_ptp_uv",
    "median_ptp_uv":  "qc_median_ptp_uv",
    "n_flat":         "qc_n_flat",
    "n_extreme":      "qc_n_extreme",
    "bad_ratio":      "qc_bad_ratio",
    "qc_pass":        "qc_amp_pass",
    "qc_reason":      "qc_amp_reason",
})

df = df.merge(
    df_qc_small,
    on=["subject","run","trial_in_run"],
    how="left",
    validate="one_to_one"
)

missing_qc = int(df["qc_amp_pass"].isna().sum())
print("\n[CHECK] qc join missing rows:", missing_qc)

if missing_qc > 0:
    df[df["qc_amp_pass"].isna()][["participant","subject","run","trial_in_run"]].head(500).to_csv(
        OUT_AUDIT / "missing_qc_rows_head500.csv", index=False, encoding="utf-8-sig"
    )
    raise ValueError(
        f"QC が付与できていない行があります: missing={missing_qc}\n"
        f"監査ログ: {OUT_AUDIT / 'missing_qc_rows_head500.csv'}"
    )

# 欠損が無いことを確認した上で確定
df["qc_amp_pass"] = df["qc_amp_pass"].astype(bool)
df["qc_pass"] = df["qc_amp_pass"]


df["FileName_norm"] = df["FileName"].map(normalize_filename_like_hcu)

# ============================================================
# 6) 監査出力
# ============================================================
audit_trials = df.groupby(["participant","run"])["trial_in_run"].nunique().reset_index(name="n_trials")
audit_trials.to_csv(OUT_AUDIT / "audit_trials_per_participant_run.csv", index=False, encoding="utf-8-sig")

audit_qc = df.groupby(["participant","run"])["qc_pass"].mean().reset_index(name="qc_pass_rate")
audit_qc.to_csv(OUT_AUDIT / "audit_qc_pass_rate_per_participant_run.csv", index=False, encoding="utf-8-sig")

print("\n--- AUDIT SUMMARY ---")
print("trial count min/max:", audit_trials["n_trials"].min(), audit_trials["n_trials"].max())
print("qc_pass rate by run:")
print(df.groupby("run")["qc_pass"].mean())

# ============================================================
# 7) 並び替え & 保存
# ============================================================
front_cols = [c for c in [
    "participant","subject","run","trial_in_run",
    "number","FileName","FileName_norm","category",
    "qc_pass","qc_amp_pass","qc_amp_reason",
] if c in df.columns]
df = df[front_cols + [c for c in df.columns if c not in front_cols]]

df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")

print("\n=== DONE ===")
print("Saved:", OUT_CSV)
print("rows:", len(df))
print("qc_pass True:", int(df["qc_pass"].sum()))
print("Design map saved:", DESIGN_ALL_OUT)
print("Audit logs:", OUT_AUDIT)


ROOT_DIR    : /Users/shunsuke/EEG_48sounds
DESIGN_DIR  : /Users/shunsuke/EEG_48sounds/design
INDEX_CSV   : /Users/shunsuke/EEG_48sounds/derivatives/epoch_index_by_trial.csv
QC_TRIAL_CSV: /Users/shunsuke/EEG_48sounds/derivatives/qc_all/qc_by_trial_all.csv
OUT_CSV     : /Users/shunsuke/EEG_48sounds/derivatives/master_epoch_index.csv

--- LOAD OK ---
epoch_index_by_trial: (1728, 17)
qc_by_trial_all     : (1728, 14)
delete old design map cache: /Users/shunsuke/EEG_48sounds/derivatives/design_trial_map_all_participant_runs.csv
Saved design map: /Users/shunsuke/EEG_48sounds/derivatives/design_trial_map_all_participant_runs.csv

--- DESIGN MAP ---
design rows: (1728, 8)
missing participant×run blocks: 0
saved: /Users/shunsuke/EEG_48sounds/output/qc_audit/missing_design_participant_run.csv

[CHECK] design join missing rows: 0

[CHECK] qc join missing rows: 0

--- AUDIT SUMMARY ---
trial count min/max: 48 48
qc_pass rate by run:
run
run1    0.909722
run2    0.977431
run3    0.869792
Name: qc_pa