In [25]:
from __future__ import annotations

from pathlib import Path
import pandas as pd
import numpy as np
import re
import unicodedata


# =========================================================
# 0. 基本設定（まずここだけ合わせればOK）
# =========================================================

ROOT_DIR = Path("/Users/shunsuke/EEG_48sounds")
if not ROOT_DIR.exists():
    ROOT_DIR = Path.cwd() / "EEG_48sounds"

DERIV_DIR = ROOT_DIR / "derivatives"
DERIV_DIR.mkdir(parents=True, exist_ok=True)

OUT_DIR = ROOT_DIR / "output"
OUT_DIR.mkdir(parents=True, exist_ok=True)

AUDIT_DIR = OUT_DIR / "integration_audit"
AUDIT_DIR.mkdir(parents=True, exist_ok=True)

MASTER_DIR = DERIV_DIR / "master_tables"
MASTER_DIR.mkdir(parents=True, exist_ok=True)

# ---- master 出力 ----
MASTER_SOUND_CSV = MASTER_DIR / "master_sound_level.csv"
MASTER_PARTICIPANT_CSV = MASTER_DIR / "master_participant_level.csv"
MASTER_PARTICIPANT_SOUND_CSV = MASTER_DIR / "master_participant_sound_level.csv"
# ---- 超おすすめ：EEG participant×sound 集約（⑥の中間成果物として保存）----
EEG_PARTICIPANT_SOUND_CSV = DERIV_DIR / "eeg_features_participant_sound.csv"


print("ROOT_DIR :", ROOT_DIR)
print("DERIV_DIR:", DERIV_DIR)
print("AUDIT_DIR:", AUDIT_DIR)
print("MASTER_DIR:", MASTER_DIR)


# =========================================================
# 1. 小ユーティリティ（読み込み・保存・正規化）
# =========================================================

def load_csv_candidates(candidates: list[Path], desc: str, required: bool = False) -> pd.DataFrame | None:
    """
    置き場所が揺れても拾えるように、候補パスを順に探して最初に見つかったものを読む。
    required=True なら見つからない時点で停止。
    """
    print(f"\n=== LOAD: {desc} ===")
    for p in candidates:
        print("  try:", p)
        if p.exists():
            df = pd.read_csv(p, low_memory=False)
            print(f"  -> FOUND: {p}  shape={df.shape}")
            return df
    print("  -> NOT FOUND")
    if required:
        raise FileNotFoundError(f"[MISSING] {desc} が見つかりません。候補: {[str(x) for x in candidates]}")
    return None


def save_csv(df: pd.DataFrame, path: Path, desc: str):
    """保存時に shape とパスを表示。文字化けしやすいので utf-8-sig を標準に。"""
    path.parent.mkdir(parents=True, exist_ok=True)
    print(f"\n=== SAVE: {desc} ===")
    print(f"  path : {path}")
    print(f"  shape: {df.shape}")
    df.to_csv(path, index=False, encoding="utf-8-sig")


def normalize_run(x) -> str:
    """run表記ゆれ（1, run1, Run1, RUN1）を run1 形式へ統一"""
    if pd.isna(x):
        return np.nan
    s = str(x).strip().lower().replace(" ", "")
    if s.startswith("run"):
        s = s[3:]
    m = re.search(r"(\d+)", s)
    return f"run{int(m.group(1))}" if m else str(x)


def to_int_safe(x):
    """int化（"1.0" などもOK）。失敗はNaN。"""
    if pd.isna(x):
        return np.nan
    try:
        return int(float(x))
    except Exception:
        return np.nan


def normalize_filename_like_hcu(x: str) -> str:
    """
    HCU/主観/物理/EEGをJOINするための FileName 正規化。
    - "1：flute_-9dBA_final.wav" → "flute.wav"
    - "flute_-9dBA.wav" → "flute.wav"
    - 全角/半角ゆれも NFKC で吸収
    """
    if pd.isna(x):
        return np.nan
    s = unicodedata.normalize("NFKC", str(x)).strip()
    s = re.sub(r"^\s*\d+\s*[:：]\s*", "", s)  # 先頭番号 "1：" を除去
    s = re.sub(r"(_-?\d+dBA.*)\.wav$", ".wav", s, flags=re.IGNORECASE)  # -9dBA等を除去
    return s


def subject_to_participant(subject: str) -> str | None:
    """
    EEGの subject 名が '1_高見' / '01_高見' / 'P01' / 'sub-01' などでも Pxx に揃える。
    """
    if subject is None or (isinstance(subject, float) and np.isnan(subject)):
        return None

    s = str(subject).strip()

    # 既に P01 形式
    m = re.search(r"[Pp](\d{1,3})", s)
    if m:
        return f"P{int(m.group(1)):02d}"

    # 先頭数字
    m2 = re.match(r"(\d{1,3})", s)
    if m2:
        return f"P{int(m2.group(1)):02d}"

    # どこかに数字がある
    m3 = re.search(r"(\d{1,3})", s)
    if m3:
        return f"P{int(m3.group(1)):02d}"

    return None


def make_subject_id_series(participant_series: pd.Series) -> pd.Series:
    """
    participant (P01, P02, ...) から数値ID(subject_id)を作る。
    取れない場合はカテゴリコードで振る（落ちないようにする）。
    """
    p = participant_series.astype(str)
    digits = p.str.extract(r"(\d+)", expand=False)
    n_ok = digits.notna().sum()

    if n_ok >= max(2, int(0.5 * len(p))):
        return digits.astype("Int64")

    # 数字がほぼ取れない場合 → 新規割当
    codes = p.astype("category").cat.codes + 1
    return codes.astype("Int64")


def assert_unique(df: pd.DataFrame, keys: list[str], name: str):
    """キー重複があると統合解析で致命傷なので、ここで止める"""
    dup = df.duplicated(subset=keys).sum()
    if dup > 0:
        ex = df[df.duplicated(subset=keys, keep=False)].sort_values(keys).head(50)
        ex.to_csv(AUDIT_DIR / f"dup_{name}.csv", index=False, encoding="utf-8-sig")
        raise ValueError(f"[{name}] キー重複があります: keys={keys}, dup_rows={dup}\nログ: {AUDIT_DIR / f'dup_{name}.csv'}")


# =========================================================
# 2. 入力CSVの候補（置き場所の揺れを吸収）
# =========================================================

# ---- HCU / 物理特徴 ----
HCU_CANDIDATES = [
    ROOT_DIR / "HCU400_48sounds_subset.csv",
    DERIV_DIR / "HCU400_48sounds_subset.csv",
]
SOUND_FEATURE_CANDIDATES = [
    ROOT_DIR / "sound_features_48.csv",
    DERIV_DIR / "sound_features_48.csv",
    DERIV_DIR / "sound_features" / "sound_features_48.csv", 
]

# ---- 主観評価（候補ディレクトリを増やして“置き場所の揺れ”を吸収）----
RATINGS_DIRS = [
    ROOT_DIR / "output" / "ratings",
    DERIV_DIR / "behavioral",          # ★ここがあなたの実体
    DERIV_DIR / "ratings",
    ROOT_DIR,
    DERIV_DIR,
]

def cand(*names):
    out = []
    for d in RATINGS_DIRS:
        for n in names:
            out.append(d / n)
    return out

SOUND_SUMMARY_CANDIDATES = cand("sound_summary.csv")
PARTICIPANT_SOUND_LONG_CANDIDATES = cand("participant_sound_long.csv")
PARTICIPANT_SUMMARY_CANDIDATES = cand("participant_summary.csv")
RATINGS_LONG_CANDIDATES = cand("ratings_long.csv")

AMB_APPR_CANDIDATES = cand("ambiguous_sounds_by_Approach_sd_top10.csv")
AMB_NEG_CANDIDATES  = cand("ambiguous_negative_sounds.csv")


# ---- EEG特徴量（⑤の出力想定：置き場所が揺れても拾う）----
EEG_SOUND_CANDIDATES = [
    DERIV_DIR / "eeg_features_sound.csv",
    DERIV_DIR / "eeg_features" / "eeg_features_sound.csv",
]
EEG_SUBJECT_CANDIDATES = [
    DERIV_DIR / "eeg_features_subject.csv",
    DERIV_DIR / "eeg_features" / "eeg_features_subject.csv",
]
EEG_TRIAL_CANDIDATES = [
    DERIV_DIR / "eeg_features_trial.csv",
    DERIV_DIR / "eeg_features" / "eeg_features_trial.csv",
]

# ---- EEG index（⑥で最強のJOIN基盤になる）----
MASTER_EPOCH_INDEX_CANDIDATES = [
    DERIV_DIR / "master_epoch_index.csv",
    DERIV_DIR / "master_epoch_index_run1_with_sound.csv",  # 過去に作った場合も救済
]


# =========================================================
# 3. main（ここから本体）
# =========================================================

def main():
    # -----------------------------------------------------
    # 3-1) 読み込み
    # -----------------------------------------------------
    df_hcu = load_csv_candidates(HCU_CANDIDATES, "HCU subset (48 sounds)", required=False)
    df_sound_feat = load_csv_candidates(SOUND_FEATURE_CANDIDATES, "Physical sound features (48 sounds)", required=False)

    df_sound_sum = load_csv_candidates(SOUND_SUMMARY_CANDIDATES, "Subjective summary per sound", required=True)
    df_ps_long = load_csv_candidates(PARTICIPANT_SOUND_LONG_CANDIDATES, "participant x sound ratings (canonical)", required=True)
    df_ps_summary = load_csv_candidates(PARTICIPANT_SUMMARY_CANDIDATES, "participant summary", required=True)
    df_ratings_long = load_csv_candidates(RATINGS_LONG_CANDIDATES, "ratings_long (for reference)", required=False)

    df_amb_appr = load_csv_candidates(AMB_APPR_CANDIDATES, "ambiguous (Approach SD top10)", required=False)
    df_amb_neg = load_csv_candidates(AMB_NEG_CANDIDATES, "ambiguous_negative", required=False)

    df_eeg_sound = load_csv_candidates(EEG_SOUND_CANDIDATES, "EEG sound-level features (optional)", required=False)
    df_eeg_subject = load_csv_candidates(EEG_SUBJECT_CANDIDATES, "EEG subject-level features (optional)", required=False)
    df_eeg_trial = load_csv_candidates(EEG_TRIAL_CANDIDATES, "EEG trial-level features (optional)", required=False)

    df_master_epoch = load_csv_candidates(MASTER_EPOCH_INDEX_CANDIDATES, "master_epoch_index (highly recommended)", required=False)

    # -----------------------------------------------------
    # 3-2) 最低限の正規化（FileName / number / participant）
    # -----------------------------------------------------
    # sound_summary は "48音が揃っていること" が前提なので、ここで確認する
    if len(df_sound_sum) != 48:
        (df_sound_sum.head(200)).to_csv(AUDIT_DIR / "sound_summary_head200.csv", index=False, encoding="utf-8-sig")
        raise ValueError(f"sound_summary が48行ではありません: {len(df_sound_sum)}行。ログ: {AUDIT_DIR/'sound_summary_head200.csv'}")

    # FileName / FileName_norm の揺れを吸収（sound_summaryは FileName_norm だけのことがある）
    for _df, name in [
        (df_sound_sum, "sound_summary"),
        (df_ps_long, "participant_sound_long"),
    ]:
        # すでに FileName_norm があるならそれを採用
        if "FileName_norm" in _df.columns:
            pass
        # FileName があるなら正規化して FileName_norm を作る
        elif "FileName" in _df.columns:
            _df["FileName_norm"] = _df["FileName"].map(normalize_filename_like_hcu)
        # filename という列名の可能性も救済
        elif "filename" in _df.columns:
            _df["FileName_norm"] = _df["filename"].map(normalize_filename_like_hcu)
        else:
            raise ValueError(f"{name} に FileName/FileName_norm/filename がありません。列名を確認してください。")

        # 後段で FileName を参照しても落ちないように保険（ログ出力・監査用）
        if "FileName" not in _df.columns:
            _df["FileName"] = _df["FileName_norm"]


    if df_hcu is not None:
        # HCU側は filename 列のことが多い → FileNameに揃える
        if "filename" in df_hcu.columns and "FileName" not in df_hcu.columns:
            df_hcu = df_hcu.rename(columns={"filename": "FileName"})
        if "FileName" in df_hcu.columns:
            df_hcu["FileName_norm"] = df_hcu["FileName"].map(normalize_filename_like_hcu)

    if df_sound_feat is not None:
        # 物理特徴量のキーが sound_no の場合がある → numberへ
        if "sound_no" in df_sound_feat.columns and "number" not in df_sound_feat.columns:
            df_sound_feat = df_sound_feat.rename(columns={"sound_no": "number"})
        if "number" in df_sound_feat.columns:
            df_sound_feat["number"] = df_sound_feat["number"].map(to_int_safe)

    # participant 列の存在チェック（ここがズレると後段全滅する）
    if "participant" not in df_ps_long.columns:
        raise ValueError("participant_sound_long.csv に participant 列がありません。主観評価スクリプトの出力を確認してください。")
    if "participant" not in df_ps_summary.columns:
        raise ValueError("participant_summary.csv に participant 列がありません。主観評価スクリプトの出力を確認してください。")

    # number を int化（主観側）
    if "number" in df_ps_long.columns:
        df_ps_long["number"] = df_ps_long["number"].map(to_int_safe)
    if "number" in df_sound_sum.columns:
        df_sound_sum["number"] = df_sound_sum["number"].map(to_int_safe)

    # -----------------------------------------------------
    # 3-3) 曖昧さフラグ（なければ空集合）
    # -----------------------------------------------------
    amb_appr_files = set(df_amb_appr["FileName_norm"]) if (df_amb_appr is not None and "FileName_norm" in df_amb_appr.columns) else set()
    amb_neg_files = set(df_amb_neg["FileName_norm"]) if (df_amb_neg is not None and "FileName_norm" in df_amb_neg.columns) else set()

    # df_amb_* が FileName_norm を持っていない場合に備え、FileName から生成して救済
    if df_amb_appr is not None and len(amb_appr_files) == 0 and "FileName" in df_amb_appr.columns:
        amb_appr_files = set(df_amb_appr["FileName"].map(normalize_filename_like_hcu))
    if df_amb_neg is not None and len(amb_neg_files) == 0 and "FileName" in df_amb_neg.columns:
        amb_neg_files = set(df_amb_neg["FileName"].map(normalize_filename_like_hcu))

    # =====================================================
    # 4. master_sound_level（1行=1音）
    #   - ベース：主観 sound_summary（48行保証）
    #   - 追加：HCU, 物理, 曖昧さフラグ, EEG音特徴（あれば）
    # =====================================================
    df_sound_master = df_sound_sum.copy()

    # HCU結合：FileName_norm で合わせる（FileName表記揺れ対策）
    if df_hcu is not None and "FileName_norm" in df_hcu.columns:
        # HCU側で重複があったら危険なので先に落とす（48音なら基本1つずつのはず）
        assert_unique(df_hcu, ["FileName_norm"], "hcu(FileName_norm)")
        df_sound_master = df_sound_master.merge(df_hcu, on="FileName_norm", how="left", suffixes=("", "_hcu"))
        print("Merged HCU:", df_hcu.shape)
    else:
        print("[INFO] HCUが無い or FileName_normが無い → HCU統合をスキップ")

    # 物理特徴結合：numberで合わせる（numberは安定キー）
    if df_sound_feat is not None and "number" in df_sound_feat.columns:
        # 物理特徴が number で重複してたら事故る
        assert_unique(df_sound_feat, ["number"], "sound_features(number)")
        df_sound_master = df_sound_master.merge(df_sound_feat, on="number", how="left", suffixes=("", "_phys"))
        print("Merged physical:", df_sound_feat.shape)
    else:
        print("[INFO] 物理特徴が無い → 統合をスキップ")

    # 曖昧フラグ
    df_sound_master["is_ambiguous_approach_sd_top10"] = df_sound_master["FileName_norm"].isin(amb_appr_files)
    df_sound_master["is_ambiguous_negative"] = df_sound_master["FileName_norm"].isin(amb_neg_files)

    # EEG音特徴（任意）：numberで合わせる
    if df_eeg_sound is not None:
        # ⑤の出力が sound_id の場合もあるので救済
        if "sound_id" in df_eeg_sound.columns and "number" not in df_eeg_sound.columns:
            df_eeg_sound = df_eeg_sound.rename(columns={"sound_id": "number"})
        if "number" in df_eeg_sound.columns:
            df_eeg_sound["number"] = df_eeg_sound["number"].map(to_int_safe)
            # number重複があれば危険（ただし複数run集約等で出るなら平均済みにするべき）
            if df_eeg_sound.duplicated(subset=["number"]).any():
                df_eeg_sound.to_csv(AUDIT_DIR / "eeg_sound_dup_number.csv", index=False, encoding="utf-8-sig")
                raise ValueError("EEG sound-level features に number 重複があります。先に⑤側で集約してください。")
            df_sound_master = df_sound_master.merge(df_eeg_sound, on="number", how="left", suffixes=("", "_eeg"))
            print("Merged EEG sound-level:", df_eeg_sound.shape)
        else:
            print("[INFO] eeg_features_sound に number/sound_id が無い → 音レベルEEG統合をスキップ")
    else:
        print("[INFO] eeg_features_sound が無い → 音レベルEEG統合をスキップ")

    # 監査：48音から欠落していないか
    if len(df_sound_master) != 48:
        df_sound_master.to_csv(AUDIT_DIR / "sound_master_not_48.csv", index=False, encoding="utf-8-sig")
        raise ValueError("master_sound_level が48行になっていません。ログを確認してください。")

    save_csv(df_sound_master, MASTER_SOUND_CSV, "master_sound_level (48 sounds)")

    # =====================================================
    # 5. master_participant_level（1行=1参加者）
    #   - ベース：participant_summary（主観）
    #   - 追加：EEG被験者特徴（あれば）
    # =====================================================
    df_participant_master = df_ps_summary.copy()

    # subject_id（数値ID）を追加：統計モデルで便利
    df_participant_master["subject_id"] = make_subject_id_series(df_participant_master["participant"])

    if df_eeg_subject is not None:
        # EEG側のID列が subject / subj_id など揺れるので吸収
        id_col = None
        for c in ["participant", "subject", "subj_id", "subject_id"]:
            if c in df_eeg_subject.columns:
                id_col = c
                break

        if id_col is None:
            print("[INFO] eeg_features_subject にID列が見当たらない → 統合スキップ")
        else:
            if id_col != "participant":
                df_eeg_subject["participant"] = df_eeg_subject[id_col].apply(subject_to_participant)
            df_eeg_subject = df_eeg_subject.dropna(subset=["participant"]).copy()

            # participant重複があれば危険（run別に出ているなど）→ ここで集約してから統合
            if df_eeg_subject.duplicated(subset=["participant"]).any():
                # 数値列だけ平均して participant単位に落とす
                num_cols = [c for c in df_eeg_subject.columns if pd.api.types.is_numeric_dtype(df_eeg_subject[c])]
                df_eeg_subject = df_eeg_subject.groupby("participant")[num_cols].mean().reset_index()

            df_participant_master = df_participant_master.merge(
                df_eeg_subject,
                on="participant",
                how="left",
                suffixes=("", "_eeg"),
            )
            print("Merged EEG subject-level:", df_eeg_subject.shape)
    else:
        print("[INFO] eeg_features_subject が無い → participant-level EEG統合をスキップ")

    save_csv(df_participant_master, MASTER_PARTICIPANT_CSV, "master_participant_level")

    # =====================================================
    # 6. master_participant_sound_level（1行=参加者×音）
    #   - ベース：participant_sound_long（主観）
    #   - 追加：HCU / 物理 / 曖昧フラグ
    #   - 追加：EEG trial特徴を participant×sound に集約して統合（あれば）
    #
    # 重要：EEG trialの「キー揺れ」を master_epoch_index で吸収すると強い
    # =====================================================
    df_ps_master = df_ps_long.copy()

    # subject_id 追加（参加者を数値化）
    df_ps_master["subject_id"] = make_subject_id_series(df_ps_master["participant"])

    # HCU/物理を付与（音側の情報なので participantsound にも入れる）
    if df_hcu is not None and "FileName_norm" in df_hcu.columns:
        assert_unique(df_hcu, ["FileName_norm"], "hcu(FileName_norm)")
        df_ps_master = df_ps_master.merge(df_hcu, on="FileName_norm", how="left", suffixes=("", "_hcu"))

    if df_sound_feat is not None and "number" in df_sound_feat.columns and "number" in df_ps_master.columns:
        assert_unique(df_sound_feat, ["number"], "sound_features(number)")
        df_ps_master = df_ps_master.merge(df_sound_feat, on="number", how="left", suffixes=("", "_phys"))

    df_ps_master["is_ambiguous_approach_sd_top10"] = df_ps_master["FileName_norm"].isin(amb_appr_files)
    df_ps_master["is_ambiguous_negative"] = df_ps_master["FileName_norm"].isin(amb_neg_files)

    print("\nBase df_ps_master (behavior + HCU + physical):", df_ps_master.shape)

    # -----------------------------------------------------
    # 6-1) EEG trial特徴（任意）を participant×sound に集約して統合
    # -----------------------------------------------------
    if df_eeg_trial is None:
        print("[INFO] eeg_features_trial が無い → participant×sound EEG統合をスキップ")
        save_csv(df_ps_master, MASTER_PARTICIPANT_SOUND_CSV, "master_participant_sound_level (no EEG trial)")
        print("\nSaved masters.")
        return

    # feature_ok があるなら True のみ（解析に使えるtrialだけ残す）
    if "feature_ok" in df_eeg_trial.columns:
        df_eeg_trial = df_eeg_trial[df_eeg_trial["feature_ok"] == True].copy()

    # run / trial_in_run を揃える（存在する場合のみ）
    if "run" in df_eeg_trial.columns:
        df_eeg_trial["run"] = df_eeg_trial["run"].map(normalize_run)
    if "trial_in_run" in df_eeg_trial.columns:
        df_eeg_trial["trial_in_run"] = df_eeg_trial["trial_in_run"].map(to_int_safe)

    # participant を作る（subject列から作れるなら作る）
    if "participant" not in df_eeg_trial.columns:
        id_col = None
        for c in ["subject", "subj_id", "subject_id"]:
            if c in df_eeg_trial.columns:
                id_col = c
                break
        if id_col is not None:
            df_eeg_trial["participant"] = df_eeg_trial[id_col].apply(subject_to_participant)

    df_eeg_trial = df_eeg_trial.dropna(subset=["participant"]).copy()

    # -----------------------------------------------------
    # 6-2) ここが “最強に重要”
    # master_epoch_index があれば、EEG trial に音情報(number, FileName_norm等)を正確に付与できる
    # -----------------------------------------------------
    if df_master_epoch is not None:
        # master_epoch_index 側を正規化
        need_cols = ["participant", "run", "trial_in_run", "number", "FileName_norm", "qc_pass"]
        for c in need_cols:
            if c not in df_master_epoch.columns:
                print("[WARN] master_epoch_index に列が足りない:", c)

        if "run" in df_master_epoch.columns:
            df_master_epoch["run"] = df_master_epoch["run"].map(normalize_run)
        if "trial_in_run" in df_master_epoch.columns:
            df_master_epoch["trial_in_run"] = df_master_epoch["trial_in_run"].map(to_int_safe)
        if "number" in df_master_epoch.columns:
            df_master_epoch["number"] = df_master_epoch["number"].map(to_int_safe)
        if "FileName_norm" not in df_master_epoch.columns and "FileName" in df_master_epoch.columns:
            df_master_epoch["FileName_norm"] = df_master_epoch["FileName"].map(normalize_filename_like_hcu)

        if "participant" not in df_master_epoch.columns and "subject" in df_master_epoch.columns:
            df_master_epoch["participant"] = df_master_epoch["subject"].apply(subject_to_participant)

        need_join_keys = ["participant", "run", "trial_in_run"]
        ok_trial_side = all(k in df_eeg_trial.columns for k in need_join_keys)
        ok_master_side = all(k in df_master_epoch.columns for k in need_join_keys)

        if ok_trial_side and ok_master_side:
            attach_cols = [c for c in ["number", "FileName_norm", "qc_pass"] if c in df_master_epoch.columns]

            df_eeg_trial = df_eeg_trial.merge(
                df_master_epoch[need_join_keys + attach_cols].drop_duplicates(subset=need_join_keys),
                on=need_join_keys,
                how="left",
                validate="many_to_one",
                suffixes=("", "_epoch"),
            )

            # epoch側を正として採用（列名ぶれ・_x/_y事故を潰す）
            for c in ["number", "FileName_norm", "qc_pass"]:
                ce = f"{c}_epoch"
                if ce in df_eeg_trial.columns:
                    df_eeg_trial[c] = df_eeg_trial[ce]
                    df_eeg_trial.drop(columns=[ce], inplace=True)
        else:
            print("[INFO] master_epoch_index と結合する3キーが不足 → 音同定の精度が落ちます（推奨：列を揃える）")
    else:
        print("[INFO] master_epoch_index が無い → EEG trialから音同定できない可能性あり（推奨：先に作る）")



    # qc_pass が付与できていれば、passのみへ（EEGノイズ混入を防ぐ）
    if "qc_pass" in df_eeg_trial.columns:
        before = len(df_eeg_trial)
        df_eeg_trial = df_eeg_trial[df_eeg_trial["qc_pass"] == True].copy()
        print(f"[EEG trial] qc_pass filter: {before} -> {len(df_eeg_trial)}")

    # -----------------------------------------------------
    # 6-3) 集約：participant × sound（number or FileName_norm）
    # -----------------------------------------------------
    # どのキーが揃っているかで最適なgroupキーを選ぶ
    group_keys = ["participant"]
    if "number" in df_eeg_trial.columns and df_eeg_trial["number"].notna().any():
        group_keys.append("number")
    elif "FileName_norm" in df_eeg_trial.columns and df_eeg_trial["FileName_norm"].notna().any():
        group_keys.append("FileName_norm")
    elif "FileName" in df_eeg_trial.columns:
        df_eeg_trial["FileName_norm"] = df_eeg_trial["FileName"].map(normalize_filename_like_hcu)
        group_keys.append("FileName_norm")
    else:
        df_eeg_trial.head(200).to_csv(AUDIT_DIR / "eeg_trial_cannot_identify_sound.csv", index=False, encoding="utf-8-sig")
        raise ValueError("EEG trialから音を同定する列がありません（number/FileNameが無い）。ログを確認してください。")

     # ---- meta判定を強化（number_x / qc_pass_x なども除外）----
    def is_meta(c: str) -> bool:
        base_meta = {
            "subject","subj_id","subject_id","participant",
            "run","run_id","trial_index","trial_in_run",
            "FileName","FileName_norm","category",
            "qc_pass","feature_ok","number","sound_id",
        }
        if c in base_meta:
            return True
        # mergeで生まれる suffix付き（number_x, qc_pass_x など）もメタ扱い
        for p in ["number", "qc_pass", "FileName", "FileName_norm"]:
            if c.startswith(p + "_"):
                return True
        return False

    # 数値列だけから EEG特徴量を作る（メタ列は落とす）
    num_cols = df_eeg_trial.select_dtypes(include=[np.number]).columns
    eeg_feat_cols = [c for c in num_cols if not is_meta(c)]

    if len(eeg_feat_cols) == 0:
        df_eeg_trial.head(200).to_csv(AUDIT_DIR / "eeg_trial_no_numeric_features.csv", index=False, encoding="utf-8-sig")
        raise ValueError("EEG trial特徴量として使える数値列がありません。⑤の出力を確認してください。")

    print(f"\nEEG feature columns n={len(eeg_feat_cols)} (first 15): {eeg_feat_cols[:15]}")

    print("[STEP] start groupby mean ...")
    grouped = df_eeg_trial.groupby(group_keys)
    df_eeg_psound = grouped[eeg_feat_cols].mean().reset_index()
    df_eeg_psound["n_trials_eeg"] = grouped.size().values
    print("[STEP] done groupby mean")

        # --- 余計な participant×sound を落とす（主観側に存在する組だけ残す）---
    before = len(df_eeg_psound)

    if ("number" in df_eeg_psound.columns) and ("number" in df_ps_master.columns):
        key_keep = df_ps_master[["participant", "number"]].drop_duplicates()
        df_eeg_psound = df_eeg_psound.merge(key_keep, on=["participant", "number"], how="inner")
        print(f"[CHECK] df_eeg_psound rows (trim by number): {before} -> {len(df_eeg_psound)}")

    elif ("FileName_norm" in df_eeg_psound.columns) and ("FileName_norm" in df_ps_master.columns):
        key_keep = df_ps_master[["participant", "FileName_norm"]].drop_duplicates()
        df_eeg_psound = df_eeg_psound.merge(key_keep, on=["participant", "FileName_norm"], how="inner")
        print(f"[CHECK] df_eeg_psound rows (trim by FileName_norm): {before} -> {len(df_eeg_psound)}")

    else:
        print("[INFO] df_eeg_psound trim skipped (no common key: number/FileName_norm)")


    grouped = df_eeg_trial.groupby(group_keys)

    # ★超おすすめ：EEG participant×sound 集約を単体でも保存（⑥の成果物）
    save_csv(df_eeg_psound, EEG_PARTICIPANT_SOUND_CSV, "eeg_features_participant_sound (aggregated from trials)")


    # -----------------------------------------------------
    # 6-4) 主観df_ps_master へ統合
    # -----------------------------------------------------
    join_keys = ["participant"]
    if ("number" in df_eeg_psound.columns) and ("number" in df_ps_master.columns):
        join_keys.append("number")
    elif ("FileName_norm" in df_eeg_psound.columns) and ("FileName_norm" in df_ps_master.columns):
        join_keys.append("FileName_norm")
    else:
        df_eeg_psound.to_csv(AUDIT_DIR / "eeg_psound_join_key_missing.csv", index=False, encoding="utf-8-sig")
        raise ValueError("participant×sound EEG集約のJOINキーが主観側と合いません。ログを確認してください。")


    df_ps_master = df_ps_master.merge(df_eeg_psound, on=join_keys, how="left")

    # 監査：EEGが付かなかった割合
    miss_eeg = df_ps_master["n_trials_eeg"].isna().mean()
    print(f"\n[CHECK] EEG merge missing rate (n_trials_eeg is NaN): {miss_eeg:.3f}")
    df_ps_master[df_ps_master["n_trials_eeg"].isna()][["participant","number","FileName","FileName_norm"]].head(500).to_csv(
        AUDIT_DIR / "missing_eeg_psound_rows_head500.csv",
        index=False, encoding="utf-8-sig"
    )
    print("missing EEG rows log:", AUDIT_DIR / "missing_eeg_psound_rows_head500.csv")

    # -----------------------------------------------------
    # 6-5) 保存
    # -----------------------------------------------------
    save_csv(df_ps_master, MASTER_PARTICIPANT_SOUND_CSV, "master_participant_sound_level")

    print("\n=== ALL DONE ===")
    print(" sound-level       :", MASTER_SOUND_CSV)
    print(" participant-level :", MASTER_PARTICIPANT_CSV)
    print(" participant×sound :", MASTER_PARTICIPANT_SOUND_CSV)


if __name__ == "__main__":
    main()


ROOT_DIR : /Users/shunsuke/EEG_48sounds
DERIV_DIR: /Users/shunsuke/EEG_48sounds/derivatives
AUDIT_DIR: /Users/shunsuke/EEG_48sounds/output/integration_audit
MASTER_DIR: /Users/shunsuke/EEG_48sounds/derivatives/master_tables

=== LOAD: HCU subset (48 sounds) ===
  try: /Users/shunsuke/EEG_48sounds/HCU400_48sounds_subset.csv
  -> FOUND: /Users/shunsuke/EEG_48sounds/HCU400_48sounds_subset.csv  shape=(48, 10103)

=== LOAD: Physical sound features (48 sounds) ===
  try: /Users/shunsuke/EEG_48sounds/sound_features_48.csv
  try: /Users/shunsuke/EEG_48sounds/derivatives/sound_features_48.csv
  try: /Users/shunsuke/EEG_48sounds/derivatives/sound_features/sound_features_48.csv
  -> FOUND: /Users/shunsuke/EEG_48sounds/derivatives/sound_features/sound_features_48.csv  shape=(48, 71)

=== LOAD: Subjective summary per sound ===
  try: /Users/shunsuke/EEG_48sounds/output/ratings/sound_summary.csv
  try: /Users/shunsuke/EEG_48sounds/derivatives/behavioral/sound_summary.csv
  -> FOUND: /Users/shunsuke/