In [16]:
from pathlib import Path
import numpy as np
import pandas as pd
import mne
import math

# =========================
# PATHS（必要ならここだけ修正）
# =========================
ROOT = Path("~/EEG_48sounds").expanduser()
EPOCHS_PATH   = ROOT / "derivatives/epochs_all/epochs_all-epo.fif"
TRIAL_FEAT    = ROOT / "moduleB_outputs/tables/moduleB_trial_eeg_features.csv"
MASTER_PC     = ROOT / "derivatives/master_tables/master_sound_level_with_PC.csv"   # ←あなたの実パスに合わせる
OUT_DIR       = ROOT / "moduleB_outputs/tables/ERP_onset_under1000ms"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# =========================
# util
# =========================
def _to_int_like(s: pd.Series) -> pd.Series:
    x = pd.to_numeric(s, errors="coerce")
    if x.isna().mean() > 0.5:
        x = pd.to_numeric(s.astype(str).str.extract(r"(\d+)")[0], errors="coerce")
    return x.astype("Int64")

def ensure_key(df: pd.DataFrame, dst: str, cands: list) -> pd.DataFrame:
    if dst in df.columns:
        df[dst] = _to_int_like(df[dst])
        return df
    for c in cands:
        if c in df.columns:
            df[dst] = _to_int_like(df[c])
            return df
    raise RuntimeError(f"'{dst}' を作れません。候補 {cands} がありません: cols={list(df.columns)}")

def add_highlow(df):
    """emo_* があれば median split を作る（*_high を生成）"""
    df = df.copy()
    for col in ["emo_arousal","emo_approach","emo_valence"]:
        if col in df.columns:
            v = pd.to_numeric(df[col], errors="coerce")
            med = float(v.median())
            df[f"{col}_high"] = (v >= med).astype(int)
    return df

def add_is_ambiguous(df):
    """is_ambiguous が無ければ、候補列から作る（無ければ0で埋める）"""
    df = df.copy()
    if "is_ambiguous" in df.columns:
        df["is_ambiguous"] = pd.to_numeric(df["is_ambiguous"], errors="coerce").fillna(0).astype(int)
        return df

    cand = None
    for c in ["is_ambiguous_approach_sd_top10","ambiguous","is_ambiguous_flag","is_ambiguous_top10"]:
        if c in df.columns:
            cand = c
            break
    if cand is None:
        df["is_ambiguous"] = 0
    else:
        x = df[cand]
        if x.dtype == bool:
            df["is_ambiguous"] = x.astype(int)
        else:
            df["is_ambiguous"] = pd.to_numeric(x, errors="coerce").fillna(0).astype(int)
    return df

# =========================
# 1) Load epochs (preload必須)
# =========================
epochs = mne.read_epochs(EPOCHS_PATH, preload=True, verbose="ERROR")
print("Loaded epochs:", len(epochs))
if epochs.metadata is None:
    raise RuntimeError("epochs.metadata が None です。")

meta = epochs.metadata.reset_index(drop=True).copy()

# metadata列名の揺れ対応：subject_id/run_id を作る
meta = ensure_key(meta, "subject_id", ["subject_id","participant","subject","sub","sub_id","participant_id"])
meta = ensure_key(meta, "run_id", ["run_id","run","block","session"])
meta = ensure_key(meta, "trial_in_run", ["trial_in_run","trial","trial_index","trial_num"])
meta = ensure_key(meta, "number", ["number","sound_id","stim_id","stimulus_id"])
epochs.metadata = meta

# =========================
# 2) keep（ModuleB基準）で整列
# =========================
df_feat = pd.read_csv(TRIAL_FEAT)
df_feat = ensure_key(df_feat, "subject_id", ["subject_id","participant","subject"])
df_feat = ensure_key(df_feat, "run_id", ["run_id","run"])
df_feat = ensure_key(df_feat, "trial_in_run", ["trial_in_run","trial"])

keys3 = ["subject_id","run_id","trial_in_run"]
keep = df_feat[keys3].drop_duplicates().copy()
keep["_keep"] = True

meta2 = epochs.metadata.merge(keep, on=keys3, how="left")
keep_mask = meta2["_keep"].fillna(False).to_numpy(dtype=bool)

epochs = epochs[keep_mask]
meta2 = meta2.loc[keep_mask].reset_index(drop=True)
epochs.metadata = meta2.drop(columns=["_keep"], errors="ignore")

print("Aligned epochs after keep:", len(epochs))
print(epochs.metadata[keys3 + ["number"]].head())

# =========================
# 3) 19ch 正規化
# =========================
CANON_19 = ["Fp1","Fp2","F7","F3","Fz","F4","F8","T7","C3","Cz","C4","T8","P7","P3","Pz","P4","P8","O1","O2"]

def _norm_ch(name: str) -> str:
    s = name.upper().replace("EEG","").replace(" ","")
    s = s.replace("-REF","").replace("REF","")
    s = s.replace("-", "")
    s = s.replace("A1","").replace("A2","").replace("M1","").replace("M2","")
    s = s.replace("T3","T7").replace("T4","T8").replace("T5","P7").replace("T6","P8")
    return s

def pick_and_rename_19ch(ep: mne.Epochs) -> mne.Epochs:
    ep = ep.copy().pick_types(eeg=True, eog=False, stim=False, misc=False)
    cur = list(ep.ch_names)
    norm_map = {_norm_ch(c): c for c in cur}

    rename = {}
    keep_ch = []
    for canon in CANON_19:
        key = canon.upper()
        if key in norm_map:
            orig = norm_map[key]
            rename[orig] = canon
            keep_ch.append(orig)

    if not keep_ch:
        raise RuntimeError(f"[channels] 19ch が1つも一致しません: {cur}")

    ep = ep.copy().pick_channels(keep_ch, ordered=True)
    ep.rename_channels(rename)
    ordered = [c for c in CANON_19 if c in ep.ch_names]
    ep = ep.copy().reorder_channels(ordered)
    return ep

epochs = pick_and_rename_19ch(epochs)
print("EEG ch_names:", epochs.ch_names)

# =========================
# 4) Crop + baseline（<1000ms）
# =========================
epochs = epochs.copy().crop(tmin=-0.2, tmax=1.0, include_tmax=False)
epochs = epochs.copy().apply_baseline((-0.2, 0.0))
print("After crop/baseline:", epochs.tmin, epochs.tmax, "n=", len(epochs))

# =========================
# 5) ★ここが今回の追加：master_sound を number で付与
# =========================
ms = pd.read_csv(MASTER_PC)

# master_soundの number 列を作る
ms = ensure_key(ms, "number", ["number","sound_id","SoundID","stim_id","stimulus_id","sound"])
ms["number"] = _to_int_like(ms["number"])

# emo_* が無い場合の候補（あなたの列名が違う可能性に備える）
# もし列名が一致しているなら何もしない
rename_map = {}
for dst, cands in {
    "emo_arousal":  ["emo_arousal","arousal","Arousal","PC_arousal","arousal_score"],
    "emo_approach":["emo_approach","approach","Approach","PC_approach","approach_score"],
    "emo_valence": ["emo_valence","valence","Valence","PC_valence","valence_score"],
}.items():
    if dst not in ms.columns:
        for c in cands:
            if c in ms.columns:
                rename_map[c] = dst
                break
if rename_map:
    ms = ms.rename(columns=rename_map)

ms = add_highlow(ms)
ms = add_is_ambiguous(ms)

label_cols = [c for c in [
    "emo_arousal","emo_approach","emo_valence",
    "emo_arousal_high","emo_approach_high","emo_valence_high",
    "is_ambiguous"
] if c in ms.columns]

if len(label_cols) == 0:
    raise RuntimeError(f"master_sound にラベル列が見つかりません: cols={list(ms.columns)}")

# epochs.metadata に付与
meta3 = epochs.metadata.merge(ms[["number"] + label_cols].drop_duplicates("number"), on="number", how="left")

# NaNチェック（ここでNaNが多いなら、numberの定義が違う）
nan_rate = meta3[label_cols].isna().mean().sort_values(ascending=False)
print("NaN rate in attached labels (top):")
print(nan_rate.head(10))

epochs.metadata = meta3

# *_high / is_ambiguous を int化
for c in ["emo_arousal_high","emo_approach_high","emo_valence_high","is_ambiguous"]:
    if c in epochs.metadata.columns:
        epochs.metadata[c] = pd.to_numeric(epochs.metadata[c], errors="coerce").astype("Int64")

# 条件一覧
cond_specs = [
    ("emo_arousal_high", 1, 0, "Arousal High vs Low"),
    ("emo_approach_high", 1, 0, "Approach High vs Low"),
    ("emo_valence_high", 1, 0, "Valence High vs Low"),
    ("is_ambiguous", 1, 0, "Ambiguous vs Non-ambiguous"),
]
available = []
for col, hi, lo, label in cond_specs:
    if col in epochs.metadata.columns:
        vc = epochs.metadata[col].value_counts(dropna=False)
        print(col, dict(vc))
        # 0/1両方あるときだけ採用
        u = set(epochs.metadata[col].dropna().astype(int).unique().tolist())
        if hi in u and lo in u:
            available.append((col, hi, lo, label))

print("Available conds:", [a[0] for a in available])
if len(available) == 0:
    raise RuntimeError("High/Low比較が成立する条件がありません（0/1の片方しか無い可能性）。")

# =========================
# 6) ROI
# =========================
ROI_DEF = {
    "frontal":  ["Fp1","Fp2","F7","F3","Fz","F4","F8"],
    "central":  ["C3","Cz","C4"],
    "parietal": ["P7","P3","Pz","P4","P8"],
    "occipital":["O1","O2"],
}
rois = {roi: [ch for ch in chs if ch in epochs.ch_names] for roi, chs in ROI_DEF.items()}
rois = {roi: chs for roi, chs in rois.items() if len(chs) > 0}
print("ROIs used:", rois)

# =========================
# 7) windows (<1000ms)
# =========================
windows_ms = [(0,80),(80,140),(140,220),(220,350),(350,500),(500,800),(800,1000)]

# =========================
# 8) 平均振幅（被験者内差）→ sign-flip permutation
# =========================
def mean_amp_by_mask(ep: mne.Epochs, mask: np.ndarray, chs: list, t0_ms: int, t1_ms: int) -> float:
    if mask.sum() == 0:
        return np.nan
    e2 = ep[mask].copy().pick_channels(chs)
    data = e2.get_data()              # (n, ch, t) in V usually
    roi = data.mean(axis=1)           # (n, t)
    times_ms = e2.times * 1000.0
    tm = (times_ms >= t0_ms) & (times_ms < t1_ms)
    if not tm.any():
        return np.nan
    return float(roi[:, tm].mean())

sample = epochs.get_data(picks=[epochs.ch_names[0]]).ravel()
scale_to_uV = 1e6 if np.nanmax(np.abs(sample)) < 1e-3 else 1.0
print("Scale factor to uV:", scale_to_uV)

def signflip_perm_test(diff: np.ndarray, n_perm: int = 5000, seed: int = 2026):
    diff = np.asarray(diff, dtype=float)
    diff = diff[np.isfinite(diff)]
    n = len(diff)
    if n < 2:
        return np.nan, np.nan, np.nan, n
    sd = diff.std(ddof=1)
    if sd == 0:
        return np.nan, np.nan, np.nan, n

    obs_mean = diff.mean()
    obs_t = obs_mean / (sd / math.sqrt(n))
    dz = obs_mean / sd

    rng = np.random.default_rng(seed)
    cnt = 0
    for _ in range(n_perm):
        signs = rng.choice([-1.0, 1.0], size=n)
        d = diff * signs
        s = d.std(ddof=1)
        if s == 0:
            continue
        t = d.mean() / (s / math.sqrt(n))
        if abs(t) >= abs(obs_t):
            cnt += 1
    p = (cnt + 1) / (n_perm + 1)
    return float(obs_t), float(p), float(dz), int(n)

rows = []
subs = np.sort(epochs.metadata["subject_id"].dropna().astype(int).unique())

for cond_col, hi_val, lo_val, cond_label in available:
    valid = epochs.metadata[cond_col].notna().to_numpy()
    epc = epochs[valid]
    mdc = epc.metadata.reset_index(drop=True)

    for roi, chs in rois.items():
        for (t0, t1) in windows_ms:
            diffs, his, los = [], [], []

            for sid in subs:
                sid_mask = (mdc["subject_id"].astype(int).to_numpy() == int(sid))
                if sid_mask.sum() == 0:
                    continue

                y = mdc.loc[sid_mask, cond_col].astype(int).to_numpy()
                idx = np.where(sid_mask)[0]

                hi_mask = np.zeros(len(mdc), dtype=bool)
                lo_mask = np.zeros(len(mdc), dtype=bool)
                hi_mask[idx[y == hi_val]] = True
                lo_mask[idx[y == lo_val]] = True

                hi_uV = mean_amp_by_mask(epc, hi_mask, chs, t0, t1)
                lo_uV = mean_amp_by_mask(epc, lo_mask, chs, t0, t1)

                if np.isfinite(hi_uV) and np.isfinite(lo_uV):
                    hi_uV *= scale_to_uV
                    lo_uV *= scale_to_uV
                    diffs.append(hi_uV - lo_uV)
                    his.append(hi_uV)
                    los.append(lo_uV)

            if len(diffs) == 0:
                continue

            diff = np.array(diffs, dtype=float)
            t_obs, p_perm, dz, n_sub = signflip_perm_test(diff, n_perm=5000, seed=2026)

            rows.append({
                "cond": cond_col,
                "contrast": cond_label,
                "roi": roi,
                "window_ms": f"{t0}-{t1}",
                "n_subjects": n_sub,
                "hi_uV_mean": float(np.nanmean(his)),
                "lo_uV_mean": float(np.nanmean(los)),
                "mean_diff_uV": float(np.nanmean(diff)),
                "sd_diff_uV": float(np.nanstd(diff, ddof=1)) if len(diff) > 1 else np.nan,
                "T_obs": t_obs,
                "p_perm": p_perm,
                "dz": dz,
            })

res = pd.DataFrame(rows)
if len(res) == 0:
    raise RuntimeError(
        "結果が0件です。\n"
        "→ master_sound ラベル付与が失敗（NaN率が高い）か、High/Lowが被験者内で成立していない可能性。"
    )

res = res.sort_values(["cond","roi","window_ms"]).reset_index(drop=True)

out_csv = OUT_DIR / "ERP_onset_under1000ms_perm.csv"
res.to_csv(out_csv, index=False)

print("\n[SAVED]", out_csv)
print(res.head(30))


Loaded epochs: 1728
Aligned epochs after keep: 1728
   subject_id  run_id  trial_in_run  number
0           1       1             1      37
1           1       1             2      44
2           1       1             3       7
3           1       1             4      25
4           1       1             5       6
EEG ch_names: ['Fp1', 'Fp2', 'F7', 'F3', 'Fz', 'F4', 'F8', 'T7', 'C3', 'Cz', 'C4', 'T8', 'P7', 'P3', 'Pz', 'P4', 'P8', 'O1', 'O2']
After crop/baseline: -0.2 0.998 n= 1728
NaN rate in attached labels (top):
emo_arousal          0.0
emo_approach         0.0
emo_valence          0.0
emo_arousal_high     0.0
emo_approach_high    0.0
emo_valence_high     0.0
is_ambiguous         0.0
dtype: float64
emo_arousal_high {np.int64(0): np.int64(864), np.int64(1): np.int64(864)}
emo_approach_high {np.int64(0): np.int64(864), np.int64(1): np.int64(864)}
emo_valence_high {np.int64(0): np.int64(864), np.int64(1): np.int64(864)}
is_ambiguous {np.int64(0): np.int64(1368), np.int64(1): np.int64(