In [3]:
# %%
from pathlib import Path
import numpy as np
import pandas as pd
import mne

# =========================
# パス設定
# =========================

# プロジェクトルート
# 例: /Users/owner/EEG_48sounds 直下に ①〜⑥ の .ipynb がある想定
ROOT_DIR = Path("/Users/shunsuke/EEG_48sounds")  # ★自分の環境に合わせて変更

DERIV_DIR        = ROOT_DIR / "derivatives"
INDEX_TRIAL_CSV  = DERIV_DIR / "epoch_index_by_trial.csv"

QC_ALL_DIR   = DERIV_DIR / "qc_all"
QC_SUBJ_DIR  = DERIV_DIR / "qc_per_subject"
QC_ALL_DIR.mkdir(parents=True, exist_ok=True)
QC_SUBJ_DIR.mkdir(parents=True, exist_ok=True)

# =========================
# 解析対象 EEG チャンネル
# =========================
# 10-20 法 19ch + 耳たぶ
EEG_CHANNELS = [
    "EEG Fp1-Ref", "EEG Fp2-Ref",
    "EEG F3-Ref",  "EEG F4-Ref",
    "EEG C3-Ref",  "EEG C4-Ref",
    "EEG P3-Ref",  "EEG P4-Ref",
    "EEG O1-Ref",  "EEG O2-Ref",
    "EEG F7-Ref",  "EEG F8-Ref",
    "EEG T3-Ref",  "EEG T4-Ref",
    "EEG T5-Ref",  "EEG T6-Ref",
    "EEG Fz-Ref",  "EEG Cz-Ref", "EEG Pz-Ref",
    "EEG A1-Ref",  "EEG A2-Ref",
]

# =========================
# 閾値設定（前処理後の波形に合わせて調整）
# =========================
# ※ ②の前処理で既に：
#   - band-pass(0.1–40Hz) / ノッチ(50/100Hz)
#   - global bad channel 補間（ptp <5µV / >1000µV など）
#   - 平均基準 & ベースライン (-0.2〜0s)
#   が入っている前提。

# 「ほぼフラット」とみなす振幅 [µV]
#  → 1 trial 内で ptp < 5µV は、ほぼ情報がないレベル
FLAT_THRESH_UV = 5.0

# 「極端に大きい」とみなす振幅 [µV]
#  → 1ch が 1mV 超えるのは、典型的には強い筋電・動きアーチファクト
EXTREME_THRESH_UV = 1000.0

# trial 全体の max_ptp に対する上限 [µV]
#  → 1 trial 内のどこかで 1.5mV 超えなら「かなり極端」
TRIAL_MAX_PTP_MIN_UV = None      # 下限は前処理で担保されているのでチェックしない
TRIAL_MAX_PTP_MAX_UV = 1500.0

# 「flat/extreme なチャネル」が全チャネルの何割を超えたら NG か
MAX_BAD_RATIO = 0.3   # 30%以上のチャネルが明らかにおかしい trial は落とす

MAX_N_EXTREME        = 2         # 1mV超チャネルが 3本以上あれば NG

# =========================
# 補助関数
# =========================

def get_eeg_picks(raw: mne.io.BaseRaw | mne.Epochs):
    """解析対象 EEG チャネルだけを pick する."""
    if isinstance(raw, mne.io.BaseRaw):
        ch_names = raw.ch_names
        info = raw.info
    else:
        ch_names = raw.ch_names
        info = raw.info

    # EEG_CHANNELS で指定されているものを優先して pick
    names = [ch for ch in EEG_CHANNELS if ch in ch_names]
    if len(names) > 0:
        picks = mne.pick_channels(ch_names, names)
    else:
        # 万一名前が揃っていなければ mne の eeg=True にフォールバック
        picks = mne.pick_types(info, meg=False, eeg=True, eog=False, stim=False)
    return picks


# =========================
# 1 run 分の QC
# =========================

def qc_one_run(subject: str, run: str, df_run: pd.DataFrame):
    """
    subject/run の run-epo.fif を1回だけ読み込み、48 trial をまとめてQCする。
    ※データは落とさず、qc_pass と qc_reason を付けるだけ。
    """
    trial_records = []
    channel_records = []

    # df_run は trial順が保証されている前提に寄せる
    df_run = df_run.sort_values("trial_in_run").reset_index(drop=True)

    # ★ run-epo.fif を1回だけ読む（I/O激減）
    run_rel = df_run.iloc[0]["run_epoch_file"]  # epoch_index_by_trial.csv にある
    run_path = ROOT_DIR / run_rel

    try:
        epochs = mne.read_epochs(run_path, preload=True, verbose="ERROR")
    except Exception as e:
        print(f"[ERROR] {subject} {run}: run epochs read_error: {e}")
        # run全体をNG扱いで埋める
        for _, row in df_run.iterrows():
            trial_records.append({
                "subject": subject,
                "run": run,
                "trial_in_run": int(row["trial_in_run"]),
                "fif_path": str(run_rel),
                "qc_autoreject_pass": True,
                "n_eeg_channels": 0,
                "max_ptp_uv": np.nan,
                "mean_ptp_uv": np.nan,
                "median_ptp_uv": np.nan,
                "n_flat": np.nan,
                "n_extreme": np.nan,
                "bad_ratio": np.nan,
                "qc_pass": False,
                "qc_reason": f"run_read_error:{e}",
            })
        return trial_records, channel_records

    # 対象チャネルpick（A1/A2は基本除外推奨）
    picks = get_eeg_picks(epochs)
    ch_names = [epochs.ch_names[i] for i in picks]

    # A1/A2を外したい場合（推奨）：名前があれば除外
    exclude = {"EEG A1-Ref", "EEG A2-Ref"}
    picks2 = [i for i in picks if epochs.ch_names[i] not in exclude]
    if len(picks2) >= 10:  # 19ch想定の保険（少なすぎたら戻す）
        picks = picks2
        ch_names = [epochs.ch_names[i] for i in picks]

    if len(picks) == 0:
        print(f"[WARN] {subject} {run}: EEG チャンネルが見つかりません。run全trialをNG扱いにします。")
        for _, row in df_run.iterrows():
            trial_records.append({
                "subject": subject,
                "run": run,
                "trial_in_run": int(row["trial_in_run"]),
                "fif_path": str(run_rel),
                "qc_autoreject_pass": True,
                "n_eeg_channels": 0,
                "max_ptp_uv": np.nan,
                "mean_ptp_uv": np.nan,
                "median_ptp_uv": np.nan,
                "n_flat": np.nan,
                "n_extreme": np.nan,
                "bad_ratio": np.nan,
                "qc_pass": False,
                "qc_reason": "no_eeg_channels",
            })
        return trial_records, channel_records

    # data: (n_epochs, n_channels, n_times)
    data = epochs.get_data()[:, picks, :]

    # ここでは qc_autoreject_pass は無い前提なので True 固定（列が今後来ても壊れないようにするなら row.getでOK）
    # 1 trialずつQC
    n_epochs = data.shape[0]
    if n_epochs != len(df_run):
        print(f"[WARN] {subject} {run}: epochs数={n_epochs} と index行数={len(df_run)} が不一致です（先頭から揃う分だけ評価します）")

    n_eval = min(n_epochs, len(df_run))

    for i in range(n_eval):
        row = df_run.iloc[i]
        trial_in_run = int(row["trial_in_run"])

        x = data[i]  # (n_ch, n_times) [V]

        # peak-to-peak [µV]
        ptp_uv = (x.max(axis=1) - x.min(axis=1)) * 1e6

        is_flat    = ptp_uv < FLAT_THRESH_UV
        is_extreme = ptp_uv > EXTREME_THRESH_UV

        n_ch      = len(ptp_uv)
        n_flat    = int(is_flat.sum())
        n_extreme = int(is_extreme.sum())
        bad_ratio = (n_flat + n_extreme) / n_ch

        max_ptp_uv    = float(ptp_uv.max())
        mean_ptp_uv   = float(ptp_uv.mean())
        median_ptp_uv = float(np.median(ptp_uv))

        reasons = []
        if TRIAL_MAX_PTP_MIN_UV is not None and max_ptp_uv < TRIAL_MAX_PTP_MIN_UV:
            reasons.append("max_ptp_too_small")
        if TRIAL_MAX_PTP_MAX_UV is not None and max_ptp_uv > TRIAL_MAX_PTP_MAX_UV:
            reasons.append("max_ptp_too_large")
        if bad_ratio > MAX_BAD_RATIO:
            reasons.append("too_many_bad_channels")
        if n_extreme > MAX_N_EXTREME:
            reasons.append("too_many_extreme_channels")

        qc_pass   = (len(reasons) == 0)
        qc_reason = ";".join(reasons)

        trial_records.append({
            "subject": subject,
            "run": run,
            "trial_in_run": trial_in_run,
            "fif_path": str(run_rel),  # run-epo.fif
            "qc_autoreject_pass": True,
            "n_eeg_channels": n_ch,
            "max_ptp_uv": max_ptp_uv,
            "mean_ptp_uv": mean_ptp_uv,
            "median_ptp_uv": median_ptp_uv,
            "n_flat": n_flat,
            "n_extreme": n_extreme,
            "bad_ratio": bad_ratio,
            "qc_pass": qc_pass,
            "qc_reason": qc_reason,
        })

        for ch_name, amp, f, ex in zip(ch_names, ptp_uv, is_flat, is_extreme):
            channel_records.append({
                "subject": subject,
                "run": run,
                "trial_in_run": trial_in_run,
                "ch_name": ch_name,
                "ptp_uv": float(amp),
                "is_flat": bool(f),
                "is_extreme": bool(ex),
            })

    return trial_records, channel_records



# =========================
# 全 run 分をまとめて QC
# =========================

def run_qc_all_trials():
    # epoch_index_by_trial.csv を読み込み
    df_index = pd.read_csv(INDEX_TRIAL_CSV)
    print("Loaded:", INDEX_TRIAL_CSV)
    print("shape:", df_index.shape)
    print("columns:", df_index.columns.tolist())

    # groupby で subject, run ごとに QC
    group_cols = ["subject", "run"]
    all_trial_records = []
    all_channel_records = []

    for (subject, run), df_run in df_index.groupby(group_cols):
        print(f"QC for subject={subject}, run={run} ...")
        trial_records, channel_records = qc_one_run(subject, run, df_run)
        all_trial_records.extend(trial_records)
        all_channel_records.extend(channel_records)

    # DataFrame に変換
    df_trial   = pd.DataFrame(all_trial_records)
    df_channel = pd.DataFrame(all_channel_records)

    # run-level summary（どの run がどれくらい使えそうか）
    agg_funcs = {
        "qc_pass": ["sum", "count"],
        "max_ptp_uv": ["mean", "median"],
        "bad_ratio": ["mean", "median"],
    }
    df_summary = (
        df_trial.groupby(["subject", "run"])
        .agg(agg_funcs)
        .reset_index()
    )
    df_summary.columns = [
        "_".join(col).rstrip("_") for col in df_summary.columns.to_flat_index()
    ]
    # わかりやすい列名にリネーム
    df_summary = df_summary.rename(
        columns={
            "qc_pass_sum": "n_pass_trials",
            "qc_pass_count": "n_total_trials",
            "max_ptp_uv_mean": "mean_max_ptp_uv",
            "max_ptp_uv_median": "median_max_ptp_uv",
            "bad_ratio_mean": "mean_bad_ratio",
            "bad_ratio_median": "median_bad_ratio",
        }
    )
    df_summary["pass_ratio"] = df_summary["n_pass_trials"] / df_summary["n_total_trials"]

    # ===== 全体版を保存 =====
    qc_trial_all_path   = QC_ALL_DIR / "qc_by_trial_all.csv"
    qc_channel_all_path = QC_ALL_DIR / "qc_channels_by_run_all.csv"
    qc_summary_all_path = QC_ALL_DIR / "qc_summary_by_run_all.csv"

    df_trial.to_csv(qc_trial_all_path, index=False, encoding="utf-8-sig")
    df_channel.to_csv(qc_channel_all_path, index=False, encoding="utf-8-sig")
    df_summary.to_csv(qc_summary_all_path, index=False, encoding="utf-8-sig")

    print("Saved trial-level QC (all subjects) to:", qc_trial_all_path)
    print("Saved channel-level QC (all subjects) to:", qc_channel_all_path)
    print("Saved run-level QC summary (all subjects) to:", qc_summary_all_path)

    # ===== 被験者別フォルダにも分割保存 =====
    for subject, df_sub_trial in df_trial.groupby("subject"):
        sub_dir = QC_SUBJ_DIR / str(subject)
        sub_dir.mkdir(parents=True, exist_ok=True)

        df_sub_trial.to_csv(sub_dir / "qc_by_trial.csv", index=False, encoding="utf-8-sig")

        df_sub_ch = df_channel[df_channel["subject"] == subject]
        df_sub_ch.to_csv(sub_dir / "qc_channels_by_run.csv", index=False, encoding="utf-8-sig")

        df_sub_sum = df_summary[df_summary["subject"] == subject]
        df_sub_sum.to_csv(sub_dir / "qc_summary_by_run.csv", index=False, encoding="utf-8-sig")

        print(f"Saved trial-level QC for subject={subject} to:", sub_dir / "qc_by_trial.csv")
        print(f"Saved channel-level QC for subject={subject} to:", sub_dir / "qc_channels_by_run.csv")
        print(f"Saved run-level summary for subject={subject} to:", sub_dir / "qc_summary_by_run.csv")


if __name__ == "__main__":
    run_qc_all_trials()


Loaded: /Users/shunsuke/EEG_48sounds/derivatives/epoch_index_by_trial.csv
shape: (1728, 17)
columns: ['subject', 'run', 'trial_in_run', 'event_sample_sound_onset', 'event_time_sound_onset_sec', 'run_epoch_file', 'single_epoch_fif_file', 'single_epoch_csv_file', 'event_time_marker_onset_sec', 'marker_to_sound_shift_sec', 'event_label_used', 'hp_freq', 'lp_freq', 'notch_freqs', 'baseline', 'tmin', 'tmax']
QC for subject=01_高見, run=run1 ...
QC for subject=01_高見, run=run2 ...
QC for subject=01_高見, run=run3 ...
QC for subject=02_相川, run=run1 ...
QC for subject=02_相川, run=run2 ...
QC for subject=02_相川, run=run3 ...
QC for subject=03_江口, run=run1 ...
QC for subject=03_江口, run=run2 ...
QC for subject=03_江口, run=run3 ...
QC for subject=04_猿谷, run=run1 ...
QC for subject=04_猿谷, run=run2 ...
QC for subject=04_猿谷, run=run3 ...
QC for subject=05_斎藤, run=run1 ...
QC for subject=05_斎藤, run=run2 ...
QC for subject=05_斎藤, run=run3 ...
QC for subject=06_高達, run=run1 ...
QC for subject=06_高達, run=run2 ..