In [85]:
####ACTUAL CODE
import os, re
import numpy as np
import pandas as pd
import mne

# ---------- PATHS ----------
edf_file_path      = r"C:\Users\user\OneDrive\Documents\UCC Studies Sem 1\IS6611_Applied Research in Business Analytics\Dataset\TUH EEG Dataset\aaaaaauj\s003_2003\02_tcp_le\aaaaaauj_s003_t001.edf"
annotation_csv_path= r"C:\Users\user\OneDrive\Documents\UCC Studies Sem 1\IS6611_Applied Research in Business Analytics\Dataset\TUH EEG Dataset\aaaaaauj\s003_2003\02_tcp_le\aaaaaauj_s003_t001.csv"
output_dir         = r"C:\Users\user\OneDrive\Documents\UCC Studies Sem 1\IS6611_Applied Research in Business Analytics\Dataset\Preprocessed LE Output"
emg_ref_stats_path = r"C:\Users\user\OneDrive\Documents\UCC Studies Sem 1\IS6611_Applied Research in Business Analytics\Dataset\EMG waves.xlsx"     # Excel file with EMG feature rows

os.makedirs(output_dir, exist_ok=True)

# ---------- 1. LOAD REFERENCE-EMG STATS ----------
ref_df = pd.read_excel(emg_ref_stats_path)
keep_cols = [c for c in ref_df.columns
             if re.search(r'EMG.*_(mean|std|min|max|hjorth_)', c, re.I)] + ['label']
ref_df = ref_df[keep_cols]

def random_stats(label):
    pool = ref_df[ref_df['label'] == label]
    if pool.empty: pool = ref_df[ref_df['label'] == 0]
    r = pool.sample(1).iloc[0]
    return dict(mean=r.filter(like='_mean').mean(),
                std =r.filter(like='_std').mean(),
                min =r.filter(like='_min').mean(),
                max =r.filter(like='_max').mean(),
                activity=r.filter(like='hjorth_activity').mean(),
                mobility=r.filter(like='hjorth_mobility').mean(),
                complexity=r.filter(like='hjorth_complexity').mean())

# ---------- 2. HELPER FUNCTIONS ----------
def hjorth(sig):
    d1,d2=np.diff(sig),np.diff(np.diff(sig))
    v0,v1,v2=np.var(sig),np.var(d1),np.var(d2)
    act=v0
    mob=np.sqrt(v1/v0) if v0 else 0
    comp=(np.sqrt(v2/v1)/mob) if v1 and mob else 0
    return act,mob,comp

def synth_emg(n,target,j_std=.15,j_mean=.10,seed=None):
    rng=np.random.default_rng(seed)
    mu=target['mean']*rng.uniform(1-j_mean,1+j_mean)
    sd=target['std'] *rng.uniform(1-j_std ,1+j_std )
    sig=rng.normal(mu,sd,n)
    cur_var=np.var(sig)
    if cur_var and target['activity']>0:
        sig*=np.sqrt(target['activity']/cur_var)
    return sig

# ---------- 3. LOAD EDF ----------
raw = mne.io.read_raw_edf(edf_file_path, preload=True, verbose=False)
print("Raw sfreq:", raw.info['sfreq'])

eeg=[c for c in raw.ch_names if 'EEG' in c.upper()]
emg=[c for c in raw.ch_names if 'EMG' in c.upper()]
ekg=[c for c in raw.ch_names if 'EKG' in c.upper()]
resp=[c for c in raw.ch_names if 'RESP'in c.upper()]
sel=list(dict.fromkeys(eeg+emg+ekg+resp))
print("Selected:", sel)

# --- average reference for EEG (remove if undesired)
if eeg:
    raw.set_eeg_reference('average', projection=True)

# --- band-pass 0.5-40 Hz
raw.filter(0.5,40.0, verbose=False)

# === RESAMPLE to 250 Hz ====================================
target_fs = 250
if int(raw.info['sfreq']) != target_fs:
    print(f"Resampling {raw.info['sfreq']} Hz → {target_fs} Hz …")
    raw.resample(target_fs, npad="auto", verbose=False)
# ===========================================================

data,times = raw.get_data(picks=sel, return_times=True)
sfreq = raw.info['sfreq']          # now 250
print("Post-resample sfreq:", sfreq)

# ---------- 4. LOAD ANNOTATIONS ----------
ann = pd.read_csv(annotation_csv_path, comment='#')
ann.columns = ann.columns.str.strip()

# ---------- 5. WINDOW LOOP ----------
win_s, step_s = 10.0, 5.0
n_win = int(win_s*sfreq); n_step=int(step_s*sfreq)
rows=[]; header=[]

for st in range(0, len(times)-n_win+1, n_step):
    en  = st+n_win
    t0,t1 = times[st], times[en-1]
    win   = data[:, st:en]

    # label 0/1/2
    lb = 0  # Default label
 
    ov = ann[(ann['start_time'] <= t1) & (ann['stop_time'] >= t0)]
     
    if not ov.empty and not ov['label'].str.contains('bckg', case=False).all():
        if ov['label'].str.contains('fnsz', case=False).any():
            lb = 1  # Focal seizure
        elif ov['label'].str.contains('cpsz', case=False).any():
            lb = 3  # Complex partial
        else:
            lb = 2  # Other/generalized seizures
# else lb stays 0 (background)

    # EMG real or synthetic
    if emg:
        emg_sig = win[ sel.index(emg[0]) ]
    else:
        emg_sig = synth_emg(n_win, random_stats(lb), seed=st)

    feat=[]
    for sig in win:
        m,s = sig.mean(), sig.std()
        e   = np.dot(sig,sig)
        mx,mn = sig.max(), sig.min()
        a,mob,c=hjorth(sig)
        feat += [m,s,e,mx,mn,a,mob,c]

    em_m,em_s=emg_sig.mean(), emg_sig.std()
    em_min,em_max = emg_sig.min(), emg_sig.max()
    em_a,em_mob,em_c = hjorth(emg_sig)
    feat += [em_m,em_s,em_min,em_max,em_a,em_mob,em_c,lb]
    rows.append(feat)

# ---------- 6. HEADER & SAVE ----------
for ch in sel:
    header += [f"{ch}_mean", f"{ch}_std", f"{ch}_energy",
               f"{ch}_max", f"{ch}_min",
               f"{ch}_hjorth_activity", f"{ch}_hjorth_mobility",
               f"{ch}_hjorth_complexity"]
header += ["SyntheticEMG_mean","SyntheticEMG_std","SyntheticEMG_min",
           "SyntheticEMG_max","SyntheticEMG_hjorth_activity",
           "SyntheticEMG_hjorth_mobility","SyntheticEMG_hjorth_complexity",
           "label"]

df = pd.DataFrame(rows, columns=header)
out_csv = os.path.join(output_dir, "aaaaaauj_window_features_s003_2003_t001.csv")
df.to_csv(out_csv, index=False)
print(f"✅ Saved {len(df)} windows → {out_csv}")


Raw sfreq: 250.0
Selected: ['EEG FP1-LE', 'EEG FP2-LE', 'EEG F3-LE', 'EEG F4-LE', 'EEG C3-LE', 'EEG C4-LE', 'EEG A1-LE', 'EEG A2-LE', 'EEG P3-LE', 'EEG P4-LE', 'EEG O1-LE', 'EEG O2-LE', 'EEG F7-LE', 'EEG F8-LE', 'EEG T3-LE', 'EEG T4-LE', 'EEG T5-LE', 'EEG T6-LE', 'EEG FZ-LE', 'EEG CZ-LE', 'EEG PZ-LE', 'EEG OZ-LE', 'EEG PG1-LE', 'EEG PG2-LE', 'EEG EKG-LE', 'EEG SP2-LE', 'EEG SP1-LE', 'EEG 28-LE', 'EEG 29-LE', 'EEG 30-LE', 'EEG T1-LE', 'EEG T2-LE']
EEG channel type selected for re-referencing
Adding average EEG reference projection.
1 projection items deactivated
Average reference projection was added, but has not been applied yet. Use the apply_proj method to apply it.
Post-resample sfreq: 250.0
✅ Saved 275 windows → C:\Users\user\OneDrive\Documents\UCC Studies Sem 1\IS6611_Applied Research in Business Analytics\Dataset\Preprocessed LE Output\aaaaaauj_window_features_s003_2003_t001.csv
