In [13]:
# ╔════════════════════════════════════════════════════════╗
# ║  RQ-2 |  EEG → Dream-Report Embedding (MLP / Classic) ║
# ╚════════════════════════════════════════════════════════╝
import ast, uuid, joblib, torch, numpy as np, pandas as pd
from pathlib import Path
from sklearn.preprocessing   import StandardScaler
from sklearn.metrics         import pairwise_distances, mean_squared_error
from torch.utils.data        import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

from src.utils.feature_cleaner       import clean_features
from src.utils.feature_selector      import select_features
from src.utils.trial_split           import prepare_splits
from src.utils.balance_topic_trials  import balance_trials_by_topic
from src.models.matching_model       import get_model
from src.train.model_configs         import MODEL_CONFIGS, CLASSIC_MODEL_CONFIGS

# ────────────────────────────────────────────────────────────
#            SINGLE PREFIX  (edit this only)
# ────────────────────────────────────────────────────────────
PREFIX = "/Users/seifelhadidi/Desktop/UNI/Thesis/eeg-dream-decoding-clean/"   # <- change & keep trailing slash

DATA_PATH  = f"{PREFIX}data/clean_full_dataset_with_avg_epochs.csv"
EMB_PATH   = f"{PREFIX}data/merged_embedded_reports.csv" # change path based on embedding type 
SAVE_DIR   = Path(f"{PREFIX}notebooks/demo_outputs_rq2")
SAVE_DIR.mkdir(parents=True, exist_ok=True)
# ────────────────────────────────────────────────────────────

# ───────────── Hyper-params / Flags ─────────────
TARGET_COLUMN   = "trimmed_vector"
FEATURE_GROUPS  = ["alpha"]
MODEL_TYPE      = "lightgbm_tuned"                 # "mlp" | "lightgbm_tuned" | …
EPOCH_TAG       = 1                     # 1  or  "avg_01"
BATCH_SIZE      = 64
EPOCHS          = 100
LR              = 1e-3
SEED            = 42
device          = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(SEED);  np.random.seed(SEED)
# ────────────────────────────────────────────────

# ╭────────────── Load & merge ───────────────╮
eeg_df    = pd.read_csv(DATA_PATH)
target_df = pd.read_csv(EMB_PATH)
target_df["trial_name"] = target_df["trial_name"].str.replace(".edf", "", regex=False)

eeg_df = clean_features(eeg_df)
if isinstance(EPOCH_TAG, str) and EPOCH_TAG.startswith("avg"):
    eeg_df = eeg_df[eeg_df["epoch"] == EPOCH_TAG]
else:
    eeg_df["epoch"] = pd.to_numeric(eeg_df["epoch"], errors="coerce")
    eeg_df = eeg_df[eeg_df["epoch"] == EPOCH_TAG]

eeg_df = eeg_df.merge(target_df[["trial_name", TARGET_COLUMN]], on="trial_name", how="inner")
print("Merged dataframe:", eeg_df.shape)

# ╭────────────── Splits ───────────────╮
train_df_full, val_folds, test_df = prepare_splits(
    eeg_df, trial_column="trial_name", test_size=0.10, seed=SEED
)
(SAVE_DIR / "report_test_trials.txt").write_text("\n".join(test_df["trial_name"].unique()))

metrics = []

# ╭────────────── Cross-validation ───────────────╮
for fold_idx, val_df in enumerate(val_folds, start=1):
    print(f"\n▶ Fold {fold_idx}")
    train_df = train_df_full[~train_df_full["trial_name"].isin(val_df["trial_name"])].reset_index(drop=True)
    val_df   = val_df.reset_index(drop=True)

    X_train, _ = select_features(train_df, FEATURE_GROUPS, None, TARGET_COLUMN)
    X_val,   _ = select_features(val_df,   FEATURE_GROUPS, None, TARGET_COLUMN)

    Y_train = train_df[TARGET_COLUMN].apply(ast.literal_eval).tolist()
    Y_val   = val_df  [TARGET_COLUMN].apply(ast.literal_eval).tolist()
    Y_train, Y_val = np.asarray(Y_train, dtype=np.float32), np.asarray(Y_val, dtype=np.float32)
    emb_dim = Y_train.shape[1]

    scaler  = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val   = scaler.fit_transform(X_val)

    # ───────── model ─────────
    if MODEL_TYPE == "mlp":
        cfg   = MODEL_CONFIGS["simple_512"]
        model = get_model("mlp", cfg, input_dim=X_train.shape[1], output_dim=emb_dim).to(device)
        loss_fn, optim_ = nn.MSELoss(), optim.Adam(model.parameters(), lr=LR)

        loader = DataLoader(TensorDataset(torch.tensor(X_train), torch.tensor(Y_train)),
                            batch_size=BATCH_SIZE, shuffle=True)

        for ep in range(EPOCHS):
            model.train()
            for xb, yb in loader:
                xb, yb = xb.float().to(device), yb.float().to(device)
                optim_.zero_grad(); loss = loss_fn(model(xb), yb); loss.backward(); optim_.step()
        Y_val_hat = model(torch.tensor(X_val).float().to(device)).cpu().numpy()

    else:  # classic regressor
        cfg   = CLASSIC_MODEL_CONFIGS[MODEL_TYPE]
        model = get_model(cfg["model_type"], cfg, task_type="regression")
        model.fit(X_train, Y_train)
        Y_val_hat = model.predict(X_val)

    # ───────── metrics ─────────
    dists = pairwise_distances(Y_val_hat, Y_val, metric="manhattan")
    top1  = (np.argmin(dists, axis=1) == np.arange(len(dists))).mean()
    top3  = np.mean([ np.arange(len(dists))[i] in np.argsort(dists[i])[:3]
                      for i in range(len(dists)) ])
    mse   = mean_squared_error(Y_val, Y_val_hat)
    metrics.append({"fold":fold_idx, "mse":mse, "top1":top1, "top3":top3})
    print(f"    MSE {mse:.4f} | Top-1 {top1:.3f} | Top-3 {top3:.3f}")

# ╭────────────── Summary ───────────────╮
summary = pd.DataFrame(metrics)
print("\nCross-validation summary:\n", summary)
print("\nMean MSE :", summary['mse'].mean())
print("Mean Top-1:", summary['top1'].mean())
print("Mean Top-3:", summary['top3'].mean())

# Save artefacts
joblib.dump(scaler, SAVE_DIR / "rq2_scaler.pkl")
if MODEL_TYPE == "mlp":
    torch.save(model.state_dict(), SAVE_DIR / "rq2_mlp_weights.pt")
else:
    joblib.dump(model, SAVE_DIR / "rq2_regressor.pkl")


Merged dataframe: (238, 802)

▶ Fold 1




    MSE 0.0013 | Top-1 0.130 | Top-3 0.217

▶ Fold 2




    MSE 0.0013 | Top-1 0.087 | Top-3 0.130

▶ Fold 3




    MSE 0.0013 | Top-1 0.130 | Top-3 0.217

Cross-validation summary:
    fold       mse      top1      top3
0     1  0.001269  0.130435  0.217391
1     2  0.001334  0.086957  0.130435
2     3  0.001274  0.130435  0.217391

Mean MSE : 0.0012926150340815833
Mean Top-1: 0.11594202898550725
Mean Top-3: 0.18840579710144925


In [14]:
# ╔═════════════════════════════════════════════════════════════╗
# ║  RQ-2  ·  Test-set Evaluation  (MLP / Classic Regressor)   ║
# ╚═════════════════════════════════════════════════════════════╝
import ast, joblib, torch, numpy as np, pandas as pd
from pathlib import Path
from sklearn.metrics import pairwise_distances, mean_squared_error
from sklearn.preprocessing import StandardScaler
from src.utils.feature_cleaner  import clean_features
from src.utils.feature_selector import select_features
from src.models.matching_model  import get_model
from src.train.model_configs    import MODEL_CONFIGS, CLASSIC_MODEL_CONFIGS

# ───── locations ────
TRIAL_FILE  = f"{PREFIX}notebooks/demo_outputs_rq2/report_test_trials.txt"

SCALER_PATH = f"{PREFIX}notebooks/demo_outputs_rq2/rq2_scaler.pkl"
MODEL_PATH  = (f"{PREFIX}notebooks/demo_outputs_rq2/rq2_mlp_weights.pt"
               if MODEL_TYPE == "mlp" else
               f"{PREFIX}notebooks/demo_outputs_rq2/rq2_regressor.pkl")

# ───── misc flags ─────
FEATURE_GROUPS = ["alpha"]
TARGET_COLUMN  = "trimmed_vector"
EPOCH_TAG      = 1                       # 1  or  "avg_01"
device         = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ╭────────────── Load data & trim to test trials ───────────────╮
with open(TRIAL_FILE) as fh:
    test_trials = [ln.strip() for ln in fh if ln.strip()]

eeg_df    = pd.read_csv(DATA_PATH)
emb_df    = pd.read_csv(EMB_PATH)
emb_df["trial_name"] = emb_df["trial_name"].str.replace(".edf", "", regex=False)
eeg_df    = clean_features(eeg_df)

# epoch filter
if isinstance(EPOCH_TAG, str) and EPOCH_TAG.startswith("avg"):
    eeg_df = eeg_df[eeg_df["epoch"] == EPOCH_TAG]
else:
    eeg_df["epoch"] = pd.to_numeric(eeg_df["epoch"], errors="coerce")
    eeg_df = eeg_df[eeg_df["epoch"] == EPOCH_TAG]

eeg_df = (
    eeg_df[eeg_df["trial_name"].isin(test_trials)]
          .merge(emb_df[["trial_name", TARGET_COLUMN]], on="trial_name", how="inner")
          .reset_index(drop=True)
)
print("Test samples:", len(eeg_df))

# ╭────────────── Feature matrix & targets ───────────────╮
X_test, _ = select_features(eeg_df, FEATURE_GROUPS, None, TARGET_COLUMN)
Y_test = np.asarray(eeg_df[TARGET_COLUMN].apply(ast.literal_eval).tolist(), dtype=np.float32)

# scale
scaler    = joblib.load(SCALER_PATH)              # fitted during training
X_scaled  = scaler.transform(X_test)

# ╭────────────── Load model & predict ───────────────╮
if MODEL_TYPE == "mlp":
    cfg   = MODEL_CONFIGS["simple_512"]
    model = get_model("mlp", cfg,
                      input_dim=X_scaled.shape[1],
                      output_dim=Y_test.shape[1]).to(device)
    model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
    model.eval()
    with torch.no_grad():
        Y_hat = model(torch.tensor(X_scaled).float().to(device)).cpu().numpy()
else:
    cfg   = CLASSIC_MODEL_CONFIGS[MODEL_TYPE]
    model = get_model(cfg["model_type"], cfg, task_type="regression")
    model.load(MODEL_PATH)
    Y_hat = model.predict(X_scaled)

# ╭────────────── Metrics ───────────────╮
mse   = mean_squared_error(Y_test, Y_hat)
dists = pairwise_distances(Y_hat, Y_test, metric="manhattan")
idxs  = np.arange(len(dists))
top1  = (np.argmin(dists, axis=1) == idxs).mean()
top3  = np.mean([ idxs[i] in np.argsort(dists[i])[:3] for i in idxs ])

print("\n────────  TEST RESULTS  ────────")
print(f"MSE      : {mse:.4f}")
print(f"Top-1 L₁ : {top1:.3f}")
print(f"Top-3 L₁ : {top3:.3f}")

# ───── save per-trial predictions (optional) ─────
out = pd.DataFrame({
    "trial_name" : eeg_df["trial_name"],
    "L1_distance": dists.min(axis=1)
})
out.to_csv(f"{PREFIX}notebooks/demo_outputs_rq2/test_distances.csv", index=False)
print("Per-trial distances saved → notebooks/demo_outputs_rq2/test_distances.csv")


Test samples: 23





────────  TEST RESULTS  ────────
MSE      : 0.0013
Top-1 L₁ : 0.087
Top-3 L₁ : 0.217
Per-trial distances saved → notebooks/demo_outputs_rq2/test_distances.csv


