In [8]:
import numpy as np
import pandas as pd
from pathlib import Path

def vectorize_connectome(matrix):
    triu_idx = np.triu_indices_from(matrix, k=1)
    return matrix[triu_idx]

connectome_dir = Path("../data/interim/connectomes_cc200")
pheno_csv = "../data/raw/Phenotypic_V1_0b_preprocessed1.csv"

# Read pheno and robustly locate the FILE_ID column (handles extra spaces/case)
pheno = pd.read_csv(pheno_csv)

# normalize column names (strip whitespace)
pheno.columns = pheno.columns.str.strip()

# try to find a column that corresponds to FILE_ID, tolerant to spaces/underscores/case
norm_map = {col: col.strip().upper().replace(" ", "_") for col in pheno.columns}
file_id_col = None
for col, norm in norm_map.items():
    if norm == "FILE_ID":
        file_id_col = col
        break

if file_id_col is None:
    # maybe the index already contains FILE_ID
    if pheno.index.name and pheno.index.name.strip().upper().replace(" ", "_") == "FILE_ID":
        # ensure index values are strings and stripped
        pheno.index = pheno.index.astype(str).str.strip()
    else:
        raise KeyError(f"FILE_ID column not found in pheno CSV. Available columns: {list(pheno.columns)}")
else:
    # ensure values are strings and stripped, then set index
    pheno[file_id_col] = pheno[file_id_col].astype(str).str.strip()
    pheno = pheno.set_index(file_id_col)

X, y, subjects = [], [], []

for f in sorted(connectome_dir.glob("*.npy")):
    subj_id = f.stem.split("_rois")[0]
    if subj_id not in pheno.index:
        # subject not found in phenotypic table
        continue
    # get row as a Series
    row = pheno.loc[subj_id]
    mat = np.load(f)
    X.append(vectorize_connectome(mat))
    # row may be a Series; ensure DX_GROUP is retrieved correctly
    dx = int(row["DX_GROUP"]) if "DX_GROUP" in row.index else int(row["DX_GROUP"].values[0])
    y.append(1 if dx == 1 else 0)
    subjects.append(subj_id)

X = np.array(X)
y = np.array(y)
subjects = np.array(subjects)
# pheno is indexed by FILE_ID now, so loc with subjects will work
sites = pheno.loc[subjects, "SITE_ID"].values

np.save("../data/processed/X.npy", X)
np.save("../data/processed/y.npy", y)
np.save("../data/processed/subjects.npy", subjects)
np.save("../data/processed/sites.npy", sites)
print("Saved", X.shape, y.shape, subjects.shape, sites.shape)


Saved (884, 19900) (884,) (884,) (884,)
