In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import plot_tree
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [23]:
from pathlib import Path

FIG_DIR = Path("../reports/figures")
TAB_DIR = Path("../reports/tables")
FIG_DIR.mkdir(parents=True, exist_ok=True)
TAB_DIR.mkdir(parents=True, exist_ok=True)

In [24]:
# load df

df_raw = pd.read_csv("../data/processed/merged4_df.csv")
df = df_raw.copy()
df = df.drop(columns=[c for c in df.columns if c.startswith(("Range_k","Quantile_k","Cluster_jacc_"))],
             errors="ignore")

df.shape
df.head()

Unnamed: 0,id,max_corsi,corsi_minmax,conditional_correct,conditional_minmax,mental_accuracy,mental_minmax,spatial_arrag_correct,spatial_arrag_minmax,spatial_rel_correct,...,nvc_count_maxscaled,crt_correct,crt_minmax,NFC,NFC_minmax,verbal,verbal_minmax,wason_correct,wason_minmax,jacc
0,participant_1,7,0.666667,0.5,0.5,0.45,0.45,0.55,0.55,0.40625,...,0.0,0.285714,0.285714,3.75,0.458333,0.25,0.25,0.333333,0.333333,0.333333
1,participant_10,7,0.666667,0.5,0.5,0.45,0.45,0.7,0.7,0.40625,...,0.15625,0.571429,0.571429,3.5,0.416667,0.375,0.375,0.5,0.5,0.136719
2,participant_100,7,0.666667,0.5,0.5,0.0,0.0,0.6,0.6,0.40625,...,0.0,0.571429,0.571429,3.75,0.458333,0.0,0.0,0.5,0.5,0.269531
3,participant_101,6,0.555556,0.375,0.375,0.15,0.15,0.7,0.7,0.46875,...,0.09375,0.714286,0.714286,5.0,0.666667,0.5,0.5,1.0,1.0,0.260417
4,participant_102,7,0.666667,0.5,0.5,0.1,0.1,0.8,0.8,0.3125,...,0.0,0.571429,0.571429,3.5,0.416667,0.125,0.125,0.0,0.0,0.252604


In [25]:
# 2 clustering methods 


TARGET = "jacc"
K_LIST = (2,3,4,5)


def assign_range_clusters(df, score_col = "jacc", k_list = K_LIST):
    scores = df[score_col].values
    min_val, max_val = scores.min(), scores.max()

    for k in k_list:
        bins = np.linspace(min_val, max_val, k+1)
        df[f"Range_k{k}"] = np.digitize(scores, bins[1:], right=True)
    return df

def assign_quantile_clusters(df, target_col = "jacc", k_list = K_LIST):
    for k in k_list :
        df[f"Quantile_k{k}"] = pd.qcut(df[target_col], q = k , labels = False, duplicates = "drop")
    return df



In [26]:
df = assign_range_clusters(df)
df = assign_quantile_clusters(df)

In [31]:
# Baseline summary (NOT LOOCV evaluation)
# - global mean of jacc (descriptive)
# - group means by Range_k* and Quantile_k* (descriptive)
# These are used for sanity-check / interpretation.
# The actual baseline performance is computed later with LOOCV.

# Safety: ensure we are using the correct target distribution
assert df[TARGET].notna().sum() == 95
assert abs(df[TARGET].mean() - 0.3141776315789474) < 1e-12

# baseline 

def compute_cluster_loocv_mae(df, cluster_col, target_col = TARGET):
    errors = [ ]
    for _, group in df.groupby(cluster_col):
        y = group[target_col].values
        if len(y) <=1:
            continue
        for i in range(len(y)):
            pred = np.mean(np.delete(y,i))
            errors.append(abs(y[i] - pred))
    return float(np.mean(errors)) if errors else np.nan 

# 1) global baseline (loocv)
y = df[TARGET]
pred_global = (y.sum() - y) / (len(y) - 1)
mae_global = (y - pred_global).abs().mean()



# 2) group baselines (loocv) 
range_mae = {k : compute_cluster_loocv_mae(df, f"Range_k{k}") for k in K_LIST}
quantile_mae = {k : compute_cluster_loocv_mae(df, f"Quantile_k{k}") for k in K_LIST}

baseline_mae_table = pd.DataFrame({
    "method" : (["global_mean_loocv"] + 
                [f"range_k{k}" for k in K_LIST] + 
                [f"quantile_k{k}" for k in K_LIST]),
    "mae" : ([mae_global] + 
             [range_mae[k] for k in K_LIST] +
             [quantile_mae[k] for k in K_LIST])
})

# Save
baseline_mae_table.to_csv(TAB_DIR / "baseline_loocv_mae.csv", index = False)
baseline_mae_table

Unnamed: 0,method,mae
0,global_mean_loocv,0.151546
1,range_k2,0.083338
2,range_k3,0.061561
3,range_k4,0.042372
4,range_k5,0.034744
5,quantile_k2,0.088175
6,quantile_k3,0.055962
7,quantile_k4,0.039886
8,quantile_k5,0.037491


In [32]:
# baseline MAE visualize 

tmp = baseline_mae_table.sort_values("mae")
plt.figure(figsize = (8, 4))
plt.bar(tmp["method"], tmp["mae"])
plt.xticks(rotation = 45, ha = "right")
plt.ylabel("MAE (LOOCV)")
plt.tight_layout()
plt.savefig(FIG_DIR / "baseline_loocv_mae.png", dpi = 200, bbox_inches = "tight")
plt.close()