In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import plot_tree
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
from pathlib import Path

FIG_DIR = Path("../reports/figures")
TAB_DIR = Path("../reports/tables")
FIG_DIR.mkdir(parents=True, exist_ok=True)
TAB_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
# load final_df

final_df = pd.read_csv("../data/processed/merged4_df.csv")
final_df.shape
final_df.head()

Unnamed: 0,id,max_corsi,corsi_minmax,conditional_correct,conditional_minmax,mental_accuracy,mental_minmax,spatial_arrag_correct,spatial_arrag_minmax,spatial_rel_correct,...,nvc_count_maxscaled,crt_correct,crt_minmax,NFC,NFC_minmax,verbal,verbal_minmax,wason_correct,wason_minmax,jacc
0,participant_1,7,0.666667,0.5,0.5,0.45,0.45,0.55,0.55,0.40625,...,0.0,0.285714,0.285714,3.75,0.458333,0.25,0.25,0.333333,0.333333,0.333333
1,participant_10,7,0.666667,0.5,0.5,0.45,0.45,0.7,0.7,0.40625,...,0.15625,0.571429,0.571429,3.5,0.416667,0.375,0.375,0.5,0.5,0.136719
2,participant_100,7,0.666667,0.5,0.5,0.0,0.0,0.6,0.6,0.40625,...,0.0,0.571429,0.571429,3.75,0.458333,0.0,0.0,0.5,0.5,0.269531
3,participant_101,6,0.555556,0.375,0.375,0.15,0.15,0.7,0.7,0.46875,...,0.09375,0.714286,0.714286,5.0,0.666667,0.5,0.5,1.0,1.0,0.260417
4,participant_102,7,0.666667,0.5,0.5,0.1,0.1,0.8,0.8,0.3125,...,0.0,0.571429,0.571429,3.5,0.416667,0.125,0.125,0.0,0.0,0.252604


In [12]:
# 2 clustering methods 

def assign_range_clusters(df, score_col = "jacc", k_list = (2,3,4,5)):
    scores = df[score_col].values
    min_val, max_val = scores.min(), scores.max()

    for k in k_list:
        bins = np.linspace(min_val, max_val, k+1)
        labels = np.digitize(scores, bins[1:], right = True)
        df[f"Range_k{k}"] = labels 
    return df

def assign_quantile_clusters(df, target_col = "jacc", k_list =(2,3,4,5)):
    for k in k_list :
        df[f"Quantile_k{k}"] = pd.qcut(df[target_col], q = k , labels = False, duplicates = "drop")
    return df



In [13]:
final_df = assign_range_clusters(final_df, score_col = "jacc")
final_df = assign_quantile_clusters(final_df, target_col = "jacc")

final_df[["jacc", "Range_k2", "Quantile_k2"]].head()

Unnamed: 0,jacc,Range_k2,Quantile_k2
0,0.333333,0,1
1,0.136719,0,0
2,0.269531,0,0
3,0.260417,0,0
4,0.252604,0,0


In [14]:
final_df.head()

Unnamed: 0,id,max_corsi,corsi_minmax,conditional_correct,conditional_minmax,mental_accuracy,mental_minmax,spatial_arrag_correct,spatial_arrag_minmax,spatial_rel_correct,...,wason_minmax,jacc,Range_k2,Range_k3,Range_k4,Range_k5,Quantile_k2,Quantile_k3,Quantile_k4,Quantile_k5
0,participant_1,7,0.666667,0.5,0.5,0.45,0.45,0.55,0.55,0.40625,...,0.333333,0.333333,0,1,1,1,1,1,2,3
1,participant_10,7,0.666667,0.5,0.5,0.45,0.45,0.7,0.7,0.40625,...,0.5,0.136719,0,0,0,0,0,0,0,0
2,participant_100,7,0.666667,0.5,0.5,0.0,0.0,0.6,0.6,0.40625,...,0.5,0.269531,0,0,1,1,0,1,1,2
3,participant_101,6,0.555556,0.375,0.375,0.15,0.15,0.7,0.7,0.46875,...,1.0,0.260417,0,0,1,1,0,1,1,2
4,participant_102,7,0.666667,0.5,0.5,0.1,0.1,0.8,0.8,0.3125,...,0.0,0.252604,0,0,1,1,0,1,1,2


In [20]:
# baseline 

TARGET = "jacc"
K_LIST = (2,3,4,5)

# 1) global baseline 
global_baseline = final_df[TARGET].mean()

# 2) range clustering baseline 
range_baselines = {}
for k in K_LIST:
    col = f"Range_k{k}"
    range_baselines[k] = (
        final_df.groupby(col, dropna = False)[TARGET]
        .mean()
        .reset_index(name = "baseline_mean")
        .sort_values(col)
    )
   
# 3) quantile clustering baseline
quantile_baselines = {}
for k in K_LIST:
    col = f"Quantile_k{k}"
    quantile_baselines[k] = (
        final_df.groupby(col, dropna = False)[TARGET]
        .mean()
        .reset_index(name = "baseline_mean")
        .sort_values(col)
    )

print("global_baseline:", global_baseline)
print("n_groups(range):", {k : t.shape[0] for k, t in range_baselines.items()})
print("n_groups(quantile):", {k: t.shape[0] for k, t in quantile_baselines.items()})

global_baseline: 0.3141776315789474
n_groups(range): {2: 2, 3: 3, 4: 4, 5: 5}
n_groups(quantile): {2: 2, 3: 3, 4: 4, 5: 5}
