In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import plot_tree
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
from pathlib import Path

FIG_DIR = Path("../reports/figures")
TAB_DIR = Path("../reports/tables")
FIG_DIR.mkdir(parents=True, exist_ok=True)
TAB_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# load df

df_raw = pd.read_csv("../data/processed/merged4_df.csv")
df = df_raw.copy()
df.shape
df.head()

In [None]:
# drop stale cluster columns
drop_cols = [c for c in df.columns if c.startswith(("Range_k", "Quantile_k", "Cluster_jacc_"))]
df = df.drop(columns=drop_cols, errors="ignore")

In [None]:
# 2 clustering methods 

def assign_range_clusters(df, score_col = "jacc", k_list = (2,3,4,5)):
    scores = df[score_col].values
    min_val, max_val = scores.min(), scores.max()

    for k in k_list:
        bins = np.linspace(min_val, max_val, k+1)
        labels = np.digitize(scores, bins[1:], right = True)
        df[f"Range_k{k}"] = labels 
    return df

def assign_quantile_clusters(df, target_col = "jacc", k_list =(2,3,4,5)):
    for k in k_list :
        df[f"Quantile_k{k}"] = pd.qcut(df[target_col], q = k , labels = False, duplicates = "drop")
    return df



In [None]:
df = assign_range_clusters(df, score_col = "jacc")
df = assign_quantile_clusters(df, target_col = "jacc")

df[["jacc", "Range_k2", "Quantile_k2"]].head()

In [None]:
df.head()

In [None]:
# Baseline summary (NOT LOOCV evaluation)
# - global mean of jacc (descriptive)
# - group means by Range_k* and Quantile_k* (descriptive)
# These are used for sanity-check / interpretation.
# The actual baseline performance is computed later with LOOCV.


# baseline 

TARGET = "jacc"
K_LIST = (2,3,4,5)

# 1) global baseline 
global_baseline = df[TARGET].mean()

# 2) range clustering baseline 
range_baselines = {}
for k in K_LIST:
    col = f"Range_k{k}"
    range_baselines[k] = (
        df.groupby(col, dropna = False)[TARGET]
        .mean()
        .reset_index(name = "baseline_mean")
        .sort_values(col)
    )
   
# 3) quantile clustering baseline
quantile_baselines = {}
for k in K_LIST:
    col = f"Quantile_k{k}"
    quantile_baselines[k] = (
        df.groupby(col, dropna = False)[TARGET]
        .mean()
        .reset_index(name = "baseline_mean")
        .sort_values(col)
    )

print("jacc mean (summary):", global_baseline)
print("n_groups(range):", {k: t.shape[0] for k, t in range_baselines.items()})
print("n_groups(quantile):", {k: t.shape[0] for k, t in quantile_baselines.items()})

In [None]:
# Target Feature summary 

target_summary = pd.DataFrame({
    "mean" : [df[TARGET].mean()],
    "std" : [df[TARGET].std()],
    "min" : [df[TARGET].min()],
    "max" : [df[TARGET].max()],
    "n" : [df[TARGET].notna().sum()],
})
target_summary.to_csv(TAB_DIR / "target_summary.csv", index = False)

In [None]:
plt.figure(figsize = (12, 5))

plt.subplot(1,2,1)
sns.histplot(df[TARGET], bins = 15, kde = True)
plt.title("Distribution of Jaccard Score")
plt.ylabel("Jaccard Score")
plt.xlabel("Frequency")

plt.subplot(1, 2, 2)
sns.boxplot(x = df[TARGET])
plt.title("Boxplot of Jaccard Score")
plt.xlabel("Jaccard Score")

plt.tight_layout()
plt.savefig(FIG_DIR / "target_jacc_distribution.png", dpi = 200, bbox_inches = "tight")
plt.close()

In [None]:
# LOOCV Baseline functions

# Global LOOCV baseline (leave-one-out mean over all subjects)
x = df[TARGET]
overall_loocv_pred = (x.sum()- x) / (len(x) - 1)
overall_loocv_mae = (x - overall_loocv_pred).abs().mean() 

def compute_cluster_loocv_mae(df, cluster_col, target_col = TARGET):
    errors = []
    for _, group in df.groupby(cluster_col):
        y = group[target_col].values
        if len(y) <= 1:
            continue
        for i in range(len(y)):
            pred = np.mean(np.delete(y, i))
            errors.append(abs(y[i] - pred))
        return float(np.mean(errors)) if errors else np.nan


In [None]:
# Group LOOCV baselines (within-cluster mean):

range_mae = {}
quantile_mae = {}

for k in K_LIST:
    range_mae[k] = compute_cluster_loocv_mae(df, f"Range_k{k}")
    quantile_mae[k] = compute_cluster_loocv_mae(df, f"Quantile_k{k}")


baseline_mae_table = pd.DataFrame({
    "method" : (["global_mean_loocv"] + 
                [f"range_k{k}" for k in K_LIST] +
                [f"quantile_k{k}" for k in K_LIST]),
    "mae" : ([overall_loocv_mae] + 
             [range_mae[k] for k in K_LIST] +
             [quantile_mae[k] for k in K_LIST])
})

baseline_mae_table.to_csv(TAB_DIR / "baseline_loocv_mae.csv", index = False)
baseline_mae_table 