In [8]:
from pathlib import Path

import pandas as pd
import numpy as np

from datetime import datetime

from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go

In [9]:

def filter_df(df, filter_type):
    filtered_df = df[df["Data"].str.contains(filter_type, case=False, na=False)]
    return filtered_df


def concat_filter_analysis_frames(root_dir: str, output_path: str, filter_type: str):
    root_path = Path(root_dir)
    # Find every file named "analysis_frame.xlsx" under root_dir
    all_analysis_files = list(root_path.rglob("analysis_frame.xlsx"))

    dfs = []
    for analysis_file in all_analysis_files:
        method_folder = analysis_file.parent.parent
        experiment_folder = analysis_file.parent.parent.parent
        
        # Only process if the experiment folder starts with "cluster_"
        if not experiment_folder.name.startswith("cluster_"):
            continue
        
        # Extract datetime string from experiment folder name
        # e.g. "cluster_experiment_results_17022025125138" → last underscore part is "17022025125138"
        parts = experiment_folder.name.split('_')
        if len(parts) > 1:
            datetime_str = parts[-1]
        else:
            datetime_str = "N/A"  # fallback if no underscore

        # Cluster method is simply the parent folder name
        # e.g. "gmm_credit_risk", "kmeans_credit_risk"
        cluster_method = method_folder.name

        # Read the Excel
        df = pd.read_excel(analysis_file)

        # Add identifying columns
        df["cluster_method"] = cluster_method
        df["datetime_str"] = datetime_str

        dfs.append(df)

    # Concatenate and save
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)

        filtered_df = filter_df(combined_df, filter_type)

        filtered_df.to_excel(output_path, index=False)
        print(f"Combined dataframe saved to: {output_path}")
    else:
        print("No matching analysis_frame.xlsx files found under the given root directory.")

concat_filter_analysis_frames(
    root_dir="../experiments",
    output_path="../experiments/train_combined_analysis_frames.xlsx",
    filter_type='train'
)

concat_filter_analysis_frames(
    root_dir="../experiments",
    output_path="../experiments/test_combined_analysis_frames.xlsx",
    filter_type='test'
)

concat_filter_analysis_frames(
    root_dir="../experiments",
    output_path="../experiments/combined_analysis_frames.xlsx",
    filter_type='_'
)

Combined dataframe saved to: train_combined_analysis_frames.xlsx
Combined dataframe saved to: test_combined_analysis_frames.xlsx
Combined dataframe saved to: combined_analysis_frames.xlsx


In [10]:
df = pd.read_excel("../experiments/combined_analysis_frames.xlsx")
df.head()

Unnamed: 0,AUC Score,Accuracy,Precision,Recall (TPR),F1 Score,Log Loss,Brier Score,Cluster Size,Pure Cluster,Cluster ID,Data,cluster_method,datetime_str
0,0.999996,0.998765,0.997175,0.997175,0.997175,0.005907,0.001093,1620,0.0,0.0,train_xper_scores,kmedoids_credit_risk,17022025212918
1,1.0,1.0,1.0,1.0,1.0,0.00378,0.00037,1580,0.0,1.0,train_xper_scores,kmedoids_credit_risk,17022025212918
2,0.86241,0.764068,0.891304,0.195238,0.320312,0.490886,0.158693,1475,0.0,0.0,train_feature_scores,kmedoids_credit_risk,17022025212918
3,0.899706,0.871884,0.0,0.0,0.0,0.315117,0.091416,1725,0.0,1.0,train_feature_scores,kmedoids_credit_risk,17022025212918
4,0.999848,0.988758,1.0,0.965679,0.98254,0.06174,0.012057,1957,0.0,0.0,train_epsilon_scores,kmedoids_credit_risk,17022025212918


In [11]:
df.shape

(300, 13)

In [12]:

df['Experiment_Type'] = df['Data'].apply(lambda x: 'Baseline' if 'baseline' in x.lower() else 'Clustering')

weighted_df = df[df["Data"].str.contains("weighted", case=False, na=False)]


In [13]:
df1 = df[df["Data"].str.contains("weighted", case=False, na=False)]

# 1 Visualizations

In [14]:
fig1 = px.box(df1, x="cluster_method", y="AUC Score",
              title="Distribution of AUC Score by Cluster Method")
fig1.show()


In [15]:
fig2 = px.box(df1, x="cluster_method", y="Recall (TPR)",
              title="Distribution of Recall (TPR) by Cluster Method")
fig2.show()


In [16]:
fig3 = px.box(df1, x="cluster_method", y="Log Loss",
              title="Distribution of Log Loss by Cluster Method")
fig3.show()


In [17]:
fig4 = px.box(df1, x="cluster_method", y="Brier Score",
              title="Distribution of Brier Score by Cluster Method")
fig4.show()


In [18]:
df2 = df[(df["Data"].str.contains("test", case=False, na=False)) & ~(df["Data"].str.contains("weighted", case=False, na=False))]

In [19]:
fig5 = px.scatter(df2, x="Cluster Size", y="AUC Score", color="cluster_method",
                  title="Cluster Size vs. AUC Score by Cluster Method")
fig5.show()


In [20]:
fig6 = px.scatter(df2, x="Cluster Size", y="Recall (TPR)", color="cluster_method",
                  title="Cluster Size vs. Recall (TPR) by Cluster Method")
fig6.show()


In [21]:
fig7 = px.scatter(df2, x="Cluster Size", y="Log Loss", color="cluster_method",
                  title="Cluster Size vs. Log Loss by Cluster Method")
fig7.show()


In [22]:
fig8 = px.scatter(df2, x="Cluster Size", y="Brier Score", color="cluster_method",
                  title="Cluster Size vs. Brier Score by Cluster Method")
fig8.show()


In [23]:
pure_counts = df2.groupby("cluster_method")["Pure Cluster"].mean().reset_index()
fig9 = px.bar(pure_counts, x="cluster_method", y="Pure Cluster",
               title="Proportion of Pure Clusters by Cluster Method",
               labels={"Pure Cluster": "Proportion of Pure Clusters"})
fig9.show()


In [24]:
fig10 = px.box(df.drop_duplicates(subset=['Data', 'datetime_str']), x="Experiment_Type", y="AUC Score", color="Experiment_Type",
               title="AUC Score Distribution: Baseline vs. Clustering")
fig10.show()


In [25]:
fig11 = px.scatter_matrix(df2, dimensions=["AUC Score", "Recall (TPR)", "Log Loss", "Brier Score"],
                            color="cluster_method",
                            title="Scatter Matrix of Performance Metrics")
fig11.update_traces(diagonal_visible=False)
fig11.show()


# 2 Visualizations

In [26]:


def extract_experiment_type(x):
    x_lower = str(x).lower()
    if "xper" in x_lower:
        return "xper"
    elif "feature" in x_lower:
        return "feature"
    elif "epsilon" in x_lower or "error" in x_lower:
        return "error"
    elif "baseline" in x_lower:
        return "baseline"
    else:
        return "other"

df["Exp_Type"] = df["Data"].apply(extract_experiment_type)

# Create filtered DataFrames for later analyses:
#  - Non-weighted, non-pure rows for direct experiment comparisons:
df_clean = df[
    (~df["Data"].str.contains("weighted", case=False, na=False)) &
    (df["Pure Cluster"] == False) &
    (df["Exp_Type"].isin(["xper", "feature", "error"]))
].copy()

#  - Baseline rows
df_baseline = df[df["Data"].str.contains("baseline", case=False, na=False)].copy()
df_baseline["Exp_Type"] = df_baseline["Data"].apply(extract_experiment_type)

#  - Weighted rows
df_weighted = df[df["Data"].str.contains("weighted", case=False, na=False)].copy()
df_weighted["Exp_Type"] = df_weighted["Data"].apply(extract_experiment_type)

# For analyses by datetime_str on weighted averages, we further restrict:
weighted_vals = [
    "weighted_average_test_epsilon_scores",
    "weighted_average_test_feature_scores",
    "weighted_average_test_xper_scores"
]
df_weighted_sel = df[df["Data"].isin(weighted_vals)].copy()


In [27]:
pure_counts = df[df["Pure Cluster"] == True].groupby(["cluster_method", "Exp_Type"]).size().reset_index(name="Count")
fig1 = px.bar(pure_counts, x="cluster_method", y="Count", color="Exp_Type", barmode="group",
              title="Count of Pure Clusters per Cluster Method & Experiment Type")
fig1.show()


In [28]:
cluster_size_grp = df_clean.groupby(["cluster_method", "Exp_Type"])["Cluster Size"].mean().reset_index()
fig2 = px.bar(cluster_size_grp, x="cluster_method", y="Cluster Size", color="Exp_Type", barmode="group",
               title="Average Cluster Size by Cluster Method & Experiment Type")
fig2.show()
