In [1]:
import pandas as pd
import config as cfg
from sklearn.preprocessing import MinMaxScaler
from pingouin import friedman
import numpy as np
from scikit_posthocs import posthoc_nemenyi_friedman
from scipy.stats import rankdata

In [2]:
def normalize_proximity_metrics(df: pd.DataFrame) -> pd.DataFrame:
    """
    Normalize proximity metrics (L1, L2, DTW) to [0,1] within each Dataset.

    Parameters
    ----------
    df : pd.DataFrame
        Dataframe containing 'Dataset' and proximity metric columns:
        - 'Proximity L1'
        - 'Proximity L2'
        - 'Proximity DTW'

    Returns
    -------
    pd.DataFrame
        Copy of the dataframe with three new normalized columns:
        - 'Proximity L1_norm'
        - 'Proximity L2_norm'
        - 'Proximity DTW_norm'
    """
    proximity_cols = ["Proximity L1", "Proximity L2", "Proximity DTW"]
    data = df.copy()

    for col in proximity_cols:
        data[col] = (
            data.groupby("Dataset")[col]
              .transform(lambda x: MinMaxScaler().fit_transform(x.values.reshape(-1, 1)).flatten())
        )
    return data


In [3]:
def print_values_with_ranks(metric: str, metric_df: pd.DataFrame, ranked_df: pd.DataFrame) -> None:
    combined = metric_df.copy()
    for col in metric_df.columns:
        combined[col] = metric_df[col].map("{:.3f}".format) + " (" + ranked_df[col].map("{:.1f}".format) + ")"

    avg_ranks = ranked_df.mean(axis=0).to_frame().T
    avg_ranks.index = ["Avg Rank"]
    avg_ranks = avg_ranks.map("{:.2f}".format)
    print(f"\n=== {metric} ===")
    print(combined.to_string(index=False))
    print("\nAverage ranks:")
    print(avg_ranks.to_string(index=False))

In [4]:
def analyze_results(results: pd.DataFrame, metric: str, direction: str = "low"):

    averages = results.groupby("Explainer").mean(numeric_only=True).reset_index()
    stds = results.groupby("Explainer").std(numeric_only=True).reset_index()

    ALPHA = 0.05

    metric_df = results.pivot_table(index="Dataset", columns="Explainer", values=metric)
    if direction == "high":
        ranked_values = rankdata(-metric_df, axis=1)
    else:
        ranked_values = rankdata(metric_df, axis=1)
    ranked_df = pd.DataFrame(ranked_values, columns=metric_df.columns, index=metric_df.index)

    print_values_with_ranks(metric, metric_df, ranked_df)
    if direction == "high":
        metric_df = -metric_df

    stats = friedman(metric_df, method="f")
    p = stats.loc["Friedman", "p-unc"]
    f = stats.loc["Friedman", "F"]

    if not np.isnan(p) and p < ALPHA:
        print(f"Significant differences found in {metric} (p = {p:.4f}, F = {f:.3f}).")
        nemenyi_matrix = posthoc_nemenyi_friedman(metric_df)

        for i in range(len(nemenyi_matrix)):
            for j in range(i + 1, len(nemenyi_matrix)):
                p_value = nemenyi_matrix.iloc[i, j]
                if p_value < ALPHA:
                    method1 = nemenyi_matrix.index[i]
                    method2 = nemenyi_matrix.columns[j]
                    average_ranks = ranked_df.mean(axis=0)
                    average1 = average_ranks[method1]
                    average2 = average_ranks[method2]
                    print(f"  {method1} (Avg. Rank: {average1}) vs {method2} (Avg. Rank: {average2}): p = {p_value:.4f}")
    else:
        print(f"No significant differences found in {metric} (p = {p:.4f}, F = {f:.3f}).")

In [5]:
results = pd.read_csv(cfg.EXPERIMENT_PROXIMITY_METRIC / "proximity_metric_summary.csv")
results = normalize_proximity_metrics(results)

metrics = ['Coverage', 'Validity', 'Sparsity', 'Confidence', 'Proximity L1', 'Proximity L2', 'Proximity DTW', 'yNN']
analyze_results(results, 'Coverage', direction="high")


=== Coverage ===
         Ctw.          Dtw. Dtw_medium_window. Dtw_small_window.    Euclidean.      Softdtw.
100.000 (3.5) 100.000 (3.5)      100.000 (3.5)     100.000 (3.5) 100.000 (3.5) 100.000 (3.5)
100.000 (3.5) 100.000 (3.5)      100.000 (3.5)     100.000 (3.5) 100.000 (3.5) 100.000 (3.5)
100.000 (3.5) 100.000 (3.5)      100.000 (3.5)     100.000 (3.5) 100.000 (3.5) 100.000 (3.5)
100.000 (3.5) 100.000 (3.5)      100.000 (3.5)     100.000 (3.5) 100.000 (3.5) 100.000 (3.5)
100.000 (3.5) 100.000 (3.5)      100.000 (3.5)     100.000 (3.5) 100.000 (3.5) 100.000 (3.5)
100.000 (3.5) 100.000 (3.5)      100.000 (3.5)     100.000 (3.5) 100.000 (3.5) 100.000 (3.5)
100.000 (3.5) 100.000 (3.5)      100.000 (3.5)     100.000 (3.5) 100.000 (3.5) 100.000 (3.5)

Average ranks:
Ctw. Dtw. Dtw_medium_window. Dtw_small_window. Euclidean. Softdtw.
3.50 3.50               3.50              3.50       3.50     3.50
No significant differences found in Coverage (p = nan, F = nan).


  W = (12 * ssbn - 3 * n**2 * k * (k + 1) ** 2) / (n**2 * k * (k - 1) * (k + 1) - n * ties)


In [6]:
analyze_results(results, 'Validity', direction="high")


=== Validity ===
       Ctw.        Dtw. Dtw_medium_window. Dtw_small_window.  Euclidean.    Softdtw.
1.000 (3.5) 1.000 (3.5)        1.000 (3.5)       1.000 (3.5) 1.000 (3.5) 1.000 (3.5)
1.000 (3.5) 1.000 (3.5)        1.000 (3.5)       1.000 (3.5) 1.000 (3.5) 1.000 (3.5)
1.000 (3.5) 1.000 (3.5)        1.000 (3.5)       1.000 (3.5) 1.000 (3.5) 1.000 (3.5)
1.000 (3.5) 1.000 (3.5)        1.000 (3.5)       1.000 (3.5) 1.000 (3.5) 1.000 (3.5)
1.000 (3.5) 1.000 (3.5)        1.000 (3.5)       1.000 (3.5) 1.000 (3.5) 1.000 (3.5)
1.000 (3.5) 1.000 (3.5)        1.000 (3.5)       1.000 (3.5) 1.000 (3.5) 1.000 (3.5)
1.000 (3.5) 1.000 (3.5)        1.000 (3.5)       1.000 (3.5) 1.000 (3.5) 1.000 (3.5)

Average ranks:
Ctw. Dtw. Dtw_medium_window. Dtw_small_window. Euclidean. Softdtw.
3.50 3.50               3.50              3.50       3.50     3.50
No significant differences found in Validity (p = nan, F = nan).


  W = (12 * ssbn - 3 * n**2 * k * (k + 1) ** 2) / (n**2 * k * (k - 1) * (k + 1) - n * ties)


In [7]:
analyze_results(results, 'Sparsity', direction="high")


=== Sparsity ===
       Ctw.        Dtw. Dtw_medium_window. Dtw_small_window.  Euclidean.    Softdtw.
0.895 (3.5) 0.895 (3.5)        0.895 (3.5)       0.895 (3.5) 0.917 (1.0) 0.894 (6.0)
0.845 (3.0) 0.845 (3.0)        0.845 (3.0)       0.845 (3.0) 0.798 (6.0) 0.845 (3.0)
0.916 (3.5) 0.916 (3.5)        0.916 (3.5)       0.916 (3.5) 0.920 (1.0) 0.915 (6.0)
0.817 (2.5) 0.817 (2.5)        0.817 (2.5)       0.817 (2.5) 0.799 (6.0) 0.806 (5.0)
0.819 (2.5) 0.819 (2.5)        0.819 (2.5)       0.819 (2.5) 0.801 (5.0) 0.799 (6.0)
0.864 (4.5) 0.864 (4.5)        0.864 (4.5)       0.864 (4.5) 0.876 (1.0) 0.868 (2.0)
0.916 (3.0) 0.916 (3.0)        0.916 (3.0)       0.916 (3.0) 0.892 (6.0) 0.916 (3.0)

Average ranks:
Ctw. Dtw. Dtw_medium_window. Dtw_small_window. Euclidean. Softdtw.
3.21 3.21               3.21              3.21       3.71     4.43
No significant differences found in Sparsity (p = 0.5839, F = 0.753).


In [8]:
analyze_results(results, 'Confidence', direction="high")


=== Confidence ===
       Ctw.        Dtw. Dtw_medium_window. Dtw_small_window.  Euclidean.    Softdtw.
0.815 (3.5) 0.815 (3.5)        0.815 (3.5)       0.815 (3.5) 0.729 (6.0) 0.816 (1.0)
0.667 (3.0) 0.667 (3.0)        0.667 (3.0)       0.667 (3.0) 0.629 (6.0) 0.667 (3.0)
0.821 (3.5) 0.821 (3.5)        0.821 (3.5)       0.821 (3.5) 0.781 (6.0) 0.823 (1.0)
0.684 (2.5) 0.684 (2.5)        0.684 (2.5)       0.684 (2.5) 0.647 (6.0) 0.658 (5.0)
0.731 (3.5) 0.731 (3.5)        0.731 (3.5)       0.731 (3.5) 0.707 (6.0) 0.745 (1.0)
0.711 (3.5) 0.711 (3.5)        0.711 (3.5)       0.711 (3.5) 0.708 (6.0) 0.721 (1.0)
0.720 (3.0) 0.720 (3.0)        0.720 (3.0)       0.720 (3.0) 0.718 (6.0) 0.720 (3.0)

Average ranks:
Ctw. Dtw. Dtw_medium_window. Dtw_small_window. Euclidean. Softdtw.
3.21 3.21               3.21              3.21       6.00     2.14
Significant differences found in Confidence (p = 0.0000, F = 19.038).
  Euclidean. (Avg. Rank: 6.0) vs Softdtw. (Avg. Rank: 2.142857142857143): p = 0.

In [9]:
analyze_results(results, 'Proximity L1')


=== Proximity L1 ===
       Ctw.        Dtw. Dtw_medium_window. Dtw_small_window.  Euclidean.    Softdtw.
0.919 (3.5) 0.919 (3.5)        0.919 (3.5)       0.919 (3.5) 0.000 (1.0) 1.000 (6.0)
1.000 (4.0) 1.000 (4.0)        1.000 (4.0)       1.000 (4.0) 0.000 (1.0) 1.000 (4.0)
0.797 (3.5) 0.797 (3.5)        0.797 (3.5)       0.797 (3.5) 0.000 (1.0) 1.000 (6.0)
0.000 (2.5) 0.000 (2.5)        0.000 (2.5)       0.000 (2.5) 0.636 (5.0) 1.000 (6.0)
1.000 (4.5) 1.000 (4.5)        1.000 (4.5)       1.000 (4.5) 0.000 (1.0) 0.504 (2.0)
1.000 (4.5) 1.000 (4.5)        1.000 (4.5)       1.000 (4.5) 0.000 (1.0) 0.877 (2.0)
0.000 (3.0) 0.000 (3.0)        0.000 (3.0)       0.000 (3.0) 1.000 (6.0) 0.000 (3.0)

Average ranks:
Ctw. Dtw. Dtw_medium_window. Dtw_small_window. Euclidean. Softdtw.
3.64 3.64               3.64              3.64       2.29     4.14
No significant differences found in Proximity L1 (p = 0.2933, F = 1.298).


In [10]:
analyze_results(results, 'Proximity L2')


=== Proximity L2 ===
       Ctw.        Dtw. Dtw_medium_window. Dtw_small_window.  Euclidean.    Softdtw.
0.885 (3.5) 0.885 (3.5)        0.885 (3.5)       0.885 (3.5) 0.000 (1.0) 1.000 (6.0)
1.000 (4.0) 1.000 (4.0)        1.000 (4.0)       1.000 (4.0) 0.000 (1.0) 1.000 (4.0)
0.847 (3.5) 0.847 (3.5)        0.847 (3.5)       0.847 (3.5) 0.000 (1.0) 1.000 (6.0)
0.468 (3.5) 0.468 (3.5)        0.468 (3.5)       0.468 (3.5) 0.000 (1.0) 1.000 (6.0)
1.000 (4.5) 1.000 (4.5)        1.000 (4.5)       1.000 (4.5) 0.000 (1.0) 0.382 (2.0)
0.923 (3.5) 0.923 (3.5)        0.923 (3.5)       0.923 (3.5) 0.000 (1.0) 1.000 (6.0)
0.000 (3.0) 0.000 (3.0)        0.000 (3.0)       0.000 (3.0) 1.000 (6.0) 0.000 (3.0)

Average ranks:
Ctw. Dtw. Dtw_medium_window. Dtw_small_window. Euclidean. Softdtw.
3.64 3.64               3.64              3.64       1.71     4.71
Significant differences found in Proximity L2 (p = 0.0044, F = 4.500).
  Euclidean. (Avg. Rank: 1.7142857142857142) vs Softdtw. (Avg. Rank: 4.714285

In [11]:
analyze_results(results, 'Proximity DTW')


=== Proximity DTW ===
       Ctw.        Dtw. Dtw_medium_window. Dtw_small_window.  Euclidean.    Softdtw.
0.840 (3.5) 0.840 (3.5)        0.840 (3.5)       0.840 (3.5) 0.000 (1.0) 1.000 (6.0)
1.000 (4.0) 1.000 (4.0)        1.000 (4.0)       1.000 (4.0) 0.000 (1.0) 1.000 (4.0)
0.921 (3.5) 0.921 (3.5)        0.921 (3.5)       0.921 (3.5) 0.000 (1.0) 1.000 (6.0)
0.000 (2.5) 0.000 (2.5)        0.000 (2.5)       0.000 (2.5) 0.460 (5.0) 1.000 (6.0)
1.000 (4.5) 1.000 (4.5)        1.000 (4.5)       1.000 (4.5) 0.000 (1.0) 0.717 (2.0)
0.793 (3.5) 0.793 (3.5)        0.793 (3.5)       0.793 (3.5) 0.000 (1.0) 1.000 (6.0)
0.000 (3.0) 0.000 (3.0)        0.000 (3.0)       0.000 (3.0) 1.000 (6.0) 0.000 (3.0)

Average ranks:
Ctw. Dtw. Dtw_medium_window. Dtw_small_window. Euclidean. Softdtw.
3.50 3.50               3.50              3.50       2.29     4.71
No significant differences found in Proximity DTW (p = 0.0880, F = 2.178).


In [12]:
analyze_results(results, 'yNN')


=== yNN ===
       Ctw.        Dtw. Dtw_medium_window. Dtw_small_window.  Euclidean.    Softdtw.
0.996 (4.5) 0.996 (4.5)        0.996 (4.5)       0.996 (4.5) 0.995 (1.0) 0.996 (2.0)
0.997 (4.0) 0.997 (4.0)        0.997 (4.0)       0.997 (4.0) 0.995 (1.0) 0.997 (4.0)
0.990 (3.5) 0.990 (3.5)        0.990 (3.5)       0.990 (3.5) 0.990 (6.0) 0.990 (1.0)
0.997 (2.5) 0.997 (2.5)        0.997 (2.5)       0.997 (2.5) 0.997 (6.0) 0.997 (5.0)
0.990 (4.5) 0.990 (4.5)        0.990 (4.5)       0.990 (4.5) 0.988 (2.0) 0.987 (1.0)
0.989 (3.5) 0.989 (3.5)        0.989 (3.5)       0.989 (3.5) 0.989 (6.0) 0.989 (1.0)
0.983 (3.0) 0.983 (3.0)        0.983 (3.0)       0.983 (3.0) 0.985 (6.0) 0.983 (3.0)

Average ranks:
Ctw. Dtw. Dtw_medium_window. Dtw_small_window. Euclidean. Softdtw.
3.64 3.64               3.64              3.64       4.00     2.43
No significant differences found in yNN (p = 0.4749, F = 0.926).
