# Notebook for creation of tables used in the paper.

In [1]:
import pandas as pd


results = pd.read_csv('../benchmark/evaluations/all_evaluation_results.csv')
#results.sort_values(by=['Explainer', 'Model', 'Dataset'], ascending=[True, True,True])

In [2]:
# Re-filter FCN-only results
fcn_results = results[results['Model'] == 'fcn'].copy()

# Define the exact Confetti Optimized method names to extract
allowed_confetti_methods = [
    'Confetti Optimized (alpha=0.5)',
    'Confetti Optimized (theta=0.95)'
]

# Apply mask for those methods
confetti_mask = fcn_results['Explainer'].isin(allowed_confetti_methods)
confetti_optimized_df = fcn_results[confetti_mask].copy()

# Rename Confetti methods using Greek symbols for α and θ
confetti_optimized_df['Explainer'] = confetti_optimized_df['Explainer'].apply(
    lambda x: f"Confetti α=0.5" if "alpha=0.5" in x else "Confetti θ=0.95"
)
# Get all other methods excluding any 'Confetti'
non_confetti_df = fcn_results[~fcn_results['Explainer'].str.contains('Confetti', case=False, na=False)]

# Combine both parts
final_df = pd.concat([non_confetti_df, confetti_optimized_df], ignore_index=True)

## Metrics

### Coverage
Coverage refers to the proportion of the sample set for which a counterfactual exists

In [3]:
# Keep only relevant columns and drop rows without Coverage
coverage_df = final_df[['Dataset', 'Explainer', 'Coverage']].dropna(subset=['Coverage'])

# Pivot to wide format without removing duplicates
pivot_table_coverage = coverage_df.pivot_table(index='Dataset', columns='Explainer', values='Coverage', aggfunc='first').reset_index()

# Reorder columns: Dataset + non-Confetti + Confetti α=...
columns = pivot_table_coverage.columns.tolist()

confetti_cols = [col for col in columns if isinstance(col, str) and (col.startswith("Confetti α") or col.startswith("Confetti θ"))]
non_confetti_cols = [col for col in columns if col != 'Dataset' and col not in confetti_cols]
ordered_cols = ['Dataset'] + sorted(non_confetti_cols) + sorted(confetti_cols)


pivot_table = pivot_table_coverage[ordered_cols]
pivot_table.columns.name = None

# Format floats to 3 decimal places
coverage_table = pivot_table.round(3)
coverage_table

Unnamed: 0,Dataset,Comte,Sets,TSEvo,Confetti α=0.5,Confetti θ=0.95
0,ArticularyWordRecognition,100.0,100.0,100.0,100.0,100.0
1,BasicMotions,100.0,100.0,100.0,100.0,100.0
2,ERing,100.0,75.0,100.0,100.0,100.0
3,Epilepsy,100.0,100.0,100.0,95.833,91.667
4,Libras,100.0,90.0,100.0,100.0,100.0
5,NATOPS,100.0,,100.0,100.0,100.0
6,RacketSports,100.0,100.0,100.0,100.0,100.0


### Validity
Validity refers to the proportion of counterfactuals that fulfill the requirement $f(X_{i}) \neq f(X_{CE})$

In [4]:
# Keep only relevant columns and drop rows without Validity
validity_df = final_df[['Dataset', 'Explainer', 'Validity']].dropna(subset=['Validity'])

# Pivot to wide format without removing duplicates
pivot_table_validity = validity_df.pivot_table(index='Dataset', columns='Explainer', values='Validity', aggfunc='first').reset_index()

# Reorder columns: Dataset + non-Confetti + Confetti α=...
columns = pivot_table_validity.columns.tolist()

confetti_cols = [col for col in columns if isinstance(col, str) and (col.startswith("Confetti α") or col.startswith("Confetti θ"))]
non_confetti_cols = [col for col in columns if col != 'Dataset' and col not in confetti_cols]
ordered_cols = ['Dataset'] + sorted(non_confetti_cols) + sorted(confetti_cols)

pivot_table_validity = pivot_table_validity[ordered_cols]
pivot_table_validity.columns.name = None

# Format floats to 3 decimal places
validity_table = pivot_table_validity.round(3)
validity_table

Unnamed: 0,Dataset,Comte,Sets,TSEvo,Confetti α=0.5,Confetti θ=0.95
0,ArticularyWordRecognition,0.92,0.92,0.92,1.0,1.0
1,BasicMotions,1.0,0.475,0.525,1.0,1.0
2,ERing,0.833,0.778,0.806,1.0,1.0
3,Epilepsy,0.833,0.792,0.792,1.0,1.0
4,Libras,0.9,0.778,0.933,1.0,1.0
5,NATOPS,1.0,,0.778,1.0,1.0
6,RacketSports,1.0,0.875,0.875,1.0,1.0


### Sparsitiy
Sparsity refers to the proportion of timesteps that are modified in the counterfactual. Here we evaluate the average sparsity across all counterfactuals.

In [5]:
# Keep only relevant columns and drop rows without Sparsity
sparsity_df = final_df[['Dataset', 'Explainer', 'Sparsity']].dropna(subset=['Sparsity'])

# Pivot to wide format without removing duplicates
pivot_table_sparsity = sparsity_df.pivot_table(index='Dataset', columns='Explainer', values='Sparsity', aggfunc='first').reset_index()

# Reorder columns: Dataset + non-Confetti + Confetti α=...
columns = pivot_table_sparsity.columns.tolist()
confetti_cols = [col for col in columns if isinstance(col, str) and (col.startswith("Confetti α") or col.startswith("Confetti θ"))]
non_confetti_cols = [col for col in columns if col != 'Dataset' and col not in confetti_cols]
ordered_cols = ['Dataset'] + sorted(non_confetti_cols) + sorted(confetti_cols)
pivot_table_sparsity = pivot_table_sparsity[ordered_cols]
pivot_table_sparsity.columns.name = None

# Format floats to 3 decimal places
sparsity_table = pivot_table_sparsity.round(3)
sparsity_table

Unnamed: 0,Dataset,Comte,Sets,TSEvo,Confetti α=0.5,Confetti θ=0.95
0,ArticularyWordRecognition,0.729,0.002,0.002,0.912,0.875
1,BasicMotions,0.48,0.004,0.003,0.791,0.766
2,ERing,0.59,0.025,0.028,0.909,0.876
3,Epilepsy,0.503,0.011,0.011,0.804,0.776
4,Libras,0.255,0.045,0.033,0.787,0.722
5,NATOPS,0.699,,0.001,0.872,0.83
6,RacketSports,0.531,0.014,0.012,0.883,0.82


### Prediction Confidence
Prediction Confidence refers to the class probability predicted by a classifier $f$ for $X_{CE}$. Here we evaluate the average confidence across all counterfactuals.

In [6]:
# Keep only relevant columns
confidence_df = final_df[['Dataset', 'Explainer', 'Confidence']]

# Pivot to wide format without removing duplicates
pivot_table_confidence = confidence_df.pivot_table(index='Dataset', columns='Explainer', values='Confidence', aggfunc='first').reset_index()

# Reorder columns: Dataset + non-Confetti + Confetti α=...
columns = pivot_table_confidence.columns.tolist()

confetti_cols = [col for col in columns if isinstance(col, str) and (col.startswith("Confetti α") or col.startswith("Confetti θ"))]
non_confetti_cols = [col for col in columns if col != 'Dataset' and col not in confetti_cols]
ordered_cols = ['Dataset'] + sorted(non_confetti_cols) + sorted(confetti_cols)

pivot_table_confidence = pivot_table_confidence[ordered_cols]
pivot_table_confidence.columns.name = None

# Format floats to 3 decimal places
confidence_table = pivot_table_confidence.round(3)
confidence_table

Unnamed: 0,Dataset,Comte,Sets,TSEvo,Confetti α=0.5,Confetti θ=0.95
0,ArticularyWordRecognition,0.929,0.92,0.922,0.745,0.98
1,BasicMotions,0.893,0.475,0.551,0.623,0.967
2,ERing,0.826,0.772,0.81,0.775,0.98
3,Epilepsy,0.84,0.771,0.78,0.629,0.971
4,Libras,0.932,0.789,0.903,0.705,0.976
5,NATOPS,0.693,,0.802,0.692,0.973
6,RacketSports,0.9,0.863,0.857,0.687,0.979


### Plausibility
Plausibility measures whether $X_{CE}$ is realistic and feasible with the domain or the original distribution. Here we evaluate the average plausibility across all counterfactuals.

In [7]:
# Keep only relevant columns
plausibility_df = final_df[['Dataset', 'Explainer', 'yNN']]

# Pivot to wide format without removing duplicates
pivot_table_plausibility = plausibility_df.pivot_table(index='Dataset', columns='Explainer', values='yNN', aggfunc='first').reset_index()

# Reorder columns: Dataset + non-Confetti + Confetti α=...
columns = pivot_table_plausibility.columns.tolist()

confetti_cols = [col for col in columns if isinstance(col, str) and (col.startswith("Confetti α") or col.startswith("Confetti θ"))]
non_confetti_cols = [col for col in columns if col != 'Dataset' and col not in confetti_cols]
ordered_cols = ['Dataset'] + sorted(non_confetti_cols) + sorted(confetti_cols)

pivot_table_plausibility = pivot_table_plausibility[ordered_cols]
pivot_table_plausibility.columns.name = None

# Format floats to 3 decimal places
plausibility_table = pivot_table_plausibility.round(3)
plausibility_table

Unnamed: 0,Dataset,Comte,Sets,TSEvo,Confetti α=0.5,Confetti θ=0.95
0,ArticularyWordRecognition,0.996,0.994,0.993,0.995,0.996
1,BasicMotions,0.995,0.993,0.993,0.995,0.996
2,ERing,0.993,0.989,0.987,0.99,0.99
3,Epilepsy,0.998,0.997,0.997,0.997,0.997
4,Libras,0.992,0.983,0.98,0.988,0.99
5,NATOPS,0.991,,0.983,0.989,0.991
6,RacketSports,0.984,0.975,0.981,0.985,0.988


### Proximity ($l_{1}$)
Proximity refers to the distance between the original instance $X_{i}$ and the counterfactual $X_{CE}$. Here we evaluate the average proximity across all counterfactuals.
In this case, we use the $l_{1}$ norm to capture how much the counterfactual deviates in total without emphasizing where or how large the changes are.

In [8]:
# Keep only relevant columns
proximity_l1_df = final_df[['Dataset', 'Explainer', 'Proximity L1']]

# Pivot to wide format without removing duplicates
pivot_table_proximity_l1 = proximity_l1_df.pivot_table(index='Dataset', columns='Explainer', values='Proximity L1', aggfunc='first').reset_index()

# Reorder columns: Dataset + non-Confetti + Confetti α=...
columns = pivot_table_proximity_l1.columns.tolist()

confetti_cols = [col for col in columns if isinstance(col, str) and (col.startswith("Confetti α") or col.startswith("Confetti θ"))]
non_confetti_cols = [col for col in columns if col != 'Dataset' and col not in confetti_cols]
ordered_cols = ['Dataset'] + sorted(non_confetti_cols) + sorted(confetti_cols)

pivot_table_proximity_l1 = pivot_table_proximity_l1[ordered_cols]
pivot_table_proximity_l1.columns.name = None

# Format floats to 3 decimal places
proximity_l1_table = pivot_table_proximity_l1.round(3)
proximity_l1_table

Unnamed: 0,Dataset,Comte,Sets,TSEvo,Confetti α=0.5,Confetti θ=0.95
0,ArticularyWordRecognition,276.511,1474.847,1476.536,84.442,120.438
1,BasicMotions,894.162,2270.143,2246.87,377.515,483.876
2,ERing,92.974,272.845,277.081,26.71,34.253
3,Epilepsy,236.819,510.458,529.534,89.568,101.912
4,Libras,7.663,15.245,16.428,2.024,2.837
5,NATOPS,134.42,,1146.729,72.724,80.684
6,RacketSports,342.294,997.926,988.59,125.964,192.165


### Proximity ($l_{2}$)
Proximity refers to the distance between the original instance $X_{i}$ and the counterfactual $X_{CE}$. Here we evaluate the average proximity across all counterfactuals.
In this case, we use the $l_{2}$ norm to quantify the overall magnitude of deviation, where larger individual differences are penalized more

In [9]:
# Keep only relevant columns
proximity_l2_df = final_df[['Dataset', 'Explainer', 'Proximity L2']]

# Pivot to wide format without removing duplicates
pivot_table_proximity_l2 = proximity_l2_df.pivot_table(index='Dataset', columns='Explainer', values='Proximity L2', aggfunc='first').reset_index()

# Reorder columns: Dataset + non-Confetti + Confetti α=...
columns = pivot_table_proximity_l2.columns.tolist()

confetti_cols = [col for col in columns if isinstance(col, str) and (col.startswith("Confetti α") or col.startswith("Confetti θ"))]
non_confetti_cols = [col for col in columns if col != 'Dataset' and col not in confetti_cols]
ordered_cols = ['Dataset'] + sorted(non_confetti_cols) + sorted(confetti_cols)

pivot_table_proximity_l2 = pivot_table_proximity_l2[ordered_cols]
pivot_table_proximity_l2.columns.name = None

# Format floats to 3 decimal places
proximity_l2_table = pivot_table_proximity_l2.round(3)
proximity_l2_table

Unnamed: 0,Dataset,Comte,Sets,TSEvo,Confetti α=0.5,Confetti θ=0.95
0,ArticularyWordRecognition,19.043,50.731,50.796,9.516,11.694
1,BasicMotions,76.843,126.678,138.706,48.377,55.831
2,ERing,11.014,22.02,22.34,6.188,6.881
3,Epilepsy,14.853,26.302,27.33,9.721,10.524
4,Libras,1.065,2.048,2.18,0.508,0.651
5,NATOPS,9.361,,41.923,7.102,6.985
6,RacketSports,58.482,112.975,112.224,36.265,45.608


### Proximity ($DTW$)
Proximity refers to the distance between the original instance $X_{i}$ and the counterfactual $X_{CE}$. Here we evaluate the average proximity across all counterfactuals.
In this case, we use the $Dyanmic Time Wrapping$ norm to assess similarity in the temporal structure.

In [10]:
# Keep only relevant columns
proximity_dtw_df = final_df[['Dataset', 'Explainer', 'Proximity DTW']]

# Pivot to wide format without removing duplicates
pivot_table_proximity_dtw = proximity_dtw_df.pivot_table(index='Dataset', columns='Explainer', values='Proximity DTW', aggfunc='first').reset_index()

# Reorder columns: Dataset + non-Confetti + Confetti α=...
columns = pivot_table_proximity_dtw.columns.tolist()

confetti_cols = [col for col in columns if isinstance(col, str) and (col.startswith("Confetti α") or col.startswith("Confetti θ"))]
non_confetti_cols = [col for col in columns if col != 'Dataset' and col not in confetti_cols]
ordered_cols = ['Dataset'] + sorted(non_confetti_cols) + sorted(confetti_cols)

pivot_table_proximity_dtw = pivot_table_proximity_dtw[ordered_cols]
pivot_table_proximity_dtw.columns.name = None

# Format floats to 3 decimal places
proximity_dtw_table = pivot_table_proximity_dtw.round(3)
proximity_dtw_table

Unnamed: 0,Dataset,Comte,Sets,TSEvo,Confetti α=0.5,Confetti θ=0.95
0,ArticularyWordRecognition,18.613,41.683,41.349,8.827,10.835
1,BasicMotions,75.832,113.433,127.036,47.283,54.39
2,ERing,9.903,15.294,15.555,5.079,5.662
3,Epilepsy,13.005,18.983,19.839,8.92,9.721
4,Libras,0.835,0.994,1.155,0.373,0.501
5,NATOPS,9.361,,41.699,7.05,6.93
6,RacketSports,57.876,98.584,97.692,34.969,44.297


### All Results

#### FCN

In [11]:
overall_results = final_df.groupby("Explainer").mean(numeric_only=True).round(3).reset_index()
overall_results = overall_results.drop(columns=['Param Config'], errors='ignore')

# Separate Confetti and non-Confetti rows
confetti_rows = overall_results[overall_results['Explainer'].str.startswith("Confetti")]
non_confetti_rows = overall_results[~overall_results['Explainer'].str.startswith("Confetti")]

# Reorder: non-confetti first, then confetti
final_results = pd.concat([non_confetti_rows, confetti_rows], ignore_index=True)
final_results


Unnamed: 0,Explainer,Confidence,Coverage,Proximity DTW,Proximity L1,Proximity L2,Sparsity,Validity,yNN
0,Comte,0.859,100.0,26.489,283.549,27.238,0.541,0.927,0.993
1,Sets,0.765,94.167,48.162,923.577,56.792,0.017,0.77,0.988
2,TSEvo,0.804,100.0,49.189,954.538,56.5,0.013,0.804,0.988
3,Confetti α=0.5,0.694,99.405,16.072,111.278,16.811,0.851,1.0,0.991
4,Confetti θ=0.95,0.975,98.81,18.905,145.166,19.739,0.809,1.0,0.993


#### ResNet

In [12]:
# Re-filter resnet-only results
resnet_results = results[results['Model'] == 'resnet'].copy()

# Define the exact Confetti Optimized method names to extract
allowed_confetti_methods_resnet = [
    'Confetti Optimized (alpha=0.5)',
    'Confetti Optimized (theta=0.95)'
]

# Apply mask for those methods
confetti_mask_resnet = resnet_results['Explainer'].isin(allowed_confetti_methods_resnet)
confetti_optimized_df_resnet = resnet_results[confetti_mask_resnet].copy()

# Rename Confetti methods using Greek symbols for α and θ
confetti_optimized_df_resnet['Explainer'] = confetti_optimized_df_resnet['Explainer'].apply(
    lambda x: "Confetti α=0.5" if "alpha=0.5" in x else "Confetti θ=0.95"
)

# Get all other methods excluding any 'Confetti'
non_confetti_df_resnet = resnet_results[
    ~resnet_results['Explainer'].str.contains('Confetti', case=False, na=False)
]

# Combine both parts
final_df_resnet = pd.concat([non_confetti_df_resnet, confetti_optimized_df_resnet], ignore_index=True)

# Group by method and compute average metrics
overall_results_resnet = final_df_resnet.groupby("Explainer").mean(numeric_only=True).round(3).reset_index()
overall_results_resnet = overall_results_resnet.drop(columns=['Param Config'], errors='ignore')

# Separate Confetti and non-Confetti rows
confetti_rows_resnet = overall_results_resnet[
    overall_results_resnet['Explainer'].str.startswith("Confetti")
]
non_confetti_rows_resnet = overall_results_resnet[
    ~overall_results_resnet['Explainer'].str.startswith("Confetti")
]

# Reorder: non-confetti first, then confetti
final_results_resnet = pd.concat([non_confetti_rows_resnet, confetti_rows_resnet], ignore_index=True)
final_results_resnet


Unnamed: 0,Explainer,Confidence,Coverage,Proximity DTW,Proximity L1,Proximity L2,Sparsity,Validity,yNN
0,Comte,0.869,100.0,25.254,269.816,25.896,0.57,0.908,0.993
1,Sets,0.759,93.241,47.722,920.462,56.983,0.016,0.759,0.989
2,TSEvo,0.795,100.0,48.617,954.391,56.216,0.013,0.794,0.988
3,Confetti α=0.5,0.714,100.0,14.79,106.075,15.614,0.851,1.0,0.991
4,Confetti θ=0.95,0.975,100.0,15.676,121.784,16.467,0.818,1.0,0.991
