# Notebook for creation of tables used in the paper.

In [1]:
import pandas as pd


results_raw = pd.read_csv('../benchmark/evaluations/all_evaluation_results.csv')
# Obtain results separated by model
fcn_raw = results_raw[results_raw['Model'] == 'fcn'].copy()
resnet_raw = results_raw[results_raw['Model'] == 'resnet'].copy()


# Define the exact Confetti Optimized method names to extract
confetti_method_map = {
    'Confetti Optimized (alpha=0.5)': 'Confetti α=0.5',
    'Confetti Optimized (theta=0.95)': 'Confetti θ=0.95',
    'Confetti Optimized (alpha=0.0)': 'Confetti α=0.0'
}

# Filter and rename Confetti Optimized methods
confetti_all = results_raw[results_raw['Explainer'].isin(confetti_method_map.keys())].copy()
confetti_fcn = (fcn_raw[fcn_raw['Explainer'].isin(confetti_method_map.keys())].copy())
confetti_resnet = (resnet_raw[resnet_raw['Explainer'].isin(confetti_method_map.keys())].copy())

confetti_all['Explainer'] = confetti_all['Explainer'].map(confetti_method_map)
confetti_fcn['Explainer'] = confetti_fcn['Explainer'].map(confetti_method_map)
confetti_resnet['Explainer'] = confetti_resnet['Explainer'].map(confetti_method_map)


# Get all other methods excluding any 'Confetti'
non_confetti_all = results_raw[~results_raw['Explainer'].str.contains('Confetti', case=False, na=False)].copy()
non_confetti_fcn = fcn_raw[~fcn_raw['Explainer'].str.contains('Confetti', case=False, na=False)]
non_confetti_resnet = resnet_raw[~resnet_raw['Explainer'].str.contains('Confetti', case=False, na=False)]

# Combine both parts
all_final = pd.concat([non_confetti_all, confetti_all], ignore_index=True)
fcn_final = pd.concat([non_confetti_fcn, confetti_fcn], ignore_index=True)
resnet_final = pd.concat([non_confetti_resnet, confetti_resnet], ignore_index=True)

## Metrics

### Coverage
Coverage refers to the proportion of the sample set for which a counterfactual exists

In [2]:
# Keep only relevant columns and drop rows without Coverage
coverage_df = all_final[['Dataset', 'Explainer', 'Coverage']].dropna(subset=['Coverage'])

# Pivot to wide format without removing duplicates
pivot_table_coverage = coverage_df.pivot_table(index='Dataset', columns='Explainer', values='Coverage', aggfunc='mean').reset_index()

# Reorder columns: Dataset + non-Confetti + Confetti α=...
columns = pivot_table_coverage.columns.tolist()

confetti_cols = [col for col in columns if isinstance(col, str) and (col.startswith("Confetti α") or col.startswith("Confetti θ"))]
non_confetti_cols = [col for col in columns if col != 'Dataset' and col not in confetti_cols]
ordered_cols = ['Dataset'] + sorted(non_confetti_cols) + sorted(confetti_cols)


pivot_table = pivot_table_coverage[ordered_cols]
pivot_table.columns.name = None

# Format floats to 3 decimal places
coverage_table = pivot_table.round(3)
coverage_table

Unnamed: 0,Dataset,Comte,Sets,TSEvo,Confetti α=0.0,Confetti α=0.5,Confetti θ=0.95
0,ArticularyWordRecognition,100.0,100.0,100.0,100.0,100.0,100.0
1,BasicMotions,100.0,100.0,100.0,100.0,100.0,100.0
2,ERing,100.0,72.222,100.0,100.0,100.0,100.0
3,Epilepsy,100.0,100.0,100.0,100.0,100.0,100.0
4,Libras,100.0,90.0,100.0,100.0,100.0,100.0
5,NATOPS,100.0,,100.0,100.0,100.0,100.0
6,RacketSports,100.0,100.0,100.0,100.0,100.0,100.0


### Validity
Validity refers to the proportion of counterfactuals that fulfill the requirement $f(X_{i}) \neq f(X_{CE})$

In [4]:
# Keep only relevant columns and drop rows without Validity
validity_df = all_final[['Dataset', 'Explainer', 'Validity']].dropna(subset=['Validity'])

# Pivot to wide format without removing duplicates
pivot_table_validity = validity_df.pivot_table(index='Dataset', columns='Explainer', values='Validity', aggfunc='mean').reset_index()

# Reorder columns: Dataset + non-Confetti + Confetti α=...
columns = pivot_table_validity.columns.tolist()

confetti_cols = [col for col in columns if isinstance(col, str) and (col.startswith("Confetti α") or col.startswith("Confetti θ"))]
non_confetti_cols = [col for col in columns if col != 'Dataset' and col not in confetti_cols]
ordered_cols = ['Dataset'] + sorted(non_confetti_cols) + sorted(confetti_cols)

pivot_table_validity = pivot_table_validity[ordered_cols]
pivot_table_validity.columns.name = None

# Format floats to 3 decimal places
validity_table = pivot_table_validity.round(3)
validity_table

Unnamed: 0,Dataset,Comte,Sets,TSEvo,Confetti α=0.0,Confetti α=0.5,Confetti θ=0.95
0,ArticularyWordRecognition,0.96,0.94,0.94,1.0,1.0,1.0
1,BasicMotions,1.0,0.488,0.512,1.0,1.0,1.0
2,ERing,0.694,0.769,0.819,1.0,1.0,1.0
3,Epilepsy,0.833,0.792,0.771,1.0,1.0,1.0
4,Libras,0.933,0.815,0.933,1.0,1.0,1.0
5,NATOPS,1.0,,0.806,1.0,1.0,1.0
6,RacketSports,1.0,0.781,0.812,1.0,1.0,1.0


### Sparsitiy
Sparsity refers to the proportion of timesteps that are modified in the counterfactual. Here we evaluate the average sparsity across all counterfactuals.

In [5]:
# Keep only relevant columns and drop rows without Sparsity
sparsity_df = all_final[['Dataset', 'Explainer', 'Sparsity']].dropna(subset=['Sparsity'])

pivot_table_sparsity = sparsity_df[sparsity_df['Explainer'] != 'SETS']
# Pivot to wide format without removing duplicates
pivot_table_sparsity = pivot_table_sparsity.pivot_table(index='Dataset', columns='Explainer', values='Sparsity', aggfunc='mean').reset_index()

# Reorder columns: Dataset + non-Confetti + Confetti α=...
columns = pivot_table_sparsity.columns.tolist()
confetti_cols = [col for col in columns if isinstance(col, str) and (col.startswith("Confetti α") or col.startswith("Confetti θ"))]
non_confetti_cols = [col for col in columns if col != 'Dataset' and col not in confetti_cols]
ordered_cols = ['Dataset'] + sorted(non_confetti_cols) + sorted(confetti_cols)
pivot_table_sparsity = pivot_table_sparsity[ordered_cols]
pivot_table_sparsity.columns.name = None

# Format floats to 3 decimal places
sparsity_table = pivot_table_sparsity.round(3)
sparsity_table

Unnamed: 0,Dataset,Comte,Sets,TSEvo,Confetti α=0.0,Confetti α=0.5,Confetti θ=0.95
0,ArticularyWordRecognition,0.731,0.002,0.002,0.926,0.912,0.877
1,BasicMotions,0.486,0.004,0.003,0.822,0.798,0.754
2,ERing,0.681,0.024,0.029,0.913,0.876,0.849
3,Epilepsy,0.461,0.011,0.011,0.822,0.802,0.781
4,Libras,0.247,0.043,0.033,0.85,0.794,0.732
5,NATOPS,0.719,,0.001,0.88,0.861,0.825
6,RacketSports,0.562,0.013,0.012,0.942,0.912,0.872


### Prediction Confidence
Prediction Confidence refers to the class probability predicted by a classifier $f$ for $X_{CE}$. Here we evaluate the average confidence across all counterfactuals.

In [6]:
# Keep only relevant columns
confidence_df = all_final[['Dataset', 'Explainer', 'Confidence']]

pivot_table_confidence = confidence_df[confidence_df['Explainer'] != 'TSEvo']

# Pivot to wide format without removing duplicates
pivot_table_confidence = pivot_table_confidence.pivot_table(index='Dataset', columns='Explainer', values='Confidence', aggfunc='mean').reset_index()

# Reorder columns: Dataset + non-Confetti + Confetti α=...
columns = pivot_table_confidence.columns.tolist()

confetti_cols = [col for col in columns if isinstance(col, str) and (col.startswith("Confetti α") or col.startswith("Confetti θ"))]
non_confetti_cols = [col for col in columns if col != 'Dataset' and col not in confetti_cols]
ordered_cols = ['Dataset'] + sorted(non_confetti_cols) + sorted(confetti_cols)

pivot_table_confidence = pivot_table_confidence[ordered_cols]
pivot_table_confidence.columns.name = None

# Format floats to 3 decimal places
confidence_table = pivot_table_confidence.round(3)
confidence_table

Unnamed: 0,Dataset,Comte,Sets,Confetti α=0.0,Confetti α=0.5,Confetti θ=0.95
0,ArticularyWordRecognition,0.953,0.94,0.613,0.726,0.978
1,BasicMotions,0.917,0.487,0.533,0.611,0.965
2,ERing,0.701,0.766,0.637,0.77,0.981
3,Epilepsy,0.837,0.778,0.531,0.636,0.972
4,Libras,0.952,0.82,0.616,0.719,0.973
5,NATOPS,0.755,,0.558,0.709,0.976
6,RacketSports,0.932,0.78,0.629,0.756,0.98


### Plausibility
Plausibility measures whether $X_{CE}$ is realistic and feasible with the domain or the original distribution. Here we evaluate the average plausibility across all counterfactuals.

In [7]:
# Keep only relevant columns
plausibility_df = all_final[['Dataset', 'Explainer', 'yNN']]

# Pivot to wide format without removing duplicates
pivot_table_plausibility = plausibility_df.pivot_table(index='Dataset', columns='Explainer', values='yNN', aggfunc='mean').reset_index()

# Reorder columns: Dataset + non-Confetti + Confetti α=...
columns = pivot_table_plausibility.columns.tolist()

confetti_cols = [col for col in columns if isinstance(col, str) and (col.startswith("Confetti α") or col.startswith("Confetti θ"))]
non_confetti_cols = [col for col in columns if col != 'Dataset' and col not in confetti_cols]
ordered_cols = ['Dataset'] + sorted(non_confetti_cols) + sorted(confetti_cols)

pivot_table_plausibility = pivot_table_plausibility[ordered_cols]
pivot_table_plausibility.columns.name = None

# Format floats to 3 decimal places
plausibility_table = pivot_table_plausibility.round(3)
plausibility_table

Unnamed: 0,Dataset,Comte,Sets,TSEvo,Confetti α=0.0,Confetti α=0.5,Confetti θ=0.95
0,ArticularyWordRecognition,0.996,0.994,0.993,0.995,0.995,0.995
1,BasicMotions,0.995,0.993,0.993,0.994,0.995,0.995
2,ERing,0.994,0.989,0.987,0.99,0.99,0.99
3,Epilepsy,0.998,0.997,0.997,0.997,0.997,0.997
4,Libras,0.992,0.984,0.98,0.987,0.988,0.989
5,NATOPS,0.99,,0.985,0.988,0.989,0.99
6,RacketSports,0.986,0.976,0.978,0.983,0.984,0.986


### Proximity ($l_{1}$)
Proximity refers to the distance between the original instance $X_{i}$ and the counterfactual $X_{CE}$. Here we evaluate the average proximity across all counterfactuals.
In this case, we use the $l_{1}$ norm to capture how much the counterfactual deviates in total without emphasizing where or how large the changes are.

In [8]:
# Keep only relevant columns
proximity_l1_df = all_final[['Dataset', 'Explainer', 'Proximity L1']]

# Pivot to wide format without removing duplicates
pivot_table_proximity_l1 = proximity_l1_df.pivot_table(index='Dataset', columns='Explainer', values='Proximity L1', aggfunc='mean').reset_index()

# Reorder columns: Dataset + non-Confetti + Confetti α=...
columns = pivot_table_proximity_l1.columns.tolist()

confetti_cols = [col for col in columns if isinstance(col, str) and (col.startswith("Confetti α") or col.startswith("Confetti θ"))]
non_confetti_cols = [col for col in columns if col != 'Dataset' and col not in confetti_cols]
ordered_cols = ['Dataset'] + sorted(non_confetti_cols) + sorted(confetti_cols)

pivot_table_proximity_l1 = pivot_table_proximity_l1[ordered_cols]
pivot_table_proximity_l1.columns.name = None

# Format floats to 3 decimal places
proximity_l1_table = pivot_table_proximity_l1.round(3)
proximity_l1_table

Unnamed: 0,Dataset,Comte,Sets,TSEvo,Confetti α=0.0,Confetti α=0.5,Confetti θ=0.95
0,ArticularyWordRecognition,268.207,1473.604,1474.186,76.822,86.569,115.747
1,BasicMotions,897.204,2272.87,2260.23,342.754,367.774,451.877
2,ERing,70.959,273.898,278.388,27.131,34.182,39.391
3,Epilepsy,251.347,510.458,525.109,85.659,90.78,102.315
4,Libras,7.629,15.176,16.14,1.617,1.908,2.606
5,NATOPS,125.148,,1155.708,69.242,76.79,84.95
6,RacketSports,316.283,986.113,971.492,83.684,104.561,141.009


### Proximity ($l_{2}$)
Proximity refers to the distance between the original instance $X_{i}$ and the counterfactual $X_{CE}$. Here we evaluate the average proximity across all counterfactuals.
In this case, we use the $l_{2}$ norm to quantify the overall magnitude of deviation, where larger individual differences are penalized more

In [9]:
# Keep only relevant columns
proximity_l2_df = all_final[['Dataset', 'Explainer', 'Proximity L2']]

# Pivot to wide format without removing duplicates
pivot_table_proximity_l2 = proximity_l2_df.pivot_table(index='Dataset', columns='Explainer', values='Proximity L2', aggfunc='mean').reset_index()

# Reorder columns: Dataset + non-Confetti + Confetti α=...
columns = pivot_table_proximity_l2.columns.tolist()

confetti_cols = [col for col in columns if isinstance(col, str) and (col.startswith("Confetti α") or col.startswith("Confetti θ"))]
non_confetti_cols = [col for col in columns if col != 'Dataset' and col not in confetti_cols]
ordered_cols = ['Dataset'] + sorted(non_confetti_cols) + sorted(confetti_cols)

pivot_table_proximity_l2 = pivot_table_proximity_l2[ordered_cols]
pivot_table_proximity_l2.columns.name = None

# Format floats to 3 decimal places
proximity_l2_table = pivot_table_proximity_l2.round(3)
proximity_l2_table

Unnamed: 0,Dataset,Comte,Sets,TSEvo,Confetti α=0.0,Confetti α=0.5,Confetti θ=0.95
0,ArticularyWordRecognition,18.703,50.669,50.682,9.408,9.868,11.395
1,BasicMotions,76.706,126.827,138.002,44.04,45.14,50.774
2,ERing,8.746,21.892,22.326,6.062,6.673,7.226
3,Epilepsy,15.293,26.302,27.183,9.422,9.664,10.404
4,Libras,1.079,2.029,2.138,0.457,0.485,0.597
5,NATOPS,8.972,,42.227,6.964,7.271,7.358
6,RacketSports,56.47,113.607,111.945,31.961,34.467,39.128


### Proximity ($DTW$)
Proximity refers to the distance between the original instance $X_{i}$ and the counterfactual $X_{CE}$. Here we evaluate the average proximity across all counterfactuals.
In this case, we use the $Dyanmic Time Wrapping$ norm to assess similarity in the temporal structure.

In [10]:
# Keep only relevant columns
proximity_dtw_df = all_final[['Dataset', 'Explainer', 'Proximity DTW']]

# Pivot to wide format without removing duplicates
pivot_table_proximity_dtw = proximity_dtw_df.pivot_table(index='Dataset', columns='Explainer', values='Proximity DTW', aggfunc='mean').reset_index()

# Reorder columns: Dataset + non-Confetti + Confetti α=...
columns = pivot_table_proximity_dtw.columns.tolist()

confetti_cols = [col for col in columns if isinstance(col, str) and (col.startswith("Confetti α") or col.startswith("Confetti θ"))]
non_confetti_cols = [col for col in columns if col != 'Dataset' and col not in confetti_cols]
ordered_cols = ['Dataset'] + sorted(non_confetti_cols) + sorted(confetti_cols)

pivot_table_proximity_dtw = pivot_table_proximity_dtw[ordered_cols]
pivot_table_proximity_dtw.columns.name = None

# Format floats to 3 decimal places
proximity_dtw_table = pivot_table_proximity_dtw.round(3)
proximity_dtw_table

Unnamed: 0,Dataset,Comte,Sets,TSEvo,Confetti α=0.0,Confetti α=0.5,Confetti θ=0.95
0,ArticularyWordRecognition,18.392,41.592,41.317,8.724,9.032,10.355
1,BasicMotions,75.81,113.373,125.962,42.919,43.888,49.421
2,ERing,7.969,15.072,15.468,4.917,5.263,5.855
3,Epilepsy,13.257,18.983,19.62,8.678,8.827,9.557
4,Libras,0.805,0.976,1.113,0.343,0.356,0.453
5,NATOPS,8.972,,41.987,6.929,7.22,7.305
6,RacketSports,55.895,97.657,96.853,31.177,33.499,38.19


## All Results

#### FCN

In [12]:
overall_results_fcn = fcn_final.groupby("Explainer").mean(numeric_only=True).round(3).reset_index()
overall_results_fcn = overall_results_fcn.drop(columns=['Param Config'], errors='ignore')

# Separate Confetti and non-Confetti rows
confetti_rows = overall_results_fcn[overall_results_fcn['Explainer'].str.startswith("Confetti")]
non_confetti_rows = overall_results_fcn[~overall_results_fcn['Explainer'].str.startswith("Confetti")]

# Reorder: non-confetti first, then confetti
final_results = pd.concat([non_confetti_rows, confetti_rows], ignore_index=True)
final_results


Unnamed: 0,Explainer,Confidence,Coverage,Proximity DTW,Proximity L1,Proximity L2,Sparsity,Validity,yNN
0,Comte,0.859,100.0,26.489,283.549,27.238,0.541,0.927,0.993
1,Sets,0.765,94.167,48.162,923.577,56.792,0.017,0.77,0.988
2,TSEvo,0.804,100.0,49.189,954.538,56.5,0.013,0.804,0.988
3,Confetti α=0.0,0.587,100.0,15.4,99.762,16.008,0.88,1.0,0.991
4,Confetti α=0.5,0.693,100.0,16.092,111.8,16.834,0.85,1.0,0.991
5,Confetti θ=0.95,0.975,100.0,18.934,146.186,19.785,0.808,1.0,0.993


#### ResNet

In [13]:
# Group by method and compute average metrics
overall_results_resnet = resnet_final.groupby("Explainer").mean(numeric_only=True).round(3).reset_index()
overall_results_resnet = overall_results_resnet.drop(columns=['Param Config'], errors='ignore')

# Separate Confetti and non-Confetti rows
confetti_rows_resnet = overall_results_resnet[
    overall_results_resnet['Explainer'].str.startswith("Confetti")
]
non_confetti_rows_resnet = overall_results_resnet[
    ~overall_results_resnet['Explainer'].str.startswith("Confetti")
]

# Reorder: non-confetti first, then confetti
final_results_resnet = pd.concat([non_confetti_rows_resnet, confetti_rows_resnet], ignore_index=True)
final_results_resnet


Unnamed: 0,Explainer,Confidence,Coverage,Proximity DTW,Proximity L1,Proximity L2,Sparsity,Validity,yNN
0,Comte,0.869,100.0,25.254,269.816,25.896,0.57,0.908,0.993
1,Sets,0.759,93.241,47.722,920.462,56.983,0.016,0.759,0.989
2,TSEvo,0.795,100.0,48.617,954.391,56.216,0.013,0.794,0.988
3,Confetti α=0.0,0.589,100.0,14.225,96.498,14.938,0.879,1.0,0.99
4,Confetti α=0.5,0.714,100.0,14.79,106.075,15.614,0.851,1.0,0.991
5,Confetti θ=0.95,0.975,100.0,15.676,121.784,16.467,0.818,1.0,0.991
