In [None]:
import mlflow
import datetime
import pandas as pd 
import collections
import numpy as np
from sklearn.metrics import f1_score

In [None]:
# get mlflow runs
experiment_id = '673705349750158929' # TODO update mlflow experiment ID if it changes (check mlruns directory)

mlflow.set_tracking_uri("../../mlruns")

runs = mlflow.search_runs(experiment_ids=[experiment_id])

failed_runs = len(runs[runs['status']=='FAILED'][['params.model']])
print("{} experiment runs failed ({}% of total)".format(failed_runs, failed_runs/len(runs)*100))

In [None]:
# timestamp for saving figures, tables and other outputs from this experiment run
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
print(timestamp)

In [None]:
runs.to_csv(f'results-main-{timestamp}.csv', index=None)

In [None]:
# cleanup the data fields for analysis
runs['C'] = runs['params.datafile'].str.split('_').str[2]
runs['trial'] = runs['params.datafile'].str.split('_').str[4]
runs['params.causal_distance'] = runs['params.causal_distance'].fillna('')
runs['params.inference_type'] = runs['params.inference_type'].fillna('')
runs['method'] = runs['params.model'] + runs['params.causal_distance'] + runs['params.inference_type']

# get best result for each trial, based on performance for validation set
results = runs[['method','C','trial','metrics.RMSE_avg_val','metrics.RMSE_avg_test']].sort_values(by='metrics.RMSE_avg_val').groupby(['method','C','trial']).first().reset_index()
results['C'] = results['C'].astype(float)
results['metrics.RMSE_avg_test'] = results['metrics.RMSE_avg_test'].astype(float)

method_names = {'global_bnn_baseline':'Global BNN baseline', 
                'individual_bnn_baseline':'Local BNNs baseline', 
                'bayesian_maml_baseline':'Meta-learning baseline',
                'our_methodground_truth':'Ground truth reference',
                'our_methodOD':'Our method (OD)',
                'our_methodSHD':'Our method (SHD)',
                'our_methodID':'Our method (ID)',
                'our_methodSID':'Our method (SID)',
                'our_methodobservational':'Our method (OP)',
                'our_methodinterventional':'Our method (IP)'}
results['method'] = results['method'].map(method_names)

results

In [None]:
# create plot

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

sns.set_style("whitegrid")

fig, axes = plt.subplots(1, 3, figsize=(16,4))

methods1 = ['Global BNN baseline', 'Local BNNs baseline', 'Meta-learning baseline', 'Our method (OD)', 'Our method (SHD)', 'Our method (ID)', 'Our method (SID)']
methods2 = ['Global BNN baseline', 'Local BNNs baseline', 'Meta-learning baseline', 'Our method (OP)', 'Our method (IP)']
methods3 = ['Ground truth reference', 'Our method (OD)', 'Our method (SHD)', 'Our method (ID)', 'Our method (SID)', 'Our method (OP)', 'Our method (IP)']

colors = {'Global BNN baseline':'tab:blue', 
          'Local BNNs baseline':'tab:pink', 
          'Meta-learning baseline':'tab:olive', 
          'Ground truth reference':'tab:red', 
          'Our method (OD)':'tab:cyan', 
          'Our method (SHD)':'tab:brown', 
          'Our method (ID)':'tab:orange', 
          'Our method (SID)':'tab:gray', 
          'Our method (OP)':'tab:green', 
          'Our method (IP)':'tab:purple'}

colors1 = {m:colors[m] for m in methods1}
colors2 = {m:colors[m] for m in methods2}
colors3 = {m:colors[m] for m in methods3}

ax1 = sns.lineplot(data=results[results['method'].isin(methods1)], x='C', y='metrics.RMSE_avg_test', hue='method', style='method', ax=axes[0], palette=colors1)
ax2 = sns.lineplot(data=results[results['method'].isin(methods2)], x='C', y='metrics.RMSE_avg_test', hue='method', style='method', ax=axes[1], palette=colors2)
ax3 = sns.lineplot(data=results[results['method'].isin(methods3)], x='C', y='metrics.RMSE_avg_test', hue='method', style='method', ax=axes[2], palette=colors3)

sns.despine(left=True)
ax1.set_ylabel('RMSE of test tasks')
ax1.set_yscale('log')
ax1.yaxis.set_major_locator(ticker.LogLocator(10,[0.01,0.02,0.03]))
ax1.yaxis.set_major_formatter(ticker.ScalarFormatter())
ax2.set_ylabel('RMSE of test tasks')
ax2.set_yscale('log')
ax2.yaxis.set_major_locator(ticker.LogLocator(10,[0.01,0.02,0.03]))
ax2.yaxis.set_major_formatter(ticker.ScalarFormatter())
ax3.set_ylabel('RMSE of test tasks')
ax1.set_title('Baseline comparison (known CGMs)')
ax2.set_title('Baseline comparison (unknown CGMs)')
ax3.set_title('Causal distance and proxy comparison')

ax1.legend().set_title('')
ax2.legend().set_title('')
ax3.legend().set_title('')

for ax in axes:
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2, borderpad=0.1, columnspacing=0.5)

plt.savefig(f'results-main2-{timestamp}.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# calculate percentage difference in performance between meta-learning baseline and other causal distances
tmp = results[results['method'].isin(['Meta-learning baseline','Our method (SHD)','Our method (SID)','Our method (OD)','Our method (ID)','Our method (OP)','Our method (IP)'])]
tmp['key'] = tmp['trial'].astype(str) + ':' + tmp['C'].astype(str)
tmp = tmp.pivot(index='key',columns='method',values='metrics.RMSE_avg_test').reset_index()
for method in ['Our method (SHD)','Our method (SID)','Our method (OD)','Our method (ID)','Our method (OP)','Our method (IP)']:
    tmp[method] = (tmp[method] - tmp['Meta-learning baseline'])/tmp['Meta-learning baseline']*100 
tmp['C'] = tmp['key'].str.split(':').str[1]
tmp = tmp.drop(columns=['Meta-learning baseline'])
tmp.groupby('C').mean()

### Accuracy of causal group assignments

In [None]:
# calculate accuracy for each trial (model selection using hyperparameter tuning on validation set)
runs['causal_dist'] = runs['params.causal_distance'] + runs['params.inference_type']
results = runs[['method','C','trial','causal_dist','metrics.RMSE_avg_val','metrics.RMSE_avg_test','params.outprefix','params.datafile']].sort_values(by='metrics.RMSE_avg_val').groupby(['method','C','trial']).first().reset_index()
results['C'] = results['C'].astype(float)
results['metrics.RMSE_avg_test'] = results['metrics.RMSE_avg_test'].astype(float)
results = results[results['method'].str.startswith('our_method')]
results = results[~(results['causal_dist']=='ground_truth')]

In [None]:
def match_ground_truth_label(labels, ground_truth_labels, num_causal_groups):
    """Match the cluster labels, according to which ground truth label is most prevalent in each (inferred) cluster
    """
    # calculate how much of each ground truth label in each inferred cluster
    freq_data = []
    for pred_label in range(num_causal_groups):
        count_gt = ground_truth_labels[np.where(labels==pred_label)] 
        freq_gt = collections.Counter(count_gt)
        for gt_label in freq_gt:
            freq_data.append({'pred':pred_label, 'gt':gt_label, 'count':freq_gt[gt_label], 'freq':freq_gt[gt_label]/len(count_gt)})

    # match a ground truth label to each inferred cluster,
    # based on the most common label in a cluster
    correction = np.zeros(num_causal_groups)
    cluster_df = pd.DataFrame(freq_data).sort_values(by='freq', ascending=False)
    assigned_gt = [] 
    assigned_pred = []
    for idx, row in cluster_df.iterrows():
        # check if ground truth or inferred label have already been assigned 
        # (give priority to cluster with highest % of a ground truth label)
        if row['gt'] not in assigned_gt and row['pred'] not in assigned_pred: 
            correction[int(row['pred'])] = int(row['gt'])
            assigned_gt.append(row['gt'])
            assigned_pred.append(row['pred'])

    return correction

In [None]:
data = []

results['casualassignment_path'] = '../../' + results['params.outprefix'] + '_causal_assignments.csv'
results['groundtruth_path'] = results['params.datafile'].str[0:-8] + 'task_metadata.csv'

for idx, row in results.iterrows():
    causalassignment_path = row['casualassignment_path']
    groundtruth_path = row['groundtruth_path']
    num_groups = int(row['C'])
    method = row['causal_dist']
    df = pd.read_csv(causalassignment_path)
    df_gt = pd.read_csv(groundtruth_path)

    pred_groups = np.array(df['predicted_groups'].tolist())
    gt_groups = np.array(df_gt['ground_truth'].tolist())

    assert len(pred_groups)==len(gt_groups)

    # label alignment function (only use train data for this)
    pred_groups_train = pred_groups[0:200]
    gt_groups_train = gt_groups[0:200]
    correction = match_ground_truth_label(pred_groups_train, gt_groups_train, num_groups)
    pred_groups = np.choose(pred_groups,correction)

    # evaluate accuracy with f1 score
    # get overall score, and also stratify into 1. train and 2. val+test
    f1_overall = f1_score(gt_groups, pred_groups, average='macro')
    f1_train = f1_score(gt_groups[0:200], pred_groups[0:200], average='macro')
    f1_val_test = f1_score(gt_groups[200:], pred_groups[200:], average='macro')

    data.append({'C':num_groups, 'method':method, 'metric':'f1_overall', 'f1':f1_overall})
    data.append({'C':num_groups, 'method':method, 'metric':'f1_train', 'f1':f1_train})
    data.append({'C':num_groups, 'method':method, 'metric':'f1_val_test', 'f1':f1_val_test})

f1_data = pd.DataFrame(data)
f1_data


In [None]:
# format for publication
tmp = f1_data.groupby(['C','method','metric'])['f1'].agg(['mean', 'std']).reset_index()
tmp['f1'] = tmp.apply(lambda x: f"{x['mean']:.2f}$\pm${x['std']:.2f}", axis=1)
tmp['index'] = tmp['C'].astype(str) + ':' + tmp['metric']
tmp = tmp[tmp['metric']!='f1_overall']
tmp = tmp.pivot(index='index',columns='method',values='f1').reset_index()
tmp = tmp.sort_values(by='index')
tmp['C'] = tmp['index'].str.split(':').str[0]
tmp['metric'] = tmp['index'].str.split(':').str[1]
tmp = tmp[['metric','C','ID','OD','SHD','SID','observational','interventional']]
tmp.to_csv(f'results-accuracycausalgroups-{timestamp}.csv', index=None)
tmp