Compare both generalisability and accuracy at recovering true causal groups

In [None]:
import mlflow
import datetime

experiment_id = '625001992703083237' # TODO update ID if it changes

mlflow.set_tracking_uri("../../mlruns")

runs = mlflow.search_runs(experiment_ids=[experiment_id])

failed_runs = len(runs[runs['status']=='FAILED'][['params.model']])
print("{} experiment runs failed ({}% of total)".format(failed_runs, failed_runs/len(runs)*100))

In [None]:
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
runs.to_csv(f'results-main-{timestamp}.csv', index=None)
print(timestamp)

In [None]:
results = runs[['metrics.RMSE_avg_test','metrics.RMSE_avg_val','metrics.total_num_epochs','metrics.runtime_initialisation','metrics.runtime_main_training','params.outprefix','params.datafile']]
results.insert(0, 'trial', results['params.outprefix'].str.split('/').str[-1].str.split('_').str[5])
results.insert(0, 'dataset', results['params.outprefix'].str.split('/').str[-1].str.split('_').str[2])
results.insert(0, 'model', results['params.outprefix'].str.split('/').str[-1].str.split('_').str[6])
results = results[~results['metrics.RMSE_avg_test'].isna()]
results.sort_values(by='metrics.RMSE_avg_test')

In [None]:
final_results = results[['trial','model','dataset','metrics.RMSE_avg_test', 'metrics.RMSE_avg_val']].groupby(['model','dataset']).mean()
final_results.sort_values(by=['dataset','metrics.RMSE_avg_test'])

In [None]:
import sys
sys.path.append("..")
from f1_score_utils import get_f1_scores

results['casualassignment_final_path'] = '../../' + results['params.outprefix'] + '_causal_assignments_final.csv'
results['groundtruth_path'] = results['params.datafile'].str[0:-8] + 'task_metadata.csv'

f1_final_train_list = []
f1_final_val_test_list = []

for idx, row in results.iterrows():

    if row['model'] not in ['metalearner','tsamaml','ourmethodknowncausalmodels']:
        f1_final_train, f1_final_val_test = get_f1_scores(row['casualassignment_final_path'], row['groundtruth_path'], 200)
    else:
        f1_final_train, f1_final_val_test = None, None

    f1_final_train_list.append(f1_final_train)
    f1_final_val_test_list.append(f1_final_val_test)

results['f1_final_train'] = f1_final_train_list
results['f1_final_val_test'] = f1_final_val_test_list

In [None]:
results.to_csv(f'results-summary-{timestamp}.csv', index=None)

In [None]:
tmp = results[['dataset','trial','model', 'metrics.RMSE_avg_test', 'f1_final_val_test', 'f1_final_train']].groupby(['dataset','model'])['metrics.RMSE_avg_test', 'f1_final_val_test', 'f1_final_train'].agg(['mean', 'sem']).reset_index()
for metric in ['metrics.RMSE_avg_test', 'f1_final_val_test', 'f1_final_train']:
    tmp["{}_fmt".format(metric)] = tmp.apply(lambda x: f"{x[(metric,'mean')]:.4f} ({x[(metric,'sem')]:.3f})", axis=1)
    tmp = tmp.drop(columns=[(metric,'mean'),(metric,'sem')])


order_map = {'ourmethodunknowncausalmodels':0, 'ourmethodunknowncausalmodelsnolatent':1, 'ourmethodunknowncausalmodelsnocausal':2, 'metalearner':3, 'ourmethodunknowncausalmodelsnoglobal':4}
tmp = tmp[tmp['model'].isin(order_map.keys())]
tmp['method_order'] = tmp['model'].map(order_map)
for dataset in ['C','M','intervp']:
    tmp[tmp['dataset']==dataset].sort_values(by=['dataset','method_order'])[['metrics.RMSE_avg_test_fmt','f1_final_train_fmt','f1_final_val_test_fmt']].to_csv(f'table_for_publication_{dataset}.csv', index=None)