In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import numpy as np

In [None]:
# timestamp for saving figures, tables and other outputs from this experiment run
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
print(timestamp)

In [None]:
data = pd.DataFrame()
data_groups = pd.DataFrame()

for trial in [1, 2, 3]:
    for C in [1, 2, 3, 4, 5, 6, 7, 8]:
        filepath = f'../../data/synthetic/output/datavalidation_C_{C}_trial_{trial}_causal_sim.csv'
        filepath_gt = f'../../data/synthetic/output/datavalidation_C_{C}_trial_{trial}_task_metadata.csv'
        try:
            # get the causal distances for the dataset
            df = pd.read_csv(filepath)
            df['C'] = C
            data = pd.concat([data, df[['SHD','SID','OD','ID','C']]])
            # get the average causal distances
            # (1) within the same causal group
            # (2) between causal groups
            df_gt = pd.read_csv(filepath_gt)
            gt_map = dict(zip(df_gt['task'], df_gt['ground_truth']))
            df['gt1'] = df['task1'].map(gt_map)
            df['gt2'] = df['task2'].map(gt_map)
            df['same_group'] = df['gt1']==df['gt2']
            df.groupby('same_group').mean().reset_index()
            df['trial'] = trial
            data_groups = pd.concat([data_groups, df[['same_group','SHD','SID','OD','ID','C','trial']]])
        except:
            print("error with file {}".format(filepath))

In [None]:
# plot number of causal groups (C) against average causal distance

sns.set_style("whitegrid")

fig, axes = plt.subplots(1, 4, figsize=(16,4))

sns.lineplot(data=data, x='C', y='SHD', errorbar='sd', ax=axes[0])
sns.lineplot(data=data, x='C', y='SID', errorbar='sd', ax=axes[1])
sns.lineplot(data=data, x='C', y='OD', errorbar='sd', ax=axes[2])
sns.lineplot(data=data, x='C', y='ID', errorbar='sd', ax=axes[3])

sns.despine(left=True)

plt.tight_layout()
plt.savefig(f'results-{timestamp}.png', dpi=300)
plt.show()

In [None]:
# get average causal distances for each value of C

tmp = data.groupby('C').mean().reset_index()
tmp.to_csv(f'results-{timestamp}.csv', index=None)
tmp

In [None]:
# compare the causal distances between tasks in the same causal group vs. between tasks in different causal groups
# we expect the causal distances to be lower for tasks in the same causal group

metrics = ['SHD','SID','OD','ID']
tmp = data_groups.groupby(['same_group','C','trial']).mean().reset_index()
tmp = tmp[tmp['C'].isin([2, 4, 6, 8])]
tmp = pd.DataFrame(tmp).groupby(['C','same_group'])[metrics].agg(['mean', 'std']).reset_index()
for metric in metrics:
    tmp["{}_fmt".format(metric)] = tmp.apply(lambda x: f"{x[(metric,'mean')]:.3f} ({x[(metric,'std')]:.3f})", axis=1)
    tmp = tmp.drop(columns=[(metric,'mean'),(metric,'std')])
tmp.to_csv(f'results-groups-{timestamp}.csv', index=None)
tmp

In [None]:
# pearson correlation between pairs of causal distances

metrics = ['SHD','SID','OD','ID']
tmp = data_groups[data_groups['C']==4]
mean_vals = np.array([tmp[tmp['trial']==t][metrics].corr().values for t in [1,2,3]]).mean(axis=0).round(3)
std_vals = np.array([tmp[tmp['trial']==t][metrics].corr().values for t in [1,2,3]]).std(axis=0).round(3)

results = np.empty((len(metrics), len(metrics)), dtype='U20')
for i in range(len(metrics)):
    for j in range(len(metrics)):
        results[i][j] = str(mean_vals[i][j]) + ' (' + str(std_vals[i][j]) + ')' # get mean and standard deviation across trials

results = pd.DataFrame(results,columns=metrics)
results.index = ['SHD','SID','OD','ID']
results.to_csv(f'results-corr-{timestamp}.csv', index=None)
results