# Preprocess Experiments in Single Dataframes

In [1]:
import seaborn as sns
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import json
from copy import deepcopy

# set seaborn settings for neurips paper
sns.set_context("paper")
sns.set_style("whitegrid")
sns.set_palette("colorblind")

flier_props = dict(markerfacecolor='0.75', markersize=2,
              linestyle='none')

## Alaa et al. synthethic data (Setup A and B)

In [2]:
# Function to process files efficiently
def process_csv_files(file_list, setup_label, p_values=True):
    list_df = []
    for filepath in tqdm(file_list):
        df = pd.read_csv(filepath)
        df['setup'] = setup_label
        df["sim_nb"] = filepath.split('/')[-1].split('_')[2]
        
        # Extract first element of p-value lists
        if p_values:
            df["p_values_y0"] = df["p_values_y0"].str.extract(r"\[{0,2}([^,\]\[]+)").astype(float)
            df["p_values_y1"] = df["p_values_y1"].str.extract(r"\[{0,2}([^,\]\[]+)").astype(float) 
            df["p_values_ite"] = df["p_values_ite"].str.extract(r"\[{0,2}([^,\]\[]+)").astype(float)
        
        list_df.append(df.copy())
        del df
    return list_df

In [3]:
# Define file paths
csv_files_setupA = sorted(glob.glob("../../results/outputs/alaa/setupA/eval_dist/*.csv"))
csv_files_setupB = sorted(glob.glob("../../results/outputs/alaa/setupB/eval_dist/*.csv"))

list_df_A = process_csv_files(csv_files_setupA, "A")
list_df_B = process_csv_files(csv_files_setupB, "B")

  0%|          | 0/99 [00:19<?, ?it/s]


KeyboardInterrupt: 

In [None]:
df_alaa = pd.concat(list_df_A+list_df_B).drop(columns=["Unnamed: 0"])
df_alaa = df_alaa.rename(columns={"approach": "Method", "setup": "Setup"})

In [None]:
df_alaa.to_csv("../../results/outputs/summary/alaa_dist_summary.csv")

## EDU

In [5]:
# Define file paths
csv_files_edu = sorted(glob.glob("../../results/outputs/edu/eval_dist/*.csv"))

list_df_edu = process_csv_files(csv_files_edu, "EDU")

100%|██████████| 100/100 [03:06<00:00,  1.86s/it]


In [6]:
df_edu = pd.concat(list_df_edu).drop(columns=["Unnamed: 0"])
df_edu = df_edu.rename(columns={"approach": "Method", "setup": "Setup"})
df_edu.to_csv("../../results/outputs/summary/edu_dist_summary.csv")

## IHDP

In [19]:
# Define file paths
csv_files_ihdp = sorted(glob.glob("../../results/outputs/ihdp/eval_dist/*.csv"))

list_df_ihdp = process_csv_files(csv_files_ihdp, "IHDP", p_values=False)

100%|██████████| 100/100 [00:00<00:00, 229.85it/s]


In [20]:
df_ihdp = pd.concat(list_df_ihdp).drop(columns=["Unnamed: 0"])
df_ihdp = df_ihdp.rename(columns={"approach": "Method", "setup": "Setup"})
df_ihdp.to_csv("../../results/outputs/summary/ihdp_dist_summary.csv")

## Nie and Wager (non-heteroscedastic)

In [27]:
# Define file paths
csv_files_nw_A = sorted(glob.glob("../../results/outputs/nie_wager/setupA/*.csv"))
csv_files_nw_B = sorted(glob.glob("../../results/outputs/nie_wager/setupB/*.csv"))
csv_files_nw_C = sorted(glob.glob("../../results/outputs/nie_wager/setupC/*.csv"))
csv_files_nw_D = sorted(glob.glob("../../results/outputs/nie_wager/setupD/*.csv"))

In [28]:
df_nw_A = pd.concat(process_csv_files(csv_files_nw_A, "A", p_values=False)).drop(columns=["Unnamed: 0"])
df_nw_A = df_nw_A.rename(columns={"approach": "Method", "setup": "Setup"})
df_nw_B = pd.concat(process_csv_files(csv_files_nw_B, "B", p_values=False)).drop(columns=["Unnamed: 0"])
df_nw_B = df_nw_B.rename(columns={"approach": "Method", "setup": "Setup"})
df_nw_C = pd.concat(process_csv_files(csv_files_nw_C, "C", p_values=False)).drop(columns=["Unnamed: 0"])
df_nw_C = df_nw_C.rename(columns={"approach": "Method", "setup": "Setup"})
df_nw_D = pd.concat(process_csv_files(csv_files_nw_D, "D", p_values=False)).drop(columns=["Unnamed: 0"])
df_nw_D = df_nw_D.rename(columns={"approach": "Method", "setup": "Setup"})

100%|██████████| 20/20 [00:00<00:00, 169.57it/s]
100%|██████████| 20/20 [00:00<00:00, 254.44it/s]
100%|██████████| 20/20 [00:00<00:00, 196.77it/s]
100%|██████████| 20/20 [00:00<00:00, 240.60it/s]


In [29]:
df_nw_A.to_csv("../../results/outputs/summary/nw_A_dist_summary.csv")
df_nw_B.to_csv("../../results/outputs/summary/nw_B_dist_summary.csv")
df_nw_C.to_csv("../../results/outputs/summary/nw_C_dist_summary.csv")
df_nw_D.to_csv("../../results/outputs/summary/nw_D_dist_summary.csv")

## Nie and Wager (heteroscedastic)

In [31]:
# Define file paths
csv_files_nw_A_het = sorted(glob.glob("../../results/outputs/nie_wager/setupA/heteroscedastic/*.csv"))
csv_files_nw_B_het = sorted(glob.glob("../../results/outputs/nie_wager/setupB/heteroscedastic/*.csv"))
csv_files_nw_C_het = sorted(glob.glob("../../results/outputs/nie_wager/setupC/heteroscedastic/*.csv"))
csv_files_nw_D_het = sorted(glob.glob("../../results/outputs/nie_wager/setupD/heteroscedastic/*.csv"))

In [32]:
df_nw_A_het = pd.concat(process_csv_files(csv_files_nw_A_het, "A", p_values=False)).drop(columns=["Unnamed: 0"])
df_nw_A_het = df_nw_A.rename(columns={"approach": "Method", "setup": "Setup"})
df_nw_B_het = pd.concat(process_csv_files(csv_files_nw_B_het, "B", p_values=False)).drop(columns=["Unnamed: 0"])
df_nw_B_het = df_nw_B.rename(columns={"approach": "Method", "setup": "Setup"})
df_nw_C_het = pd.concat(process_csv_files(csv_files_nw_C_het, "C", p_values=False)).drop(columns=["Unnamed: 0"])
df_nw_C_het = df_nw_C.rename(columns={"approach": "Method", "setup": "Setup"})
df_nw_D_het = pd.concat(process_csv_files(csv_files_nw_D_het, "D", p_values=False)).drop(columns=["Unnamed: 0"])
df_nw_D_het = df_nw_D.rename(columns={"approach": "Method", "setup": "Setup"})

100%|██████████| 20/20 [00:00<00:00, 157.24it/s]
100%|██████████| 20/20 [00:00<00:00, 258.88it/s]
100%|██████████| 20/20 [00:00<00:00, 157.09it/s]
100%|██████████| 20/20 [00:00<00:00, 211.15it/s]


In [33]:
df_nw_A_het.to_csv("../../results/outputs/summary/nw_A_het_dist_summary.csv")
df_nw_B_het.to_csv("../../results/outputs/summary/nw_B_het_dist_summary.csv")
df_nw_C_het.to_csv("../../results/outputs/summary/nw_C_het_dist_summary.csv")
df_nw_D_het.to_csv("../../results/outputs/summary/nw_D_het_dist_summary.csv")

## ACIC 2016

In [23]:
nb_setups = 77
list_df_acic2016 = []
for i in range(1, nb_setups+1):
    csv_files_setup = sorted(glob.glob(f"../../results/outputs/acic2016/setup{i}/eval_dist/*.csv"))
    list_df_setup = process_csv_files(csv_files_setup, "ACIC2016", p_values=False)
    if len(list_df_setup) == 0:
        continue
    df_setup = pd.concat(list_df_setup).drop(columns=["Unnamed: 0"])
    df_setup = df_setup.rename(columns={"approach": "Method", "setup": "Setup"})
    df_setup["sim_nb"] = i
    list_df_acic2016.append(df_setup)

df_acic2016 = pd.concat(list_df_acic2016, axis=0, ignore_index=True)

100%|██████████| 19/19 [00:00<00:00, 139.71it/s]
100%|██████████| 19/19 [00:00<00:00, 193.36it/s]
100%|██████████| 20/20 [00:00<00:00, 143.33it/s]
100%|██████████| 20/20 [00:00<00:00, 182.30it/s]
100%|██████████| 20/20 [00:00<00:00, 319.12it/s]
100%|██████████| 20/20 [00:00<00:00, 181.93it/s]
100%|██████████| 18/18 [00:00<00:00, 146.32it/s]
100%|██████████| 10/10 [00:00<00:00, 191.78it/s]
100%|██████████| 10/10 [00:00<00:00, 175.17it/s]
100%|██████████| 9/9 [00:00<00:00, 252.64it/s]
100%|██████████| 10/10 [00:00<00:00, 296.97it/s]
100%|██████████| 8/8 [00:00<00:00, 266.08it/s]
100%|██████████| 9/9 [00:00<00:00, 130.88it/s]
100%|██████████| 9/9 [00:00<00:00, 132.28it/s]
100%|██████████| 7/7 [00:00<00:00, 267.83it/s]
100%|██████████| 10/10 [00:00<00:00, 208.74it/s]
100%|██████████| 10/10 [00:00<00:00, 272.60it/s]
100%|██████████| 10/10 [00:00<00:00, 151.15it/s]
100%|██████████| 9/9 [00:00<00:00, 270.61it/s]
100%|██████████| 9/9 [00:00<00:00, 221.36it/s]
100%|██████████| 10/10 [00:00<00:0

In [25]:
df_acic2016.to_csv("../../results/outputs/summary/ACIC2016_dist_summary.csv")