In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Generate Data

In [3]:
# Ensure the parent directory is in the Python path
import sys
import os
sys.path.append(os.path.abspath(".."))

from data import SyntheticDataGeneratorPlus

### RCT with treatment rate 0.5
with pd.HDFStore("RCT_0_5.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'RCT_0_5_scenario_{i}'
        gen = SyntheticDataGeneratorPlus(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         RCT=True, treatment_proportion=0.5, unobserved=False, overlap=True)
        dsets = gen.generate_datasets() 
        store[f"scenario_{i}/data"] = dsets['data']
        store.get_storer(f"scenario_{i}/data").attrs.metadata = dsets['metadata']

### RCT with treatment rate 0.05
with pd.HDFStore("RCT_0_05.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'RCT_0_05_scenario_{i}'
        gen = SyntheticDataGeneratorPlus(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         RCT=True, treatment_proportion=0.05, unobserved=False, overlap=True)
        dsets = gen.generate_datasets() 
        store[f"scenario_{i}/data"] = dsets['data']
        store.get_storer(f"scenario_{i}/data").attrs.metadata = dsets['metadata']

### non-RCT with ignorability and overlap held - propensity score e(X)
with pd.HDFStore("e_X.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'e_X_scenario_{i}'
        gen = SyntheticDataGeneratorPlus(scenario=i, dataset_name=dataset_name,
                                        n_samples=50000, random_state=2025,
                                         RCT=False, unobserved=False, overlap=True)
        dsets = gen.generate_datasets()
        store[f"scenario_{i}/data"] = dsets['data']
        store.get_storer(f"scenario_{i}/data").attrs.metadata = dsets['metadata']

### non-RCT with ignorability violated and overlap held - propensity score e(X, U)
with pd.HDFStore("e_X_U.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'e_X_U_scenario_{i}'
        gen = SyntheticDataGeneratorPlus(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         RCT=False, unobserved=True, overlap=True)
        dsets = gen.generate_datasets()
        store[f"scenario_{i}/data"] = dsets['data']
        store.get_storer(f"scenario_{i}/data").attrs.metadata = dsets['metadata']

### non-RCT with ignorability held but overlap violated - propensity score e(X)_no_overlap
with pd.HDFStore("e_X_no_overlap.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'e_X_no_overlap_scenario_{i}'
        gen = SyntheticDataGeneratorPlus(scenario=i, dataset_name=dataset_name,
                                        n_samples=50000, random_state=2025,
                                         RCT=False, unobserved=False, overlap=False)
        dsets = gen.generate_datasets()
        store[f"scenario_{i}/data"] = dsets['data']
        store.get_storer(f"scenario_{i}/data").attrs.metadata = dsets['metadata']

### informative censoring
info_censor_baseline=0.1
info_censor_alpha=0.05

### informative_censoring and non-RCT with ignorability and overlap held - propensity score e(X)
with pd.HDFStore("e_X_info_censor.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'e_X_info_censor_scenario_{i}'
        gen = SyntheticDataGeneratorPlus(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         informative_censoring=True, RCT=False, 
                                         unobserved=False, overlap=True,
                                         info_censor_baseline=info_censor_baseline,
                                         info_censor_alpha=info_censor_alpha)
        dsets = gen.generate_datasets() 
        store[f"scenario_{i}/data"] = dsets['data']
        store.get_storer(f"scenario_{i}/data").attrs.metadata = dsets['metadata']

### informative_censoring and non-RCT with ignorability violated and overlap held - propensity score e(X, U)
with pd.HDFStore("e_X_U_info_censor.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'e_X_U_info_censor_scenario_{i}'
        gen = SyntheticDataGeneratorPlus(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         informative_censoring=True, RCT=False, 
                                         unobserved=True, overlap=True,
                                         info_censor_baseline=info_censor_baseline,
                                         info_censor_alpha=info_censor_alpha)
        dsets = gen.generate_datasets() 
        store[f"scenario_{i}/data"] = dsets['data']
        store.get_storer(f"scenario_{i}/data").attrs.metadata = dsets['metadata']

### informative_censoring and non-RCT with ignorability held but overlap violated - propensity score e(X)
with pd.HDFStore("e_X_no_overlap_info_censor.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'e_X_no_overlap_info_censor_scenario_{i}'
        gen = SyntheticDataGeneratorPlus(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         informative_censoring=True, RCT=False, 
                                         unobserved=False, overlap=False,
                                         info_censor_baseline=info_censor_baseline,
                                         info_censor_alpha=info_censor_alpha)
        dsets = gen.generate_datasets() 
        store[f"scenario_{i}/data"] = dsets['data']
        store.get_storer(f"scenario_{i}/data").attrs.metadata = dsets['metadata']

# Data Characteristics

In [4]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

def load_scenario_data(h5_file_path, scenario_num):
    key = f"scenario_{scenario_num}/data"
    with pd.HDFStore(h5_file_path, mode='r') as store:
        if key not in store:
            return None  # Scenario not found
        df = store[key]
        metadata = store.get_storer(key).attrs.metadata
    return {"dataset": df, "metadata": metadata}

In [10]:
store_files = [
    "../synthetic_data/RCT_0_5.h5",
    "../synthetic_data/RCT_0_05.h5",
    "../synthetic_data/e_X.h5",
    "../synthetic_data/e_X_U.h5",
    "../synthetic_data/e_X_no_overlap.h5",
    "../synthetic_data/e_X_info_censor.h5",
    "../synthetic_data/e_X_U_info_censor.h5",
    "../synthetic_data/e_X_no_overlap_info_censor.h5"
]

experiment_setups = {}

for path in store_files:
    base_name = os.path.splitext(os.path.basename(path))[0]  # e.g. RCT_0_5
    scenario_dict = {}
    for scenario in ['A', 'B', 'C', 'D', 'E']:
        try:
            result = load_scenario_data(path, scenario)
            if result is not None:
                scenario_dict[f"scenario_{scenario}"] = result
        except Exception as e:
            # Log or ignore as needed
            print(f"Error loading scenario {scenario} from {path}: {e}")
            continue
    experiment_setups[base_name] = scenario_dict

In [11]:
summary_characteristics = {}

for setup_name, setup_dict in tqdm(experiment_setups.items(), desc="Experiment Setups"):
    summary_characteristics[setup_name] = {}
    for scenario_key in tqdm(setup_dict, desc=f"{setup_name} Scenarios", leave=False):
        dataset_df = setup_dict[scenario_key]["dataset"]
            
        # Store placeholder for later population
        cur_dataset_df = experiment_setups[setup_name][scenario_key]["dataset"]
        summary_characteristics[setup_name][scenario_key] = {'censoring_rate': 1-cur_dataset_df['event'].mean(),
                                                             'treatment_rate': cur_dataset_df['W'].mean(),
                                                             'event_time_min': cur_dataset_df['T'].min(),
                                                             'event_time_median': cur_dataset_df['T'].median(),
                                                             'event_time_max': cur_dataset_df['T'].max(),
                                                             'event_time_mean': cur_dataset_df['T'].mean(),
                                                             'event_time_std': cur_dataset_df['T'].std(),
                                                             'censoring_time_min': cur_dataset_df['C'].min(),
                                                             'censoring_time_median': cur_dataset_df['C'].median(),
                                                             'censoring_time_max': cur_dataset_df['C'].max(),
                                                             'censoring_time_mean': cur_dataset_df['C'].mean(),
                                                             'censoring_time_std': cur_dataset_df['C'].std(),
                                                             'ate': (cur_dataset_df['T1']-cur_dataset_df['T0']).mean(),
                                                             'cate_min': (cur_dataset_df['T1']-cur_dataset_df['T0']).min(),
                                                             'cate_median': (cur_dataset_df['T1']-cur_dataset_df['T0']).median(),
                                                             'cate_max': (cur_dataset_df['T1']-cur_dataset_df['T0']).max(),}
        

# Convert the summary_characteristics dictionary to a DataFrame
summary_df = pd.DataFrame.from_dict({(i, j): summary_characteristics[i][j] 
                                       for i in summary_characteristics.keys() 
                                       for j in summary_characteristics[i].keys()},
                                      orient='index').round(6)

summary_df

Experiment Setups: 100%|██████████| 8/8 [00:00<00:00, 50.30it/s]


Unnamed: 0,Unnamed: 1,censoring_rate,treatment_rate,event_time_min,event_time_median,event_time_max,event_time_mean,event_time_std,censoring_time_min,censoring_time_median,censoring_time_max,censoring_time_mean,censoring_time_std,ate,cate_min,cate_median,cate_max
RCT_0_5,scenario_A,0.20318,0.5022,0.0,0.17196,116.487493,0.951277,2.692759,7.5e-05,1.497342,2.999932,1.498318,0.863964,0.163441,-80.551757,-8.8e-05,116.227561
RCT_0_5,scenario_B,0.07288,0.5022,0.001912,0.207141,21.857791,0.400223,0.643571,0.008333,1.612869,10.224601,1.856696,1.168527,0.124969,-16.622922,0.037219,21.793959
RCT_0_5,scenario_C,0.39204,0.5022,0.0,7.0,21.0,7.20312,2.768201,1.0,inf,inf,inf,,0.74996,-16.0,1.0,20.0
RCT_0_5,scenario_D,0.9126,0.5022,0.013666,1.672461,151.249695,3.155972,4.954743,0.001382,0.28535,3.303999,0.357987,0.275921,0.723925,-122.234845,0.096283,150.788241
RCT_0_5,scenario_E,0.79418,0.5022,0.0,8.0,23.0,8.1983,2.937569,0.0,5.0,17.0,4.72428,2.227554,0.7537,-18.0,1.0,20.0
RCT_0_05,scenario_A,0.19976,0.04924,0.0,0.173173,81.226428,0.878312,2.270277,7.5e-05,1.497342,2.999932,1.498318,0.863964,0.163441,-80.551757,-8.8e-05,116.227561
RCT_0_05,scenario_B,0.03638,0.04924,0.002733,0.187343,17.00862,0.342527,0.509784,0.016299,2.171893,10.224601,2.338421,1.269447,0.124969,-16.622922,0.037219,21.793959
RCT_0_05,scenario_C,0.39036,0.04924,0.0,7.0,20.0,6.86734,2.676768,1.0,inf,inf,inf,,0.74996,-16.0,1.0,20.0
RCT_0_05,scenario_D,0.88072,0.04924,0.027282,1.628126,125.041743,2.819971,3.954887,0.001644,0.380938,3.303999,0.451989,0.312118,0.723925,-122.234845,0.096283,150.788241
RCT_0_05,scenario_E,0.76988,0.04924,0.0,8.0,23.0,7.84842,2.832434,0.0,5.0,17.0,4.72428,2.227554,0.7537,-18.0,1.0,20.0


In [12]:
TRUE_ATE_DICT = summary_df['ate'].to_dict()
TRUE_ATE_DICT

{('RCT_0_5', 'scenario_A'): 0.163441,
 ('RCT_0_5', 'scenario_B'): 0.124969,
 ('RCT_0_5', 'scenario_C'): 0.74996,
 ('RCT_0_5', 'scenario_D'): 0.723925,
 ('RCT_0_5', 'scenario_E'): 0.7537,
 ('RCT_0_05', 'scenario_A'): 0.163441,
 ('RCT_0_05', 'scenario_B'): 0.124969,
 ('RCT_0_05', 'scenario_C'): 0.74996,
 ('RCT_0_05', 'scenario_D'): 0.723925,
 ('RCT_0_05', 'scenario_E'): 0.7537,
 ('e_X', 'scenario_A'): 0.163441,
 ('e_X', 'scenario_B'): 0.124969,
 ('e_X', 'scenario_C'): 0.74996,
 ('e_X', 'scenario_D'): 0.723925,
 ('e_X', 'scenario_E'): 0.7537,
 ('e_X_U', 'scenario_A'): 0.003744,
 ('e_X_U', 'scenario_B'): 0.131728,
 ('e_X_U', 'scenario_C'): 0.74036,
 ('e_X_U', 'scenario_D'): 0.830668,
 ('e_X_U', 'scenario_E'): 0.74032,
 ('e_X_no_overlap', 'scenario_A'): 0.163441,
 ('e_X_no_overlap', 'scenario_B'): 0.124969,
 ('e_X_no_overlap', 'scenario_C'): 0.74996,
 ('e_X_no_overlap', 'scenario_D'): 0.723925,
 ('e_X_no_overlap', 'scenario_E'): 0.7537,
 ('e_X_info_censor', 'scenario_A'): 0.163441,
 ('e_X_i

In [None]:
TRUE_ATE = {('RCT_0_5', 'scenario_B'): 0.124969, ('RCT_0_5', 'scenario_A'): 0.163441, ('RCT_0_5', 'scenario_C'): 0.74996,
            ('RCT_0_5', 'scenario_E'): 0.7537, ('RCT_0_5', 'scenario_D'): 0.723925,
            ('RCT_0_05', 'scenario_B'): 0.124969, ('RCT_0_05', 'scenario_A'): 0.163441, ('RCT_0_05', 'scenario_C'): 0.74996,
            ('RCT_0_05', 'scenario_E'): 0.7537, ('RCT_0_05', 'scenario_D'): 0.723925,
            ('e_X', 'scenario_B'): 0.124969, ('e_X', 'scenario_A'): 0.163441, ('e_X', 'scenario_C'): 0.74996,
            ('e_X', 'scenario_E'): 0.7537, ('e_X', 'scenario_D'): 0.723925,
            ('e_X_U', 'scenario_B'): 0.131728, ('e_X_U', 'scenario_A'): 0.003744, ('e_X_U', 'scenario_C'): 0.74036,
            ('e_X_U', 'scenario_E'): 0.74032, ('e_X_U', 'scenario_D'): 0.830668,
            ('e_X_no_overlap', 'scenario_B'): 0.124969, ('e_X_no_overlap', 'scenario_A'): 0.163441, ('e_X_no_overlap', 'scenario_C'): 0.74996,
            ('e_X_no_overlap', 'scenario_E'): 0.7537, ('e_X_no_overlap', 'scenario_D'): 0.723925,
            ('e_X_info_censor', 'scenario_B'): 0.124969, ('e_X_info_censor', 'scenario_A'): 0.163441, ('e_X_info_censor', 'scenario_C'): 0.74996,
            ('e_X_info_censor', 'scenario_E'): 0.7537, ('e_X_info_censor', 'scenario_D'): 0.723925,
            ('e_X_U_info_censor', 'scenario_B'): 0.131728, ('e_X_U_info_censor', 'scenario_A'): 0.003744, ('e_X_U_info_censor', 'scenario_C'): 0.74036,
            ('e_X_U_info_censor', 'scenario_E'): 0.74032, ('e_X_U_info_censor', 'scenario_D'): 0.830668,
            ('e_X_no_overlap_info_censor', 'scenario_B'): 0.124969, ('e_X_no_overlap_info_censor', 'scenario_A'): 0.163441, ('e_X_no_overlap_info_censor', 'scenario_C'): 0.74996,
            ('e_X_no_overlap_info_censor', 'scenario_E'): 0.7537, ('e_X_no_overlap_info_censor', 'scenario_D'): 0.723925}

In [9]:
TRUE_ATE.get((setup_name, scenario_key), 0)

0.723925