In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Generate Data

In [2]:
from data import SyntheticDataGeneratorPlus

### RCT with treatment rate 0.5
with pd.HDFStore("synthetic_data/RCT_0_5.h5") as store:
    for i in range(1,11):
        if i in [3,4,6,7,10]:
            continue
        dataset_name = f'RCT_0_5_scenario_{i}'
        gen = SyntheticDataGeneratorPlus(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         RCT=True, treatment_proportion=0.5, unobserved=False, overlap=True)
        dsets = gen.generate_datasets() 
        store[f"scenario_{i}/data"] = dsets['data']
        store.get_storer(f"scenario_{i}/data").attrs.metadata = dsets['metadata']

### RCT with treatment rate 0.05
with pd.HDFStore("synthetic_data/RCT_0_05.h5") as store:
    for i in range(1,11):
        if i in [3,4,6,7,10]:
            continue
        dataset_name = f'RCT_0_05_scenario_{i}'
        gen = SyntheticDataGeneratorPlus(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         RCT=True, treatment_proportion=0.05, unobserved=False, overlap=True)
        dsets = gen.generate_datasets() 
        store[f"scenario_{i}/data"] = dsets['data']
        store.get_storer(f"scenario_{i}/data").attrs.metadata = dsets['metadata']

### non-RCT with ignorability and overlap held - propensity score e(X)
with pd.HDFStore("synthetic_data/e_X.h5") as store:
    for i in range(1,11):
        if i in [3,4,6,7,10]:
            continue
        dataset_name = f'e_X_scenario_{i}'
        gen = SyntheticDataGeneratorPlus(scenario=i, dataset_name=dataset_name,
                                        n_samples=50000, random_state=2025,
                                         RCT=False, unobserved=False, overlap=True)
        dsets = gen.generate_datasets()
        store[f"scenario_{i}/data"] = dsets['data']
        store.get_storer(f"scenario_{i}/data").attrs.metadata = dsets['metadata']

### non-RCT with ignorability violated and overlap held - propensity score e(X, U)
with pd.HDFStore("synthetic_data/e_X_U.h5") as store:
    for i in range(1,11):
        if i in [3,4,6,7,10]:
            continue
        dataset_name = f'e_X_U_scenario_{i}'
        gen = SyntheticDataGeneratorPlus(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         RCT=False, unobserved=True, overlap=True)
        dsets = gen.generate_datasets()
        store[f"scenario_{i}/data"] = dsets['data']
        store.get_storer(f"scenario_{i}/data").attrs.metadata = dsets['metadata']

### non-RCT with ignorability held but overlap violated - propensity score e(X)_no_overlap
with pd.HDFStore("synthetic_data/e_X_no_overlap.h5") as store:
    for i in range(1,11):
        if i in [3,4,6,7,10]:
            continue
        dataset_name = f'e_X_no_overlap_scenario_{i}'
        gen = SyntheticDataGeneratorPlus(scenario=i, dataset_name=dataset_name,
                                        n_samples=50000, random_state=2025,
                                         RCT=False, unobserved=False, overlap=False)
        dsets = gen.generate_datasets()
        store[f"scenario_{i}/data"] = dsets['data']
        store.get_storer(f"scenario_{i}/data").attrs.metadata = dsets['metadata']

### informative censoring
info_censor_baseline=0.1
info_censor_alpha=0.05

### informative_censoring and non-RCT with ignorability and overlap held - propensity score e(X)
with pd.HDFStore("synthetic_data/e_X_info_censor.h5") as store:
    for i in range(1,11):
        if i in [3,4,6,7,10]:
            continue
        dataset_name = f'e_X_info_censor_scenario_{i}'
        gen = SyntheticDataGeneratorPlus(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         informative_censoring=True, RCT=False, 
                                         unobserved=False, overlap=True,
                                         info_censor_baseline=info_censor_baseline,
                                         info_censor_alpha=info_censor_alpha)
        dsets = gen.generate_datasets() 
        store[f"scenario_{i}/data"] = dsets['data']
        store.get_storer(f"scenario_{i}/data").attrs.metadata = dsets['metadata']

### informative_censoring and non-RCT with ignorability violated and overlap held - propensity score e(X, U)
with pd.HDFStore("synthetic_data/e_X_U_info_censor.h5") as store:
    for i in range(1,11):
        if i in [3,4,6,7,10]:
            continue
        dataset_name = f'e_X_U_info_censor_scenario_{i}'
        gen = SyntheticDataGeneratorPlus(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         informative_censoring=True, RCT=False, 
                                         unobserved=True, overlap=True,
                                         info_censor_baseline=info_censor_baseline,
                                         info_censor_alpha=info_censor_alpha)
        dsets = gen.generate_datasets() 
        store[f"scenario_{i}/data"] = dsets['data']
        store.get_storer(f"scenario_{i}/data").attrs.metadata = dsets['metadata']

### informative_censoring and non-RCT with ignorability held but overlap violated - propensity score e(X)
with pd.HDFStore("synthetic_data/e_X_no_overlap_info_censor.h5") as store:
    for i in range(1,11):
        if i in [3,4,6,7,10]:
            continue
        dataset_name = f'e_X_no_overlap_info_censor_scenario_{i}'
        gen = SyntheticDataGeneratorPlus(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         informative_censoring=True, RCT=False, 
                                         unobserved=False, overlap=False,
                                         info_censor_baseline=info_censor_baseline,
                                         info_censor_alpha=info_censor_alpha)
        dsets = gen.generate_datasets() 
        store[f"scenario_{i}/data"] = dsets['data']
        store.get_storer(f"scenario_{i}/data").attrs.metadata = dsets['metadata']

# Data Characteristics

In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

def load_scenario_data(h5_file_path, scenario_num):
    key = f"scenario_{scenario_num}/data"
    with pd.HDFStore(h5_file_path, mode='r') as store:
        if key not in store:
            return None  # Scenario not found
        df = store[key]
        metadata = store.get_storer(key).attrs.metadata
    return {"dataset": df, "metadata": metadata}

In [4]:
store_files = [
    "synthetic_data/RCT_0_5.h5",
    "synthetic_data/RCT_0_05.h5",
    "synthetic_data/e_X.h5",
    "synthetic_data/e_X_U.h5",
    "synthetic_data/e_X_no_overlap.h5",
    "synthetic_data/e_X_info_censor.h5",
    "synthetic_data/e_X_U_info_censor.h5",
    "synthetic_data/e_X_no_overlap_info_censor.h5"
]

experiment_setups = {}

for path in store_files:
    base_name = os.path.splitext(os.path.basename(path))[0]  # e.g. RCT_0_5
    scenario_dict = {}
    for scenario in range(1, 11):
        try:
            result = load_scenario_data(path, scenario)
            if result is not None:
                scenario_dict[f"scenario_{scenario}"] = result
        except Exception as e:
            # Log or ignore as needed
            print(f"Error loading scenario {scenario} from {path}: {e}")
            continue
    experiment_setups[base_name] = scenario_dict

In [5]:
summary_characteristics = {}

for setup_name, setup_dict in tqdm(experiment_setups.items(), desc="Experiment Setups"):
    summary_characteristics[setup_name] = {}
    for scenario_key in tqdm(setup_dict, desc=f"{setup_name} Scenarios", leave=False):
        dataset_df = setup_dict[scenario_key]["dataset"]
            
        # Store placeholder for later population
        cur_dataset_df = experiment_setups[setup_name][scenario_key]["dataset"]
        summary_characteristics[setup_name][scenario_key] = {'censoring_rate': 1-cur_dataset_df['event'].mean(),
                                                             'treatment_rate': cur_dataset_df['W'].mean(),
                                                             'event_time_min': cur_dataset_df['T'].min(),
                                                             'event_time_median': cur_dataset_df['T'].median(),
                                                             'event_time_max': cur_dataset_df['T'].max(),
                                                             'event_time_mean': cur_dataset_df['T'].mean(),
                                                             'event_time_std': cur_dataset_df['T'].std(),
                                                             'censoring_time_min': cur_dataset_df['C'].min(),
                                                             'censoring_time_median': cur_dataset_df['C'].median(),
                                                             'censoring_time_max': cur_dataset_df['C'].max(),
                                                             'censoring_time_mean': cur_dataset_df['C'].mean(),
                                                             'censoring_time_std': cur_dataset_df['C'].std(),
                                                             'ate': (cur_dataset_df['T1']-cur_dataset_df['T0']).mean(),
                                                             'cate_min': (cur_dataset_df['T1']-cur_dataset_df['T0']).min(),
                                                             'cate_median': (cur_dataset_df['T1']-cur_dataset_df['T0']).median(),
                                                             'cate_max': (cur_dataset_df['T1']-cur_dataset_df['T0']).max(),}
        

# Convert the summary_characteristics dictionary to a DataFrame
summary_df = pd.DataFrame.from_dict({(i, j): summary_characteristics[i][j] 
                                       for i in summary_characteristics.keys() 
                                       for j in summary_characteristics[i].keys()},
                                      orient='index').round(2)

summary_df

  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
Experiment Setups: 100%|██████████| 8/8 [00:00<00:00, 40.01it/s]


Unnamed: 0,Unnamed: 1,censoring_rate,treatment_rate,event_time_min,event_time_median,event_time_max,event_time_mean,event_time_std,censoring_time_min,censoring_time_median,censoring_time_max,censoring_time_mean,censoring_time_std,ate,cate_min,cate_median,cate_max
RCT_0_5,scenario_1,0.07,0.5,0.0,0.21,21.86,0.4,0.64,0.01,1.61,10.22,1.86,1.17,0.12,-16.62,0.04,21.79
RCT_0_5,scenario_2,0.2,0.5,0.0,0.17,116.49,0.95,2.69,0.0,1.5,3.0,1.5,0.86,0.16,-80.55,-0.0,116.23
RCT_0_5,scenario_5,0.39,0.5,0.0,7.0,21.0,7.2,2.77,1.0,inf,inf,inf,,0.75,-16.0,1.0,20.0
RCT_0_5,scenario_8,0.79,0.5,0.0,8.0,23.0,8.2,2.94,0.0,5.0,17.0,4.72,2.23,0.75,-18.0,1.0,20.0
RCT_0_5,scenario_9,0.91,0.5,0.01,1.67,151.25,3.16,4.95,0.0,0.29,3.3,0.36,0.28,0.72,-122.23,0.1,150.79
RCT_0_05,scenario_1,0.04,0.05,0.0,0.19,17.01,0.34,0.51,0.02,2.17,10.22,2.34,1.27,0.12,-16.62,0.04,21.79
RCT_0_05,scenario_2,0.2,0.05,0.0,0.17,81.23,0.88,2.27,0.0,1.5,3.0,1.5,0.86,0.16,-80.55,-0.0,116.23
RCT_0_05,scenario_5,0.39,0.05,0.0,7.0,20.0,6.87,2.68,1.0,inf,inf,inf,,0.75,-16.0,1.0,20.0
RCT_0_05,scenario_8,0.77,0.05,0.0,8.0,23.0,7.85,2.83,0.0,5.0,17.0,4.72,2.23,0.75,-18.0,1.0,20.0
RCT_0_05,scenario_9,0.88,0.05,0.03,1.63,125.04,2.82,3.95,0.0,0.38,3.3,0.45,0.31,0.72,-122.23,0.1,150.79
