### Label Confounding on X-ray data - Asymptotic Normality

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
from scipy.stats import norm

# ATE estimation
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression, LinearRegression
from causalml.inference.meta import BaseSRegressor
from doubleml import DoubleMLData, DoubleMLPLR
import statsmodels.api as sm

# Custom modules
from feature_extraction.pretrained_models_xrv import load_torchxrayvision_model, extract_features_from_folder
from utils.project import set_root
from utils.io import save_results, load_results

In [None]:
# Set working directory, dataset directory and directory for saving results
set_root()
dataset_dir = "data/xray/processed/all_unique"
results_dir = "results/asym_normality/xray/label"

# Define the model name and path for saving results
model_name = "densenet121-res224-all"  # Pretrained model name
save_dir_rep = f"data/xray/representations/{model_name}"

# Define file paths
features_path = os.path.join(save_dir_rep, "latent_features.npy")
labels_path = os.path.join(save_dir_rep, "labels.npy")

In [None]:
# Feature extraction and saving (Only if the features and labels do not already exist)
if not os.path.exists(features_path) or not os.path.exists(labels_path):
    print(f"Extracting features using model '{model_name}'...")
    
    # Extract features and save them
    model = load_torchxrayvision_model(model_name)
    all_features, labels = extract_features_from_folder(
        dataset_dir,
        model,
        device='cpu',
        batch_size=32,
        save_path=save_dir_rep
    )
    
    print(f"Features extracted and saved to: {save_dir_rep}")
else:
    print(f"Features already exist in {save_dir_rep}. Skipping extraction.")

# Load extracted features
all_features = np.load(features_path)
labels = np.load(labels_path)

### Confounding Simulation and ATE Estimation

In [None]:
# 1. Define simulation parameters
beta_true = 2.0     # True effect of A on Y
gamma_true = -1   # Effect of pneumonia on Y
p_treat_given_pneu = 0.7    # Probability of treatment if pneumonia
p_treat_given_normal = 0.3  # Probability of treatment if normal

# 2. Specify general parameters for simulation
pneumonia_label = labels.astype(int) # ensure 0/1
n_samples = pneumonia_label.shape[0]
n_runs = 200  # Number of simulation runs
ci_alpha_level = 0.05  # Alpha level for 1-alpha confidence intervals
z_score = norm.ppf(1 - ci_alpha_level / 2) # Z-score for 1-alpha confidence intervals

# 3. Initialize dictionaries to store estimates and confidence intervals
methods = ['Naive', 'Oracle', 'S-Learner (Linear)', 'DML (Linear)']
estimates_dict = {method: [] for method in methods}
cis_dict = {method: {'se': [], 'lower': [], 'upper': []} for method in methods}

# Set seed for reproducibility
seed = 42

# 4. Simulation Loop
for run in range(n_runs):
    print(f"\n--- Simulation Run {run + 1} ---")
    # Set a unique seed for each run for variability
    seed = seed + 2  # Update seed for each run
    np.random.seed(seed)
    
    # 4.1. Simulate Treatment A
    pA = pneumonia_label * p_treat_given_pneu + (1 - pneumonia_label) * p_treat_given_normal
    A = np.random.binomial(1, pA)
    
    # 4.2. Simulate Outcome Y
    noise = np.random.normal(loc=0, scale=1, size=n_samples)
    Y = beta_true * A + gamma_true * pneumonia_label + noise
    
    # 4.3. Package into DataFrame
    df = pd.DataFrame({
        'Y': Y,
        'A': A,
        'pneumonia': pneumonia_label
    })
    
    # 4.4. Naive OLS (Unadjusted) using statsmodels
    X_naive = sm.add_constant(df['A']) 
    model_naive = sm.OLS(df['Y'], X_naive).fit()
    beta_naive = model_naive.params['A']
    se_naive = model_naive.bse['A']
    ci_lower_naive = beta_naive - z_score * se_naive
    ci_upper_naive = beta_naive + z_score * se_naive
    estimates_dict['Naive'].append(beta_naive)
    cis_dict['Naive']['se'].append(se_naive)
    cis_dict['Naive']['lower'].append(ci_lower_naive)
    cis_dict['Naive']['upper'].append(ci_upper_naive)
    print(f"Naive OLS: β = {beta_naive:.3f}, SE = {se_naive:.3f}")
    
    # 4.5. Oracle OLS (Adjusting for pneumonia) using statsmodels
    X_oracle = sm.add_constant(df[['A', 'pneumonia']])
    model_oracle = sm.OLS(df['Y'], X_oracle).fit()
    beta_oracle = model_oracle.params['A']
    se_oracle = model_oracle.bse['A']
    ci_lower_oracle = beta_oracle - z_score * se_oracle
    ci_upper_oracle = beta_oracle + z_score * se_oracle
    estimates_dict['Oracle'].append(beta_oracle)
    cis_dict['Oracle']['se'].append(se_oracle)
    cis_dict['Oracle']['lower'].append(ci_lower_oracle)
    cis_dict['Oracle']['upper'].append(ci_upper_oracle)
    print(f"Oracle OLS: β = {beta_oracle:.3f}, SE = {se_oracle:.3f}")

    # 4.6.1 S-Learner (Linear)
    outcome_model_linear = LinearRegression()
    try:
        outcome_model_linear = BaseSRegressor(outcome_model_linear) 
        s_ate_linear, s_ci_lower_linear , s_ci_upper_linear  = outcome_model_linear.estimate_ate(all_features, A, Y, return_ci=True)
        estimates_dict['S-Learner (Linear)'].append(s_ate_linear[0])
        se_slearner = (s_ci_upper_linear - s_ate_linear) / z_score
        cis_dict['S-Learner (Linear)']['se'].append(se_slearner)
        cis_dict['S-Learner (Linear)']['lower'].append(s_ci_lower_linear[0])
        cis_dict['S-Learner (Linear)']['upper'].append(s_ci_upper_linear[0])
        print(f"S-Learner (Linear): β = {s_ate_linear[0]:.3f}")
    except Exception as e:
        print(f"Run {run+1}: S-Learner (Linear) failed with error: {e}")
        estimates_dict['S-Learner (Linear)'].append(np.nan)
        cis_dict['S-Learner (Linear)']['se'].append(np.nan)
        cis_dict['S-Learner (Linear)']['lower'].append(np.nan)
        cis_dict['S-Learner (Linear)']['upper'].append(np.nan)

    # 4.7. DoubleML with Linar Nuisance Estimators
    # Convert all_features to DataFrame
    X_dml_df = pd.DataFrame(
        all_features,
        columns=[f"feat_{i}" for i in range(all_features.shape[1])]
    )

    # Add outcome and treatment to DoubleMLData via column names
    X_dml_df['Y'] = df['Y'].copy()
    X_dml_df['A'] = df['A'].copy()

    # Create DoubleMLData
    data_dml = DoubleMLData(X_dml_df, "Y", "A")

    # 4.7.1. DoubleML with Linear Models Estimators
    try:
        # Define nuisance models with linear models
        ml_g_linear = LinearRegression() # Outcome model
        ml_m_linear = LogisticRegression()  # Treatment model
    
        # Instantiate and fit DoubleMLPLR
        dml_plr_linear = DoubleMLPLR(data_dml, ml_g_linear, ml_m_linear, n_folds=2) 
        dml_plr_linear.fit()
        beta_dml_linear = dml_plr_linear.coef[0]
        se_dml_linear = dml_plr_linear.se[0]
        estimates_dict['DML (Linear)'].append(beta_dml_linear)
        # 95% Confidence Interval
        ci_lower_dml_linear = beta_dml_linear - z_score * se_dml_linear
        ci_upper_dml_linear = beta_dml_linear + z_score * se_dml_linear
        cis_dict['DML (Linear)']['se'].append(se_dml_linear)
        cis_dict['DML (Linear)']['lower'].append(ci_lower_dml_linear)
        cis_dict['DML (Linear)']['upper'].append(ci_upper_dml_linear)
        print(f"DML (Linear): β = {beta_dml_linear:.3f}, SE = {se_dml_linear:.3f}")
    except Exception as e:
        print(f"Run {run+1}: DML (NN) failed with error: {e}")
        estimates_dict['DML (Linear)'].append(np.nan)
        cis_dict['DML (Linear)']['se'].append(np.nan)
        cis_dict['DML (Linear)']['lower'].append(np.nan)
        cis_dict['DML (Linear)']['upper'].append(np.nan)

In [None]:
# 5. Create a directory for the experiment and save the results
experiment_name = "exp_results"
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
experiment_dir = os.path.join(results_dir, model_name, experiment_name, timestamp)
save_results(experiment_dir, estimates_dict, cis_dict)

### Plotting of Results

In [14]:
# Load the results from the previous experiment
estimates_dict, cis_dict = load_results(experiment_dir)

In [None]:
# Standardize ATE estimates  
dml_est_normalized = (np.array(estimates_dict['DML (Linear)']) - 
                      beta_true) / np.array(cis_dict['DML (Linear)']['se'])
naive_est_normalized = (np.array(estimates_dict['Naive']) - 
                      beta_true) / np.array(cis_dict['Naive']['se'])
oracle_est_normalized = (np.array(estimates_dict['Oracle']) - 
                      beta_true) / np.array(cis_dict['Oracle']['se'])

In [None]:
# 6. Plotting
sns.set(style="whitegrid")
face_colors = sns.color_palette('pastel')
edge_colors = sns.color_palette('dark')

fig_orth_nosplit, ax = plt.subplots(constrained_layout=True)

n_bins = 20
sns.histplot(naive_est_normalized,
                color=face_colors[1], edgecolor = edge_colors[1],
                stat='density', bins=n_bins, label='Naive')
sns.histplot(oracle_est_normalized,
                color=face_colors[0], edgecolor = edge_colors[0],
                stat='density', bins=n_bins, label='Oracle')
sns.histplot(dml_est_normalized,
                color=face_colors[2], edgecolor = edge_colors[2],
                stat='density', bins=n_bins, label='DML')
x_val_norm = np.arange(-15, 15, 0.001)
y_val_norm = norm.pdf(x_val_norm)
ax.plot(x_val_norm, y_val_norm, color='k', label='$\\mathcal{N}(0, 1)$', linewidth=2)
ax.legend(loc='upper right', bbox_to_anchor=(1.2, 1.0))
ax.set_xlim([-15, 5])
ax.set_xlabel('$(\widehat{ATE} - ATE)/\hat{\sigma}$')
plot_path = os.path.join(experiment_dir, 'plot_asym_norm.pdf')
plt.savefig(plot_path, dpi=300, bbox_inches='tight', facecolor='white')
plt.show()