### Label Confounding on X-ray data

In [None]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
from scipy.stats import norm

# ATE estimation
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression, LinearRegression, LassoCV, LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from causalml.inference.meta import BaseSRegressor
from doubleml import DoubleMLData, DoubleMLPLR

# Custom modules
from utils.project import set_root
from utils.io import save_results, load_results
from visualization.plotting import plot_ate_estimates
from feature_extraction.pretrained_models_xrv import load_torchxrayvision_model, extract_features_from_folder


In [None]:
# Set working directory, dataset directory and directory for saving results
set_root()
dataset_dir = "data/xray/raw/all_unique"
results_dir = "results/comparison_learners/xray/label"

# Define the model name and path for saving results
model_name = "densenet121-res224-all"  # Pretrained model name
save_dir_rep = f"data/xray/representations/{model_name}"

# Define file paths
features_path = os.path.join(save_dir_rep, "latent_features.npy")
labels_path = os.path.join(save_dir_rep, "labels.npy")

In [None]:
# Feature extraction and saving (Only if the features and labels do not already exist)
if not os.path.exists(features_path) or not os.path.exists(labels_path):
    print(f"Extracting features using model '{model_name}'...")
    
    # Extract features and save them
    model = load_torchxrayvision_model(model_name)
    all_features, labels = extract_features_from_folder(
        dataset_dir,
        model,
        device='cpu',
        batch_size=32,
        save_path=save_dir_rep
    )
    
    print(f"Features extracted and saved to: {save_dir_rep}")
else:
    print(f"Features already exist in {save_dir_rep}. Skipping extraction.")

# Load extracted features
all_features = np.load(features_path)
labels = np.load(labels_path)

### Confounding Simulation and ATE Estimation

In [None]:
# 1. Define simulation parameters
beta_true = 2.0     # True effect of A on Y
gamma_true = -2   # Effect of pneumonia on Y
p_treat_given_pneu = 0.7    # Probability of treatment if pneumonia
p_treat_given_normal = 0.3  # Probability of treatment if normal

# 2. Specify general parameters for simulation
pneumonia_label = labels.astype(int) # ensure 0/1
n_samples = pneumonia_label.shape[0]
n_runs = 5  # Number of simulation runs
ci_alpha_level = 0.05  # Alpha level for 1-alpha confidence intervals
z_score = norm.ppf(1 - ci_alpha_level / 2) # Z-score for 1-alpha confidence intervals

# 3. Initialize dictionaries to store estimates and confidence intervals
methods = ['Naive', 'Oracle','S-Learner (Linear)', 
           'S-Learner (RF)', 'S-Learner (Lasso)', 
           'DML (Linear)', 'DML (RF)', 'DML (Lasso)']
estimates_dict = {method: [] for method in methods}
cis_dict = {method: {'lower': [], 'upper': []} for method in methods}

# Set seed for reproducibility
seed = 42

# 4. Simulation Loop
for run in range(n_runs):
    print(f"\n--- Simulation Run {run + 1} ---")
    # Set a unique seed for each run for variability
    seed = seed + 2  # Update seed for each run
    np.random.seed(seed)
    
    # 4.1. Simulate Treatment A
    pA = pneumonia_label * p_treat_given_pneu + (1 - pneumonia_label) * p_treat_given_normal
    A = np.random.binomial(1, pA)
    
    # 4.2. Simulate Outcome Y
    noise = np.random.normal(loc=0, scale=1, size=n_samples)
    Y = beta_true * A + gamma_true * pneumonia_label + noise
    
    # 4.3. Package into DataFrame
    df = pd.DataFrame({
        'Y': Y,
        'A': A,
        'pneumonia': pneumonia_label
    })
    
    # 4.4. Naive OLS (Unadjusted) using statsmodels
    X_naive = sm.add_constant(df['A']) 
    model_naive = sm.OLS(df['Y'], X_naive).fit()
    beta_naive = model_naive.params['A']
    se_naive = model_naive.bse['A']
    ci_lower_naive = beta_naive - z_score * se_naive
    ci_upper_naive = beta_naive + z_score * se_naive
    estimates_dict['Naive'].append(beta_naive)
    cis_dict['Naive']['lower'].append(ci_lower_naive)
    cis_dict['Naive']['upper'].append(ci_upper_naive)
    print(f"Naive OLS: β = {beta_naive:.3f}, SE = {se_naive:.3f}")
    
    # 4.5. Oracle OLS (Adjusting for pneumonia label) using statsmodels
    X_oracle = sm.add_constant(df[['A', 'pneumonia']])
    model_oracle = sm.OLS(df['Y'], X_oracle).fit()
    beta_oracle = model_oracle.params['A']
    se_oracle = model_oracle.bse['A']
    ci_lower_oracle = beta_oracle - z_score * se_oracle
    ci_upper_oracle = beta_oracle + z_score * se_oracle
    estimates_dict['Oracle'].append(beta_oracle)
    cis_dict['Oracle']['lower'].append(ci_lower_oracle)
    cis_dict['Oracle']['upper'].append(ci_upper_oracle)
    print(f"Oracle OLS: β = {beta_oracle:.3f}, SE = {se_oracle:.3f}")

    # 4.6.1. S-Learner (Linear)
    outcome_model_linear = LinearRegression()
    try:
        outcome_model_linear = BaseSRegressor(outcome_model_linear) 
        s_ate_linear, s_ci_lower_linear , s_ci_upper_linear  = outcome_model_linear.estimate_ate(all_features, A, Y, return_ci=True)
        estimates_dict['S-Learner (Linear)'].append(s_ate_linear[0])
        cis_dict['S-Learner (Linear)']['lower'].append(s_ci_lower_linear[0])
        cis_dict['S-Learner (Linear)']['upper'].append(s_ci_upper_linear[0])
        print(f"S-Learner (Linear): β = {s_ate_linear[0]:.3f}")
    except Exception as e:
        print(f"Run {run+1}: S-Learner (Linear) failed with error: {e}")
        estimates_dict['S-Learner (Linear)'].append(np.nan)
        cis_dict['S-Learner (Linear)']['lower'].append(np.nan)
        cis_dict['S-Learner (Linear)']['upper'].append(np.nan)
    
    # 4.6.2. S-Learner (RF) 
    outcome_model_rf = RandomForestRegressor(n_estimators=50, random_state=42)
    try:
        s_learner_rf = BaseSRegressor(outcome_model_rf) 
        s_ate_rf, s_ci_lower_rf, s_ci_upper_rf = s_learner_rf.estimate_ate(all_features, A, Y, return_ci=True)
        estimates_dict['S-Learner (RF)'].append(s_ate_rf[0])
        cis_dict['S-Learner (RF)']['lower'].append(s_ci_lower_rf[0])
        cis_dict['S-Learner (RF)']['upper'].append(s_ci_upper_rf[0])
        print(f"S-Learner (RF): β = {s_ate_rf[0]:.3f}")
    except Exception as e:
        print(f"Run {run+1}: S-Learner (RF) failed with error: {e}")
        estimates_dict['S-Learner (RF)'].append(np.nan)
        cis_dict['S-Learner (RF)']['lower'].append(np.nan)
        cis_dict['S-Learner (RF)']['upper'].append(np.nan)

    # 4.6.3. S-Learner (Lasso) 
    outcome_model_lasso = LassoCV(cv=5, n_jobs=-1)
    try:
        s_learner_lasso = BaseSRegressor(outcome_model_lasso) 
        s_ate_lasso, s_ci_lower_lasso, s_ci_upper_lasso = s_learner_lasso.estimate_ate(all_features, A, Y, return_ci=True)
        estimates_dict['S-Learner (Lasso)'].append(s_ate_lasso[0])
        cis_dict['S-Learner (Lasso)']['lower'].append(s_ci_lower_lasso[0])
        cis_dict['S-Learner (Lasso)']['upper'].append(s_ci_upper_lasso[0])
        print(f"S-Learner (Lasso): β = {s_ate_lasso[0]:.3f}")
    except Exception as e:
        print(f"Run {run+1}: S-Learner (Lasso) failed with error: {e}")
        estimates_dict['S-Learner (Lasso)'].append(np.nan)
        cis_dict['S-Learner (Lasso)']['lower'].append(np.nan)
        cis_dict['S-Learner (Lasso)']['upper'].append(np.nan)

    # 4.7. DoubleML with Linar and Random Forest Nuisance Estimators
    # Convert all_features to DataFrame
    X_dml_df = pd.DataFrame(
        all_features,
        columns=[f"feat_{i}" for i in range(all_features.shape[1])]
    )

    # Add outcome and treatment to DoubleMLData via column names
    X_dml_df['Y'] = df['Y'].copy()
    X_dml_df['A'] = df['A'].copy()

    # Create DoubleMLData
    data_dml = DoubleMLData(X_dml_df, "Y", "A")

    # 4.7.1. DoubleML with Linear Models Estimators
    try:
        # Define nuisance models with linear models
        ml_g_linear = LinearRegression() # Outcome model
        ml_m_linear = LogisticRegression()  # Treatment model

    
        # Instantiate and fit DoubleMLPLR
        dml_plr_linear = DoubleMLPLR(data_dml, ml_g_linear, ml_m_linear, n_folds=2) 
        dml_plr_linear.fit()
        beta_dml_linear = dml_plr_linear.coef[0]
        se_dml_linear = dml_plr_linear.se[0]
        estimates_dict['DML (Linear)'].append(beta_dml_linear)
        # 95% Confidence Interval
        ci_lower_dml_linear = beta_dml_linear - z_score * se_dml_linear
        ci_upper_dml_linear = beta_dml_linear + z_score * se_dml_linear
        cis_dict['DML (Linear)']['lower'].append(ci_lower_dml_linear)
        cis_dict['DML (Linear)']['upper'].append(ci_upper_dml_linear)
        print(f"DML (Linear): β = {beta_dml_linear:.3f}, SE = {se_dml_linear:.3f}")
    except Exception as e:
        print(f"Run {run+1}: DML (Linear) failed with error: {e}")
        estimates_dict['DML (Linear)'].append(np.nan)
        cis_dict['DML (Linear)']['lower'].append(np.nan)
        cis_dict['DML (Linear)']['upper'].append(np.nan)
    
    # 4.7.2. DoubleML with Random Forest Nuisance Estimators
    try:
        # Define nuisance models with neural networks
        ml_g_rf = RandomForestRegressor(n_estimators=50, random_state=42) 
        ml_m_rf = RandomForestClassifier(n_estimators=50, random_state=42)
    
        # Instantiate and fit DoubleMLPLR
        dml_plr_rf = DoubleMLPLR(data_dml, ml_g_rf, ml_m_rf, n_folds=2)  
        dml_plr_rf.fit()
        beta_dml_rf = dml_plr_rf.coef[0]
        se_dml_rf = dml_plr_rf.se[0]
        estimates_dict['DML (RF)'].append(beta_dml_rf)
        # 95% Confidence Interval
        ci_lower_dml_rf = beta_dml_rf - z_score * se_dml_rf 
        ci_upper_dml_rf = beta_dml_rf + z_score * se_dml_rf
        cis_dict['DML (RF)']['lower'].append(ci_lower_dml_rf)
        cis_dict['DML (RF)']['upper'].append(ci_upper_dml_rf)
        print(f"DML (RF): β = {beta_dml_rf:.3f}, SE = {se_dml_rf:.3f}")
    except Exception as e:
        print(f"Run {run+1}: DML failed with error: {e}")
        estimates_dict['DML (RF)'].append(np.nan)
        cis_dict['DML (RF)']['lower'].append(np.nan)
        cis_dict['DML (RF)']['upper'].append(np.nan)

    # 4.7.3. DoubleML with Lasso Nuisance Estimators
    try:
        # Define nuisance models with neural networks
        ml_g_lasso = LassoCV(cv=5, n_jobs=-1, random_state=42) 
        ml_m_lasso = LogisticRegressionCV(penalty='l1', cv=5, n_jobs=-1, 
                                          random_state=42, solver='saga', max_iter=10000)

        # Instantiate and fit DoubleMLPLR
        dml_plr_lasso = DoubleMLPLR(data_dml, ml_g_lasso, ml_m_lasso, n_folds=2)  
        dml_plr_lasso.fit()
        beta_dml_lasso = dml_plr_lasso.coef[0]
        se_dml_lasso = dml_plr_lasso.se[0]
        estimates_dict['DML (Lasso)'].append(beta_dml_lasso)
        # 95% Confidence Interval
        ci_lower_dml_lasso = beta_dml_lasso - z_score * se_dml_lasso 
        ci_upper_dml_lasso = beta_dml_lasso + z_score * se_dml_lasso
        cis_dict['DML (Lasso)']['lower'].append(ci_lower_dml_lasso)
        cis_dict['DML (Lasso)']['upper'].append(ci_upper_dml_lasso)
        print(f"DML (Lasso): β = {beta_dml_lasso:.3f}, SE = {se_dml_lasso:.3f}")
    except Exception as e:
        print(f"Run {run+1}: DML (Lasso) failed with error: {e}")
        estimates_dict['DML (Lasso)'].append(np.nan)
        cis_dict['DML (Lasso)']['lower'].append(np.nan)
        cis_dict['DML (Lasso)']['upper'].append(np.nan)

In [None]:
# 5. Create a directory for the experiment and save the results
experiment_name = "exp_results"
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
experiment_dir = os.path.join(results_dir, model_name, experiment_name, timestamp)
save_results(experiment_dir, estimates_dict, cis_dict)

### Plotting Results

In [4]:
# Load the results from the previous experiment
estimates_dict, cis_dict = load_results(experiment_dir)

In [None]:
# Plot ATE estimates with confidence intervals
plot_ate_estimates(
    estimates_dict=estimates_dict,
    cis_dict=cis_dict,
    plot_name="ate_estimates_label_conf_pneu",
    save_dir=experiment_dir,
    ate_true=2.0,
    n_runs=5,
    figsize=(16, 8),
    verbose=True
)