# Causal Simulations: inverse probability of treatment weights
Simulated data sets demonstrating the unbiasedness of the implemented IPTW estimator under several different data generating mechanisms. 1000 samples of 2000 individuals are used to demonstrate the IPTW for the average causal effect of a time-fixed exposure on outcomes at a single time point. All parametric models are correctly specified. Briefly described below are the main features of each data generating mechanism.

Data-generating mechanism 1:
- Binary outcome

Data-generating mechanism 2:
- Normally distributed outcome

Data-generating mechanism 3:
- Binary outcome with interaction terms

Data-generating mechanism 4:
- Continuous outcome with informative censoring

Data-generating mechanism 5:
- Binary outcome with missing treatment data
- Demonstrated IPTW with IPMW to account for missingness of treatment data

*Notes*: confidence intervals come from the robust variance estimator and will be overly conservative. Narrower confidence intervals could be obtained through a bootstrap procedure instead.

In [8]:
import numpy as np
import pandas as pd
from scipy.stats import logistic

from zepid.causal.ipw import IPTW, IPMW

np.random.seed(20191203)

sample_size = 2000
sim_size = 1000

In [2]:
def dgm(version, n=10000000):
    """Generates one of five different data generating mechanisms for simulations. 
    Generates a target population 10,000,000 individuals.
    
    Version 1: binary outcome
    Version 2: continuous outcome
    Version 3: binary outcome
    Version 4: continuous outcome with censoring
    Version 5: binary outcome with missing treatment information
    """
    df = pd.DataFrame()
    if version == 1:
        # Creating confounders
        df['W'] = np.random.normal(10, 3, size=n)
        df['L'] = np.random.binomial(n=1, p=0.4, size=n)
        # Treatment model
        df['A'] = np.random.binomial(n=1, p=logistic.cdf(-0.5*df['W'] + 0.02*df['W']*df['W'] + 5*df['L']), size=n)
        # Outcome models
        df['Y1'] = np.random.binomial(n=1, p=logistic.cdf(-3 + 0.2*df['W'] - 3*df['L']), size=n)
        df['Y0'] = np.random.binomial(n=1, p=logistic.cdf(0.2*df['W'] - 3*df['L']), size=n)        
        df['Y'] = np.where(df['A'] == 1, df['Y1'], df['Y0'])
        return df
    
    if version == 2:
        # Creating confounders
        df['Q'] = np.random.normal(size=n)
        df['Z'] = np.random.binomial(n=1, p=0.8, size=n)
        # Treatment model
        df['A'] = np.random.binomial(n=1, p=logistic.cdf(0.75 + 1.5*df['Q'] - 4*df['Z']), size=n)
        # Outcome models
        df['Y1'] = 129 + 0.2*df['Q'] - 0.01*df['Q']*df['Q'] + 5*df['Z'] + np.random.normal(0, 2, size=n)
        df['Y0'] = 122 + 0.2*df['Q'] - 0.01*df['Q']*df['Q'] - 4*df['Z'] + np.random.normal(0, 2, size=n)
        df['Y'] = np.where(df['A'] == 1, df['Y1'], df['Y0'])
        return df

    if version == 3:
        # Creating confounders
        df['X'] = np.random.normal(size=n)
        df['B'] = np.random.binomial(n=1, p=0.6, size=n)
        df['C'] = np.random.binomial(n=1, p=0.3, size=n)
        # Treatment model
        df['A'] = np.random.binomial(n=1, p=logistic.cdf(-1.75 - 1.5*df['X'] + 3*df['B'] + 2*df['C'] 
                                                         - 5*df['B']*df['C']), size=n)
        # Outcome models
        df['Y1'] = np.random.binomial(n=1, p=logistic.cdf(-2 - 0.2*df['X'] + 3*df['B'] + 1.5*df['C'] 
                                                          + 0.1*df['X']*df['C']), size=n)
        df['Y0'] = np.random.binomial(n=1, p=logistic.cdf(-5 - 0.2*df['X'] + 3*df['B'] + 1.5*df['C'] 
                                                          + 0.1*df['X']*df['C']), size=n)
        df['Y'] = np.where(df['A'] == 1, df['Y1'], df['Y0'])
        return df
    
    if version == 4:
        # Creating confounders
        df['R'] = np.random.normal(size=n)
        df['S'] = np.random.normal(size=n)
        df['T'] = np.random.binomial(n=1, p=0.4, size=n)
        # Treatment model
        df['A'] = np.random.binomial(n=1, p=logistic.cdf(-1.195 - 1.5*df['S'] + 2*df['T'] 
                                                         + 0.3*df['S']*df['T']), size=n)
        # Outcome models
        df['Y1'] = 27 + df['R'] + df['S'] - 0.2*df['R']*df['S'] - 3*df['T'] + np.random.normal(size=n)
        df['Y0'] = 27 + df['R'] + df['S'] - 0.2*df['R']*df['S'] - 3*df['T'] + np.random.normal(size=n)
        df['Y'] = np.where(df['A'] == 1, df['Y1'], df['Y0'])
        # Censoring model
        df['My'] = np.random.binomial(n=1, p=logistic.cdf(-1.975 + 0.8*df['A'] + 0.1*df['R']), size=n)
        df['Y'] = np.where(df['My'] == 1, np.nan, df['Y'])
        return df
    
    if version == 5:
        # Creating confounders
        df['G'] = np.random.normal(5, 1, size=n)
        df['H'] = np.random.binomial(n=1, p=0.4, size=n)
        df['K'] = np.random.binomial(n=1, p=0.7, size=n)
        # Treatment model
        df['A'] = np.random.binomial(n=1, p=logistic.cdf(0.5*df['G'] + 3*df['H'] - 5*df['K']), size=n)
        # Outcome models
        df['Y1'] = np.random.binomial(n=1, p=logistic.cdf(-2 + 0.25*df['G'] - 4*df['H']), size=n)
        df['Y0'] = np.random.binomial(n=1, p=logistic.cdf(0.5 + 0.25*df['G'] - 5*df['H']), size=n)        
        df['Y'] = np.where(df['A'] == 1, df['Y1'], df['Y0'])
        # Missing model
        df['Ma'] = np.random.binomial(n=1, p=logistic.cdf(-2.5 + 3*df['K'] - 1*df['H']), size=n)
        df['A'] = np.where(df['Ma'] == 1, np.nan, df['A'])
        return df


## Data-generating mechanism 1

In [3]:
df = dgm(version=1)
df['W_sq'] = df['W']**2
truth = np.mean(df['Y1'] - df['Y0'])
bias_naive = []
bias_iptw = []
ci_iptw = []

for i in range(sim_size):
    dfs = df.sample(n=sample_size)
    # naive
    bias_naive.append(np.mean(dfs.loc[dfs['A'] == 1, 'Y']) - np.mean(dfs.loc[dfs['A'] == 0, 'Y']) - truth) 

    # IPTW
    ipw = IPTW(dfs, treatment='A', outcome='Y')
    ipw.treatment_model('W + W_sq + L', print_results=False)
    ipw.marginal_structural_model('A')
    ipw.fit()
    bias_iptw.append(ipw.risk_difference['RD'][1] - truth)
    if ipw.risk_difference['95%LCL'][1] < truth < ipw.risk_difference['95%UCL'][1]:
        ci_iptw.append(1)
    else:
        ci_iptw.append(0)

results = pd.DataFrame()
results['bias_naive'] = bias_naive
results['bias_iptw'] = bias_iptw
results['ci_iptw'] = ci_iptw
results.describe()

Unnamed: 0,bias_naive,bias_iptw,ci_iptw
count,1000.0,1000.0,1000.0
mean,-0.33296,-0.001986,0.967
std,0.013402,0.040564,0.178726
min,-0.373152,-0.154378,0.0
25%,-0.342588,-0.028714,1.0
50%,-0.332688,-0.000565,1.0
75%,-0.324095,0.025742,1.0
max,-0.294562,0.1253,1.0


## Data-generating mechanism 2

In [4]:
df = dgm(version=2)
truth = np.mean(df['Y1'] - df['Y0'])
bias_naive = []
bias_iptw = []
ci_iptw = []

for i in range(sim_size):
    dfs = df.sample(n=sample_size)
    # naive
    bias_naive.append(np.mean(dfs.loc[dfs['A'] == 1, 'Y']) - np.mean(dfs.loc[dfs['A'] == 0, 'Y']) - truth) 

    # IPTW
    ipw = IPTW(dfs, treatment='A', outcome='Y')
    ipw.treatment_model('Q + Z', print_results=False)
    ipw.marginal_structural_model('A')
    ipw.fit()
    bias_iptw.append(ipw.average_treatment_effect['ATE'][1] - truth)
    if ipw.average_treatment_effect['95%LCL'][1] < truth < ipw.average_treatment_effect['95%UCL'][1]:
        ci_iptw.append(1)
    else:
        ci_iptw.append(0)

results = pd.DataFrame()
results['bias_naive'] = bias_naive
results['bias_iptw'] = bias_iptw
results['ci_iptw'] = ci_iptw
results.describe()

Unnamed: 0,bias_naive,bias_iptw,ci_iptw
count,1000.0,1000.0,1000.0
mean,-1.717746,-0.005187,0.943
std,0.169573,0.368315,0.231959
min,-2.303674,-1.697024,0.0
25%,-1.831535,-0.233242,1.0
50%,-1.712394,-0.027897,1.0
75%,-1.608215,0.211661,1.0
max,-1.177244,2.761513,1.0


## Data-generating mechanism 3

In [5]:
df = dgm(version=3)
truth = np.mean(df['Y1'] - df['Y0'])
bias_naive = []
bias_iptw = []
ci_iptw = []

for i in range(sim_size):
    dfs = df.sample(n=sample_size)
    # naive
    bias_naive.append(np.mean(dfs.loc[dfs['A'] == 1, 'Y']) - np.mean(dfs.loc[dfs['A'] == 0, 'Y']) - truth) 

    # IPTW
    ipw = IPTW(dfs, treatment='A', outcome='Y')
    ipw.treatment_model('X + B + C + B:C', print_results=False)
    ipw.marginal_structural_model('A')
    ipw.fit()
    bias_iptw.append(ipw.risk_difference['RD'][1] - truth)
    if ipw.risk_difference['95%LCL'][1] < truth < ipw.risk_difference['95%UCL'][1]:
        ci_iptw.append(1)
    else:
        ci_iptw.append(0)

results = pd.DataFrame()
results['bias_naive'] = bias_naive
results['bias_iptw'] = bias_iptw
results['ci_iptw'] = ci_iptw
results.describe()

Unnamed: 0,bias_naive,bias_iptw,ci_iptw
count,1000.0,1000.0,1000.0
mean,0.073078,0.000964,0.961
std,0.018384,0.035258,0.193692
min,0.015226,-0.144828,0.0
25%,0.060339,-0.02189,1.0
50%,0.072919,0.000943,1.0
75%,0.086188,0.024105,1.0
max,0.129573,0.125303,1.0


## Data-generating mechanism 4

In [6]:
df = dgm(version=4)
truth = np.mean(df['Y1'] - df['Y0'])
bias_naive = []
bias_iptw = []
ci_iptw = []

for i in range(sim_size):
    dfs = df.sample(n=sample_size)
    # naive
    bias_naive.append(np.mean(dfs.loc[dfs['A'] == 1, 'Y']) - np.mean(dfs.loc[dfs['A'] == 0, 'Y']) - truth) 

    # IPTW
    ipw = IPTW(dfs, treatment='A', outcome='Y')
    ipw.treatment_model('S + T + S:T', print_results=False)
    ipw.missing_model('A + R', print_results=False)
    ipw.marginal_structural_model('A')
    ipw.fit()
    bias_iptw.append(ipw.average_treatment_effect['ATE'][1] - truth)
    if ipw.average_treatment_effect['95%LCL'][1] < truth < ipw.average_treatment_effect['95%UCL'][1]:
        ci_iptw.append(1)
    else:
        ci_iptw.append(0)

results = pd.DataFrame()
results['bias_naive'] = bias_naive
results['bias_iptw'] = bias_iptw
results['ci_iptw'] = ci_iptw
results.describe()

Unnamed: 0,bias_naive,bias_iptw,ci_iptw
count,1000.0,1000.0,1000.0
mean,-1.950571,-0.015132,0.961
std,0.103504,0.206335,0.193692
min,-2.296665,-0.519726,0.0
25%,-2.020835,-0.1406,1.0
50%,-1.945731,-0.035363,1.0
75%,-1.881781,0.092287,1.0
max,-1.656158,1.316798,1.0


## Data-generating mechanism 5

In [9]:
df = dgm(version=5)
truth = np.mean(df['Y1'] - df['Y0'])
bias_naive = []
bias_iptw1 = []
ci_iptw1 = []
bias_iptw2 = []
ci_iptw2 = []

for i in range(sim_size):
    dfs = df.sample(n=sample_size)
    # naive
    bias_naive.append(np.mean(dfs.loc[dfs['A'] == 1, 'Y']) - np.mean(dfs.loc[dfs['A'] == 0, 'Y']) - truth) 

    # IPTW
    ipw = IPTW(dfs.dropna(), treatment='A', outcome='Y')
    ipw.treatment_model('G + H', print_results=False)
    ipw.marginal_structural_model('A')
    ipw.fit()
    bias_iptw1.append(ipw.risk_difference['RD'][1] - truth)
    if ipw.risk_difference['95%LCL'][1] < truth < ipw.risk_difference['95%UCL'][1]:
        ci_iptw1.append(1)
    else:
        ci_iptw1.append(0)

    # calculating IPMW for A
    ipmw = IPMW(dfs, missing_variable='A')
    ipmw.regression_models('K + H', print_results=False)
    ipmw.fit()
    dfs['ipmw'] = ipmw.Weight

    # IPTW with IPMW
    ipw = IPTW(dfs.dropna(), treatment='A', outcome='Y', weights='ipmw')
    ipw.treatment_model('G + H', print_results=False)
    ipw.marginal_structural_model('A')
    ipw.fit()
    bias_iptw2.append(ipw.risk_difference['RD'][1] - truth)
    if ipw.risk_difference['95%LCL'][1] < truth < ipw.risk_difference['95%UCL'][1]:
        ci_iptw2.append(1)
    else:
        ci_iptw2.append(0)

        
results = pd.DataFrame()
results['bias_naive'] = bias_naive
results['bias_iptw1'] = bias_iptw1
results['ci_iptw1'] = ci_iptw1
results['bias_iptw2'] = bias_iptw2
results['ci_iptw2'] = ci_iptw2
results.describe()

Unnamed: 0,bias_naive,bias_iptw1,ci_iptw1,bias_iptw2,ci_iptw2
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,-0.132735,0.028993,0.919,-0.005754,0.987
std,0.026546,0.020535,0.272972,0.022858,0.113331
min,-0.232965,-0.042301,0.0,-0.086423,0.0
25%,-0.150776,0.01534,1.0,-0.021486,1.0
50%,-0.132979,0.028566,1.0,-0.006924,1.0
75%,-0.114121,0.043676,1.0,0.009946,1.0
max,-0.048084,0.099398,1.0,0.07043,1.0
