# Case study 2: spam email detection from ACIC (logistic)
This notebook includes experiments from Case Study 2 from the paper Multi-Source Causal Inference Using Control Variates. Specifically, this notebook contains experiments using the logistic model with interaction between $X$ and $Z$ to estimate the ATE and odds ratios.

This experiment uses $n_2 = 10,000$ samples for the dataset without selection bias.

In [1]:
import numpy as np
import pandas as pd

import data_sampler
import bootstrap

In [2]:
df_orig = pd.read_csv('spam_binMod11.csv', sep=",")

In [4]:
df_orig.describe()

Unnamed: 0,Y,A,V1,V2,V3,V4,V5,V6,V7,V8,...,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,0.3111,0.2077,0.313405,0.095635,0.110604,0.103395,0.08878,0.241847,0.058427,0.533474,...,0.142008,0.1867,1.656505,0.085842,0.802396,0.118049,0.100828,0.089238,0.532534,0.95427
std,0.462967,0.405681,0.694358,0.273618,0.387355,0.379405,0.2681,0.586613,0.197865,0.866654,...,0.455435,0.533876,1.75984,0.557759,1.181341,1.017302,0.332306,0.404872,1.633468,0.743763
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.464991
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,...,0.0,0.0,1.31,0.0,0.23,0.0,0.0,0.0,0.0,0.822859
75%,1.0,0.0,0.38,0.0,0.0,0.0,0.0,0.17,0.0,0.77,...,0.0,0.0,2.6325,0.0,1.27,0.0,0.0,0.0,0.0,1.312109
max,1.0,1.0,10.0,5.88,7.27,11.11,5.26,11.11,2.61,9.67,...,7.14,9.09,18.75,18.18,11.11,17.1,5.45,12.5,20.83,7.005336


In [5]:
Y_COLUMN = 'Y'
Z_COLUMN = 'A'
X_COLUMNS = ['V%d' % i for i in range(1,23)]

# Load selection biased data

In [7]:
df_large = pd.read_csv('spam_binMod1_large1.csv', sep=",")

In [13]:
data_sampler_selection_bias = data_sampler.DataSampler(Z_COLUMN, X_COLUMNS, Y_COLUMN)
df_selection_bias = data_sampler_selection_bias.selection_bias_filter(df_large)
df_selection_bias = df_selection_bias.sample(3*len(df_orig), replace=False) 

In [14]:
df_selection_bias.describe()

Unnamed: 0,Y,A,V1,V2,V3,V4,V5,V6,V7,V8,...,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,0.803933,0.185333,0.179978,0.050607,0.032971,0.053826,0.047243,0.193109,0.030144,0.60043,...,0.078161,0.112938,1.183693,0.029163,0.444141,0.04917,0.03307,0.032991,1.336312,0.737938
std,0.397026,0.388574,0.506862,0.180836,0.199362,0.258288,0.192052,0.597873,0.148274,0.996212,...,0.29538,0.40882,1.606306,0.301086,0.88626,0.649688,0.194318,0.25104,2.456238,0.549394
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.393224
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.49,0.0,0.0,0.0,0.0,0.0,0.0,0.687129
75%,1.0,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.86,...,0.0,0.0,1.88,0.0,0.62,0.0,0.0,0.0,1.84,0.994362
max,1.0,1.0,10.0,5.88,5.4,11.11,5.26,18.18,2.61,9.67,...,7.14,9.09,18.75,18.18,11.11,17.1,5.45,12.5,20.83,6.929027


# Logistic regression model with interaction terms

In this section, we assume that the data generating outcome model is

$$P(Y=1 | Z = z, X = x) = \frac{e^{\beta_0 + \beta_1 z + \beta_2 ^T x + \beta_3 ^T xz}}{1 + e^{\beta_0 + \beta_1 z + \beta_2^T x + \beta_3 ^T xz}}$$

This allows for linear heterogenous effects in $x$.

# Compute ATE estimates with and without control variate

In [42]:
def ATE_estimator_fn_interaction(df_input):
    data_sampler_interaction_logistic = data_sampler.DataSamplerInteractionLogistic(Z_COLUMN, X_COLUMNS, Y_COLUMN)
    data_sampler_interaction_logistic.fit_outcome(df_input)
    return data_sampler_interaction_logistic.get_ATE_estimate(df_input)

def CV_estimator_fn_interaction(df_input_obs, df_input_bias):
    data_sampler_interaction_logistic = data_sampler.DataSamplerInteractionLogistic(Z_COLUMN, X_COLUMNS, Y_COLUMN)
    OR_xs = df_input_obs[X_COLUMNS] # Average over all xs in the observational dataset.
    # Estimate OR from observational dataset
    data_sampler_interaction_logistic.fit_outcome(df_input_obs)
    OR_obs = np.mean(data_sampler_interaction_logistic.get_conditional_OR_estimates(OR_xs))
    # Estimate OR from selection bias dataset
    data_sampler_interaction_logistic.fit_outcome(df_input_bias)
    OR_bias = np.mean(data_sampler_interaction_logistic.get_conditional_OR_estimates(OR_xs))
    return OR_obs - OR_bias

CV_samples, ATE_hat_samples, _ = bootstrap.run_bootstrap_df(df_obs=df_orig, 
              df_bias=df_selection_bias, 
              n_replicates=300, 
              ATE_estimator_fn=ATE_estimator_fn_interaction,
              CV_estimator_fn=CV_estimator_fn_interaction,
             )

Starting replicate 299

In [45]:
sample_cov = np.cov(np.array([ATE_hat_samples, CV_samples]), ddof=1)

# Get optimal control variates coefficient
cov_ATE_CV = sample_cov[0][1]
var_CV = sample_cov[1][1]
optimal_CV_coeff = cov_ATE_CV / var_CV
print("optimal CV coefficient:", optimal_CV_coeff)

optimal CV coefficient: 0.004709341919725353


In [46]:
# Get variance/bias of ATE estimators with and without CV.
CV_samples, ATE_hat_samples, ATE_hat_CV_samples = bootstrap.run_bootstrap_df(
    df_obs=df_orig, 
    df_bias=df_selection_bias, 
    n_replicates=300, # Try increasing this
    ATE_estimator_fn=ATE_estimator_fn_interaction,
    CV_estimator_fn=CV_estimator_fn_interaction,
    optimal_CV_coeff=optimal_CV_coeff)

Starting replicate 299

In [47]:
ATE_true = 0.106286795757474

ATE_var = np.var(np.array(ATE_hat_samples), ddof=1)
print(">>> Variance of ATE estimator:", ATE_var)

ATE_bias = np.mean(np.array(ATE_hat_samples)) - ATE_true
print(">>> Bias of ATE estimator:", ATE_bias)

ATE_CV_var = np.var(np.array(ATE_hat_CV_samples), ddof=1)
print(">>> Variance of ATE estimator with CV:", ATE_CV_var)

ATE_CV_bias = np.mean(np.array(ATE_hat_CV_samples)) - ATE_true
print(">>> Bias of ATE estimator with CV:", ATE_CV_bias)

>>> Variance of ATE estimator: 0.00014430022002689374
>>> Bias of ATE estimator: 0.007230298726495221
>>> Variance of ATE estimator with CV: 0.00014303901427683996
>>> Bias of ATE estimator with CV: 0.0030850513480855796
