# Case study 2: spam email detection from ACIC (neural network)
This notebook includes experiments from Case Study 2 from the paper Multi-Source Causal Inference Using Control Variates. Specifically, this notebook contains experiments using the neural network model to estimate the ATE and odds ratios.

This experiment uses $n_2 = 3,000$ samples for the dataset without selection bias.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor

import data_sampler
import bootstrap

In [2]:
df_orig = pd.read_csv('spam_binMod11.csv', sep=",")

In [3]:
df_orig = df_orig[:3000]

In [5]:
df_orig.describe()

Unnamed: 0,Y,A,V1,V2,V3,V4,V5,V6,V7,V8,...,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,...,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,0.294667,0.225,0.319917,0.098823,0.11819,0.10566,0.083893,0.24681,0.06201,0.529727,...,0.15698,0.186447,1.69994,0.1011,0.82435,0.116147,0.107973,0.088857,0.528137,0.972863
std,0.455969,0.417652,0.72337,0.286872,0.426914,0.3797,0.264619,0.591946,0.206386,0.828893,...,0.48656,0.506001,1.778479,0.720557,1.189208,1.044481,0.345529,0.349962,1.744793,0.7622
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.479799
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09,...,0.0,0.0,1.38,0.0,0.27,0.0,0.0,0.0,0.0,0.832909
75%,1.0,0.0,0.38,0.0,0.0,0.0,0.0,0.2,0.0,0.77,...,0.0,0.0,2.6325,0.0,1.29,0.0,0.0,0.0,0.0,1.332102
max,1.0,1.0,10.0,5.88,7.27,6.06,3.33,11.11,2.61,6.45,...,5.12,9.09,14.28,18.18,10.71,17.1,3.62,9.09,20.83,7.005336


In [6]:
Y_COLUMN = 'Y'
Z_COLUMN = 'A'
X_COLUMNS = ['V%d' % i for i in range(1,23)]

# Load selection biased data

In [8]:
df_large = pd.read_csv('spam_binMod1_large1.csv', sep=",")

In [9]:
data_sampler_selection_bias = data_sampler.DataSampler(Z_COLUMN, X_COLUMNS, Y_COLUMN)
df_selection_bias = data_sampler_selection_bias.selection_bias_filter(df_large)
df_selection_bias = df_selection_bias.sample(10*len(df_orig), replace=False) 

In [10]:
df_selection_bias.describe()

Unnamed: 0,Y,A,V1,V2,V3,V4,V5,V6,V7,V8,...,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,0.8034,0.184533,0.182096,0.051569,0.034601,0.05622,0.048137,0.193815,0.029849,0.602707,...,0.076511,0.114768,1.191797,0.029085,0.449583,0.041853,0.034491,0.032461,1.328376,0.740786
std,0.397434,0.387925,0.513661,0.18161,0.214859,0.280935,0.193812,0.6151,0.145768,1.000305,...,0.287824,0.421061,1.610843,0.308781,0.901698,0.584171,0.200275,0.237237,2.479185,0.556426
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.393393
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.51,0.0,0.0,0.0,0.0,0.0,0.0,0.686626
75%,1.0,0.0,0.09,0.0,0.0,0.0,0.0,0.0,0.0,0.86,...,0.0,0.0,1.92,0.0,0.63,0.0,0.0,0.0,1.83,1.000632
max,1.0,1.0,10.0,3.57,7.27,11.11,5.26,18.18,2.61,9.67,...,7.14,9.09,18.75,18.18,11.11,17.1,5.45,12.5,20.83,7.005336


# Varying coefficient model with neural network function approximation

In this section, we assume the data is generated from 

$$P(Y=1 | Z = z, X = x) = \frac{e^{\beta_0^x + \beta_1^x z}}{ 1 + e^{\beta_0^x + \beta_1^xz}}$$

where $\beta_0^x, \beta_1^x$ are functions of $x$:
$$\beta_0^x = f_0(x; \theta_0), \quad \beta_1^x = f_1(x; \theta_1)$$

If we were to assume $\beta_0^x, \beta_1^x$ were linear in $x$, this would be equivalent to the above logistic regression model with explicit interaction terms.

To model more complicated interactions, we allow $f_0(x; \theta_0), f_1(x; \theta_1)$ to be nonlinear in $x$, and set these to be generated from a two layer neural network.

## Choose hyperparameters using 5-fold cross validation

In [11]:
data_sampler_nn = data_sampler.DataSamplerNN(Z_COLUMN, X_COLUMNS, Y_COLUMN)

In [80]:
# Optimize for best hyperparams using 5-fold cross validation. 
num_hidden_layers_list = [4, 8]
hidden_dim_list = [4, 8, 16, 32]

kf = KFold(n_splits = 5, shuffle = True, random_state = 0)
    
best_accuracy = 0.0
best_hidden_dim = None
best_num_hidden_layers = None
for num_hidden_layers in num_hidden_layers_list:
    for hidden_dim in hidden_dim_list:
        print("Optimizing with num_hidden_layers=%d, hidden_dim=%d" % (num_hidden_layers, hidden_dim))
        CV_accuracies = []
        for train_index, test_index in kf.split(df_orig):
            df_train = df_orig.iloc[train_index]
            df_test =  df_orig.iloc[test_index]
            _ = data_sampler_nn.fit_outcome(df_train, 
                                            num_hidden_layers=num_hidden_layers, 
                                            hidden_dim=hidden_dim, 
                                            batch_size=3000, 
                                            epochs=1000, 
                                            step_size=0.001, 
                                            verbose=0, 
                                            print_metrics=False)
            accuracy = data_sampler_nn.print_metrics(df_test)
            CV_accuracies.append(accuracy)
        avg_CV_accuracy = np.mean(CV_accuracies)
        print("average CV accuracy:", avg_CV_accuracy)
        if avg_CV_accuracy > best_accuracy: 
            best_num_hidden_layers = num_hidden_layers
            best_hidden_dim = hidden_dim
            best_accuracy = avg_CV_accuracy

Optimizing with num_hidden_layers=4, hidden_dim=4
Accuracy for outcome model: 0.845000
AUC for outcome model: 0.902785
Accuracy for outcome model: 0.858333
AUC for outcome model: 0.871247
Accuracy for outcome model: 0.848333
AUC for outcome model: 0.897701
Accuracy for outcome model: 0.858333
AUC for outcome model: 0.895902
Accuracy for outcome model: 0.861667
AUC for outcome model: 0.936662
average CV accuracy: 0.8543333333333333
Optimizing with num_hidden_layers=4, hidden_dim=8
Accuracy for outcome model: 0.830000
AUC for outcome model: 0.888189
Accuracy for outcome model: 0.855000
AUC for outcome model: 0.930459
Accuracy for outcome model: 0.838333
AUC for outcome model: 0.866574
Accuracy for outcome model: 0.848333
AUC for outcome model: 0.848111
Accuracy for outcome model: 0.866667
AUC for outcome model: 0.924754
average CV accuracy: 0.8476666666666667
Optimizing with num_hidden_layers=4, hidden_dim=16
Accuracy for outcome model: 0.836667
AUC for outcome model: 0.829224
Accuracy f

In [81]:
print("best cross validation accuracy:", best_accuracy)
print("best hidden_dim:", best_hidden_dim)
print("best num_hidden_layers:", best_num_hidden_layers)

best cross validation accuracy: 0.861
best hidden_dim: 8
best num_hidden_layers: 8


# Compute ATE estimates with and without control variate

In [None]:
def ATE_estimator_fn_interaction(df_input):
    data_sampler_nn = data_sampler.DataSamplerNN(Z_COLUMN, X_COLUMNS, Y_COLUMN)
    data_sampler_nn.fit_outcome(df_input, 
                                num_hidden_layers=8, 
                                hidden_dim=8, 
                                batch_size=3000, 
                                epochs=1000, 
                                step_size=0.001, 
                                verbose=0, 
                                print_metrics=False)
    return data_sampler_nn.get_ATE_estimate(df_input)

def CV_estimator_fn_interaction(df_input_obs, df_input_bias):
    data_sampler_nn = data_sampler.DataSamplerNN(Z_COLUMN, X_COLUMNS, Y_COLUMN)
    OR_xs = df_input_obs[X_COLUMNS] # Average over all xs in the observational dataset.
    # Estimate OR from observational dataset
    data_sampler_nn.fit_outcome(df_input_obs, 
                                num_hidden_layers=8, 
                                hidden_dim=8, 
                                batch_size=3000, 
                                epochs=1000, 
                                step_size=0.001, 
                                verbose=0, 
                                print_metrics=False)
    OR_obs = np.mean(data_sampler_nn.get_conditional_OR_estimates(OR_xs))
    # Estimate OR from selection bias dataset
    data_sampler_nn.fit_outcome(df_input_bias, 
                                num_hidden_layers=8, 
                                hidden_dim=8, 
                                batch_size=3000, 
                                epochs=1000, 
                                step_size=0.001, 
                                verbose=0, 
                                print_metrics=False)
    OR_bias = np.mean(data_sampler_nn.get_conditional_OR_estimates(OR_xs))
    return OR_obs - OR_bias

CV_samples, ATE_hat_samples, _ = bootstrap.run_bootstrap_df(df_obs=df_orig, 
              df_bias=df_selection_bias, 
              n_replicates=300, 
              ATE_estimator_fn=ATE_estimator_fn_interaction,
              CV_estimator_fn=CV_estimator_fn_interaction,
             )

In [83]:
sample_cov = np.cov(np.array([ATE_hat_samples, CV_samples]), ddof=1)

# Get optimal control variates coefficient
cov_ATE_CV = sample_cov[0][1]
var_CV = sample_cov[1][1]
optimal_CV_coeff = cov_ATE_CV / var_CV
print("cov_ATE_CV:", cov_ATE_CV)
print("var_CV:", var_CV)
print("optimal CV coefficient:", optimal_CV_coeff)

cov_ATE_CV: 0.010320750224568166
var_CV: 97.07312314304782
optimal CV coefficient: 0.00010631933835444252


In [86]:
# Get variance/bias of ATE estimators with and without CV.
CV_samples, ATE_hat_samples, ATE_hat_CV_samples = bootstrap.run_bootstrap_df(
    df_obs=df_orig, 
    df_bias=df_selection_bias, 
    n_replicates=300, # Try increasing this
    ATE_estimator_fn=ATE_estimator_fn_interaction,
    CV_estimator_fn=CV_estimator_fn_interaction,
    optimal_CV_coeff=optimal_CV_coeff)

Starting replicate 299

In [85]:
ATE_true = 0.106286795757474

ATE_var = np.var(np.array(ATE_hat_samples), ddof=1)
print(">>> Variance of ATE estimator:", ATE_var)

ATE_bias = np.mean(np.array(ATE_hat_samples)) - ATE_true
print(">>> Bias of ATE estimator:", ATE_bias)

ATE_CV_var = np.var(np.array(ATE_hat_CV_samples), ddof=1)
print(">>> Variance of ATE estimator with CV:", ATE_CV_var)

ATE_CV_bias = np.mean(np.array(ATE_hat_CV_samples)) - ATE_true
print(">>> Bias of ATE estimator with CV:", ATE_CV_bias)

>>> Variance of ATE estimator: 0.0010743029
>>> Bias of ATE estimator: -0.010947475157441197
>>> Variance of ATE estimator with CV: 0.0010674468592391608
>>> Bias of ATE estimator with CV: -0.011286787382429164
