# Case study 2: spam email detection from ACIC (kernel)
This notebook includes experiments from Case Study 2 from the paper Multi-Source Causal Inference Using Control Variates. Specifically, this notebook contains experiments using kernel smoothing to estimate the odds ratios.

This notebook includes experiments with $n_2 = 3,000$ samples for the dataset without selection bias.

In [1]:
import numpy as np
import pandas as pd

import data_sampler
import bootstrap

In [2]:
df_orig = pd.read_csv('spam_binMod11.csv', sep=",")

In [8]:
df_orig = df_orig[:3000]

In [10]:
Y_COLUMN = 'Y'
Z_COLUMN = 'A'
X_COLUMNS = ['V%d' % i for i in range(1,23)]

# Load selection biased data

In [12]:
df_large = pd.read_csv('spam_binMod1_large1.csv', sep=",")

In [13]:
data_sampler_selection_bias = data_sampler.DataSampler(Z_COLUMN, X_COLUMNS, Y_COLUMN)
df_selection_bias = data_sampler_selection_bias.selection_bias_filter(df_large)
df_selection_bias = df_selection_bias.sample(3*len(df_orig), replace=False)

In [14]:
df_selection_bias.describe()

Unnamed: 0,Y,A,V1,V2,V3,V4,V5,V6,V7,V8,...,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22
count,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,...,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0
mean,0.803556,0.184333,0.182077,0.050508,0.034222,0.054563,0.047512,0.202491,0.033011,0.618948,...,0.081628,0.121258,1.175406,0.032417,0.442911,0.040812,0.033944,0.029393,1.336863,0.740187
std,0.397331,0.387777,0.501766,0.178726,0.22098,0.262489,0.197918,0.613983,0.154427,1.025605,...,0.312594,0.426679,1.575767,0.273282,0.904204,0.606628,0.209252,0.198492,2.488343,0.547425
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.393393
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.49,0.0,0.0,0.0,0.0,0.0,0.0,0.687129
75%,1.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.88,...,0.0,0.0,1.88,0.0,0.62,0.0,0.0,0.0,1.83,1.004668
max,1.0,1.0,10.0,3.57,7.27,11.11,3.23,11.11,2.61,9.67,...,7.14,6.66,12.5,5.33,11.11,17.1,5.45,6.66,20.83,6.50279


# Compute ATE estimates with and without control variate

In [None]:
def ATE_estimator_fn_interaction(df_input):
    data_sampler_interaction_logistic = data_sampler.DataSamplerInteractionLogistic(Z_COLUMN, X_COLUMNS, Y_COLUMN)
    data_sampler_interaction_logistic.fit_outcome(df_input)
    return data_sampler_interaction_logistic.get_ATE_estimate(df_input)

OR_xs=df_orig.sample(50, replace=False)[X_COLUMNS] #sample a few statas

def CV_estimator_kernel(df_input_obs, df_input_bias, bandwidth=10, n_OR_samples=20):
    data_sampler_interaction_logistic = data_sampler.DataSamplerInteractionLogistic(Z_COLUMN, X_COLUMNS, Y_COLUMN)
    # Estimate OR from observational dataset
    OR_obs = np.mean(data_sampler_interaction_logistic.get_conditional_OR_estimates_kernel(input_df=df_input_obs, x_inputs=OR_xs, bandwidth=bandwidth))
    # Estimate OR from selection bias dataset
    OR_bias = np.mean(data_sampler_interaction_logistic.get_conditional_OR_estimates_kernel(input_df=df_input_bias, x_inputs=OR_xs, bandwidth=bandwidth))
    return OR_obs - OR_bias

CV_samples, ATE_hat_samples, _ = bootstrap.run_bootstrap_df(df_obs=df_orig, 
              df_bias=df_selection_bias, 
              n_replicates=300, 
              ATE_estimator_fn=ATE_estimator_fn_interaction,
              CV_estimator_fn=CV_estimator_kernel,
             )

In [44]:
sample_cov = np.cov(np.array([ATE_hat_samples, CV_samples]), ddof=1)

# Get optimal control variates coefficient
cov_ATE_CV = sample_cov[0][1]
var_CV = sample_cov[1][1]
optimal_CV_coeff = cov_ATE_CV / var_CV
print("optimal CV coefficient:", optimal_CV_coeff)

optimal CV coefficient: 0.11430391166474199


In [None]:
# Get variance/bias of ATE estimators with and without CV.
CV_samples, ATE_hat_samples, ATE_hat_CV_samples = bootstrap.run_bootstrap_df(
    df_obs=df_orig, 
    df_bias=df_selection_bias, 
    n_replicates=300, # Try increasing this
    ATE_estimator_fn=ATE_estimator_fn_interaction,
    CV_estimator_fn=CV_estimator_kernel,
    optimal_CV_coeff=optimal_CV_coeff)

In [49]:
ATE_true = 0.106286795757474

ATE_var = np.var(np.array(ATE_hat_samples), ddof=1)
print(">>> Variance of ATE estimator:", ATE_var)

ATE_bias = np.mean(np.array(ATE_hat_samples)) - ATE_estimate
print(">>> Bias of ATE estimator:", ATE_bias)

ATE_CV_var = np.var(np.array(ATE_hat_CV_samples), ddof=1)
print(">>> Variance of ATE estimator with CV:", ATE_CV_var)

ATE_CV_bias = np.mean(np.array(ATE_hat_CV_samples)) - ATE_estimate
print(">>> Bias of ATE estimator with CV:", ATE_CV_bias)

>>> Variance of ATE estimator: 0.0004309190652441266
>>> Bias of ATE estimator: -0.00424969694808594
>>> Variance of ATE estimator with CV: 0.0003394285826792128
>>> Bias of ATE estimator with CV: -0.006390191942491713
