<table border="0">
    <tr>
        <td>
            <img src="https://ictd2016.files.wordpress.com/2016/04/microsoft-research-logo-copy.jpg" style="width 30px;" />
             </td>
        <td>
            <img src="https://www.microsoft.com/en-us/research/wp-content/uploads/2016/12/MSR-ALICE-HeaderGraphic-1920x720_1-800x550.jpg" style="width 100px;"/></td>
        </tr>
</table>

# Model Selection for Causal Effect Model with the RScorer

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import econml

In [None]:
## Ignore warnings
import warnings
warnings.filterwarnings('ignore') 

In [None]:
# Main imports
from econml.dml import DML, LinearDML, SparseLinearDML, NonParamDML
from econml.metalearners import XLearner, TLearner, SLearner, DomainAdaptationLearner
from econml.drlearner import DRLearner

import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LassoCV
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

%matplotlib inline

## 2. Example Usage with Single Binary Treatment Synthetic Data

### 2.1. DGP 
We use the following DGP:

\begin{align}
T \sim & \text{Bernoulli}\left(f(W)\right), &\; f(W)=\sigma(\langle W, \beta\rangle + \eta), \;\eta \sim \text{Uniform}(-1, 1)\\
Y = & T\cdot \theta(X) + \langle W, \gamma\rangle + \epsilon, & \; \epsilon \sim \text{Uniform}(-1, 1)\\
W \sim & \text{Normal}(0,\, I_{n_w}) & \\
X \sim & \text{Uniform}(0,\, 1)^{n_x}
\end{align}

where $W$ is a matrix of high-dimensional confounders, $\beta, \gamma$ have high sparsity and $\sigma$ is the sigmoid function.

For this DGP, 
\begin{align}
\theta(x) = \exp( 2\cdot x_1 ).
\end{align}

In [None]:
# Treatment effect function
def exp_te(x):
    return np.exp(2 * x[:, 0]) # DGP constants
# def exp_te(x):
#     return 4*x[:, 0]
def exp_te(x):
    return x[:, 0] > 0.5
# def exp_te(x):
#     return (1 + x[:, 0]**2) * (x[:, 0] > .5)

np.random.seed(123)
n = 5000
support_size = 5
n_x = 10
# Outcome support
support_Y = np.random.choice(range(n_x), size=support_size, replace=False)
coefs_Y = np.random.uniform(0, 1, size=support_size)
epsilon_sample = lambda n:np.random.uniform(-1, 1, size=n)
# Treatment support
support_T = support_Y
coefs_T = np.random.uniform(0, 1, size=support_size)
eta_sample = lambda n: np.random.uniform(-1, 1, size=n) 

# Generate controls, covariates, treatments and outcomes
X = np.random.uniform(0, 1, size=(n, n_x))
# Heterogeneous treatment effects
TE = exp_te(X)
# Define treatment
log_odds = np.dot(X[:, support_T], coefs_T) + eta_sample(n)
T_sigmoid = 1/(1 + np.exp(-log_odds))
T = np.array([np.random.binomial(1, p) for p in T_sigmoid])
# Define the outcome
Y = TE * T + np.dot(X[:, support_Y], coefs_Y) + epsilon_sample(n)

# get testing data
X_test = np.random.uniform(0, 1, size=(n, n_x))
X_test[:, 0] = np.linspace(0, 1, n)
expected_te_test = exp_te(X_test)

### 2.2. Train Estimator

In [None]:
reg = lambda: RandomForestRegressor(min_samples_leaf=10)
clf = lambda: RandomForestClassifier(min_samples_leaf=10)

In [None]:
X_train, X_val, T_train, T_val, Y_train, Y_val = train_test_split(X, T, Y, test_size=.4)

In [None]:
models = [('ldml', LinearDML(model_y=reg(), model_t=clf(), discrete_treatment=True,
                             linear_first_stages=False, n_splits=3)),
#           ('sldml', SparseLinearDML(model_y=reg(), model_t=clf(), discrete_treatment=True,
#                                     featurizer=PolynomialFeatures(degree=2, include_bias=False),
#                                     linear_first_stages=False, n_splits=3)),
          ('xlearner', XLearner(models=reg(), cate_models=reg(), propensity_model=clf())),
          ('dalearner', DomainAdaptationLearner(models=reg(), final_models=reg(), propensity_model=clf())),
          ('slearner', SLearner(overall_model=reg())),
          ('tlearner', TLearner(models=reg())),
          ('drlearner', DRLearner(model_propensity=clf(), model_regression=reg(),
                                  model_final=reg(), n_splits=3)),
          ('rlearner', NonParamDML(model_y=reg(), model_t=clf(), model_final=reg(),
                                   discrete_treatment=True, n_splits=3)),
          ('dml3dlasso', DML(model_y=reg(), model_t=clf(), model_final=LassoCV(), discrete_treatment=True,
                             featurizer=PolynomialFeatures(degree=3),
                             linear_first_stages=False, n_splits=3))
]

In [None]:
from joblib import Parallel, delayed

def fit_model(name, model):
    return name, model.fit(Y_train, T_train, X=X_train)

models = Parallel(n_jobs=-1, verbose=1)(delayed(fit_model)(name, mdl) for name, mdl in models)

In [None]:
from econml.score import RScorer

scorer = RScorer(model_y=reg(), model_t=clf(),
                 discrete_treatment=True, n_splits=3, mc_iters=2)
scorer.fit(Y_val, T_val, X=X_val)

In [None]:
rscore = [scorer.score(mdl) for _, mdl in models]

In [None]:
expected_te_val = exp_te(X_val)

In [None]:
rootpehe = [np.sqrt(np.mean((expected_te_val.flatten() - mdl.effect(X_val).flatten())**2)) for _, mdl in models]

In [None]:
plt.scatter(rootpehe, rscore)
plt.xlabel('rpehe')
plt.ylabel('rscore')
plt.show()

### 2.3. Performance Visualization

In [None]:
plt.figure(figsize=(16, 16))
rows = int(np.ceil(len(models) / 3))
for it, (name, mdl) in enumerate(models):
    plt.subplot(rows, 3, it + 1)
    plt.title('{}. RScore: {:.3f}, Root-PEHE: {:.3f}'.format(name, rscore[it], rootpehe[it]))
    plt.plot(X_test[:, 0], mdl.effect(X_test), label='{}'.format(it))
    plt.plot(X_test[:, 0], expected_te_test, 'b--', label='True effect')
plt.ylabel('Treatment Effect')
plt.xlabel('x')
plt.legend()
plt.show()

# Getting the Best Model

In [None]:
mdl, score  = scorer.best_model([mdl for _, mdl in models])
rootpehe_best = np.sqrt(np.mean((expected_te_val.flatten() - mdl.effect(X_val).flatten())**2))
plt.figure()
plt.title('RScore: {:.3f}, Root-PEHE: {:.3f}'.format(score, rootpehe_best))
plt.plot(X_test[:, 0], mdl.effect(X_test), label='best')
plt.plot(X_test[:, 0], expected_te_test, 'b--', label='True effect')
plt.ylabel('Treatment Effect')
plt.xlabel('x')
plt.legend()
plt.show()

# Getting an Ensemble based on Scores

In [None]:
mdl, score  = scorer.ensemble([mdl for _, mdl in models])
rootpehe_ensemble = np.sqrt(np.mean((expected_te_val.flatten() - mdl.effect(X_val).flatten())**2))
plt.figure()
plt.title('RScore: {:.3f}, Root-PEHE: {:.3f}'.format(score, rootpehe_ensemble))
plt.plot(X_test[:, 0], mdl.effect(X_test), label='{}'.format(it))
plt.plot(X_test[:, 0], expected_te_test, 'b--', label='True effect')
plt.ylabel('Treatment Effect')
plt.xlabel('x')
plt.legend()
plt.show()

# Semi-Synthetic Data

In [None]:
reg = lambda: RandomForestRegressor(min_samples_leaf=10, random_state=123)
clf = lambda: RandomForestClassifier(min_samples_leaf=10, random_state=123)

In [None]:
from econml.data.dgps import ihdp_surface_B, ihdp_surface_A
Y, T, X, expected_te = ihdp_surface_B(random_state=123)

In [None]:
X_train, X_val, T_train, T_val,\
Y_train, Y_val, expected_te_train, expected_te_val = train_test_split(X, T, Y, expected_te, test_size=.3, random_state=123)

In [None]:
models = [('ldml', LinearDML(model_y=reg(), model_t=clf(), discrete_treatment=True,
                             linear_first_stages=False, n_splits=3)),
#           ('sldml', SparseLinearDML(model_y=reg(), model_t=clf(), discrete_treatment=True,
#                                     featurizer=PolynomialFeatures(degree=2, include_bias=False),
#                                     linear_first_stages=False, n_splits=3)),
          ('xlearner', XLearner(models=reg(), cate_models=reg(), propensity_model=clf())),
          ('dalearner', DomainAdaptationLearner(models=reg(), final_models=reg(), propensity_model=clf())),
          ('slearner', SLearner(overall_model=reg())),
          ('tlearner', TLearner(models=reg())),
          ('drlearner', DRLearner(model_propensity=clf(), model_regression=reg(),
                                  model_final=reg(), n_splits=3)),
          ('rlearner', NonParamDML(model_y=reg(), model_t=clf(), model_final=reg(),
                                   discrete_treatment=True, n_splits=3)),
          ('dml3dlasso', DML(model_y=reg(), model_t=clf(), model_final=LassoCV(), discrete_treatment=True,
                             featurizer=PolynomialFeatures(degree=2, interaction_only=True, include_bias=False),
                             linear_first_stages=False, n_splits=3))
]

In [None]:
from joblib import Parallel, delayed

def fit_model(name, model):
    return name, model.fit(Y_train, T_train, X=X_train)

models = Parallel(n_jobs=-1, verbose=1)(delayed(fit_model)(name, mdl) for name, mdl in models)

In [None]:
from econml.score import RScorer

scorer = RScorer(model_y=reg(), model_t=clf(),
                 discrete_treatment=True, n_splits=3)
scorer.fit(Y_val, T_val, X=X_val)

In [None]:
rscore = [scorer.score(mdl) for _, mdl in models]

In [None]:
rootpehe = [np.sqrt(np.mean((expected_te.flatten() - mdl.effect(X).flatten())**2)) for _, mdl in models]

In [None]:
plt.scatter(rootpehe, rscore)
plt.xlabel('rpehe')
plt.ylabel('rscore')
plt.show()

In [None]:
mdl, score  = scorer.best_model([mdl for _, mdl in models])
rootpehe_best = np.sqrt(np.mean((expected_te_val.flatten() - mdl.effect(X_val).flatten())**2))
rootpehe_best

In [None]:
mdl, score  = scorer.ensemble([mdl for _, mdl in models])
rootpehe_ensemble = np.sqrt(np.mean((expected_te_val.flatten() - mdl.effect(X_val).flatten())**2))
rootpehe_ensemble

In [None]:
# Visualization of bias distribution
plt.figure(figsize=(15, 5))
plt.violinplot([np.abs(mdl.effect(X).flatten() - expected_te) for _, mdl in models] + 
               [np.abs(mdl.effect(X).flatten() - expected_te)], showmeans=True)
plt.ylabel("Bias distribution")
plt.xticks(np.arange(1, len(models) + 2), [name for name, _ in models] + ['best'])
plt.show()