In [1]:
# experiment on synthetic data 

In [2]:
# No treatment effect
# size of treatment < < size of control 
# size of treatment ≈ size of control 
# different distribution of treatment / control 
# same dis of treatment / control 

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from scipy.stats import norm, sem
from scipy.interpolate import UnivariateSpline
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats.stats import pearsonr
from numpy.random import default_rng
import random
pd.set_option('display.max_columns', 100)
from resample import * 
from metalearner import * 

####  Extension: For get_data_with_same_distribution , we always generate one dimensional feature x ∈ (0,1), for w = 0, y = x + 1. for w = 1, y = x + 2. So the ite and ate are both 1. In addition,  # treatment sample = # control sample

In [4]:
syn_data_class = resample_from_synthetic_data(n_sample= 1000000)
d = syn_data_class.get_data_with_same_distribution(ratio = 0.5)

In [5]:
s_learner = Slearner(baselearner=LinearRegression(), is_regressor=True)
s_learner.fit(X = np.array(d['X']).reshape(len(d['X']),1), treatment = np.array(d['W']), 
              y =  np.array(d['Y']))

In [6]:
ite, yhat_ts, yhat_cs, rmse = s_learner.get_ite(X =np.array(d['X']).reshape(len(d['X']),1), treatment = d['W'], y =  d['Y'])

In [7]:
print('ATE for S learner: ', np.mean(ite))

ATE for S learner:  0.9999999999999966


In [8]:
t_learner = Tlearner(LinearRegression(),LinearRegression(), is_regressor= True)
t_learner.fit(X = np.array(d['X']).reshape(-1,1), treatment = np.array(d['W']), 
              y =  np.array(d['Y']))

In [9]:
ite, yhat_ts, yhat_cs, rmse = t_learner.get_ite(X = np.array(d['X']).reshape(-1,1), treatment = d['W'], y =  d['Y'])

In [10]:
print('ATE for T learner: ', np.mean(ite))

ATE for T learner:  1.0000000000000002


In [11]:
x_learner = Xlearner(LinearRegression(),
                     propensity_model = LogisticRegression(),
                    is_regressor= True)
x_learner.fit(X = np.array(d['X']).reshape(-1,1), treatment = np.array(d['W']), 
              y =  np.array(d['Y']))

In [12]:
ite, yhat_ts, yhat_cs, rmse = x_learner.get_ite(X = np.array(d['X']).reshape(-1,1), treatment = d['W'], y =  d['Y'])

In [13]:
print('ATE for X learner: ', np.mean(ite))

ATE for X learner:  1.0


We can draw conclusion from above result that when control and treatment groups are from the same simple distribution 
and when the data sizes are equal, the three metalearners perform equally well. 

#### Paper claims:  X-learner performs particularly well when the treatment group sizes are very unbalanced

In [35]:
syn_data_class = resample_from_synthetic_data(n_sample= 1000000)
d = syn_data_class.get_data_with_same_distribution(ratio = 0.01)

In [36]:
d_test = syn_data_class.get_data_with_same_distribution(ratio = 0.01)

In [37]:
d[d['W'] == 1] 

Unnamed: 0,W,Y0,Y1,X,Y
990000,1,1.139203,2.139203,0.139203,2.139203
990001,1,1.540330,2.540330,0.540330,2.540330
990002,1,1.360063,2.360063,0.360063,2.360063
990003,1,1.323327,2.323327,0.323327,2.323327
990004,1,1.831549,2.831549,0.831549,2.831549
...,...,...,...,...,...
999995,1,1.812929,2.812929,0.812929,2.812929
999996,1,1.491826,2.491826,0.491826,2.491826
999997,1,1.701604,2.701604,0.701604,2.701604
999998,1,1.589628,2.589628,0.589628,2.589628


In [38]:
s_learner = Slearner(baselearner=LinearRegression(), is_regressor=True)
s_learner.fit(X = np.array(d['X']).reshape(len(d['X']),1), treatment = np.array(d['W']), 
              y =  np.array(d['Y']))
ite, yhat_ts, yhat_cs, rmse = s_learner.get_ite(X =np.array(d_test['X']).reshape(len(d_test['X']),1), treatment = d_test['W'], y =  d['Y'])
print('ATE for S learner: ', np.mean(ite))

ATE for S learner:  1.0


In [39]:
t_learner = Tlearner(LinearRegression(),LinearRegression(), is_regressor= True)
t_learner.fit(X = np.array(d['X']).reshape(-1,1), treatment = np.array(d['W']), 
              y =  np.array(d['Y']))

ite, yhat_ts, yhat_cs, rmse = t_learner.get_ite(X = np.array(d_test['X']).reshape(-1,1), treatment = d_test['W'], y =  d_test['Y'])

print('ATE for T learner: ', np.mean(ite))

ATE for T learner:  1.0


In [40]:
x_learner = Xlearner(LinearRegression(),
                     propensity_model = LogisticRegression(),
                    is_regressor= True)
x_learner.fit(X = np.array(d['X']).reshape(-1,1), treatment = np.array(d['W']), 
              y =  np.array(d['Y']))
ite, yhat_ts, yhat_cs, rmse = x_learner.get_ite(X = np.array(d_test['X']).reshape(-1,1), treatment = d_test['W'], y =  d_test['Y'])

print('ATE for X learner: ', np.mean(ite))

ATE for X learner:  1.0000000000000002


All three metalearners perform well 

#### Paper claims:  X-learner performs well for two extreme cases where: 1. CATE functions are very complex and 2. treatmnent effect is 0 

#####  complex function w = 1 

In [20]:
# complex CATE function (i.e.: when w= 1, true distribution is quite complex)

syn_data_class = resample_from_synthetic_data(n_sample= 1000)
d = syn_data_class.get_data_with_diff_distribution(ratio = 0.1)
d_test = syn_data_class.get_data_with_diff_distribution(ratio = 0.1)

In [21]:
# For every x, true ite is 3 + np.abs(x*x*x - x) - (x+1) = 2 - x + np.abs(x*x*x - x)
# calculate true ate first 
true_ite = [2 - x + np.abs(x*x*x - x) for x in d_test['X']]
true_ate = np.mean(true_ite)
print('True ate for this data is: ', true_ate)

True ate for this data is:  1.7322758574140962


In [22]:
s_learner = Slearner(baselearner=LinearRegression(), is_regressor=True)
s_learner.fit(X = np.array(d['X']).reshape(len(d['X']),1), treatment = np.array(d['W']), 
              y =  np.array(d['Y']))
ite, yhat_ts, yhat_cs, rmse = s_learner.get_ite(X =np.array(d_test['X']).reshape(len(d_test['X']),1), treatment = d_test['W'], y =  d['Y'])
print('ATE for S learner: ', np.mean(ite))

ATE for S learner:  1.7353553289558623


In [23]:
t_learner = Tlearner(LinearRegression(),LinearRegression(), is_regressor= True)
t_learner.fit(X = np.array(d['X']).reshape(-1,1), treatment = np.array(d['W']), 
              y =  np.array(d['Y']))

ite, yhat_ts, yhat_cs, rmse = t_learner.get_ite(X = np.array(d_test['X']).reshape(-1,1), treatment = d_test['W'], y =  d_test['Y'])

print('ATE for T learner: ', np.mean(ite))

ATE for T learner:  1.715321689046918


In [24]:
x_learner = Xlearner(LinearRegression(),
                     propensity_model = LogisticRegression(),
                    is_regressor= True)
x_learner.fit(X = np.array(d['X']).reshape(-1,1), treatment = np.array(d['W']), 
              y =  np.array(d['Y']))
ite, yhat_ts, yhat_cs, rmse = x_learner.get_ite(X = np.array(d_test['X']).reshape(-1,1), treatment = d_test['W'], y =  d_test['Y'])

print('ATE for X learner: ', np.mean(ite))

ATE for X learner:  1.7153216890469185


##### treatment effect = 0 

In [30]:
syn_data_class = resample_from_synthetic_data(n_sample= 1000)
d = syn_data_class.get_data_with_zero_treatment_effect(ratio = 0.5)
d_test = syn_data_class.get_data_with_zero_treatment_effect(ratio = 0.5)

In [31]:
s_learner = Slearner(baselearner=LinearRegression(), is_regressor=True)
s_learner.fit(X = np.array(d['X']).reshape(len(d['X']),1), treatment = np.array(d['W']), 
              y =  np.array(d['Y']))
ite, yhat_ts, yhat_cs, rmse = s_learner.get_ite(X =np.array(d_test['X']).reshape(len(d_test['X']),1), treatment = d_test['W'], y =  d['Y'])
print('ATE for S learner: ', np.mean(ite))

ATE for S learner:  -8.881784197001253e-19


In [32]:
t_learner = Tlearner(LinearRegression(),LinearRegression(), is_regressor= True)
t_learner.fit(X = np.array(d['X']).reshape(-1,1), treatment = np.array(d['W']), 
              y =  np.array(d['Y']))

ite, yhat_ts, yhat_cs, rmse = t_learner.get_ite(X = np.array(d_test['X']).reshape(-1,1), treatment = d_test['W'], y =  d_test['Y'])

print('ATE for T learner: ', np.mean(ite))

ATE for T learner:  -8.526512829121202e-17


In [33]:
x_learner = Xlearner(LinearRegression(),
                     propensity_model = LogisticRegression(),
                    is_regressor= True)
x_learner.fit(X = np.array(d['X']).reshape(-1,1), treatment = np.array(d['W']), 
              y =  np.array(d['Y']))
ite, yhat_ts, yhat_cs, rmse = x_learner.get_ite(X = np.array(d_test['X']).reshape(-1,1), treatment = d_test['W'], y =  d_test['Y'])

print('ATE for X learner: ', np.mean(ite))

ATE for X learner:  -4.234797938131524e-17


Conclusion: X-learner performs well on both tasks, but T and S learners perform very well too. 