## FLAML for hp optimisation and model selection
We use FLAML twice, first to find the best component model for each estimator, and then to optimise the estimators themselves and choose the best estimator. Here we show how it's done

In [1]:
%load_ext autoreload
%autoreload 2
import os, sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore') # suppress sklearn deprecation warnings for now.. 

root_path = root_path = os.path.realpath('../..')
data_dir = os.path.realpath(os.path.join(root_path, "auto-causality/data"))
if not os.path.isdir(data_dir):
    os.mkdir(data_dir)

sys.path.append(os.path.join(root_path, "auto-causality"))
# sys.path.append(os.path.join(root_path, "dowhy"))

In [2]:
from auto_causality.utils import featurize
from auto_causality import AutoCausality

In [3]:
# set all the control parameters here
train_size = 0.5
test_size = None
time_budget = 300
num_cores = os.cpu_count() - 1
conf_intervals = False


In [4]:
# load raw data
data = pd.read_csv(
    "https://raw.githubusercontent.com/AMLab-Amsterdam/CEVAE/master/datasets/IHDP/csv/ihdp_npci_1.csv",
    header=None,
)
col = [
    "treatment",
    "y_factual",
    "y_cfactual",
    "mu0",
    "mu1",
]
for i in range(1, 26):
    col.append("x" + str(i))
data.columns = col
# drop the columns we don't care about
ignore_patterns = ["y_cfactual", "mu"]
ignore_cols = [c for c in data.columns if any([s in c for s in ignore_patterns])]
data = data.drop(columns=ignore_cols)


# prepare the data

treatment = "treatment"
targets = ["y_factual"]  # it's good to allow multiple ones
features = [c for c in data.columns if c not in [treatment] + targets]

data[treatment] = data[treatment].astype(int)
# this is a trick to bypass some DoWhy/EconML bugs
data["random"] = np.random.randint(0, 2, size=len(data))

used_df = featurize(
    data, features=features, exclude_cols=[treatment] + targets, drop_first=False,
)
used_features = [
    c for c in used_df.columns if c not in ignore_cols + [treatment] + targets
]


# Let's treat all features as effect modifiers
features_X = [f for f in used_features if f != "random"]
features_W = [f for f in used_features if f not in features_X]


train_df, test_df = train_test_split(used_df, train_size=train_size)
if test_size is not None:
    test_df = test_df.sample(test_size)

test_df.to_csv(os.path.join(data_dir, f"test_{time_budget}.csv"))
train_df.to_csv(os.path.join(data_dir, f"train_{time_budget}.csv"))


### Model fitting & scoring
Here we fit a (selection of) model(s) to the data and score them with the ERUPT metric on held-out data

In [None]:

estimator_list = ["dml","ForestDR"]
outcome = targets[0]
auto_causality = AutoCausality(time_budget=10,components_time_budget=10,estimator_list=estimator_list, metric = 'erupt')

myresults = auto_causality.fit(train_df, test_df, treatment, outcome,
 features_W, features_X)

print(f"Best estimator: {auto_causality.best_estimator}")


[flaml.tune.tune: 03-08 16:42:02] {447} INFO - trial 1 config: {'fit_cate_intercept': 1, 'mc_iters': 0}


fitting estimators: ['backdoor.econml.dml.LinearDML', 'backdoor.econml.dml.SparseLinearDML', 'backdoor.econml.dml.CausalForestDML', 'backdoor.econml.dr.ForestDRLearner']


[flaml.tune.tune: 03-08 16:42:22] {447} INFO - trial 1 config: {'fit_cate_intercept': 1, 'mc_iters': 0, 'n_alphas': 87, 'n_alphas_cov': 5, 'tol': 3.81e-05, 'max_iter': 18500}


... Estimator: backdoor.econml.dml.LinearDML
 erupt: 6.319208
 qini: -0.017620
 auc: 0.528207
 r_score: 0.016680
 ATE: 3.909819


[flaml.tune.tune: 03-08 16:42:43] {447} INFO - trial 1 config: {'mc_iters': 5, 'drate': 0, 'n_estimators': 67, 'criterion': 'het', 'max_depth': 2, 'min_samples_split': 18, 'min_samples_leaf': 7, 'min_weight_fraction_leaf': 0.377743449018298, 'min_var_fraction_leaf': 0.26639242043080236, 'max_features': 'sqrt', 'min_impurity_decrease': 7.5621068260561595, 'max_samples': 0.013611566647889872, 'min_balancedness_tol': 0.3836583130489407, 'honest': 0, 'inference': 0, 'fit_intercept': 1}


... Estimator: backdoor.econml.dml.SparseLinearDML
 erupt: 6.319208
 qini: 0.019942
 auc: 0.538326
 r_score: 0.005456
 ATE: 4.299669
