In [2]:
import os
import sys
import dcor
import pandas as pd
from sklearn.model_selection import train_test_split

from dowhy import CausalModel
from dowhy.causal_estimator import CausalEstimate

root_path = root_path = os.path.realpath('../..')
try:
    import auto_causality
except ModuleNotFoundError:
    sys.path.append(os.path.join(root_path, "auto-causality"))

from auto_causality import AutoCausality # noqa F401
from auto_causality.data_utils import preprocess_dataset # noqa F401
from auto_causality.scoring import Scorer # noqa F401
from auto_causality.datasets import iv_dgp_econml # noqa F401

In [7]:
# Needed since ac.model.estimator doesn't include additional params -
# treatment, outcome etc. - needed from CausalEstimate instance
def energy_scorer_patch(
    estimate: CausalEstimate,
    df: pd.DataFrame,
    treatment: str,
    outcome: str,
    instrument: str,
    effect_modifiers: [],
):

    df["dy"] = estimate.estimator.effect(df[effect_modifiers])
    df.loc[df[treatment] == 0, "dy"] = 0
    df["yhat"] = df[outcome] - df["dy"]

    X1 = df[df[instrument] == 1]
    X0 = df[df[instrument] == 0]
    select_cols = effect_modifiers + ["yhat"]

    energy_distance_score = dcor.energy_distance(X1[select_cols], X0[select_cols])

    return energy_distance_score

In [3]:
data = iv_dgp_econml()

treatment = data.treatment
targets = data.outcomes
instruments = data.instruments
data_df, features_X, features_W = preprocess_dataset(
    data.data, treatment, targets, instruments
)

outcome = targets[0]
train_df, test_df = train_test_split(data_df, test_size=0.2)
train_df.head()

Unnamed: 0,treatment,y,Z,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,random
4707,1.0,10.885826,1.0,-0.957813,-0.633441,0.117917,-0.433192,-0.361684,-0.135773,-0.052122,0.380327,1.329565,-0.321435,1.0
3039,1.0,18.750271,1.0,-0.539135,1.191224,-0.196336,-1.248182,-0.984221,0.556852,1.936504,-0.913703,0.890823,0.481807,0.0
4052,1.0,19.557085,1.0,1.174392,0.096396,1.555359,-1.7187,1.015463,0.395455,1.236376,0.0416,0.687383,-0.027819,0.0
2001,0.0,9.353517,0.0,0.743674,0.982287,0.845747,1.123714,1.357107,0.90809,-1.98152,-1.037397,0.846038,-0.385248,1.0
662,0.0,8.886736,0.0,-0.591889,0.102331,-1.327556,1.787519,-0.449173,-0.020433,-0.833148,0.881001,-1.593588,0.3844,0.0


In [4]:
ac = AutoCausality(
    time_budget=240,
    verbose=3,
    components_verbose=2,
    components_time_budget=60,
    propensity_model="auto",
)

ac.fit(train_df, treatment, outcome, features_W, features_X, instruments)

Component model time budget is 60. Recommended value is at least 300 for smallish datasets, 1800 for datasets with> 100K rows
[32m[I 2022-07-12 17:53:15,158][0m A new study created in memory with name: optuna[0m
[flaml.tune.tune: 07-12 17:53:15] {456} INFO - trial 1 config: {'estimator': {'estimator_name': 'iv.econml.iv.dml.OrthoIV', 'mc_agg': 'mean'}}


Initial configs: [{'estimator': {'estimator_name': 'iv.econml.iv.dml.OrthoIV', 'mc_agg': 'mean'}}, {'estimator': {'estimator_name': 'iv.econml.iv.dml.DMLIV', 'mc_agg': 'mean'}}]
{'estimator_name': 'iv.econml.iv.dml.OrthoIV', 'mc_agg': 'mean'}


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
[flaml.tune.tune: 07-12 17:57:25] {110} INFO - result: {'energy_distance': 0.023866990213011974, 'estimator_name': 'iv.econml.iv.dml.OrthoIV', 'scores': {'train': {'energy_distance': 0.014418654175849532}, 'validation': {'energy_distance': 0.023866990213011974}}, 'config': {'estimator': {'estimator_name': 'iv.econml.iv.dml.OrthoIV', 'mc_agg': 'mean'}}, 'training_iteration': 0, 'config/estimator': {'estimator_name': 'iv.econml.iv.dml.OrthoIV', 'mc_agg': 'mean'}, 'experiment_tag': 'exp', 'time_total_s': 250.59895300865173}


In [5]:
# return best estimator
print(f"Best estimator: {ac.best_estimator}")
# config of best estimator:
print(f"best config: {ac.best_config}")
# best score:
print(f"best score: {ac.best_score}")

Best estimator: iv.econml.iv.dml.OrthoIV
best config: {'estimator': {'estimator_name': 'iv.econml.iv.dml.OrthoIV', 'mc_agg': 'mean'}}
best score: 0.023866990213011974


In [8]:
# Comparing best model searched to base IV model configuration
model = CausalModel(
    data=train_df,
    treatment=treatment,
    outcome=outcome[0],
    effect_modifiers=features_X,
    common_causes=["random"],
    instruments=instruments,
)
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
estimate = model.estimate_effect(
    identified_estimand,
    method_name="iv.econml.iv.dml.DMLIV",
    method_params={
        "init_params": {},
        "fit_params": {},
    },
    test_significance=False,
)



In [10]:
Xtest = test_df[features_X]
print()
print(
    "(Baseline Estimator) Treatment Effect: ",
    estimate.estimator.effect(Xtest).mean(),
)
print(
    "(AutoCausality Estimator) Treatment Effect: ",
    ac.model.estimator.estimator.effect(Xtest).mean(),
)

print("Energy distance scores")
base_estimator_edist = Scorer.energy_distance_score(estimate, test_df)
ac_estimator_edist = energy_scorer_patch(
    ac.model.estimator, test_df, treatment, outcome, instruments[0], features_X
)
print("(Baseline Estimator) Energy distance score: ", base_estimator_edist)
print("(AutoCausality Estimator) Energy distance score: ", ac_estimator_edist)


(Baseline Estimator) Treatment Effect:  10.189894905013425
(AutoCausality Estimator) Treatment Effect:  9.82992883200494
Energy distance scores
(Baseline Estimator) Energy distance score:  0.09726814255196636
(AutoCausality Estimator) Energy distance score:  0.05911449386666323
