# Caml API Usage

In [1]:
import os
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

datasets = ["partially_linear_simple","fully_heterogenous","partially_linear_constant","dowhy_linear"]
backends = ["pandas","pyspark","polars"]

df_backend = backends[0]
dataset = datasets[3]

## Create Synthetic Data

In [2]:
from caml.extensions.synthetic_data import (make_partially_linear_dataset_simple,
                                             make_fully_heterogeneous_dataset,
                                             make_partially_linear_dataset_constant,
                                             make_dowhy_linear_dataset)

if dataset == "partially_linear_simple":
    df, true_cates, true_ate = make_partially_linear_dataset_simple(
        n_obs=10000,
        n_confounders=5,
        dim_heterogeneity=2,
        binary_treatment=True,
        seed=None,
    )
    df["true_cates"] = true_cates
elif dataset == "fully_heterogenous":
    df, true_cates, true_ate = make_fully_heterogeneous_dataset(
        n_obs=10000,
        n_confounders=10,
        theta=4.0,
        seed=None,
    )
    df["true_cates"] = true_cates
elif dataset == "partially_linear_constant":
    df, true_cates, true_ate = make_partially_linear_dataset_constant(
        n_obs=10000,
        ate=4.0,
        n_confounders=5,
        dgp="make_plr_CCDDHNR2018", # make_plr_turrell2018
        seed=None,
    )
    df["true_cates"] = true_cates
elif dataset == "dowhy_linear":
    df, true_cates, true_ate = make_dowhy_linear_dataset(
        beta=2.0,
        n_obs=10000,
        n_confounders=10,
        n_discrete_confounders=3,
        n_effect_modifiers=6,
        n_discrete_effect_modifiers=2,
        n_treatments=1,
        binary_treatment=False,
        categorical_treatment=True,
        binary_outcome=False,
        seed=0)

    for i in range(1,len(true_cates)+1):
        if isinstance(true_cates[f"d{i}"],list):
            df[f"true_cate_d{i}_1"] = true_cates[f"d{i}"][0]
            df[f"true_cate_d{i}_2"] = true_cates[f"d{i}"][1]
        else:
            df[f"true_cate_d{i}"] = true_cates[f"d{i}"]


df["uuid"] = df.index

In [3]:
import polars as pl

try:
    from pyspark.sql import SparkSession
except ImportError:
    pass

if df_backend == "polars":
    df = pl.from_pandas(df)
    spark = None
elif df_backend == "pandas":
    spark = None
    pass
elif df_backend == "pyspark":
    spark = (
        SparkSession.builder.master("local[1]")
        .appName("local-tests")
        .config("spark.executor.cores", "1")
        .config("spark.executor.instances", "1")
        .config("spark.sql.shuffle.partitions", "1")
        .getOrCreate()
    )
    df = spark.createDataFrame(df)

In [4]:
df

Unnamed: 0,X0,X1,X2,X3,X4,X5,W0,W1,W2,W3,...,W5,W6,W7,W8,W9,d1,y,true_cate_d1_1,true_cate_d1_2,uuid
0,-1.676664,0.590863,0.847605,1.086753,1,0,0.241671,1.884652,0.966564,0.211441,...,0.625463,1.369253,1,2,0,1.0,12.770049,3.575139,7.150279,0
1,-1.242579,1.638140,1.262156,-2.362890,3,1,-2.455363,1.083997,1.069963,-0.652399,...,-1.162577,-0.079067,1,3,3,1.0,12.660524,5.933978,11.867955,1
2,-1.201035,-1.449302,0.061269,-0.868967,2,1,0.252574,0.808541,-0.682259,-1.891030,...,0.448137,1.105465,3,1,1,1.0,10.833756,3.830137,7.660273,2
3,-1.044843,0.007703,-1.111472,-0.767243,0,0,-0.950926,-0.989639,-1.500743,2.040542,...,-0.146286,-1.377621,3,0,1,0.0,1.621129,0.340239,0.680477,3
4,0.988896,-0.506787,0.342865,-0.324748,3,0,-0.797840,0.817281,-0.305278,-1.090866,...,0.720120,-0.058308,2,1,1,1.0,8.212666,4.433707,8.867415,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-1.921174,0.121931,1.901082,0.139763,3,3,2.618457,0.305495,0.223595,-0.908856,...,-0.273319,-0.580064,2,3,1,2.0,28.389859,9.601107,19.202214,9995
9996,-1.177932,-0.180092,0.311408,-1.736921,2,2,-0.448078,2.127144,-0.302659,0.093387,...,0.543350,-0.463007,3,1,2,1.0,12.339251,5.461511,10.923022,9996
9997,0.238597,-2.092318,1.618289,-0.472539,3,1,-0.270160,-0.137979,-0.515721,0.288046,...,1.206545,0.318931,1,0,1,1.0,9.294169,6.455389,12.910778,9997
9998,-1.299908,-0.315052,0.820170,-2.162909,3,1,0.526198,0.265380,1.070489,-0.557896,...,-0.045927,-2.153374,3,0,0,2.0,13.160182,5.082198,10.164396,9998


## Core API

### CamlCATE

#### Class Instantiation

In [5]:
from caml import CamlCATE

caml = CamlCATE(
    df=df,
    Y="y",
    T="d1",
    X=[c for c in df.columns if "X" in c],
    W=[c for c in df.columns if "W" in c],
    uuid="uuid",
    discrete_treatment=True,
    discrete_outcome=False,
    seed=0,
)

#### Nuissance Function AutoML

In [6]:
caml.auto_nuisance_functions(
    flaml_Y_kwargs={"time_budget": 10},
    flaml_T_kwargs={"time_budget": 10},
    use_ray=False,
    use_spark=False,
)

[flaml.automl.logger: 08-25 22:50:42] {1680} INFO - task = regression
[flaml.automl.logger: 08-25 22:50:42] {1691} INFO - Evaluation method: cv
[flaml.automl.logger: 08-25 22:50:42] {1789} INFO - Minimizing error metric: mse
[flaml.automl.logger: 08-25 22:50:42] {1901} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth']
[flaml.automl.logger: 08-25 22:50:42] {2219} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 08-25 22:50:42] {2345} INFO - Estimated sufficient time budget=4644s. Estimated necessary time budget=33s.
[flaml.automl.logger: 08-25 22:50:42] {2392} INFO -  at 0.5s,	estimator lgbm's best error=26.5323,	best estimator lgbm's best error=26.5323
[flaml.automl.logger: 08-25 22:50:42] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 08-25 22:50:43] {2392} INFO -  at 1.4s,	estimator lgbm's best error=25.5686,	best estimator lgbm's best error=25.5686
[flaml.automl.logger: 08-25 22:50:43] {2219} INFO

#### Fit and ensemble CATE models

In [7]:
caml.fit_validator(
    subset_cate_models=[
        "LinearDML",
        "NonParamDML",
        "DML-Lasso3d",
        "CausalForestDML",
        "XLearner",
        "DomainAdaptationLearner",
        "SLearner",
        "TLearner",
        "DRLearner",
    ],
    rscorer_kwargs={},
    use_ray=False,
    ray_remote_func_options_kwargs={},
)

[2;36m[08/25/24 22:51:15][0m[2;36m [0m[32mINFO    [0m Logging has been set up.              ]8;id=355622;file:///home/jakep/projects/caml/caml/logging.py\[2mlogging.py[0m]8;;\[2m:[0m]8;id=53056;file:///home/jakep/projects/caml/caml/logging.py#42\[2m42[0m]8;;\
[2;36m[08/25/24 22:51:15][0m[2;36m [0m[32mINFO    [0m Logging has been set up.              ]8;id=73242;file:///home/jakep/projects/caml/caml/logging.py\[2mlogging.py[0m]8;;\[2m:[0m]8;id=323479;file:///home/jakep/projects/caml/caml/logging.py#42\[2m42[0m]8;;\
[2;36m[08/25/24 22:51:15][0m[2;36m [0m[32mINFO    [0m Logging has been set up.              ]8;id=484213;file:///home/jakep/projects/caml/caml/logging.py\[2mlogging.py[0m]8;;\[2m:[0m]8;id=545418;file:///home/jakep/projects/caml/caml/logging.py#42\[2m42[0m]8;;\
[2;36m[08/25/24 22:51:15][0m[2;36m [0m[32mINFO    [0m Logging has been set up.              ]8;id=935657;file:///home/jakep/projects/caml/caml/logging.p

The final model has a nonzero intercept for at least one outcome; it will be subtracted, but consider fitting a model without an intercept if possible.


In [8]:
caml.validation_estimator

<econml.score.ensemble_cate.EnsembleCateEstimator at 0x7f39ca235840>

#### CATE Validation

In [10]:
validation_results = caml.validate(estimator=None, print_full_report=True)

ValueError: Validation for continuous treatments is not supported yet.

#### Refit best estimator on full dataset

In [None]:
caml.fit_final()

In [None]:
caml.final_estimator

#### Predict CATEs

In [None]:
## "Out of sample" predictions

df_predictions = caml.predict(
    out_of_sample_df=df,
    out_of_sample_uuid="uuid",
    return_predictions=False,
    join_predictions=True,
)

if df_backend == "pyspark":
    df_predictions.show()
else:
    print(df_predictions)

In [None]:
## Append to internal dataframe

caml.predict(
    out_of_sample_df=None,
    out_of_sample_uuid=None,
    join_predictions=True,
    return_predictions=False,
)

caml.dataframe

#### CATE Rank Ordering

In [None]:
## "Out of sample" predictions

df_rank_ordered = caml.rank_order(
    out_of_sample_df=df_predictions, return_rank_order=False, join_rank_order=True,
    treatment_category=1,
)

df_rank_ordered

In [None]:
## Append to internal dataframe

caml.rank_order(out_of_sample_df=None, return_rank_order=False, join_rank_order=True,
                treatment_category=1)

caml.dataframe

#### CATE Visualization/Summary

In [None]:
cate_summary = caml.summarize(out_of_sample_df=df_rank_ordered,treatment_category=1)

cate_summary

In [None]:
cate_summary = caml.summarize(out_of_sample_df=None,treatment_category=1)

cate_summary

In [None]:
true_ate

#### Access my dataframe and estimator object 

In [None]:
caml.dataframe

In [None]:
from econml.score import EnsembleCateEstimator

# Use this estimator object as pickled object for optimized inference
final_estimator = caml.final_estimator

if isinstance(final_estimator, EnsembleCateEstimator):
    for model in final_estimator._cate_models:
        print(model)
        print(model._input_names)
else:
    print(final_estimator)
    print(final_estimator._input_names)