Skip to content

Commit

Permalink
"support valid error early stopping" (uber#648)
Browse files Browse the repository at this point in the history
* support valid error early stopping

---------

Co-authored-by: Jeong-Yoon Lee <jeongyoon.lee1@gmail.com>
  • Loading branch information
2 people authored and rolandrmgservices committed Aug 22, 2023
1 parent fb10bb2 commit 2a27027
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 29 deletions.
103 changes: 83 additions & 20 deletions causalml/inference/tree/uplift.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,10 @@ class UpliftTreeClassifier:
n_reg: int, optional (default=100)
The regularization parameter defined in Rzepakowski et al. 2012, the weight (in terms of sample size) of the
parent node influence on the child node, only effective for 'KL', 'ED', 'Chi', 'CTS' methods.
early_stopping_eval_diff_scale: float, optional (default=1)
If train and valid uplift score diff bigger than
min(train_uplift_score,valid_uplift_score)/early_stopping_eval_diff_scale, stop.
control_name: string
The name of the control group (other experiment groups will be regarded as treatment groups).
Expand All @@ -240,12 +244,13 @@ class UpliftTreeClassifier:
"""
def __init__(self, control_name, max_features=None, max_depth=3, min_samples_leaf=100,
min_samples_treatment=10, n_reg=100, evaluationFunction='KL',
min_samples_treatment=10, n_reg=100, early_stopping_eval_diff_scale=1, evaluationFunction='KL',
normalization=True, honesty=False, estimation_sample_size=0.5, random_state=None):
self.max_depth = max_depth
self.min_samples_leaf = min_samples_leaf
self.min_samples_treatment = min_samples_treatment
self.n_reg = n_reg
self.early_stopping_eval_diff_scale = early_stopping_eval_diff_scale
self.max_features = max_features

assert evaluationFunction in ['KL', 'ED', 'Chi', 'CTS', 'DDP', 'IT', 'CIT', 'IDDP'], \
Expand Down Expand Up @@ -282,7 +287,7 @@ class UpliftTreeClassifier:
self.honesty = True


def fit(self, X, treatment, y):
def fit(self, X, treatment, y, X_val=None, treatment_val=None, y_val=None):
""" Fit the uplift model.
Args
Expand All @@ -306,14 +311,23 @@ class UpliftTreeClassifier:
X, y = check_X_y(X, y)
treatment = np.asarray(treatment)
assert len(y) == len(treatment), 'Data length must be equal for X, treatment, and y.'

if X_val is not None:
X_val, y_val = check_X_y(X_val, y_val)
treatment_val = np.asarray(treatment_val)
assert len(y_val) == len(treatment_val), 'Data length must be equal for X_val, treatment_val, and y_val.'

# Get treatment group keys. self.classes_[0] is reserved for the control group.
treatment_groups = sorted([x for x in list(set(treatment)) if x != self.control_name])
self.classes_ = [self.control_name]
treatment_idx = np.zeros_like(treatment, dtype=int)
treatment_val_idx = None
if treatment_val is not None:
treatment_val_idx = np.zeros_like(treatment_val, dtype=int)
for i, tr in enumerate(treatment_groups, 1):
self.classes_.append(tr)
treatment_idx[treatment == tr] = i
if treatment_val_idx is not None:
treatment_val_idx[treatment_val == tr] = i
self.n_class = len(self.classes_)

self.feature_imp_dict = defaultdict(float)
Expand All @@ -333,8 +347,9 @@ class UpliftTreeClassifier:
random_state=self.random_state)

self.fitted_uplift_tree = self.growDecisionTreeFrom(
X, treatment_idx, y,
max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf,
X, treatment_idx, y, X_val, treatment_val_idx, y_val,
max_depth=self.max_depth, early_stopping_eval_diff_scale=self.early_stopping_eval_diff_scale,
min_samples_leaf=self.min_samples_leaf,
depth=1, min_samples_treatment=self.min_samples_treatment,
n_reg=self.n_reg, parentNodeSummary=None
)
Expand Down Expand Up @@ -1118,7 +1133,8 @@ class UpliftTreeClassifier:
res.append(p)
return res

def growDecisionTreeFrom(self, X, treatment_idx, y, max_depth=10,
def growDecisionTreeFrom(self, X, treatment_idx, y, X_val, treatment_val_idx, y_val,
early_stopping_eval_diff_scale=1, max_depth=10,
min_samples_leaf=100, depth=1,
min_samples_treatment=10, n_reg=100,
parentNodeSummary=None):
Expand All @@ -1133,6 +1149,12 @@ class UpliftTreeClassifier:
An array containing the treatment group idx for each unit.
y : array-like, shape = [num_samples]
An array containing the outcome of interest for each unit.
X_val : ndarray, shape = [num_samples, num_features]
An ndarray of the covariates used to valid the uplift model.
treatment_val_idx : array-like, shape = [num_samples]
An array containing the validation treatment group idx for each unit.
y_val : array-like, shape = [num_samples]
An array containing the validation outcome of interest for each unit.
max_depth: int, optional (default=10)
The maximum depth of the tree.
min_samples_leaf: int, optional (default=100)
Expand Down Expand Up @@ -1194,7 +1216,6 @@ class UpliftTreeClassifier:
else:
p_t = currentNodeSummary[suboptTreatment][0]
n_t = currentNodeSummary[suboptTreatment][1]

p_value = (1. - stats.norm.cdf(abs(p_c - p_t) / np.sqrt(p_t * (1 - p_t) / n_t + p_c * (1 - p_c) / n_c))) * 2
upliftScore = [maxDiff, p_value]

Expand Down Expand Up @@ -1223,6 +1244,7 @@ class UpliftTreeClassifier:

for value in lsUnique:
X_l, X_r, w_l, w_r, y_l, y_r = self.divideSet(X, treatment_idx, y, col, value)

# check the split validity on min_samples_leaf 372
if (len(X_l) < min_samples_leaf or len(X_r) < min_samples_leaf):
continue
Expand All @@ -1233,15 +1255,28 @@ class UpliftTreeClassifier:
min_samples_treatment=min_samples_treatment,
n_reg=n_reg,
parentNodeSummary=currentNodeSummary)

rightNodeSummary = self.tree_node_summary(w_r, y_r,
min_samples_treatment=min_samples_treatment,
min_samples_treatment=min_samples_treatment,
n_reg=n_reg,
parentNodeSummary=currentNodeSummary)

# check the split validity on min_samples_treatment
assert len(leftNodeSummary) == len(rightNodeSummary)

if X_val is not None:
X_val_l, X_val_r, w_val_l, w_val_r, y_val_l, y_val_r = self.divideSet(X_val, treatment_val_idx, y_val, col, value)
leftNodeSummary_val = self.tree_node_summary(w_val_l, y_val_l,
parentNodeSummary=currentNodeSummary)
rightNodeSummary_val = self.tree_node_summary(w_val_r, y_val_r,
parentNodeSummary=currentNodeSummary)
early_stopping_flag = False
for k in range(len(leftNodeSummary_val)):
if (abs(leftNodeSummary_val[k][0]-leftNodeSummary[k][0]) > min(leftNodeSummary_val[k][0],leftNodeSummary[k][0])/early_stopping_eval_diff_scale or
abs(rightNodeSummary_val[k][0]-rightNodeSummary[k][0]) > min(rightNodeSummary_val[k][0],rightNodeSummary[k][0])/early_stopping_eval_diff_scale):
early_stopping_flag = True
break
if early_stopping_flag:
continue

# check the split validity on min_samples_treatment
node_mst = min([stat[1] for stat in leftNodeSummary + rightNodeSummary])
if node_mst < min_samples_treatment:
continue
Expand Down Expand Up @@ -1293,13 +1328,16 @@ class UpliftTreeClassifier:
norm_factor = self.normI(n_c, n_c_left, n_t, n_t_left, alpha=0.9)
else:
norm_factor = 1
gain = gain / norm_factor
gain = gain / norm_factor
if (gain > bestGain and len(X_l) > min_samples_leaf and len(X_r) > min_samples_leaf):
bestGain = gain
bestGainImp = gain_for_imp
bestAttribute = (col, value)
best_set_left = [X_l, w_l, y_l]
best_set_right = [X_r, w_r, y_r]
best_set_left = [X_l, w_l, y_l, None, None, None]
best_set_right = [X_r, w_r, y_r, None, None, None]
if X_val is not None:
best_set_left = [X_l, w_l, y_l, X_val_l, w_val_l, y_val_l]
best_set_right = [X_r, w_r, y_r, X_val_r, w_val_r, y_val_r]

dcY = {'impurity': '%.3f' % currentScore, 'samples': '%d' % len(X)}
# Add treatment size
Expand All @@ -1312,12 +1350,12 @@ class UpliftTreeClassifier:
if bestGain > 0 and depth < max_depth:
self.feature_imp_dict[bestAttribute[0]] += bestGainImp
trueBranch = self.growDecisionTreeFrom(
*best_set_left, max_depth, min_samples_leaf,
*best_set_left, self.early_stopping_eval_diff_scale, max_depth, min_samples_leaf,
depth + 1, min_samples_treatment=min_samples_treatment,
n_reg=n_reg, parentNodeSummary=currentNodeSummary
)
falseBranch = self.growDecisionTreeFrom(
*best_set_right, max_depth, min_samples_leaf,
*best_set_right, self.early_stopping_eval_diff_scale, max_depth, min_samples_leaf,
depth + 1, min_samples_treatment=min_samples_treatment,
n_reg=n_reg, parentNodeSummary=currentNodeSummary
)
Expand Down Expand Up @@ -1484,6 +1522,10 @@ class UpliftRandomForestClassifier:
weight (in terms of sample size) of the parent node influence on the
child node, only effective for 'KL', 'ED', 'Chi', 'CTS' methods.
early_stopping_eval_diff_scale: float, optional (default=1)
If train and valid uplift score diff bigger than
min(train_uplift_score,valid_uplift_score)/early_stopping_eval_diff_scale, stop.
control_name: string
The name of the control group (other experiment groups will be regarded as treatment groups)
Expand Down Expand Up @@ -1521,6 +1563,7 @@ class UpliftRandomForestClassifier:
min_samples_leaf=100,
min_samples_treatment=10,
n_reg=10,
early_stopping_eval_diff_scale=1,
evaluationFunction='KL',
normalization=True,
honesty=False,
Expand All @@ -1538,6 +1581,7 @@ class UpliftRandomForestClassifier:
self.min_samples_leaf = min_samples_leaf
self.min_samples_treatment = min_samples_treatment
self.n_reg = n_reg
self.early_stopping_eval_diff_scale = early_stopping_eval_diff_scale
self.evaluationFunction = evaluationFunction
self.control_name = control_name
self.normalization = normalization
Expand All @@ -1554,7 +1598,7 @@ class UpliftRandomForestClassifier:
if self.n_jobs == -1:
self.n_jobs = mp.cpu_count()

def fit(self, X, treatment, y):
def fit(self, X, treatment, y, X_val=None, treatment_val=None, y_val=None):
"""
Fit the UpliftRandomForestClassifier.
Expand All @@ -1568,6 +1612,15 @@ class UpliftRandomForestClassifier:
y : array-like, shape = [num_samples]
An array containing the outcome of interest for each unit.
X_val : ndarray, shape = [num_samples, num_features]
An ndarray of the covariates used to valid the uplift model.
treatment_val : array-like, shape = [num_samples]
An array containing the validation treatment group for each unit.
y_val : array-like, shape = [num_samples]
An array containing the validation outcome of interest for each unit.
"""
random_state = check_random_state(self.random_state)

Expand All @@ -1578,6 +1631,7 @@ class UpliftRandomForestClassifier:
min_samples_leaf=self.min_samples_leaf,
min_samples_treatment=self.min_samples_treatment,
n_reg=self.n_reg,
early_stopping_eval_diff_scale=self.early_stopping_eval_diff_scale,
evaluationFunction=self.evaluationFunction,
control_name=self.control_name,
normalization=self.normalization,
Expand All @@ -1595,21 +1649,30 @@ class UpliftRandomForestClassifier:

self.uplift_forest = (
Parallel(n_jobs=self.n_jobs, prefer=self.joblib_prefer)
(delayed(self.bootstrap)(X, treatment, y, tree) for tree in self.uplift_forest)
(delayed(self.bootstrap)(X, treatment, y, X_val, treatment_val, y_val, tree) for tree in self.uplift_forest)
)

all_importances = [tree.feature_importances_ for tree in self.uplift_forest]
self.feature_importances_ = np.mean(all_importances, axis=0)
self.feature_importances_ /= self.feature_importances_.sum() # normalize to add to 1

@staticmethod
def bootstrap(X, treatment, y, tree):
def bootstrap(X, treatment, y, X_val, treatment_val, y_val, tree):
random_state = check_random_state(tree.random_state)
bt_index = random_state.choice(len(X), len(X))
x_train_bt = X[bt_index]
y_train_bt = y[bt_index]
treatment_train_bt = treatment[bt_index]
tree.fit(X=x_train_bt, treatment=treatment_train_bt, y=y_train_bt)

if X_val is None:
tree.fit(X=x_train_bt, treatment=treatment_train_bt, y=y_train_bt)
else:
bt_val_index = random_state.choice(len(X_val), len(X_val))
x_val_bt = X_val[bt_val_index]
y_val_bt = y_val[bt_val_index]
treatment_val_bt = treatment_val[bt_val_index]

tree.fit(X=x_train_bt, treatment=treatment_train_bt, y=y_train_bt, X_val=x_val_bt, treatment_val=treatment_val_bt, y_val=y_val_bt)
return tree

@ignore_warnings(category=FutureWarning)
Expand Down
40 changes: 31 additions & 9 deletions tests/test_uplift_trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,24 @@ def test_make_uplift_classification(generate_classification_data):

@pytest.mark.parametrize("backend", ["loky", "threading", "multiprocessing"])
@pytest.mark.parametrize("joblib_prefer", ["threads", "processes"])
@pytest.mark.parametrize("early_stopping", ["true", "false"])
def test_UpliftRandomForestClassifier(
generate_classification_data, backend, joblib_prefer
generate_classification_data, backend, joblib_prefer, early_stopping
):
df, x_names = generate_classification_data()
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
df_train, df_test, df_val = None, None, None

if early_stopping == "true":
df_train, df_test_val = train_test_split(
df, test_size=0.2, random_state=RANDOM_SEED
)
df_test, df_val = train_test_split(
df_test_val, test_size=0.5, random_state=RANDOM_SEED
)
else:
df_train, df_test = train_test_split(
df, test_size=0.2, random_state=RANDOM_SEED
)

with parallel_backend(backend):
# Train the UpLift Random Forest classifier
Expand All @@ -32,14 +45,23 @@ def test_UpliftRandomForestClassifier(
control_name=TREATMENT_NAMES[0],
random_state=RANDOM_SEED,
joblib_prefer=joblib_prefer,
early_stopping_eval_diff_scale=1,
)

uplift_model.fit(
df_train[x_names].values,
treatment=df_train["treatment_group_key"].values,
y=df_train[CONVERSION].values,
)

if early_stopping == "true":
uplift_model.fit(
df_train[x_names].values,
treatment=df_train["treatment_group_key"].values,
y=df_train[CONVERSION].values,
X_val=df_val[x_names].values,
treatment_val=df_val["treatment_group_key"].values,
y_val=df_val[CONVERSION].values,
)
else:
uplift_model.fit(
df_train[x_names].values,
treatment=df_train["treatment_group_key"].values,
y=df_train[CONVERSION].values,
)
predictions = {}
predictions["single"] = uplift_model.predict(df_test[x_names].values)
with parallel_backend("loky", n_jobs=2):
Expand Down

0 comments on commit 2a27027

Please sign in to comment.