"support valid error early stopping" (uber#648)

* support valid error early stopping --------- Co-authored-by: Jeong-Yoon Lee <jeongyoon.lee1@gmail.com>
ras44 · Aug 22, 2023 · 2a27027 · 2a27027
1 parent fb10bb2
commit 2a27027
Show file tree

Hide file tree

Showing 2 changed files with 114 additions and 29 deletions.
diff --git a/causalml/inference/tree/uplift.pyx b/causalml/inference/tree/uplift.pyx
@@ -220,6 +220,10 @@ class UpliftTreeClassifier:
     n_reg: int, optional (default=100)
         The regularization parameter defined in Rzepakowski et al. 2012, the weight (in terms of sample size) of the
         parent node influence on the child node, only effective for 'KL', 'ED', 'Chi', 'CTS' methods.
+    
+    early_stopping_eval_diff_scale: float, optional (default=1)
+        If train and valid uplift score diff bigger than 
+        min(train_uplift_score,valid_uplift_score)/early_stopping_eval_diff_scale, stop.
 
     control_name: string
         The name of the control group (other experiment groups will be regarded as treatment groups).
@@ -240,12 +244,13 @@ class UpliftTreeClassifier:
 
     """
     def __init__(self, control_name, max_features=None, max_depth=3, min_samples_leaf=100,
-                 min_samples_treatment=10, n_reg=100, evaluationFunction='KL',
+                 min_samples_treatment=10, n_reg=100, early_stopping_eval_diff_scale=1, evaluationFunction='KL',
                  normalization=True, honesty=False, estimation_sample_size=0.5, random_state=None):
         self.max_depth = max_depth
         self.min_samples_leaf = min_samples_leaf
         self.min_samples_treatment = min_samples_treatment
         self.n_reg = n_reg
+        self.early_stopping_eval_diff_scale = early_stopping_eval_diff_scale
         self.max_features = max_features
 
         assert evaluationFunction in ['KL', 'ED', 'Chi', 'CTS', 'DDP', 'IT', 'CIT', 'IDDP'], \
@@ -282,7 +287,7 @@ class UpliftTreeClassifier:
             self.honesty = True
 
 
-    def fit(self, X, treatment, y):
+    def fit(self, X, treatment, y, X_val=None, treatment_val=None, y_val=None):
         """ Fit the uplift model.
 
         Args
@@ -306,14 +311,23 @@ class UpliftTreeClassifier:
         X, y = check_X_y(X, y)
         treatment = np.asarray(treatment)
         assert len(y) == len(treatment), 'Data length must be equal for X, treatment, and y.'
-
+        if X_val is not None:
+            X_val, y_val = check_X_y(X_val, y_val)
+            treatment_val = np.asarray(treatment_val)
+            assert len(y_val) == len(treatment_val), 'Data length must be equal for X_val, treatment_val, and y_val.'
+
         # Get treatment group keys. self.classes_[0] is reserved for the control group.
         treatment_groups = sorted([x for x in list(set(treatment)) if x != self.control_name])
         self.classes_ = [self.control_name]
         treatment_idx = np.zeros_like(treatment, dtype=int)
+        treatment_val_idx = None
+        if treatment_val is not None:
+            treatment_val_idx = np.zeros_like(treatment_val, dtype=int)
         for i, tr in enumerate(treatment_groups, 1):
             self.classes_.append(tr)
             treatment_idx[treatment == tr] = i
+            if treatment_val_idx is not None:
+                treatment_val_idx[treatment_val == tr] = i
         self.n_class = len(self.classes_)
 
         self.feature_imp_dict = defaultdict(float)
@@ -333,8 +347,9 @@ class UpliftTreeClassifier:
                                                                                         random_state=self.random_state)
 
         self.fitted_uplift_tree = self.growDecisionTreeFrom(
-            X, treatment_idx, y,
-            max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf,
+            X, treatment_idx, y, X_val, treatment_val_idx, y_val,
+            max_depth=self.max_depth, early_stopping_eval_diff_scale=self.early_stopping_eval_diff_scale,
+            min_samples_leaf=self.min_samples_leaf,
             depth=1, min_samples_treatment=self.min_samples_treatment,
             n_reg=self.n_reg, parentNodeSummary=None
         )
@@ -1118,7 +1133,8 @@ class UpliftTreeClassifier:
             res.append(p)
         return res
 
-    def growDecisionTreeFrom(self, X, treatment_idx, y, max_depth=10,
+    def growDecisionTreeFrom(self, X, treatment_idx, y, X_val, treatment_val_idx, y_val,
+                             early_stopping_eval_diff_scale=1, max_depth=10,
                              min_samples_leaf=100, depth=1,
                              min_samples_treatment=10, n_reg=100,
                              parentNodeSummary=None):
@@ -1133,6 +1149,12 @@ class UpliftTreeClassifier:
             An array containing the treatment group idx for each unit.
         y : array-like, shape = [num_samples]
             An array containing the outcome of interest for each unit.
+        X_val : ndarray, shape = [num_samples, num_features]
+            An ndarray of the covariates used to valid the uplift model.
+        treatment_val_idx : array-like, shape = [num_samples]
+            An array containing the validation treatment group idx for each unit.
+        y_val : array-like, shape = [num_samples]
+            An array containing the validation outcome of interest for each unit.
         max_depth: int, optional (default=10)
             The maximum depth of the tree.
         min_samples_leaf: int, optional (default=100)
@@ -1194,7 +1216,6 @@ class UpliftTreeClassifier:
         else:
             p_t = currentNodeSummary[suboptTreatment][0]
             n_t = currentNodeSummary[suboptTreatment][1]
-
         p_value = (1. - stats.norm.cdf(abs(p_c - p_t) / np.sqrt(p_t * (1 - p_t) / n_t + p_c * (1 - p_c) / n_c))) * 2
         upliftScore = [maxDiff, p_value]
 
@@ -1223,6 +1244,7 @@ class UpliftTreeClassifier:
 
             for value in lsUnique:
                 X_l, X_r, w_l, w_r, y_l, y_r = self.divideSet(X, treatment_idx, y, col, value)
+
                 # check the split validity on min_samples_leaf  372
                 if (len(X_l) < min_samples_leaf or len(X_r) < min_samples_leaf):
                     continue
@@ -1233,15 +1255,28 @@ class UpliftTreeClassifier:
                                                          min_samples_treatment=min_samples_treatment,
                                                          n_reg=n_reg,
                                                          parentNodeSummary=currentNodeSummary)
-
                 rightNodeSummary = self.tree_node_summary(w_r, y_r,
-                                                          min_samples_treatment=min_samples_treatment,
+                                                         min_samples_treatment=min_samples_treatment,
                                                           n_reg=n_reg,
                                                           parentNodeSummary=currentNodeSummary)
-
-                # check the split validity on min_samples_treatment
                 assert len(leftNodeSummary) == len(rightNodeSummary)
 
+                if X_val is not None:
+                    X_val_l, X_val_r, w_val_l, w_val_r, y_val_l, y_val_r = self.divideSet(X_val, treatment_val_idx, y_val, col, value)
+                    leftNodeSummary_val = self.tree_node_summary(w_val_l, y_val_l,
+                                                             parentNodeSummary=currentNodeSummary)
+                    rightNodeSummary_val = self.tree_node_summary(w_val_r, y_val_r,
+                                                              parentNodeSummary=currentNodeSummary)
+                    early_stopping_flag = False
+                    for k in range(len(leftNodeSummary_val)):
+                        if (abs(leftNodeSummary_val[k][0]-leftNodeSummary[k][0]) > min(leftNodeSummary_val[k][0],leftNodeSummary[k][0])/early_stopping_eval_diff_scale or
+                         abs(rightNodeSummary_val[k][0]-rightNodeSummary[k][0]) > min(rightNodeSummary_val[k][0],rightNodeSummary[k][0])/early_stopping_eval_diff_scale):
+                            early_stopping_flag = True
+                            break
+                    if early_stopping_flag:
+                        continue
+
+                # check the split validity on min_samples_treatment
                 node_mst = min([stat[1] for stat in leftNodeSummary + rightNodeSummary])
                 if node_mst < min_samples_treatment:
                     continue
@@ -1293,13 +1328,16 @@ class UpliftTreeClassifier:
                         norm_factor = self.normI(n_c, n_c_left, n_t, n_t_left, alpha=0.9)
                     else:
                         norm_factor = 1
-                    gain = gain / norm_factor
+                    gain = gain / norm_factor 
                 if (gain > bestGain and len(X_l) > min_samples_leaf and len(X_r) > min_samples_leaf):
                     bestGain = gain
                     bestGainImp = gain_for_imp
                     bestAttribute = (col, value)
-                    best_set_left = [X_l, w_l, y_l]
-                    best_set_right = [X_r, w_r, y_r]
+                    best_set_left = [X_l, w_l, y_l, None, None, None]
+                    best_set_right = [X_r, w_r, y_r, None, None, None]
+                    if X_val is not None:
+                        best_set_left = [X_l, w_l, y_l, X_val_l, w_val_l, y_val_l]
+                        best_set_right = [X_r, w_r, y_r, X_val_r, w_val_r, y_val_r]
 
         dcY = {'impurity': '%.3f' % currentScore, 'samples': '%d' % len(X)}
         # Add treatment size
@@ -1312,12 +1350,12 @@ class UpliftTreeClassifier:
         if bestGain > 0 and depth < max_depth:
             self.feature_imp_dict[bestAttribute[0]] += bestGainImp
             trueBranch = self.growDecisionTreeFrom(
-                *best_set_left, max_depth, min_samples_leaf,
+                *best_set_left, self.early_stopping_eval_diff_scale, max_depth, min_samples_leaf,
                 depth + 1, min_samples_treatment=min_samples_treatment,
                 n_reg=n_reg, parentNodeSummary=currentNodeSummary
             )
             falseBranch = self.growDecisionTreeFrom(
-                *best_set_right, max_depth, min_samples_leaf,
+                *best_set_right, self.early_stopping_eval_diff_scale, max_depth, min_samples_leaf,
                 depth + 1, min_samples_treatment=min_samples_treatment,
                 n_reg=n_reg, parentNodeSummary=currentNodeSummary
             )
@@ -1484,6 +1522,10 @@ class UpliftRandomForestClassifier:
         weight (in terms of sample size) of the parent node influence on the
         child node, only effective for 'KL', 'ED', 'Chi', 'CTS' methods.
 
+    early_stopping_eval_diff_scale: float, optional (default=1)
+        If train and valid uplift score diff bigger than 
+        min(train_uplift_score,valid_uplift_score)/early_stopping_eval_diff_scale, stop.
+
     control_name: string
         The name of the control group (other experiment groups will be regarded as treatment groups)
 
@@ -1521,6 +1563,7 @@ class UpliftRandomForestClassifier:
                  min_samples_leaf=100,
                  min_samples_treatment=10,
                  n_reg=10,
+                 early_stopping_eval_diff_scale=1,
                  evaluationFunction='KL',
                  normalization=True,
                  honesty=False,
@@ -1538,6 +1581,7 @@ class UpliftRandomForestClassifier:
         self.min_samples_leaf = min_samples_leaf
         self.min_samples_treatment = min_samples_treatment
         self.n_reg = n_reg
+        self.early_stopping_eval_diff_scale = early_stopping_eval_diff_scale
         self.evaluationFunction = evaluationFunction
         self.control_name = control_name
         self.normalization = normalization
@@ -1554,7 +1598,7 @@ class UpliftRandomForestClassifier:
         if self.n_jobs == -1:
             self.n_jobs = mp.cpu_count()
 
-    def fit(self, X, treatment, y):
+    def fit(self, X, treatment, y, X_val=None, treatment_val=None, y_val=None):
         """
         Fit the UpliftRandomForestClassifier.
 
@@ -1568,6 +1612,15 @@ class UpliftRandomForestClassifier:
 
         y : array-like, shape = [num_samples]
             An array containing the outcome of interest for each unit.
+
+        X_val : ndarray, shape = [num_samples, num_features]
+            An ndarray of the covariates used to valid the uplift model.
+
+        treatment_val : array-like, shape = [num_samples]
+            An array containing the validation treatment group for each unit.
+
+        y_val : array-like, shape = [num_samples]
+            An array containing the validation outcome of interest for each unit.
         """
         random_state = check_random_state(self.random_state)
 
@@ -1578,6 +1631,7 @@ class UpliftRandomForestClassifier:
                 min_samples_leaf=self.min_samples_leaf,
                 min_samples_treatment=self.min_samples_treatment,
                 n_reg=self.n_reg,
+                early_stopping_eval_diff_scale=self.early_stopping_eval_diff_scale,
                 evaluationFunction=self.evaluationFunction,
                 control_name=self.control_name,
                 normalization=self.normalization,
@@ -1595,21 +1649,30 @@ class UpliftRandomForestClassifier:
 
         self.uplift_forest = (
             Parallel(n_jobs=self.n_jobs, prefer=self.joblib_prefer)
-            (delayed(self.bootstrap)(X, treatment, y, tree) for tree in self.uplift_forest)
+            (delayed(self.bootstrap)(X, treatment, y, X_val, treatment_val, y_val, tree) for tree in self.uplift_forest)
         )
 
         all_importances = [tree.feature_importances_ for tree in self.uplift_forest]
         self.feature_importances_ = np.mean(all_importances, axis=0)
         self.feature_importances_ /= self.feature_importances_.sum()  # normalize to add to 1
 
     @staticmethod
-    def bootstrap(X, treatment, y, tree):
+    def bootstrap(X, treatment, y, X_val, treatment_val, y_val, tree):
         random_state = check_random_state(tree.random_state)
         bt_index = random_state.choice(len(X), len(X))
         x_train_bt = X[bt_index]
         y_train_bt = y[bt_index]
         treatment_train_bt = treatment[bt_index]
-        tree.fit(X=x_train_bt, treatment=treatment_train_bt, y=y_train_bt)
+
+        if X_val is None:
+            tree.fit(X=x_train_bt, treatment=treatment_train_bt, y=y_train_bt)
+        else:
+            bt_val_index = random_state.choice(len(X_val), len(X_val))
+            x_val_bt = X_val[bt_val_index]
+            y_val_bt = y_val[bt_val_index]
+            treatment_val_bt = treatment_val[bt_val_index]
+
+            tree.fit(X=x_train_bt, treatment=treatment_train_bt, y=y_train_bt, X_val=x_val_bt, treatment_val=treatment_val_bt, y_val=y_val_bt)
         return tree
 
     @ignore_warnings(category=FutureWarning)

diff --git a/tests/test_uplift_trees.py b/tests/test_uplift_trees.py
@@ -19,11 +19,24 @@ def test_make_uplift_classification(generate_classification_data):
 
 @pytest.mark.parametrize("backend", ["loky", "threading", "multiprocessing"])
 @pytest.mark.parametrize("joblib_prefer", ["threads", "processes"])
+@pytest.mark.parametrize("early_stopping", ["true", "false"])
 def test_UpliftRandomForestClassifier(
-    generate_classification_data, backend, joblib_prefer
+    generate_classification_data, backend, joblib_prefer, early_stopping
 ):
     df, x_names = generate_classification_data()
-    df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
+    df_train, df_test, df_val = None, None, None
+
+    if early_stopping == "true":
+        df_train, df_test_val = train_test_split(
+            df, test_size=0.2, random_state=RANDOM_SEED
+        )
+        df_test, df_val = train_test_split(
+            df_test_val, test_size=0.5, random_state=RANDOM_SEED
+        )
+    else:
+        df_train, df_test = train_test_split(
+            df, test_size=0.2, random_state=RANDOM_SEED
+        )
 
     with parallel_backend(backend):
         # Train the UpLift Random Forest classifier
@@ -32,14 +45,23 @@ def test_UpliftRandomForestClassifier(
             control_name=TREATMENT_NAMES[0],
             random_state=RANDOM_SEED,
             joblib_prefer=joblib_prefer,
+            early_stopping_eval_diff_scale=1,
         )
-
-        uplift_model.fit(
-            df_train[x_names].values,
-            treatment=df_train["treatment_group_key"].values,
-            y=df_train[CONVERSION].values,
-        )
-
+        if early_stopping == "true":
+            uplift_model.fit(
+                df_train[x_names].values,
+                treatment=df_train["treatment_group_key"].values,
+                y=df_train[CONVERSION].values,
+                X_val=df_val[x_names].values,
+                treatment_val=df_val["treatment_group_key"].values,
+                y_val=df_val[CONVERSION].values,
+            )
+        else:
+            uplift_model.fit(
+                df_train[x_names].values,
+                treatment=df_train["treatment_group_key"].values,
+                y=df_train[CONVERSION].values,
+            )
         predictions = {}
         predictions["single"] = uplift_model.predict(df_test[x_names].values)
         with parallel_backend("loky", n_jobs=2):