diff --git a/causalml/inference/tree/uplift.pyx b/causalml/inference/tree/uplift.pyx index 6508718e..e5bba528 100644 --- a/causalml/inference/tree/uplift.pyx +++ b/causalml/inference/tree/uplift.pyx @@ -220,6 +220,10 @@ class UpliftTreeClassifier: n_reg: int, optional (default=100) The regularization parameter defined in Rzepakowski et al. 2012, the weight (in terms of sample size) of the parent node influence on the child node, only effective for 'KL', 'ED', 'Chi', 'CTS' methods. + + early_stopping_eval_diff_scale: float, optional (default=1) + If train and valid uplift score diff bigger than + min(train_uplift_score,valid_uplift_score)/early_stopping_eval_diff_scale, stop. control_name: string The name of the control group (other experiment groups will be regarded as treatment groups). @@ -240,12 +244,13 @@ class UpliftTreeClassifier: """ def __init__(self, control_name, max_features=None, max_depth=3, min_samples_leaf=100, - min_samples_treatment=10, n_reg=100, evaluationFunction='KL', + min_samples_treatment=10, n_reg=100, early_stopping_eval_diff_scale=1, evaluationFunction='KL', normalization=True, honesty=False, estimation_sample_size=0.5, random_state=None): self.max_depth = max_depth self.min_samples_leaf = min_samples_leaf self.min_samples_treatment = min_samples_treatment self.n_reg = n_reg + self.early_stopping_eval_diff_scale = early_stopping_eval_diff_scale self.max_features = max_features assert evaluationFunction in ['KL', 'ED', 'Chi', 'CTS', 'DDP', 'IT', 'CIT', 'IDDP'], \ @@ -282,7 +287,7 @@ class UpliftTreeClassifier: self.honesty = True - def fit(self, X, treatment, y): + def fit(self, X, treatment, y, X_val=None, treatment_val=None, y_val=None): """ Fit the uplift model. Args @@ -306,14 +311,23 @@ class UpliftTreeClassifier: X, y = check_X_y(X, y) treatment = np.asarray(treatment) assert len(y) == len(treatment), 'Data length must be equal for X, treatment, and y.' - + if X_val is not None: + X_val, y_val = check_X_y(X_val, y_val) + treatment_val = np.asarray(treatment_val) + assert len(y_val) == len(treatment_val), 'Data length must be equal for X_val, treatment_val, and y_val.' + # Get treatment group keys. self.classes_[0] is reserved for the control group. treatment_groups = sorted([x for x in list(set(treatment)) if x != self.control_name]) self.classes_ = [self.control_name] treatment_idx = np.zeros_like(treatment, dtype=int) + treatment_val_idx = None + if treatment_val is not None: + treatment_val_idx = np.zeros_like(treatment_val, dtype=int) for i, tr in enumerate(treatment_groups, 1): self.classes_.append(tr) treatment_idx[treatment == tr] = i + if treatment_val_idx is not None: + treatment_val_idx[treatment_val == tr] = i self.n_class = len(self.classes_) self.feature_imp_dict = defaultdict(float) @@ -333,8 +347,9 @@ class UpliftTreeClassifier: random_state=self.random_state) self.fitted_uplift_tree = self.growDecisionTreeFrom( - X, treatment_idx, y, - max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, + X, treatment_idx, y, X_val, treatment_val_idx, y_val, + max_depth=self.max_depth, early_stopping_eval_diff_scale=self.early_stopping_eval_diff_scale, + min_samples_leaf=self.min_samples_leaf, depth=1, min_samples_treatment=self.min_samples_treatment, n_reg=self.n_reg, parentNodeSummary=None ) @@ -1118,7 +1133,8 @@ class UpliftTreeClassifier: res.append(p) return res - def growDecisionTreeFrom(self, X, treatment_idx, y, max_depth=10, + def growDecisionTreeFrom(self, X, treatment_idx, y, X_val, treatment_val_idx, y_val, + early_stopping_eval_diff_scale=1, max_depth=10, min_samples_leaf=100, depth=1, min_samples_treatment=10, n_reg=100, parentNodeSummary=None): @@ -1133,6 +1149,12 @@ class UpliftTreeClassifier: An array containing the treatment group idx for each unit. y : array-like, shape = [num_samples] An array containing the outcome of interest for each unit. + X_val : ndarray, shape = [num_samples, num_features] + An ndarray of the covariates used to valid the uplift model. + treatment_val_idx : array-like, shape = [num_samples] + An array containing the validation treatment group idx for each unit. + y_val : array-like, shape = [num_samples] + An array containing the validation outcome of interest for each unit. max_depth: int, optional (default=10) The maximum depth of the tree. min_samples_leaf: int, optional (default=100) @@ -1194,7 +1216,6 @@ class UpliftTreeClassifier: else: p_t = currentNodeSummary[suboptTreatment][0] n_t = currentNodeSummary[suboptTreatment][1] - p_value = (1. - stats.norm.cdf(abs(p_c - p_t) / np.sqrt(p_t * (1 - p_t) / n_t + p_c * (1 - p_c) / n_c))) * 2 upliftScore = [maxDiff, p_value] @@ -1223,6 +1244,7 @@ class UpliftTreeClassifier: for value in lsUnique: X_l, X_r, w_l, w_r, y_l, y_r = self.divideSet(X, treatment_idx, y, col, value) + # check the split validity on min_samples_leaf 372 if (len(X_l) < min_samples_leaf or len(X_r) < min_samples_leaf): continue @@ -1233,15 +1255,28 @@ class UpliftTreeClassifier: min_samples_treatment=min_samples_treatment, n_reg=n_reg, parentNodeSummary=currentNodeSummary) - rightNodeSummary = self.tree_node_summary(w_r, y_r, - min_samples_treatment=min_samples_treatment, + min_samples_treatment=min_samples_treatment, n_reg=n_reg, parentNodeSummary=currentNodeSummary) - - # check the split validity on min_samples_treatment assert len(leftNodeSummary) == len(rightNodeSummary) + if X_val is not None: + X_val_l, X_val_r, w_val_l, w_val_r, y_val_l, y_val_r = self.divideSet(X_val, treatment_val_idx, y_val, col, value) + leftNodeSummary_val = self.tree_node_summary(w_val_l, y_val_l, + parentNodeSummary=currentNodeSummary) + rightNodeSummary_val = self.tree_node_summary(w_val_r, y_val_r, + parentNodeSummary=currentNodeSummary) + early_stopping_flag = False + for k in range(len(leftNodeSummary_val)): + if (abs(leftNodeSummary_val[k][0]-leftNodeSummary[k][0]) > min(leftNodeSummary_val[k][0],leftNodeSummary[k][0])/early_stopping_eval_diff_scale or + abs(rightNodeSummary_val[k][0]-rightNodeSummary[k][0]) > min(rightNodeSummary_val[k][0],rightNodeSummary[k][0])/early_stopping_eval_diff_scale): + early_stopping_flag = True + break + if early_stopping_flag: + continue + + # check the split validity on min_samples_treatment node_mst = min([stat[1] for stat in leftNodeSummary + rightNodeSummary]) if node_mst < min_samples_treatment: continue @@ -1293,13 +1328,16 @@ class UpliftTreeClassifier: norm_factor = self.normI(n_c, n_c_left, n_t, n_t_left, alpha=0.9) else: norm_factor = 1 - gain = gain / norm_factor + gain = gain / norm_factor if (gain > bestGain and len(X_l) > min_samples_leaf and len(X_r) > min_samples_leaf): bestGain = gain bestGainImp = gain_for_imp bestAttribute = (col, value) - best_set_left = [X_l, w_l, y_l] - best_set_right = [X_r, w_r, y_r] + best_set_left = [X_l, w_l, y_l, None, None, None] + best_set_right = [X_r, w_r, y_r, None, None, None] + if X_val is not None: + best_set_left = [X_l, w_l, y_l, X_val_l, w_val_l, y_val_l] + best_set_right = [X_r, w_r, y_r, X_val_r, w_val_r, y_val_r] dcY = {'impurity': '%.3f' % currentScore, 'samples': '%d' % len(X)} # Add treatment size @@ -1312,12 +1350,12 @@ class UpliftTreeClassifier: if bestGain > 0 and depth < max_depth: self.feature_imp_dict[bestAttribute[0]] += bestGainImp trueBranch = self.growDecisionTreeFrom( - *best_set_left, max_depth, min_samples_leaf, + *best_set_left, self.early_stopping_eval_diff_scale, max_depth, min_samples_leaf, depth + 1, min_samples_treatment=min_samples_treatment, n_reg=n_reg, parentNodeSummary=currentNodeSummary ) falseBranch = self.growDecisionTreeFrom( - *best_set_right, max_depth, min_samples_leaf, + *best_set_right, self.early_stopping_eval_diff_scale, max_depth, min_samples_leaf, depth + 1, min_samples_treatment=min_samples_treatment, n_reg=n_reg, parentNodeSummary=currentNodeSummary ) @@ -1484,6 +1522,10 @@ class UpliftRandomForestClassifier: weight (in terms of sample size) of the parent node influence on the child node, only effective for 'KL', 'ED', 'Chi', 'CTS' methods. + early_stopping_eval_diff_scale: float, optional (default=1) + If train and valid uplift score diff bigger than + min(train_uplift_score,valid_uplift_score)/early_stopping_eval_diff_scale, stop. + control_name: string The name of the control group (other experiment groups will be regarded as treatment groups) @@ -1521,6 +1563,7 @@ class UpliftRandomForestClassifier: min_samples_leaf=100, min_samples_treatment=10, n_reg=10, + early_stopping_eval_diff_scale=1, evaluationFunction='KL', normalization=True, honesty=False, @@ -1538,6 +1581,7 @@ class UpliftRandomForestClassifier: self.min_samples_leaf = min_samples_leaf self.min_samples_treatment = min_samples_treatment self.n_reg = n_reg + self.early_stopping_eval_diff_scale = early_stopping_eval_diff_scale self.evaluationFunction = evaluationFunction self.control_name = control_name self.normalization = normalization @@ -1554,7 +1598,7 @@ class UpliftRandomForestClassifier: if self.n_jobs == -1: self.n_jobs = mp.cpu_count() - def fit(self, X, treatment, y): + def fit(self, X, treatment, y, X_val=None, treatment_val=None, y_val=None): """ Fit the UpliftRandomForestClassifier. @@ -1568,6 +1612,15 @@ class UpliftRandomForestClassifier: y : array-like, shape = [num_samples] An array containing the outcome of interest for each unit. + + X_val : ndarray, shape = [num_samples, num_features] + An ndarray of the covariates used to valid the uplift model. + + treatment_val : array-like, shape = [num_samples] + An array containing the validation treatment group for each unit. + + y_val : array-like, shape = [num_samples] + An array containing the validation outcome of interest for each unit. """ random_state = check_random_state(self.random_state) @@ -1578,6 +1631,7 @@ class UpliftRandomForestClassifier: min_samples_leaf=self.min_samples_leaf, min_samples_treatment=self.min_samples_treatment, n_reg=self.n_reg, + early_stopping_eval_diff_scale=self.early_stopping_eval_diff_scale, evaluationFunction=self.evaluationFunction, control_name=self.control_name, normalization=self.normalization, @@ -1595,7 +1649,7 @@ class UpliftRandomForestClassifier: self.uplift_forest = ( Parallel(n_jobs=self.n_jobs, prefer=self.joblib_prefer) - (delayed(self.bootstrap)(X, treatment, y, tree) for tree in self.uplift_forest) + (delayed(self.bootstrap)(X, treatment, y, X_val, treatment_val, y_val, tree) for tree in self.uplift_forest) ) all_importances = [tree.feature_importances_ for tree in self.uplift_forest] @@ -1603,13 +1657,22 @@ class UpliftRandomForestClassifier: self.feature_importances_ /= self.feature_importances_.sum() # normalize to add to 1 @staticmethod - def bootstrap(X, treatment, y, tree): + def bootstrap(X, treatment, y, X_val, treatment_val, y_val, tree): random_state = check_random_state(tree.random_state) bt_index = random_state.choice(len(X), len(X)) x_train_bt = X[bt_index] y_train_bt = y[bt_index] treatment_train_bt = treatment[bt_index] - tree.fit(X=x_train_bt, treatment=treatment_train_bt, y=y_train_bt) + + if X_val is None: + tree.fit(X=x_train_bt, treatment=treatment_train_bt, y=y_train_bt) + else: + bt_val_index = random_state.choice(len(X_val), len(X_val)) + x_val_bt = X_val[bt_val_index] + y_val_bt = y_val[bt_val_index] + treatment_val_bt = treatment_val[bt_val_index] + + tree.fit(X=x_train_bt, treatment=treatment_train_bt, y=y_train_bt, X_val=x_val_bt, treatment_val=treatment_val_bt, y_val=y_val_bt) return tree @ignore_warnings(category=FutureWarning) diff --git a/tests/test_uplift_trees.py b/tests/test_uplift_trees.py index b3bb2346..a18f6e5f 100644 --- a/tests/test_uplift_trees.py +++ b/tests/test_uplift_trees.py @@ -19,11 +19,24 @@ def test_make_uplift_classification(generate_classification_data): @pytest.mark.parametrize("backend", ["loky", "threading", "multiprocessing"]) @pytest.mark.parametrize("joblib_prefer", ["threads", "processes"]) +@pytest.mark.parametrize("early_stopping", ["true", "false"]) def test_UpliftRandomForestClassifier( - generate_classification_data, backend, joblib_prefer + generate_classification_data, backend, joblib_prefer, early_stopping ): df, x_names = generate_classification_data() - df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED) + df_train, df_test, df_val = None, None, None + + if early_stopping == "true": + df_train, df_test_val = train_test_split( + df, test_size=0.2, random_state=RANDOM_SEED + ) + df_test, df_val = train_test_split( + df_test_val, test_size=0.5, random_state=RANDOM_SEED + ) + else: + df_train, df_test = train_test_split( + df, test_size=0.2, random_state=RANDOM_SEED + ) with parallel_backend(backend): # Train the UpLift Random Forest classifier @@ -32,14 +45,23 @@ def test_UpliftRandomForestClassifier( control_name=TREATMENT_NAMES[0], random_state=RANDOM_SEED, joblib_prefer=joblib_prefer, + early_stopping_eval_diff_scale=1, ) - - uplift_model.fit( - df_train[x_names].values, - treatment=df_train["treatment_group_key"].values, - y=df_train[CONVERSION].values, - ) - + if early_stopping == "true": + uplift_model.fit( + df_train[x_names].values, + treatment=df_train["treatment_group_key"].values, + y=df_train[CONVERSION].values, + X_val=df_val[x_names].values, + treatment_val=df_val["treatment_group_key"].values, + y_val=df_val[CONVERSION].values, + ) + else: + uplift_model.fit( + df_train[x_names].values, + treatment=df_train["treatment_group_key"].values, + y=df_train[CONVERSION].values, + ) predictions = {} predictions["single"] = uplift_model.predict(df_test[x_names].values) with parallel_backend("loky", n_jobs=2):