Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Fixing data leak with warm starting in GBDT #15032

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
2 changes: 1 addition & 1 deletion build_tools/azure/test_script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ except ImportError:
python -c "import multiprocessing as mp; print('%d CPUs' % mp.cpu_count())"
pip list

TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML"
TEST_CMD="python -m pytest -k test_gradient_boosting_early_stopping -s --showlocals --durations=20 --junitxml=$JUNITXML"

if [[ "$COVERAGE" == "true" ]]; then
export COVERAGE_PROCESS_START="$BUILD_SOURCESDIRECTORY/.coveragerc"
Expand Down
12 changes: 11 additions & 1 deletion sklearn/ensemble/_gb.py
Original file line number Diff line number Diff line change
Expand Up @@ -1435,6 +1435,16 @@ def fit(self, X, y, sample_weight=None, monitor=None):
if not self.warm_start:
self._clear_state()

rng = check_random_state(self.random_state)

# When warm starting, we want to re-use the same seed that was used
# the first time fit was called (e.g. for subsampling or for the
# train/val split).
if not (self.warm_start and self._is_initialized()):
self._random_seed = rng.randint(np.iinfo(np.uint32).max,
dtype='u8')
print(self._random_seed)
assert self._random_seed == 1.5
# Check input
# Since check_array converts both X and y to the same dtype, but the
# trees use different types for X and y, checking them separately.
Expand All @@ -1458,7 +1468,7 @@ def fit(self, X, y, sample_weight=None, monitor=None):
stratify = y if is_classifier(self) else None
X, X_val, y, y_val, sample_weight, sample_weight_val = (
train_test_split(X, y, sample_weight,
random_state=self.random_state,
random_state=self._random_seed,
test_size=self.validation_fraction,
stratify=stratify))
if is_classifier(self):
Expand Down
57 changes: 53 additions & 4 deletions sklearn/ensemble/tests/test_gradient_boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,7 @@ def test_oob_improvement():
assert clf.oob_improvement_.shape[0] == 100
# hard-coded regression test - change if modification in OOB computation
assert_array_almost_equal(clf.oob_improvement_[:5],
np.array([0.19, 0.15, 0.12, -0.12, -0.11]),
np.array([0.19, 0.16, 0.13, -0.12, -0.12]),
decimal=2)


Expand Down Expand Up @@ -1225,9 +1225,9 @@ def test_gradient_boosting_early_stopping():
X_train, X_test, y_train, y_test = train_test_split(X, y,
random_state=42)
# Check if early_stopping works as expected
for est, tol, early_stop_n_estimators in ((gbc, 1e-1, 28), (gbr, 1e-1, 13),
(gbc, 1e-3, 70),
(gbr, 1e-3, 28)):
for est, tol, early_stop_n_estimators in ((gbc, 1e-1, 28), (gbr, 1e-1, 14),
(gbc, 1e-3, 65),
(gbr, 1e-3, 49)):
est.set_params(tol=tol)
est.fit(X_train, y_train)
assert est.n_estimators_ == early_stop_n_estimators
Expand Down Expand Up @@ -1403,3 +1403,52 @@ def test_presort_deprecated(Cls, presort):
with pytest.warns(DeprecationWarning,
match="The parameter 'presort' is deprecated "):
gb.fit(X, y)


@pytest.mark.parametrize('GradientBoosting, X, y', [
(GradientBoostingClassifier, X, y),
(GradientBoostingRegressor, X, y)
])
@pytest.mark.parametrize('rng_type', ('none', 'int', 'instance'))
def test_random_seeds_warm_start(GradientBoosting, X, y, rng_type):
# Make sure the seeds for train/val split and small trainset subsampling
# are correctly set in a warm start context.
def _get_rng(rng_type):
# Helper to avoid consuming rngs
if rng_type == 'none':
return None
elif rng_type == 'int':
return 42
else:
return np.random.RandomState(0)

random_state = _get_rng(rng_type)
gb_1 = GradientBoosting(n_estimators=3, random_state=random_state)
gb_1.fit(X, y)
random_seed_1_1 = gb_1._random_seed

gb_1.fit(X, y)
random_seed_1_2 = gb_1._random_seed # clear the old state, different seed

random_state = _get_rng(rng_type)
gb_2 = GradientBoosting(n_estimators=3, random_state=random_state,
warm_start=True)
gb_2.fit(X, y) # inits state
random_seed_2_1 = gb_2._random_seed
gb_2.fit(X, y) # clears old state and equals est
random_seed_2_2 = gb_2._random_seed

# Without warm starting, the seeds should be
# * all different if random state is None
# * all equal if random state is an integer
# * different when refitting and equal with a new estimator (because
# the random state is mutated)
if rng_type == 'none':
assert random_seed_1_1 != random_seed_1_2 != random_seed_2_1
elif rng_type == 'int':
assert random_seed_1_1 == random_seed_1_2 == random_seed_2_1
else:
assert random_seed_1_1 == random_seed_2_1 != random_seed_1_2

# With warm starting, the seeds must be equal
assert random_seed_2_1 == random_seed_2_2