-
-
Notifications
You must be signed in to change notification settings - Fork 25.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MRG+1] Learning curve: Add an option to randomly choose indices for different training sizes #7506
Changes from 13 commits
43ebd81
660db34
9351553
5839808
1263e5a
2cca140
fa40520
abdcc3f
cc9b42d
8cae11e
20209e9
d0ac1ed
9cc8ad8
210a7bb
60b1531
b2d28c3
a6a63e2
7ed713d
1ea3a6e
b6af4d1
c11e29e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -642,7 +642,8 @@ def _shuffle(y, groups, random_state): | |
def learning_curve(estimator, X, y, groups=None, | ||
train_sizes=np.linspace(0.1, 1.0, 5), cv=None, scoring=None, | ||
exploit_incremental_learning=False, n_jobs=1, | ||
pre_dispatch="all", verbose=0): | ||
pre_dispatch="all", verbose=0, shuffle=False, | ||
random_state=None): | ||
"""Learning curve. | ||
|
||
Determines cross-validated training and test scores for different training | ||
|
@@ -718,7 +719,14 @@ def learning_curve(estimator, X, y, groups=None, | |
verbose : integer, optional | ||
Controls the verbosity: the higher, the more messages. | ||
|
||
Returns | ||
shuffle : boolean, optional | ||
Whether to shuffle training data before using it based on | ||
`train_sizes` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you want double-backticks |
||
|
||
random_state : None, int or RandomState | ||
When shuffle=True, pseudo-random number generator state used for | ||
shuffling. If None, use default numpy RNG for shuffling. | ||
|
||
------- | ||
train_sizes_abs : array, shape = (n_unique_ticks,), dtype int | ||
Numbers of training examples that has been used to generate the | ||
|
@@ -759,17 +767,26 @@ def learning_curve(estimator, X, y, groups=None, | |
|
||
parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, | ||
verbose=verbose) | ||
|
||
rng = check_random_state(random_state) if shuffle else None | ||
|
||
if exploit_incremental_learning: | ||
classes = np.unique(y) if is_classifier(estimator) else None | ||
out = parallel(delayed(_incremental_fit_estimator)( | ||
clone(estimator), X, y, classes, train, test, train_sizes_abs, | ||
scorer, verbose) for train, test in cv.split(X, y, groups)) | ||
clone(estimator), X, y, classes, _get_train_indices(train, rng), | ||
test, train_sizes_abs, scorer, verbose) | ||
for train, test in cv_iter) | ||
else: | ||
train_test_proportions = [] | ||
for train, test in cv_iter: | ||
train = _get_train_indices(train, rng) | ||
for n_train_samples in train_sizes_abs: | ||
train_test_proportions.append((train[:n_train_samples], test)) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I could have implemented it the following way:
but in this case line: |
||
out = parallel(delayed(_fit_and_score)( | ||
clone(estimator), X, y, scorer, train[:n_train_samples], test, | ||
clone(estimator), X, y, scorer, train, test, | ||
verbose, parameters=None, fit_params=None, return_train_score=True) | ||
for train, test in cv_iter | ||
for n_train_samples in train_sizes_abs) | ||
for train, test in train_test_proportions) | ||
out = np.array(out) | ||
n_cv_folds = out.shape[0] // n_unique_ticks | ||
out = out.reshape(n_cv_folds, n_unique_ticks, 2) | ||
|
@@ -779,6 +796,13 @@ def learning_curve(estimator, X, y, groups=None, | |
return train_sizes_abs, out[0], out[1] | ||
|
||
|
||
def _get_train_indices(train, rng): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually I think the refactoring that I want looks more like: if shuffle:
rng = check_random_state(random_state)
cv_iter = ((rng.permutation(train), test) for train, test in cv_iter) |
||
"""Shuffle training indices if random number generator is specified.""" | ||
if rng is not None: | ||
train = rng.permutation(train) | ||
return train | ||
|
||
|
||
def _translate_train_sizes(train_sizes, n_max_training_samples): | ||
"""Determine absolute sizes of training subsets and validate 'train_sizes'. | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -670,6 +670,31 @@ def test_learning_curve_batch_and_incremental_learning_are_equal(): | |
test_scores_batch.mean(axis=1)) | ||
|
||
|
||
def test_learning_curve_batch_and_incremental_shuffle(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since this test does nothing but check that |
||
X, y = make_classification(n_samples=30, n_features=1, n_informative=1, | ||
n_redundant=0, n_classes=2, | ||
n_clusters_per_class=1, random_state=0) | ||
estimator = MockIncrementalImprovingEstimator(20) | ||
train_sizes, train_scores, test_scores = learning_curve( | ||
estimator, X, y, cv=3, exploit_incremental_learning=True, | ||
train_sizes=np.linspace(0.1, 1.0, 10), | ||
random_state=1, shuffle=True) | ||
assert_array_equal(train_sizes, np.linspace(2, 20, 10)) | ||
assert_array_almost_equal(train_scores.mean(axis=1), | ||
np.linspace(1.9, 1.0, 10)) | ||
assert_array_almost_equal(test_scores.mean(axis=1), | ||
np.linspace(0.1, 1.0, 10)) | ||
|
||
estimator = MockImprovingEstimator(20) | ||
train_sizes, train_scores, test_scores = learning_curve( | ||
estimator, X, y, cv=3, train_sizes=np.linspace(0.1, 1.0, 10), | ||
random_state=1, shuffle=True) | ||
assert_array_almost_equal(train_scores.mean(axis=1), | ||
np.linspace(1.9, 1.0, 10)) | ||
assert_array_almost_equal(test_scores.mean(axis=1), | ||
np.linspace(0.1, 1.0, 10)) | ||
|
||
|
||
def test_learning_curve_n_sample_range_out_of_bounds(): | ||
X, y = make_classification(n_samples=30, n_features=1, n_informative=1, | ||
n_redundant=0, n_classes=2, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"using it based on" could be "taking prefixes of it based on"