New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MRG] extending BaseSearchCV with a custom search strategy #9599
Changes from 30 commits
d0d0436
d4dbb5d
8e36e37
0d20d0b
4264fff
6905a47
9c46788
ab56a97
dcaeb14
4f9f84e
249bcbb
05c3c3b
1dad444
579a43e
90aaa06
d6ef8da
1c19d9f
709b89a
b77eec4
4211b2b
c23e378
6c7c231
4ef1f1b
e430dc4
ecffb3e
718ded1
df3e4c7
f14c8ce
7d91d18
2a3eecf
5c135d0
bd88b64
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -406,7 +406,8 @@ def __repr__(self): | |
|
||
class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator, | ||
MetaEstimatorMixin)): | ||
"""Base class for hyper parameter search with cross-validation.""" | ||
"""Abstract base class for hyper parameter search with cross-validation. | ||
""" | ||
|
||
@abstractmethod | ||
def __init__(self, estimator, scoring=None, | ||
|
@@ -577,6 +578,30 @@ def classes_(self): | |
self._check_is_fitted("classes_") | ||
return self.best_estimator_.classes_ | ||
|
||
@abstractmethod | ||
def _run_search(self, evaluate_candidates): | ||
"""Repeatedly calls `evaluate_candidates` to conduct a search. | ||
|
||
Parameters | ||
---------- | ||
evaluate_candidates : callable | ||
This callback accepts a list of candidates, where each candidate is | ||
a dict of parameter settings. It returns a dict of all results so | ||
far, formatted like ``cv_results_``. | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we refactor this function? I think a better implementation would be def _generate_candidates(self):
params = results = None
while True:
params = self._candidates(params, results)
if params is None:
break
results = yield params where There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would allow the documentation of |
||
Examples | ||
-------- | ||
|
||
:: | ||
|
||
def _run_search(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo: def _run_search(self, evaluate_candidates):
... |
||
'Try C=0.1 only if C=1 is better than C=10' | ||
all_results = evaluate_candidates([{'C': 1}, {'C': 10}]) | ||
score = all_results['mean_test_score'] | ||
if score[0] < score[1]: | ||
evaluate_candidates([{'C': 0.1}]) | ||
""" | ||
|
||
def fit(self, X, y=None, groups=None, **fit_params): | ||
"""Run fit with all sets of parameters. | ||
|
||
|
@@ -636,29 +661,86 @@ def fit(self, X, y=None, groups=None, **fit_params): | |
|
||
X, y, groups = indexable(X, y, groups) | ||
n_splits = cv.get_n_splits(X, y, groups) | ||
# Regenerate parameter iterable for each fit | ||
candidate_params = list(self._get_param_iterator()) | ||
n_candidates = len(candidate_params) | ||
if self.verbose > 0: | ||
print("Fitting {0} folds for each of {1} candidates, totalling" | ||
" {2} fits".format(n_splits, n_candidates, | ||
n_candidates * n_splits)) | ||
|
||
base_estimator = clone(self.estimator) | ||
pre_dispatch = self.pre_dispatch | ||
|
||
out = Parallel( | ||
n_jobs=self.n_jobs, verbose=self.verbose, | ||
pre_dispatch=pre_dispatch | ||
)(delayed(_fit_and_score)(clone(base_estimator), X, y, scorers, train, | ||
test, self.verbose, parameters, | ||
fit_params=fit_params, | ||
return_train_score=self.return_train_score, | ||
return_n_test_samples=True, | ||
return_times=True, return_parameters=False, | ||
error_score=self.error_score) | ||
for parameters, (train, test) in product(candidate_params, | ||
cv.split(X, y, groups))) | ||
|
||
parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, | ||
pre_dispatch=self.pre_dispatch) | ||
|
||
fit_and_score_kwargs = dict(scorer=scorers, | ||
fit_params=fit_params, | ||
return_train_score=self.return_train_score, | ||
return_n_test_samples=True, | ||
return_times=True, | ||
return_parameters=False, | ||
error_score=self.error_score, | ||
verbose=self.verbose) | ||
results_container = [{}] | ||
with parallel: | ||
all_candidate_params = [] | ||
all_out = [] | ||
|
||
def evaluate_candidates(candidate_params): | ||
candidate_params = list(candidate_params) | ||
n_candidates = len(candidate_params) | ||
|
||
if self.verbose > 0: | ||
print("Fitting {0} folds for each of {1} candidates," | ||
" totalling {2} fits".format( | ||
n_splits, n_candidates, n_candidates * n_splits)) | ||
|
||
out = parallel(delayed(_fit_and_score)(clone(base_estimator), | ||
X, y, | ||
train=train, test=test, | ||
parameters=parameters, | ||
**fit_and_score_kwargs) | ||
for parameters, (train, test) | ||
in product(candidate_params, | ||
cv.split(X, y, groups))) | ||
|
||
all_candidate_params.extend(candidate_params) | ||
all_out.extend(out) | ||
|
||
# XXX: When we drop Python 2 support, we can use nonlocal | ||
# instead of results_container | ||
results_container[0] = self._format_results( | ||
all_candidate_params, scorers, n_splits, all_out) | ||
return results_container[0] | ||
|
||
self._run_search(evaluate_candidates) | ||
|
||
results = results_container[0] | ||
|
||
# For multi-metric evaluation, store the best_index_, best_params_ and | ||
# best_score_ iff refit is one of the scorer names | ||
# In single metric evaluation, refit_metric is "score" | ||
if self.refit or not self.multimetric_: | ||
self.best_index_ = results["rank_test_%s" % refit_metric].argmin() | ||
self.best_params_ = results["params"][self.best_index_] | ||
self.best_score_ = results["mean_test_%s" % refit_metric][ | ||
self.best_index_] | ||
|
||
if self.refit: | ||
self.best_estimator_ = clone(base_estimator).set_params( | ||
**self.best_params_) | ||
refit_start_time = time.time() | ||
if y is not None: | ||
self.best_estimator_.fit(X, y, **fit_params) | ||
else: | ||
self.best_estimator_.fit(X, **fit_params) | ||
refit_end_time = time.time() | ||
self.refit_time_ = refit_end_time - refit_start_time | ||
|
||
# Store the only scorer not as a dict for single metric evaluation | ||
self.scorer_ = scorers if self.multimetric_ else scorers['score'] | ||
|
||
self.cv_results_ = results | ||
self.n_splits_ = n_splits | ||
|
||
return self | ||
|
||
def _format_results(self, candidate_params, scorers, n_splits, out): | ||
n_candidates = len(candidate_params) | ||
|
||
# if one choose to see train score, "out" will contain train score info | ||
if self.return_train_score: | ||
|
@@ -744,7 +826,6 @@ def _store(key_name, array, weights=None, splits=False, rank=False): | |
prev_keys = set(results.keys()) | ||
_store('train_%s' % scorer_name, train_scores[scorer_name], | ||
splits=True) | ||
|
||
if self.return_train_score == 'warn': | ||
for key in set(results.keys()) - prev_keys: | ||
message = ( | ||
|
@@ -755,33 +836,7 @@ def _store(key_name, array, weights=None, splits=False, rank=False): | |
# warn on key access | ||
results.add_warning(key, message, FutureWarning) | ||
|
||
# For multi-metric evaluation, store the best_index_, best_params_ and | ||
# best_score_ iff refit is one of the scorer names | ||
# In single metric evaluation, refit_metric is "score" | ||
if self.refit or not self.multimetric_: | ||
self.best_index_ = results["rank_test_%s" % refit_metric].argmin() | ||
self.best_params_ = candidate_params[self.best_index_] | ||
self.best_score_ = results["mean_test_%s" % refit_metric][ | ||
self.best_index_] | ||
|
||
if self.refit: | ||
self.best_estimator_ = clone(base_estimator).set_params( | ||
**self.best_params_) | ||
refit_start_time = time.time() | ||
if y is not None: | ||
self.best_estimator_.fit(X, y, **fit_params) | ||
else: | ||
self.best_estimator_.fit(X, **fit_params) | ||
refit_end_time = time.time() | ||
self.refit_time_ = refit_end_time - refit_start_time | ||
|
||
# Store the only scorer not as a dict for single metric evaluation | ||
self.scorer_ = scorers if self.multimetric_ else scorers['score'] | ||
|
||
self.cv_results_ = results | ||
self.n_splits_ = n_splits | ||
|
||
return self | ||
return results | ||
|
||
|
||
class GridSearchCV(BaseSearchCV): | ||
|
@@ -1100,9 +1155,9 @@ def __init__(self, estimator, param_grid, scoring=None, fit_params=None, | |
self.param_grid = param_grid | ||
_check_param_grid(param_grid) | ||
|
||
def _get_param_iterator(self): | ||
"""Return ParameterGrid instance for the given param_grid""" | ||
return ParameterGrid(self.param_grid) | ||
def _run_search(self, evaluate_candidates): | ||
"""Search all candidates in param_grid""" | ||
evaluate_candidates(ParameterGrid(self.param_grid)) | ||
|
||
|
||
class RandomizedSearchCV(BaseSearchCV): | ||
|
@@ -1414,8 +1469,8 @@ def __init__(self, estimator, param_distributions, n_iter=10, scoring=None, | |
pre_dispatch=pre_dispatch, error_score=error_score, | ||
return_train_score=return_train_score) | ||
|
||
def _get_param_iterator(self): | ||
"""Return ParameterSampler instance for the given distributions""" | ||
return ParameterSampler( | ||
def _run_search(self, evaluate_candidates): | ||
"""Search n_iter candidates from param_distributions""" | ||
evaluate_candidates(ParameterSampler( | ||
self.param_distributions, self.n_iter, | ||
random_state=self.random_state) | ||
random_state=self.random_state)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add some motivation for the intent of this abstract method. For instance:
This method, implemented in sub-classes, makes it is possible to customize the
the scheduling of evaluations: GridSearchCV and RandomizedSearchCV schedule
evaluations for their whole parameter search space at once but other more
sequential approaches are also possible: for instance is possible to
iteratively schedule evaluations for new regions of the parameter search
space based on previously collected evaluation results. This makes it
possible to implement Bayesian optimization or more generally sequential
model-based optimization by deriving from the
BaseSearchCV
abstract baseclass.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Very nice text!