Skip to content

Commit

Permalink
Multiprocessing over features rather than CV folds in Sequential Feat…
Browse files Browse the repository at this point in the history
…ure Selection (addressing #191) (#193)
  • Loading branch information
whalebot-helmsman authored and rasbt committed May 18, 2017
1 parent 89a2a0e commit 1b0decf
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 42 deletions.
3 changes: 3 additions & 0 deletions docs/sources/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ The CHANGELOG for the current development version is available at
### Version 0.6.1 (TBD)



##### Downloads

- [Source code (zip)](https://github.com/rasbt/mlxtend/archive/v0.6.1.zip)
Expand All @@ -24,6 +25,8 @@ The CHANGELOG for the current development version is available at
- `plot_decision_regions` now supports plotting decision regions for more than 2 training features. (via [James Bourbeau](https://github.com/jrbourbeau)).


- Parallel execution in `mlxtend.feature_selection.SequentialFeatureSelector` and `mlxtend.feature_selection.ExhaustiveFeatureSelector` is now performed over different feature subsets instead of the different cross-validation folds to better utilize machines with multiple processors if the number of features is large ([#193](https://github.com/rasbt/mlxtend/pull/193), via [@whalebot-helmsman](https://github.com/whalebot-helmsman)).

##### Bug Fixes

- `SequentialFeatureSelector` now correctly accepts a `None` argument for the `scoring` parameter to infer the default scoring metric from scikit-learn classifiers and regressors.
Expand Down
41 changes: 24 additions & 17 deletions mlxtend/feature_selection/exhaustive_feature_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,21 @@
from sklearn.base import MetaEstimatorMixin
from ..externals.name_estimators import _name_estimators
from sklearn.model_selection import cross_val_score
from sklearn.externals.joblib import Parallel, delayed


def _calc_score(selector, X, y, indices):
if selector.cv:
scores = cross_val_score(selector.est_,
X[:, indices], y,
cv=selector.cv,
scoring=selector.scorer,
n_jobs=1,
pre_dispatch=selector.pre_dispatch)
else:
selector.est_.fit(X[:, indices], y)
scores = np.array([selector.scorer(selector.est_, X[:, indices], y)])
return indices, scores


class ExhaustiveFeatureSelector(BaseEstimator, MetaEstimatorMixin):
Expand Down Expand Up @@ -51,10 +66,11 @@ class ExhaustiveFeatureSelector(BaseEstimator, MetaEstimatorMixin):
otherwise.
No cross-validation if cv is None, False, or 0.
n_jobs : int (default: 1)
The number of CPUs to use for cross validation. -1 means 'all CPUs'.
The number of CPUs to use for evaluating different feature subsets
in parallel. -1 means 'all CPUs'.
pre_dispatch : int, or string (default: '2*n_jobs')
Controls the number of jobs that get dispatched
during parallel execution in cross_val_score.
during parallel execution if `n_jobs > 1` or `n_jobs=-1`.
Reducing this number can be useful to avoid an explosion of
memory consumption when more jobs get dispatched than CPUs can process.
This parameter can be:
Expand Down Expand Up @@ -147,8 +163,12 @@ def fit(self, X, y):

self.subsets_ = {}
all_comb = len(candidates)
for iteration, c in enumerate(candidates):
cv_scores = self._calc_score(X=X, y=y, indices=c)
n_jobs = min(self.n_jobs, all_comb)
parallel = Parallel(n_jobs=n_jobs, pre_dispatch=self.pre_dispatch)
work = enumerate(parallel(delayed(_calc_score)(self, X, y, c)
for c in candidates))

for iteration, (c, cv_scores) in work:

self.subsets_[iteration] = {'feature_idx': c,
'cv_scores': cv_scores,
Expand All @@ -173,19 +193,6 @@ def fit(self, X, y):
self.fitted = True
return self

def _calc_score(self, X, y, indices):
if self.cv:
scores = cross_val_score(self.est_,
X[:, indices], y,
cv=self.cv,
scoring=self.scorer,
n_jobs=self.n_jobs,
pre_dispatch=self.pre_dispatch)
else:
self.est_.fit(X[:, indices], y)
scores = np.array([self.scorer(self.est_, X[:, indices], y)])
return scores

def transform(self, X):
"""Return the best selected features from X.
Expand Down
62 changes: 39 additions & 23 deletions mlxtend/feature_selection/sequential_feature_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,21 @@
from sklearn.base import MetaEstimatorMixin
from ..externals.name_estimators import _name_estimators
from sklearn.model_selection import cross_val_score
from sklearn.externals.joblib import Parallel, delayed


def _calc_score(selector, X, y, indices):
if selector.cv:
scores = cross_val_score(selector.est_,
X[:, indices], y,
cv=selector.cv,
scoring=selector.scorer,
n_jobs=1,
pre_dispatch=selector.pre_dispatch)
else:
selector.est_.fit(X[:, indices], y)
scores = np.array([selector.scorer(selector.est_, X[:, indices], y)])
return indices, scores


class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
Expand Down Expand Up @@ -69,10 +84,11 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
exclusion/inclusion if floating=True and
algorithm gets stuck in cycles.
n_jobs : int (default: 1)
The number of CPUs to use for cross validation. -1 means 'all CPUs'.
The number of CPUs to use for evaluating different feature subsets
in parallel. -1 means 'all CPUs'.
pre_dispatch : int, or string (default: '2*n_jobs')
Controls the number of jobs that get dispatched
during parallel execution in cross_val_score.
during parallel execution if `n_jobs > 1` or `n_jobs=-1`.
Reducing this number can be useful to avoid an explosion of
memory consumption when more jobs get dispatched than CPUs can process.
This parameter can be:
Expand Down Expand Up @@ -222,7 +238,7 @@ def fit(self, X, y):
k_to_select = self.k_features[0]
k_idx = tuple(range(X.shape[1]))
k = len(k_idx)
k_score = self._calc_score(X, y, k_idx)
k_idx, k_score = _calc_score(self, X, y, k_idx)
self.subsets_[k] = {
'feature_idx': k_idx,
'cv_scores': k_score,
Expand Down Expand Up @@ -325,32 +341,26 @@ def _is_stuck(self, sdq):
stuck = True
return stuck

def _calc_score(self, X, y, indices):
if self.cv:
scores = cross_val_score(self.est_,
X[:, indices], y,
cv=self.cv,
scoring=self.scorer,
n_jobs=self.n_jobs,
pre_dispatch=self.pre_dispatch)
else:
self.est_.fit(X[:, indices], y)
scores = np.array([self.scorer(self.est_, X[:, indices], y)])
return scores

def _inclusion(self, orig_set, subset, X, y):
all_avg_scores = []
all_cv_scores = []
all_subsets = []
res = (None, None, None)
remaining = orig_set - subset
if remaining:
for feature in remaining:
new_subset = tuple(subset | {feature})
cv_scores = self._calc_score(X, y, new_subset)
features = len(remaining)
n_jobs = min(self.n_jobs, features)
parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose,
pre_dispatch=self.pre_dispatch)
work = parallel(delayed(_calc_score)
(self, X, y, tuple(subset | {feature}))
for feature in remaining)

for new_subset, cv_scores in work:
all_avg_scores.append(cv_scores.mean())
all_cv_scores.append(cv_scores)
all_subsets.append(new_subset)

best = np.argmax(all_avg_scores)
res = (all_subsets[best],
all_avg_scores[best],
Expand All @@ -364,13 +374,19 @@ def _exclusion(self, feature_set, X, y, fixed_feature=None):
all_avg_scores = []
all_cv_scores = []
all_subsets = []
for p in combinations(feature_set, r=n - 1):
if fixed_feature and fixed_feature not in set(p):
continue
cv_scores = self._calc_score(X, y, p)
features = n
n_jobs = min(self.n_jobs, features)
parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose,
pre_dispatch=self.pre_dispatch)
work = parallel(delayed(_calc_score)(self, X, y, p)
for p in combinations(feature_set, r=n - 1)
if not fixed_feature or fixed_feature in set(p))

for p, cv_scores in work:
all_avg_scores.append(cv_scores.mean())
all_cv_scores.append(cv_scores)
all_subsets.append(p)

best = np.argmax(all_avg_scores)
res = (all_subsets[best],
all_avg_scores[best],
Expand Down
2 changes: 0 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,3 @@ numpy>=1.10.4
pandas>=0.17.1
scikit-learn>=0.18
matplotlib>=1.5.1


0 comments on commit 1b0decf

Please sign in to comment.