Skip to content

Commit

Permalink
simplify recursive floating implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
rasbt committed Oct 14, 2017
1 parent 4e3ca62 commit 021b930
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 144 deletions.
2 changes: 1 addition & 1 deletion docs/sources/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ The CHANGELOG for the current development version is available at
- Added `'leverage'` and `'conviction` as evaluation metrics to the `frequent_patterns.association_rules` function. [#246](https://github.com/rasbt/mlxtend/pull/246) & [#247](https://github.com/rasbt/mlxtend/pull/247)
- Added a `loadings_` attribute to `PrincipalComponentAnalysis` to compute the factor loadings of the features on the principal components. [#251](https://github.com/rasbt/mlxtend/pull/251)
- Allow grid search over classifiers/regressors in ensemble and stacking estimators [#259](https://github.com/rasbt/mlxtend/pull/259)
- Added a `recursive_floating` parameter to the `SequentialFeatureSelector` to enable the continuation of the floating inclusion/exclusion as described in Novovicova & Kittler (1994) [#262](https://github.com/rasbt/mlxtend/pull/262)

##### Changes

- The `'support'` column returned by `frequent_patterns.association_rules` was changed to compute the support of "antecedant union consequent", and new `antecedant support'` and `'consequent support'` column were added to avoid ambiguity. [#245](https://github.com/rasbt/mlxtend/pull/245)
- Allow the `OnehotTransactions` to be cloned via scikit-learn's `clone` function, which is required by e.g., scikit-learn's `FeatureUnion` or `GridSearchCV` (via [Iaroslav Shcherbatyi](https://github.com/iaroslav-ai)). [#249](https://github.com/rasbt/mlxtend/pull/249)
- All feature index tuples in `SequentialFeatureSelector` or now in sorted order [#262](https://github.com/rasbt/mlxtend/pull/262)
- The `SequentialFeatureSelector` now runs the continuation of the floating inclusion/exclusion as described in Novovicova & Kittler (1994). Note that this didn't cause any difference in performance on any of the test scenarios but could lead to better performance in certain edge cases. [#262](https://github.com/rasbt/mlxtend/pull/262)

##### Bug Fixes

Expand Down
156 changes: 61 additions & 95 deletions mlxtend/feature_selection/sequential_feature_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,17 +103,6 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
if False. Set to False if the estimator doesn't
implement scikit-learn's set_params and get_params methods.
In addition, it is required to set cv=0, and n_jobs=1.
recursive_floating : bool (default=False)
If `True`, uses the floating behavior described in
Novovicova & Kittler (1994) for the `floating=True` variants.
That is, the continuation of conditional exclusion in SFFS
(skipping inclusion if feature removal leads to better performance).
Similarly, the conditional inclusion is continued in SFBS
(and consequently the the exclusion is skipped) as long as it leads
to improvements in performance.
Note: `recursive_floating` will be set `True` in future versions
of mlxtend (mlxtend > 0.10).
Attributes
----------
Expand All @@ -136,8 +125,7 @@ def __init__(self, estimator, k_features=1,
verbose=0, scoring=None,
cv=5, n_jobs=1,
pre_dispatch='2*n_jobs',
clone_estimator=True,
recursive_floating=False):
clone_estimator=True):

self.estimator = estimator
self.k_features = k_features
Expand All @@ -154,7 +142,6 @@ def __init__(self, estimator, k_features=1,
self.named_est = {key: value for key, value in
_name_estimators([self.estimator])}
self.clone_estimator = clone_estimator
self.recursive_floating = recursive_floating

if self.clone_estimator:
self.est_ = clone(self.estimator)
Expand Down Expand Up @@ -249,6 +236,8 @@ def fit(self, X, y):

self.subsets_ = {}
orig_set = set(range(X.shape[1]))
n_features = X.shape[1]

if self.forward:
if select_in_range:
k_to_select = max_k
Expand All @@ -268,117 +257,92 @@ def fit(self, X, y):
best_subset = None
k_score = 0

# to check for continuation of exlusion
# self.forward is True
# (or equivalent for backward selection)
continuation = False

try:

while k != k_to_select:

prev_subset = set(k_idx)

if self.forward:
# allows continuation of exclusion
# if both recursive_floating and floating are true
# and k > 2
if (not self.floating or
(self.floating and not self.recursive_floating) or
(self.floating and self.recursive_floating
and not continuation)):

k_idx, k_score, cv_scores = self._inclusion(
orig_set=orig_set,
subset=prev_subset,
X=X,
y=y)
k_idx, k_score, cv_scores = self._inclusion(
orig_set=orig_set,
subset=prev_subset,
X=X,
y=y
)
else:

# allows continuation of inclusion
# if both recursive_floating and floating are true
# and more than 2 features have been removed
if (not self.floating or
(self.floating and not self.recursive_floating) or
(self.floating and self.recursive_floating
and not continuation)):

k_idx, k_score, cv_scores = self._exclusion(
feature_set=prev_subset,
X=X,
y=y)
k_idx, k_score, cv_scores = self._exclusion(
feature_set=prev_subset,
X=X,
y=y
)

if self.floating:
k_score_c = None

if not self.recursive_floating:
(new_feature,) = set(k_idx) ^ prev_subset
else:
new_feature = None

if self.forward:

if self.recursive_floating and len(k_idx) <= 2:
continuation = False
continue

k_idx_c, k_score_c, cv_scores_c = self._exclusion(
feature_set=k_idx,
fixed_feature=new_feature,
X=X,
y=y)

continuation_cond_1 = len(k_idx)
else:
continuation_cond_1 = n_features - len(k_idx)

if self.recursive_floating:
num_features_removed = len(orig_set) - len(k_idx)
if num_features_removed <= 2:
continuation = False
continue
continuation_cond_2 = True
ran_step_1 = True
new_feature = None

k_idx_c, k_score_c, cv_scores_c = self._inclusion(
orig_set=orig_set - {new_feature},
subset=set(k_idx),
X=X,
y=y)
while continuation_cond_1 >= 2 and continuation_cond_2:
k_score_c = None

if k_score_c is not None and k_score_c > k_score:
if ran_step_1:
(new_feature,) = set(k_idx) ^ prev_subset

if self.forward:
k_idx_c, k_score_c, cv_scores_c = self._exclusion(
feature_set=k_idx,
fixed_feature=new_feature,
X=X,
y=y
)

if len(k_idx_c) in self.subsets_:
cached_score = self.subsets_[len(
k_idx_c)]['avg_score']
else:
cached_score = None
k_idx_c, k_score_c, cv_scores_c = self._inclusion(
orig_set=orig_set - {new_feature},
subset=set(k_idx),
X=X,
y=y
)

if k_score_c is not None and k_score_c > k_score:

if len(k_idx_c) in self.subsets_:
cached_score = self.subsets_[len(
k_idx_c)]['avg_score']
else:
cached_score = None

if cached_score is None or \
k_score_c > cached_score:
prev_subset = set(k_idx)
k_idx, k_score, cv_scores = \
k_idx_c, k_score_c, cv_scores_c
continuation_cond_1 = len(k_idx)
ran_step_1 = False

else:
continuation_cond_2 = False

if cached_score is None or k_score_c > cached_score:
k_idx, k_score, cv_scores = \
k_idx_c, k_score_c, cv_scores_c
else:
continuation_cond_2 = False

k = len(k_idx)
# floating can lead to multiple same-sized subsets
if k not in self.subsets_ or (k_score >
self.subsets_[k]['avg_score']):

k_idx = tuple(sorted(k_idx))

self.subsets_[k] = {
'feature_idx': k_idx,
'cv_scores': cv_scores,
'avg_score': k_score
}

# this will skip the non-conditional
# inclusion/exclusion
# if recursive_floating is True
if self.recursive_floating:
continuation = True
else:
# never set to true anyway
# if self.recursive_floating is False
# and only disables continuation for
# recursive_floating
continuation = False

if self.verbose == 1:
sys.stderr.write('\rFeatures: %d/%s' % (
len(k_idx),
Expand Down Expand Up @@ -419,14 +383,15 @@ def fit(self, X, y):
continue
if self.subsets_[k]['avg_score'] >= (
max_score - np.std(self.subsets_[k]['cv_scores']) /
float(self.subsets_[k]['cv_scores'].shape[0])):
self.subsets_[k]['cv_scores'].shape[0]):
max_score = self.subsets_[k]['avg_score']
best_subset = k
k_score = max_score
k_idx = self.subsets_[best_subset]['feature_idx']

self.k_feature_idx_ = k_idx
self.k_score_ = k_score
self.subsets_plus_ = dict()
self.fitted = True
return self

Expand Down Expand Up @@ -473,6 +438,7 @@ def _exclusion(self, feature_set, X, y, fixed_feature=None):
if not fixed_feature or fixed_feature in set(p))

for p, cv_scores in work:

all_avg_scores.append(np.nanmean(cv_scores))
all_cv_scores.append(cv_scores)
all_subsets.append(p)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -282,22 +282,6 @@ def test_knn_option_sffs():
assert sfs2.k_feature_idx_ == (1, 2, 3)


def test_knn_option_sffs_recursive_floating():
iris = load_iris()
X = iris.data
y = iris.target
knn = KNeighborsClassifier(n_neighbors=4)
sfs2 = SFS(knn,
k_features=3,
forward=True,
floating=True,
cv=4,
verbose=0,
recursive_floating=True)
sfs2 = sfs2.fit(X, y)
assert sfs2.k_feature_idx_ == (1, 2, 3)


def test_knn_option_sbs():
iris = load_iris()
X = iris.data
Expand Down Expand Up @@ -472,22 +456,6 @@ def test_regression_sffs():
assert sfs_r.k_feature_idx_ == (0, 1, 3, 4, 6, 7, 8, 9, 10, 11, 12)


def test_regression_sffs_recursive_floating():
boston = load_boston()
X, y = boston.data, boston.target
lr = LinearRegression()
sfs_r = SFS(lr,
k_features=11,
forward=True,
floating=True,
scoring='neg_mean_squared_error',
cv=10,
recursive_floating=True,
verbose=0)
sfs_r = sfs_r.fit(X, y)
assert sfs_r.k_feature_idx_ == (0, 1, 3, 4, 6, 7, 8, 9, 10, 11, 12)


def test_regression_sbfs():
boston = load_boston()
X, y = boston.data, boston.target
Expand All @@ -503,22 +471,6 @@ def test_regression_sbfs():
assert sfs_r.k_feature_idx_ == (7, 10, 12), sfs_r.k_feature_idx_


def test_regression_sbfs_recursive_floating():
boston = load_boston()
X, y = boston.data, boston.target
lr = LinearRegression()
sfs_r = SFS(lr,
k_features=3,
forward=False,
floating=True,
scoring='neg_mean_squared_error',
cv=10,
recursive_floating=True,
verbose=0)
sfs_r = sfs_r.fit(X, y)
assert sfs_r.k_feature_idx_ == (7, 10, 12), sfs_r.k_feature_idx_


def test_regression_in_range():
boston = load_boston()
X, y = boston.data, boston.target
Expand Down

0 comments on commit 021b930

Please sign in to comment.