Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add groups parameter to SFS and EFS fit(), for forwarding to sklearn cv #537

Merged
merged 3 commits into from
May 17, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/sources/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ The CHANGELOG for the current development version is available at

##### New Features

- -
- Add optional `groups` parameter to `SequentialFeatureSelector` and `ExhaustiveFeatureSelector` `fit()` methods for forwarding to sklearn CV ([#537](https://github.com/rasbt/mlxtend/pull/537) via [arc12](https://github.com/qiaguhttps://github.com/arc12))

##### Changes

Expand Down
17 changes: 12 additions & 5 deletions mlxtend/feature_selection/exhaustive_feature_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,11 @@
from sklearn.externals.joblib import Parallel, delayed


def _calc_score(selector, X, y, indices, **fit_params):
def _calc_score(selector, X, y, indices, groups=None, **fit_params):
if selector.cv:
scores = cross_val_score(selector.est_,
X[:, indices], y,
groups=groups,
cv=selector.cv,
scoring=selector.scorer,
n_jobs=1,
Expand Down Expand Up @@ -175,7 +176,7 @@ def __init__(self, estimator, min_features=1, max_features=1,
# don't mess with this unless testing
self._TESTING_INTERRUPT_MODE = False

def fit(self, X, y, custom_feature_names=None, **fit_params):
def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params):
"""Perform feature selection and learn model from training data.

Parameters
Expand All @@ -191,6 +192,9 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
Custom feature names for `self.k_feature_names` and
`self.subsets_[i]['feature_names']`.
(new in v 0.13.0)
groups : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
train/test set. Passed to the fit method of the cross-validator.
fit_params : dict of string -> object, optional
Parameters to pass to to the fit method of classifier.

Expand Down Expand Up @@ -268,7 +272,7 @@ def ncr(n, r):
n_jobs = min(self.n_jobs, all_comb)
parallel = Parallel(n_jobs=n_jobs, pre_dispatch=self.pre_dispatch)
work = enumerate(parallel(delayed(_calc_score)
(self, X_, y, c, **fit_params)
(self, X_, y, c, groups=groups, **fit_params)
for c in candidates))

try:
Expand Down Expand Up @@ -336,7 +340,7 @@ def transform(self, X):
X_ = X
return X_[:, self.best_idx_]

def fit_transform(self, X, y, **fit_params):
def fit_transform(self, X, y, groups=None, **fit_params):
"""Fit to training data and return the best selected features from X.

Parameters
Expand All @@ -348,6 +352,9 @@ def fit_transform(self, X, y, **fit_params):
argument for X.
y : array-like, shape = [n_samples]
Target values.
groups : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
train/test set. Passed to the fit method of the cross-validator.
fit_params : dict of string -> object, optional
Parameters to pass to to the fit method of classifier.

Expand All @@ -356,7 +363,7 @@ def fit_transform(self, X, y, **fit_params):
Feature subset of X, shape={n_samples, k_features}

"""
self.fit(X, y, **fit_params)
self.fit(X, y, groups=groups, **fit_params)
return self.transform(X)

def get_metric_dict(self, confidence_interval=0.95):
Expand Down
38 changes: 26 additions & 12 deletions mlxtend/feature_selection/sequential_feature_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,11 @@
from sklearn.externals.joblib import Parallel, delayed


def _calc_score(selector, X, y, indices, **fit_params):
def _calc_score(selector, X, y, indices, groups=None, **fit_params):
if selector.cv:
scores = cross_val_score(selector.est_,
X[:, indices], y,
groups=groups,
cv=selector.cv,
scoring=selector.scorer,
n_jobs=1,
Expand Down Expand Up @@ -242,7 +243,7 @@ def set_params(self, **params):
self._set_params('estimator', 'named_estimators', **params)
return self

def fit(self, X, y, custom_feature_names=None, **fit_params):
def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params):
"""Perform feature selection and learn model from training data.

Parameters
Expand All @@ -260,6 +261,9 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
Custom feature names for `self.k_feature_names` and
`self.subsets_[i]['feature_names']`.
(new in v 0.13.0)
groups : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
train/test set. Passed to the fit method of the cross-validator.
fit_params : dict of string -> object, optional
Parameters to pass to to the fit method of classifier.

Expand Down Expand Up @@ -291,8 +295,8 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
if not isinstance(self.k_features, int) and\
not isinstance(self.k_features, tuple)\
and not isinstance(self.k_features, str):
raise AttributeError('k_features must be a positive integer'
', tuple, or string')
raise AttributeError('k_features must be a positive integer'
', tuple, or string')

if (isinstance(self.k_features, int) and (
self.k_features < 1 or self.k_features > X_.shape[1])):
Expand Down Expand Up @@ -351,7 +355,8 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
k_to_select = min_k
k_idx = tuple(range(X_.shape[1]))
k = len(k_idx)
k_idx, k_score = _calc_score(self, X_, y, k_idx, **fit_params)
k_idx, k_score = _calc_score(self, X_, y, k_idx,
groups=groups, **fit_params)
self.subsets_[k] = {
'feature_idx': k_idx,
'cv_scores': k_score,
Expand All @@ -370,6 +375,7 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
subset=prev_subset,
X=X_,
y=y,
groups=groups,
**fit_params
)
else:
Expand All @@ -378,6 +384,7 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
feature_set=prev_subset,
X=X_,
y=y,
groups=groups,
**fit_params
)

Expand All @@ -404,6 +411,7 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
fixed_feature=new_feature,
X=X_,
y=y,
groups=groups,
**fit_params
)

Expand All @@ -413,6 +421,7 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
subset=set(k_idx),
X=X_,
y=y,
groups=groups,
**fit_params
)

Expand Down Expand Up @@ -472,7 +481,7 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
X)
raise KeyboardInterrupt

except KeyboardInterrupt as e:
except KeyboardInterrupt:
self.interrupted_ = True
sys.stderr.write('\nSTOPPING EARLY DUE TO KEYBOARD INTERRUPT...')

Expand Down Expand Up @@ -512,7 +521,7 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
return self

def _inclusion(self, orig_set, subset, X, y, ignore_feature=None,
**fit_params):
groups=None, **fit_params):
all_avg_scores = []
all_cv_scores = []
all_subsets = []
Expand All @@ -526,7 +535,7 @@ def _inclusion(self, orig_set, subset, X, y, ignore_feature=None,
work = parallel(delayed(_calc_score)
(self, X, y,
tuple(subset | {feature}),
**fit_params)
groups=groups, **fit_params)
for feature in remaining
if feature != ignore_feature)

Expand All @@ -541,7 +550,8 @@ def _inclusion(self, orig_set, subset, X, y, ignore_feature=None,
all_cv_scores[best])
return res

def _exclusion(self, feature_set, X, y, fixed_feature=None, **fit_params):
def _exclusion(self, feature_set, X, y, fixed_feature=None,
groups=None, **fit_params):
n = len(feature_set)
res = (None, None, None)
if n > 1:
Expand All @@ -552,7 +562,8 @@ def _exclusion(self, feature_set, X, y, fixed_feature=None, **fit_params):
n_jobs = min(self.n_jobs, features)
parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose,
pre_dispatch=self.pre_dispatch)
work = parallel(delayed(_calc_score)(self, X, y, p, **fit_params)
work = parallel(delayed(_calc_score)(self, X, y, p,
groups=groups, **fit_params)
for p in combinations(feature_set, r=n - 1)
if not fixed_feature or fixed_feature in set(p))

Expand Down Expand Up @@ -591,7 +602,7 @@ def transform(self, X):
X_ = X
return X_[:, self.k_feature_idx_]

def fit_transform(self, X, y, **fit_params):
def fit_transform(self, X, y, groups=None, **fit_params):
"""Fit to training data then reduce X to its most important features.

Parameters
Expand All @@ -605,6 +616,9 @@ def fit_transform(self, X, y, **fit_params):
Target values.
New in v 0.13.0: a pandas Series are now also accepted as
argument for y.
groups : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
train/test set. Passed to the fit method of the cross-validator.
fit_params : dict of string -> object, optional
Parameters to pass to to the fit method of classifier.

Expand All @@ -613,7 +627,7 @@ def fit_transform(self, X, y, **fit_params):
Reduced feature subset of X, shape={n_samples, k_features}

"""
self.fit(X, y, **fit_params)
self.fit(X, y, groups=groups, **fit_params)
return self.transform(X)

def get_metric_dict(self, confidence_interval=0.95):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
from mlxtend.utils import assert_raises
from sklearn.model_selection import GroupKFold


def dict_compare_utility(d1, d2):
Expand Down Expand Up @@ -183,6 +184,40 @@ def test_knn_cv3():
assert round(efs1.best_score_, 4) == 0.9728


def test_knn_cv3_groups():
iris = load_iris()
X = iris.data
y = iris.target
knn = KNeighborsClassifier(n_neighbors=4)
efs1 = EFS(knn,
min_features=3,
max_features=3,
scoring='accuracy',
cv=GroupKFold(n_splits=3),
print_progress=False)
np.random.seed(1630672634)
groups = np.random.randint(0, 6, size=len(y))
efs1 = efs1.fit(X, y, groups=groups)
# print(efs1.subsets_)
expect = {0: {'cv_scores': np.array([0.97916667, 0.93877551, 0.9245283]),
'feature_idx': (0, 1, 2),
'avg_score': 0.9474901595858469,
'feature_names': ('0', '1', '2')},
1: {'cv_scores': np.array([1., 0.93877551, 0.9245283]),
'feature_idx': (0, 1, 3),
'avg_score': 0.9544346040302915,
'feature_names': ('0', '1', '3')},
2: {'cv_scores': np.array([0.97916667, 0.95918367, 0.9245283]),
'feature_idx': (0, 2, 3),
'avg_score': 0.9542928806742822,
'feature_names': ('0', '2', '3')},
3: {'cv_scores': np.array([0.97916667, 0.95918367, 0.94339623]),
'feature_idx': (1, 2, 3),
'avg_score': 0.9605821888503829,
'feature_names': ('1', '2', '3')}}
dict_compare_utility(d1=expect, d2=efs1.subsets_)


def test_fit_params():
iris = load_iris()
X = iris.data
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,33 @@ def test_knn_cv3():
dict_compare_utility(d1=expect, d2=sfs1.subsets_)


def test_knn_cv3_groups():
iris = load_iris()
X = iris.data
y = iris.target
knn = KNeighborsClassifier(n_neighbors=4)
sfs1 = SFS(knn,
k_features=3,
forward=True,
floating=False,
cv=GroupKFold(n_splits=3),
verbose=0)
np.random.seed(1630672634)
groups = np.random.randint(0, 6, size=len(y))
sfs1 = sfs1.fit(X, y, groups=groups)
# print(sfs1.subsets_)
expect = {
1: {'cv_scores': np.array([0.97916667, 0.93877551, 0.96226415]),
'feature_idx': (3,),
'avg_score': 0.9600687759380482},
2: {'cv_scores': np.array([0.95833333, 0.93877551, 0.98113208]),
'feature_idx': (1, 3),
'avg_score': 0.9594136396697044},
3: {'cv_scores': np.array([0.97916667, 0.95918367, 0.94339623]),
'feature_idx': (1, 2, 3),
'avg_score': 0.9605821888503829}}
dict_compare_utility(d1=expect, d2=sfs1.subsets_, decimal=3)

def test_knn_rbf_groupkfold():
nan_roc_auc_scorer = make_scorer(nan_roc_auc_score)
rng = np.random.RandomState(123)
Expand Down