Skip to content
This repository has been archived by the owner on Dec 6, 2023. It is now read-only.

Commit

Permalink
Merge 1e75f79 into e6e34da
Browse files Browse the repository at this point in the history
  • Loading branch information
a-n-ermakov committed Mar 16, 2019
2 parents e6e34da + 1e75f79 commit 06337d6
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 17 deletions.
60 changes: 50 additions & 10 deletions stability_selection/stability_selection.py
Expand Up @@ -63,10 +63,10 @@ def _bootstrap_generator(n_bootstrap_iterations, bootstrap_func, y,


def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value,
threshold=None):
threshold=None, max_features=None):
"""
Fits base_estimator on a bootstrap sample of the original data,
and returns a mas of the variables that are selected by the fitted model.
and returns a mask of the variables that are selected by the fitted model.
Parameters
----------
Expand Down Expand Up @@ -95,6 +95,11 @@ def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value,
or implicitly (e.g, Lasso), the threshold used is 1e-5.
Otherwise, "mean" is used by default.
max_features : int or None, optional
The maximum number of features selected scoring above ``threshold``.
To disable ``threshold`` and only select based on ``max_features``,
set ``threshold=-np.inf``.
Returns
-------
selected_variables : array-like, shape = [n_features]
Expand All @@ -108,6 +113,7 @@ def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value,
selector_model = _return_estimator_from_pipeline(base_estimator)
variable_selector = SelectFromModel(estimator=selector_model,
threshold=threshold,
max_features=max_features,
prefit=True)
return variable_selector.get_support()

Expand Down Expand Up @@ -187,6 +193,9 @@ class StabilitySelection(BaseEstimator, TransformerMixin):
threshold : float.
Threshold defining the minimum cutoff value for the stability scores.
max_features : int or None, optional
The maximum number of features selected scoring above ``threshold``.
bootstrap_func : str or callable fun (default=bootstrap_without_replacement)
The function used to subsample the data. This parameter can be:
- A string, which must be one of
Expand All @@ -208,6 +217,11 @@ class StabilitySelection(BaseEstimator, TransformerMixin):
or implicitly (e.g, Lasso), the threshold used is 1e-5.
Otherwise, "mean" is used by default.
bootstrap_max_features : int or None, optional
The maximum number of features selected scoring above ``bootstrap_threshold``.
To disable ``bootstrap_threshold`` and only select based on ``max_features``,
set ``bootstrap_threshold=-np.inf``.
verbose : integer.
Controls the verbosity: the higher, the more messages.
Expand Down Expand Up @@ -253,19 +267,24 @@ class StabilitySelection(BaseEstimator, TransformerMixin):
of the Royal Statistical Society: Series B (Statistical Methodology),
75(1), pp.55-80.
"""
def __init__(self, base_estimator=LogisticRegression(penalty='l1'), lambda_name='C',
lambda_grid=np.logspace(-5, -2, 25), n_bootstrap_iterations=100,
sample_fraction=0.5, threshold=0.6, bootstrap_func=bootstrap_without_replacement,
bootstrap_threshold=None, verbose=0, n_jobs=1, pre_dispatch='2*n_jobs',

def __init__(self, base_estimator=LogisticRegression(penalty='l1', solver='liblinear'),
lambda_name='C', lambda_grid=np.logspace(-5, -2, 25), n_bootstrap_iterations=100,
sample_fraction=0.5, threshold=0.6, max_features=None,
bootstrap_func=bootstrap_without_replacement,
bootstrap_threshold=None, bootstrap_max_features=None,
verbose=0, n_jobs=1, pre_dispatch='2*n_jobs',
random_state=None):
self.base_estimator = base_estimator
self.lambda_name = lambda_name
self.lambda_grid = lambda_grid
self.n_bootstrap_iterations = n_bootstrap_iterations
self.sample_fraction = sample_fraction
self.threshold = threshold
self.max_features = max_features
self.bootstrap_func = bootstrap_func
self.bootstrap_threshold = bootstrap_threshold
self.bootstrap_max_features = bootstrap_max_features
self.verbose = verbose
self.n_jobs = n_jobs
self.pre_dispatch = pre_dispatch
Expand All @@ -282,6 +301,9 @@ def _validate_input(self):
if not isinstance(self.threshold, float) or not (0.0 < self.threshold <= 1.0):
raise ValueError('threshold should be a float in (0, 1], got %s' % self.threshold)

if self.max_features is not None and (not isinstance(self.max_features, int) or self.max_features < 1):
raise ValueError('max_features should be a positive int, got %s' % self.max_features)

if self.lambda_name not in self.base_estimator.get_params().keys():
raise ValueError('lambda_name is set to %s, but base_estimator %s '
'does not have a parameter '
Expand Down Expand Up @@ -340,15 +362,16 @@ def fit(self, X, y):
y=y[subsample],
lambda_name=self.lambda_name,
lambda_value=lambda_value,
threshold=self.bootstrap_threshold)
threshold=self.bootstrap_threshold,
max_features=self.bootstrap_max_features)
for subsample in bootstrap_samples)

stability_scores[:, idx] = np.vstack(selected_variables).mean(axis=0)

self.stability_scores_ = stability_scores
return self

def get_support(self, indices=False, threshold=None):
def get_support(self, indices=False, threshold=None, max_features=None):
"""Get a mask, or integer index, of the features selected
Parameters
Expand All @@ -361,6 +384,9 @@ def get_support(self, indices=False, threshold=None):
Threshold defining the minimum cutoff value for the
stability scores.
max_features : int or None, optional
The maximum number of features selected scoring above ``threshold``.
Returns
-------
support : array
Expand All @@ -377,8 +403,22 @@ def get_support(self, indices=False, threshold=None):
raise ValueError('threshold should be a float in (0, 1], '
'got %s' % self.threshold)

cutoff = self.threshold if threshold is None else threshold
mask = (self.stability_scores_.max(axis=1) > cutoff)
n_features = self.stability_scores_.shape[0]
if max_features is not None and (not isinstance(max_features, int)
or not(1 < max_features <= n_features)):
raise ValueError('max_features should be an int in [1, %s], '
'got %s' % (n_features, max_features))

threshold_cutoff = self.threshold if threshold is None else threshold
mask = (self.stability_scores_.max(axis=1) > threshold_cutoff)

max_features_cutoff = self.max_features if max_features is None else max_features
if max_features_cutoff is not None:
exceed_counts = (self.stability_scores_ > threshold_cutoff).sum(axis=1)
if max_features_cutoff < (exceed_counts > 0).sum():
feature_indices = (-exceed_counts).argsort()[:max_features_cutoff]
mask = np.zeros(n_features, dtype=np.bool)
mask[feature_indices] = True

return mask if not indices else np.where(mask)[0]

Expand Down
5 changes: 5 additions & 0 deletions stability_selection/tests/test_common.py
Expand Up @@ -71,3 +71,8 @@ def test_sample_fraction():
@raises(ValueError)
def test_lambda_name():
StabilitySelection(lambda_name='n_estimators')._validate_input()


@raises(ValueError)
def test_max_features_zero():
StabilitySelection(max_features=0)._validate_input()
55 changes: 48 additions & 7 deletions stability_selection/tests/test_stability_selection.py
Expand Up @@ -44,7 +44,7 @@ def _generate_dummy_classification_data(p=1000, n=1000, k=5,
def test_stability_selection_classification():
n, p, k = 1000, 1000, 5

X, y, important_betas = _generate_dummy_classification_data(n=n, k=k)
X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)
selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25), verbose=1)
selector.fit(X, y)

Expand All @@ -59,7 +59,7 @@ def test_stability_selection_classification():
def test_stability_selection_regression():
n, p, k = 500, 1000, 5

X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k)

base_estimator = Pipeline([
('scaler', StandardScaler()),
Expand All @@ -81,7 +81,7 @@ def test_stability_selection_regression():
def test_with_complementary_pairs_bootstrap():
n, p, k = 500, 1000, 5

X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k)

base_estimator = Pipeline([
('scaler', StandardScaler()),
Expand All @@ -104,7 +104,7 @@ def test_with_complementary_pairs_bootstrap():
def test_with_stratified_bootstrap():
n, p, k = 1000, 1000, 5

X, y, important_betas = _generate_dummy_classification_data(n=n, k=k)
X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)
selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25), verbose=1,
bootstrap_func='stratified')
selector.fit(X, y)
Expand All @@ -117,7 +117,7 @@ def test_with_stratified_bootstrap():
def test_different_shape():
n, p, k = 100, 200, 5

X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k)

base_estimator = Pipeline([
('scaler', StandardScaler()),
Expand All @@ -136,7 +136,7 @@ def test_different_shape():
def test_no_features():
n, p, k = 100, 200, 0

X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k)

base_estimator = Pipeline([
('scaler', StandardScaler()),
Expand All @@ -154,10 +154,51 @@ def test_no_features():
np.empty(0).reshape((X.shape[0], 0)))


def test_stability_selection_max_features():
n, p, k = 1000, 1000, 5
lambda_grid=np.logspace(-5, -1, 25)

X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)
selector = StabilitySelection(lambda_grid=lambda_grid, max_features=1)
selector.fit(X, y)
X_r = selector.transform(X)
assert(X_r.shape == (n, 1))

selector = StabilitySelection(lambda_grid=lambda_grid, max_features=k)
selector.fit(X, y)
X_r = selector.transform(X)
assert(X_r.shape == (n, k))

selector = StabilitySelection(lambda_grid=lambda_grid, max_features=k+1)
selector.fit(X, y)
X_r = selector.transform(X)
assert(X_r.shape == (n, k))


@raises(ValueError)
def test_get_support_max_features_low():
n, p, k = 500, 200, 5

X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)
selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25))
selector.fit(X, y)
selector.get_support(max_features=0)


@raises(ValueError)
def test_get_support_max_features_high():
n, p, k = 500, 200, 5

X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)
selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25))
selector.fit(X, y)
selector.get_support(max_features=p+1)


def test_stability_plot():
n, p, k = 500, 200, 5

X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k)

base_estimator = Pipeline([
('scaler', StandardScaler()),
Expand Down

0 comments on commit 06337d6

Please sign in to comment.