Merge 1e75f79 into e6e34da

scikit-learn-contrib · Mar 16, 2019 · 06337d6 · 06337d6
2 parents e6e34da + 1e75f79
commit 06337d6
Show file tree

Hide file tree

Showing 3 changed files with 103 additions and 17 deletions.
diff --git a/stability_selection/stability_selection.py b/stability_selection/stability_selection.py
@@ -63,10 +63,10 @@ def _bootstrap_generator(n_bootstrap_iterations, bootstrap_func, y,
 
 
 def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value,
-                          threshold=None):
+                          threshold=None, max_features=None):
     """
     Fits base_estimator on a bootstrap sample of the original data,
-    and returns a mas of the variables that are selected by the fitted model.
+    and returns a mask of the variables that are selected by the fitted model.
 
     Parameters
     ----------
@@ -95,6 +95,11 @@ def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value,
         or implicitly (e.g, Lasso), the threshold used is 1e-5.
         Otherwise, "mean" is used by default.
 
+    max_features : int or None, optional
+        The maximum number of features selected scoring above ``threshold``.
+        To disable ``threshold`` and only select based on ``max_features``,
+        set ``threshold=-np.inf``.
+
     Returns
     -------
     selected_variables : array-like, shape = [n_features]
@@ -108,6 +113,7 @@ def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value,
     selector_model = _return_estimator_from_pipeline(base_estimator)
     variable_selector = SelectFromModel(estimator=selector_model,
                                         threshold=threshold,
+                                        max_features=max_features,
                                         prefit=True)
     return variable_selector.get_support()
 
@@ -187,6 +193,9 @@ class StabilitySelection(BaseEstimator, TransformerMixin):
     threshold : float.
         Threshold defining the minimum cutoff value for the stability scores.
 
+    max_features : int or None, optional
+        The maximum number of features selected scoring above ``threshold``.
+
     bootstrap_func : str or callable fun (default=bootstrap_without_replacement)
         The function used to subsample the data. This parameter can be:
             - A string, which must be one of
@@ -208,6 +217,11 @@ class StabilitySelection(BaseEstimator, TransformerMixin):
         or implicitly (e.g, Lasso), the threshold used is 1e-5.
         Otherwise, "mean" is used by default.
 
+    bootstrap_max_features : int or None, optional
+        The maximum number of features selected scoring above ``bootstrap_threshold``.
+        To disable ``bootstrap_threshold`` and only select based on ``max_features``,
+        set ``bootstrap_threshold=-np.inf``.
+
     verbose : integer.
         Controls the verbosity: the higher, the more messages.
 
@@ -253,19 +267,24 @@ class StabilitySelection(BaseEstimator, TransformerMixin):
            of the Royal Statistical Society: Series B (Statistical Methodology),
             75(1), pp.55-80.
     """
-    def __init__(self, base_estimator=LogisticRegression(penalty='l1'), lambda_name='C',
-                 lambda_grid=np.logspace(-5, -2, 25), n_bootstrap_iterations=100,
-                 sample_fraction=0.5, threshold=0.6, bootstrap_func=bootstrap_without_replacement,
-                 bootstrap_threshold=None, verbose=0, n_jobs=1, pre_dispatch='2*n_jobs',
+
+    def __init__(self, base_estimator=LogisticRegression(penalty='l1', solver='liblinear'),
+                 lambda_name='C', lambda_grid=np.logspace(-5, -2, 25), n_bootstrap_iterations=100,
+                 sample_fraction=0.5, threshold=0.6, max_features=None,
+                 bootstrap_func=bootstrap_without_replacement,
+                 bootstrap_threshold=None, bootstrap_max_features=None,
+                 verbose=0, n_jobs=1, pre_dispatch='2*n_jobs',
                  random_state=None):
         self.base_estimator = base_estimator
         self.lambda_name = lambda_name
         self.lambda_grid = lambda_grid
         self.n_bootstrap_iterations = n_bootstrap_iterations
         self.sample_fraction = sample_fraction
         self.threshold = threshold
+        self.max_features = max_features
         self.bootstrap_func = bootstrap_func
         self.bootstrap_threshold = bootstrap_threshold
+        self.bootstrap_max_features = bootstrap_max_features
         self.verbose = verbose
         self.n_jobs = n_jobs
         self.pre_dispatch = pre_dispatch
@@ -282,6 +301,9 @@ def _validate_input(self):
         if not isinstance(self.threshold, float) or not (0.0 < self.threshold <= 1.0):
             raise ValueError('threshold should be a float in (0, 1], got %s' % self.threshold)
 
+        if self.max_features is not None and (not isinstance(self.max_features, int) or self.max_features < 1):
+            raise ValueError('max_features should be a positive int, got %s' % self.max_features)
+
         if self.lambda_name not in self.base_estimator.get_params().keys():
             raise ValueError('lambda_name is set to %s, but base_estimator %s '
                              'does not have a parameter '
@@ -340,15 +362,16 @@ def fit(self, X, y):
                                              y=y[subsample],
                                              lambda_name=self.lambda_name,
                                              lambda_value=lambda_value,
-                                             threshold=self.bootstrap_threshold)
+                                             threshold=self.bootstrap_threshold,
+                                             max_features=self.bootstrap_max_features)
               for subsample in bootstrap_samples)
 
             stability_scores[:, idx] = np.vstack(selected_variables).mean(axis=0)
 
         self.stability_scores_ = stability_scores
         return self
 
-    def get_support(self, indices=False, threshold=None):
+    def get_support(self, indices=False, threshold=None, max_features=None):
         """Get a mask, or integer index, of the features selected
 
         Parameters
@@ -361,6 +384,9 @@ def get_support(self, indices=False, threshold=None):
             Threshold defining the minimum cutoff value for the
             stability scores.
 
+        max_features : int or None, optional
+            The maximum number of features selected scoring above ``threshold``.
+
         Returns
         -------
         support : array
@@ -377,8 +403,22 @@ def get_support(self, indices=False, threshold=None):
             raise ValueError('threshold should be a float in (0, 1], '
                              'got %s' % self.threshold)
 
-        cutoff = self.threshold if threshold is None else threshold
-        mask = (self.stability_scores_.max(axis=1) > cutoff)
+        n_features = self.stability_scores_.shape[0]
+        if max_features is not None and (not isinstance(max_features, int)
+                                         or not(1 < max_features <= n_features)):
+            raise ValueError('max_features should be an int in [1, %s], '
+                             'got %s' % (n_features, max_features))
+
+        threshold_cutoff = self.threshold if threshold is None else threshold
+        mask = (self.stability_scores_.max(axis=1) > threshold_cutoff)
+
+        max_features_cutoff = self.max_features if max_features is None else max_features
+        if max_features_cutoff is not None:
+            exceed_counts = (self.stability_scores_ > threshold_cutoff).sum(axis=1)
+            if max_features_cutoff < (exceed_counts > 0).sum():
+                feature_indices = (-exceed_counts).argsort()[:max_features_cutoff]
+                mask = np.zeros(n_features, dtype=np.bool)
+                mask[feature_indices] = True
 
         return mask if not indices else np.where(mask)[0]
 

diff --git a/stability_selection/tests/test_common.py b/stability_selection/tests/test_common.py
@@ -71,3 +71,8 @@ def test_sample_fraction():
 @raises(ValueError)
 def test_lambda_name():
     StabilitySelection(lambda_name='n_estimators')._validate_input()
+
+
+@raises(ValueError)
+def test_max_features_zero():
+    StabilitySelection(max_features=0)._validate_input()
diff --git a/stability_selection/tests/test_stability_selection.py b/stability_selection/tests/test_stability_selection.py
@@ -44,7 +44,7 @@ def _generate_dummy_classification_data(p=1000, n=1000, k=5,
 def test_stability_selection_classification():
     n, p, k = 1000, 1000, 5
 
-    X, y, important_betas = _generate_dummy_classification_data(n=n, k=k)
+    X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)
     selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25), verbose=1)
     selector.fit(X, y)
 
@@ -59,7 +59,7 @@ def test_stability_selection_classification():
 def test_stability_selection_regression():
     n, p, k = 500, 1000, 5
 
-    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
+    X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k)
 
     base_estimator = Pipeline([
         ('scaler', StandardScaler()),
@@ -81,7 +81,7 @@ def test_stability_selection_regression():
 def test_with_complementary_pairs_bootstrap():
     n, p, k = 500, 1000, 5
 
-    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
+    X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k)
 
     base_estimator = Pipeline([
         ('scaler', StandardScaler()),
@@ -104,7 +104,7 @@ def test_with_complementary_pairs_bootstrap():
 def test_with_stratified_bootstrap():
     n, p, k = 1000, 1000, 5
 
-    X, y, important_betas = _generate_dummy_classification_data(n=n, k=k)
+    X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)
     selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25), verbose=1,
                                   bootstrap_func='stratified')
     selector.fit(X, y)
@@ -117,7 +117,7 @@ def test_with_stratified_bootstrap():
 def test_different_shape():
     n, p, k = 100, 200, 5
 
-    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
+    X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k)
 
     base_estimator = Pipeline([
         ('scaler', StandardScaler()),
@@ -136,7 +136,7 @@ def test_different_shape():
 def test_no_features():
     n, p, k = 100, 200, 0
 
-    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
+    X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k)
 
     base_estimator = Pipeline([
         ('scaler', StandardScaler()),
@@ -154,10 +154,51 @@ def test_no_features():
                         np.empty(0).reshape((X.shape[0], 0)))
 
 
+def test_stability_selection_max_features():
+    n, p, k = 1000, 1000, 5
+    lambda_grid=np.logspace(-5, -1, 25)
+
+    X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)
+    selector = StabilitySelection(lambda_grid=lambda_grid, max_features=1)
+    selector.fit(X, y)
+    X_r = selector.transform(X)
+    assert(X_r.shape == (n, 1))
+
+    selector = StabilitySelection(lambda_grid=lambda_grid, max_features=k)
+    selector.fit(X, y)
+    X_r = selector.transform(X)
+    assert(X_r.shape == (n, k))
+
+    selector = StabilitySelection(lambda_grid=lambda_grid, max_features=k+1)
+    selector.fit(X, y)
+    X_r = selector.transform(X)
+    assert(X_r.shape == (n, k))
+
+
+@raises(ValueError)
+def test_get_support_max_features_low():
+    n, p, k = 500, 200, 5
+
+    X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)
+    selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25))
+    selector.fit(X, y)
+    selector.get_support(max_features=0)
+
+
+@raises(ValueError)
+def test_get_support_max_features_high():
+    n, p, k = 500, 200, 5
+
+    X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)
+    selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25))
+    selector.fit(X, y)
+    selector.get_support(max_features=p+1)
+
+
 def test_stability_plot():
     n, p, k = 500, 200, 5
 
-    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
+    X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k)
 
     base_estimator = Pipeline([
         ('scaler', StandardScaler()),