Merge branch 'master' of https://github.com/rasbt/mlxtend

rasbt · Sep 9, 2017 · c09c23b · c09c23b
2 parents 1ca0b74 + ba38b79
commit c09c23b
Show file tree

Hide file tree

Showing 3 changed files with 91 additions and 20 deletions.
diff --git a/docs/sources/user_guide/feature_selection/SequentialFeatureSelector.ipynb b/docs/sources/user_guide/feature_selection/SequentialFeatureSelector.ipynb
@@ -1451,7 +1451,7 @@
       "- `estimator` : scikit-learn classifier or regressor\n",
       "\n",
       "\n",
-      "- `k_features` : int or tuple (new in 0.4.2) (default: 1)\n",
+      "- `k_features` : int or tuple or str (default: 1)\n",
       "\n",
       "    Number of features to select,\n",
       "    where k_features < the full feature set.\n",
@@ -1460,6 +1460,12 @@
       "    min and max that scored highest in cross-validtion. For example,\n",
       "    the tuple (1, 4) will return any combination from\n",
       "    1 up to 4 features instead of a fixed number of features k.\n",
+      "    New in 0.8.0: A string argument \"best\" or \"parsimonious\".\n",
+      "    If \"best\" is provided, the feature selector will return the\n",
+      "    feature subset with the best cross-validation performance.\n",
+      "    If \"parsimonious\" is provided as an argument, the smallest\n",
+      "    feature subset that is within one standard error of the\n",
+      "    cross-validation performance will be selected.\n",
       "\n",
       "- `forward` : bool (default: True)\n",
       "\n",
@@ -1670,6 +1676,15 @@
     "    s = f.read()\n",
     "print(s)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/mlxtend/feature_selection/sequential_feature_selector.py b/mlxtend/feature_selection/sequential_feature_selector.py
@@ -43,14 +43,20 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
     Parameters
     ----------
     estimator : scikit-learn classifier or regressor
-    k_features : int or tuple (new in 0.4.2) (default: 1)
+    k_features : int or tuple or str (default: 1)
         Number of features to select,
         where k_features < the full feature set.
         New in 0.4.2: A tuple containing a min and max value can be provided,
             and the SFS will consider return any feature combination between
             min and max that scored highest in cross-validtion. For example,
             the tuple (1, 4) will return any combination from
             1 up to 4 features instead of a fixed number of features k.
+        New in 0.8.0: A string argument "best" or "parsimonious".
+            If "best" is provided, the feature selector will return the
+            feature subset with the best cross-validation performance.
+            If "parsimonious" is provided as an argument, the smallest
+            feature subset that is within one standard error of the
+            cross-validation performance will be selected.
     forward : bool (default: True)
         Forward selection if True,
         backward selection otherwise
@@ -179,11 +185,11 @@ def fit(self, X, y):
         self : object
 
         """
-
         if not isinstance(self.k_features, int) and\
-                not isinstance(self.k_features, tuple):
-            raise AttributeError('k_features must be a positive integer'
-                                 ' or tuple')
+                not isinstance(self.k_features, tuple)\
+                and not isinstance(self.k_features, str):
+                raise AttributeError('k_features must be a positive integer'
+                                     ', tuple, or string')
 
         if isinstance(self.k_features, int) and (self.k_features < 1 or
                                                  self.k_features > X.shape[1]):
@@ -208,8 +214,22 @@ def fit(self, X, y):
                 raise AttributeError('The min k_features value must be smaller'
                                      ' than the max k_features value.')
 
-        if isinstance(self.k_features, tuple):
+        if isinstance(self.k_features, tuple) or\
+                isinstance(self.k_features, str):
+
             select_in_range = True
+
+            if isinstance(self.k_features, str):
+                if self.k_features not in {'best', 'parsimonious'}:
+                    raise AttributeError('If a string argument is provided, '
+                                         'it must be "best" or "parsimonious"')
+                else:
+                    min_k = 1
+                    max_k = X.shape[1]
+            else:
+                min_k = self.k_features[0]
+                max_k = self.k_features[1]
+
         else:
             select_in_range = False
             k_to_select = self.k_features
@@ -218,12 +238,12 @@ def fit(self, X, y):
         orig_set = set(range(X.shape[1]))
         if self.forward:
             if select_in_range:
-                k_to_select = self.k_features[1]
+                k_to_select = max_k
             k_idx = ()
             k = 0
         else:
             if select_in_range:
-                k_to_select = self.k_features[0]
+                k_to_select = min_k
             k_idx = tuple(range(X.shape[1]))
             k = len(k_idx)
             k_idx, k_score = _calc_score(self, X, y, k_idx)
@@ -318,16 +338,30 @@ def fit(self, X, y):
             sys.stderr.write('\nSTOPPING EARLY DUE TO KEYBOARD INTERRUPT...')
 
         if select_in_range:
+            max_score = float('-inf')
+
             max_score = float('-inf')
             for k in self.subsets_:
-                if k < self.k_features[0] or k > self.k_features[1]:
+                if k < min_k or k > max_k:
                     continue
                 if self.subsets_[k]['avg_score'] > max_score:
                     max_score = self.subsets_[k]['avg_score']
                     best_subset = k
             k_score = max_score
             k_idx = self.subsets_[best_subset]['feature_idx']
 
+            if self.k_features == 'parsimonious':
+                for k in self.subsets_:
+                    if k >= best_subset:
+                        continue
+                    if self.subsets_[k]['avg_score'] >= (
+                            max_score - np.std(self.subsets_[k]['cv_scores']) /
+                            self.subsets_[k]['cv_scores'].shape[0]):
+                        max_score = self.subsets_[k]['avg_score']
+                        best_subset = k
+                k_score = max_score
+                k_idx = self.subsets_[best_subset]['feature_idx']
+
         self.k_feature_idx_ = k_idx
         self.k_score_ = k_score
         self.subsets_plus_ = dict()

diff --git a/mlxtend/feature_selection/tests/test_sequential_feature_selector.py b/mlxtend/feature_selection/tests/test_sequential_feature_selector.py
@@ -83,10 +83,10 @@ def test_kfeatures_type_2():
     X = iris.data
     y = iris.target
     knn = KNeighborsClassifier()
-    expect = 'k_features must be a positive integer or tuple'
+    expect = 'k_features must be a positive integer, tuple, or string'
     sfs = SFS(estimator=knn,
               verbose=0,
-              k_features='abc')
+              k_features=set())
     assert_raises(AttributeError,
                   expect,
                   sfs.fit,
@@ -458,10 +458,6 @@ def test_regression_in_range():
 
 
 def test_clone_params_fail():
-    iris = load_iris()
-    X = iris.data
-    y = iris.target
-
     if sys.version_info >= (3, 0):
         objtype = 'class'
     else:
@@ -504,7 +500,6 @@ def test_clone_params_pass():
 def test_transform_not_fitted():
     iris = load_iris()
     X = iris.data
-    y = iris.target
     knn = KNeighborsClassifier(n_neighbors=4)
 
     sfs1 = SFS(knn,
@@ -525,9 +520,6 @@ def test_transform_not_fitted():
 
 
 def test_get_metric_dict_not_fitted():
-    iris = load_iris()
-    X = iris.data
-    y = iris.target
     knn = KNeighborsClassifier(n_neighbors=4)
 
     sfs1 = SFS(knn,
@@ -642,3 +634,33 @@ def test_max_feature_subset_size_in_tuple_range():
 
     sfs = sfs.fit(X, y)
     assert len(sfs.k_feature_idx_) == 5
+
+
+def test_max_feature_subset_best():
+    boston = load_boston()
+    X, y = boston.data, boston.target
+    lr = LinearRegression()
+
+    sfs = SFS(lr,
+              k_features='best',
+              forward=True,
+              floating=False,
+              cv=10)
+
+    sfs = sfs.fit(X, y)
+    assert sfs.k_feature_idx_ == (1, 3, 5, 7, 8, 9, 10, 11, 12)
+
+
+def test_max_feature_subset_parsimonious():
+    boston = load_boston()
+    X, y = boston.data, boston.target
+    lr = LinearRegression()
+
+    sfs = SFS(lr,
+              k_features='parsimonious',
+              forward=True,
+              floating=False,
+              cv=10)
+
+    sfs = sfs.fit(X, y)
+    assert sfs.k_feature_idx_ == (10, 11, 12, 5)