From 74a9756fa784d1f22873ad23c8b4948c6e290108 Mon Sep 17 00:00:00 2001 From: Antoine Wendlinger Date: Mon, 24 Oct 2016 20:01:49 +0200 Subject: [PATCH] [MRG+2] Norm inconsistency between RFE and SelectFromModel (was _LearntSelectorMixin) #2121 (#6181) * Norm inconsistency between RFE and SelectFromModel (was _LearntSelectorMixin) #2121 * safe_pwr utility * Norm fix * Removed safe_pwr * 1D arrays support for norm fix * Test case for 2d coef in SelectFromModel * Fix numpy version requirement for norm fix * Implement fixes suggested by @jnothman * Add numpy version requiring the fix. --- doc/whats_new.rst | 3 ++ sklearn/feature_selection/from_model.py | 16 +++++++--- .../tests/test_from_model.py | 26 ++++++++++++++++ sklearn/utils/fixes.py | 30 +++++++++++++++++++ sklearn/utils/tests/test_fixes.py | 25 ++++++++++++++++ 5 files changed, 96 insertions(+), 4 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index c7c0f4d1242d1..2a97ae7673b56 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -52,6 +52,9 @@ Enhancements (`#7506` _) by `Narine Kokhlikyan`_. + - Added ``norm_order`` parameter to :class:`feature_selection.SelectFromModel` + to enable selection of the norm order when ``coef_`` is more than 1D + Bug fixes ......... diff --git a/sklearn/feature_selection/from_model.py b/sklearn/feature_selection/from_model.py index c8a0679247f16..e8a18031f1dc8 100644 --- a/sklearn/feature_selection/from_model.py +++ b/sklearn/feature_selection/from_model.py @@ -10,9 +10,10 @@ from ..utils import safe_mask, check_array, deprecated from ..utils.validation import check_is_fitted from ..exceptions import NotFittedError +from ..utils.fixes import norm -def _get_feature_importances(estimator): +def _get_feature_importances(estimator, norm_order=1): """Retrieve or aggregate feature importances from estimator""" importances = getattr(estimator, "feature_importances_", None) @@ -21,7 +22,7 @@ def _get_feature_importances(estimator): importances = np.abs(estimator.coef_) else: - importances = np.sum(np.abs(estimator.coef_), axis=0) + importances = norm(estimator.coef_, axis=0, ord=norm_order) elif importances is None: raise ValueError( @@ -172,6 +173,11 @@ class SelectFromModel(BaseEstimator, SelectorMixin): Otherwise train the model using ``fit`` and then ``transform`` to do feature selection. + norm_order : non-zero int, inf, -inf, default 1 + Order of the norm used to filter the vectors of coefficients below + ``threshold`` in the case where the ``coef_`` attribute of the + estimator is of dimension 2. + Attributes ---------- `estimator_`: an estimator @@ -182,10 +188,12 @@ class SelectFromModel(BaseEstimator, SelectorMixin): `threshold_`: float The threshold value used for feature selection. """ - def __init__(self, estimator, threshold=None, prefit=False): + + def __init__(self, estimator, threshold=None, prefit=False, norm_order=1): self.estimator = estimator self.threshold = threshold self.prefit = prefit + self.norm_order = norm_order def _get_support_mask(self): # SelectFromModel can directly call on transform. @@ -197,7 +205,7 @@ def _get_support_mask(self): raise ValueError( 'Either fit the model before transform or set "prefit=True"' ' while passing the fitted estimator to the constructor.') - scores = _get_feature_importances(estimator) + scores = _get_feature_importances(estimator, self.norm_order) self.threshold_ = _calculate_threshold(estimator, scores, self.threshold) return scores >= self.threshold_ diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index fe170f09ad80c..a4789de0976bb 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -17,6 +17,7 @@ from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import PassiveAggressiveClassifier +from sklearn.utils.fixes import norm iris = datasets.load_iris() data, y = iris.data, iris.target @@ -102,6 +103,31 @@ def test_feature_importances(): assert_array_equal(X_new, X[:, mask]) +@skip_if_32bit +def test_feature_importances_2d_coef(): + X, y = datasets.make_classification( + n_samples=1000, n_features=10, n_informative=3, n_redundant=0, + n_repeated=0, shuffle=False, random_state=0, n_classes=4) + + est = LogisticRegression() + for threshold, func in zip(["mean", "median"], [np.mean, np.median]): + for order in [1, 2, np.inf]: + # Fit SelectFromModel a multi-class problem + transformer = SelectFromModel(estimator=LogisticRegression(), + threshold=threshold, + norm_order=order) + transformer.fit(X, y) + assert_true(hasattr(transformer.estimator_, 'coef_')) + X_new = transformer.transform(X) + assert_less(X_new.shape[1], X.shape[1]) + + # Manually check that the norm is correctly performed + est.fit(X, y) + importances = norm(est.coef_, axis=0, ord=order) + feature_mask = importances > func(importances) + assert_array_equal(X_new, X[:, feature_mask]) + + def test_partial_fit(): est = PassiveAggressiveClassifier(random_state=0, shuffle=False) transformer = SelectFromModel(estimator=est) diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 682ab7733c77a..c7bc8f3078d6b 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -419,3 +419,33 @@ def __getstate__(self): self._fill_value) else: from numpy.ma import MaskedArray # noqa + +if 'axis' not in signature(np.linalg.norm).parameters: + + def norm(X, ord=None, axis=None): + """ + Handles the axis parameter for the norm function + in old versions of numpy (useless for numpy >= 1.8). + """ + + if axis is None or X.ndim == 1: + result = np.linalg.norm(X, ord=ord) + return result + + if axis not in (0, 1): + raise NotImplementedError(""" + The fix that adds axis parameter to the old numpy + norm only works for 1D or 2D arrays. + """) + + if axis == 0: + X = X.T + + result = np.zeros(X.shape[0]) + for i in range(len(result)): + result[i] = np.linalg.norm(X[i], ord=ord) + + return result + +else: + norm = np.linalg.norm diff --git a/sklearn/utils/tests/test_fixes.py b/sklearn/utils/tests/test_fixes.py index ef1110bfc4eed..b7e2dd7180776 100644 --- a/sklearn/utils/tests/test_fixes.py +++ b/sklearn/utils/tests/test_fixes.py @@ -5,6 +5,7 @@ import pickle import numpy as np +import math from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_false @@ -16,6 +17,7 @@ from sklearn.utils.fixes import divide, expit from sklearn.utils.fixes import astype from sklearn.utils.fixes import MaskedArray +from sklearn.utils.fixes import norm def test_expit(): @@ -66,3 +68,26 @@ def test_masked_array_obj_dtype_pickleable(): marr_pickled = pickle.loads(pickle.dumps(marr)) assert_array_equal(marr.data, marr_pickled.data) assert_array_equal(marr.mask, marr_pickled.mask) + + +def test_norm(): + X = np.array([[-2, 4, 5], + [1, 3, -4], + [0, 0, 8], + [0, 0, 0]]).astype(float) + + # Test various axis and order + assert_equal(math.sqrt(135), norm(X)) + assert_array_equal( + np.array([math.sqrt(5), math.sqrt(25), math.sqrt(105)]), + norm(X, axis=0) + ) + assert_array_equal(np.array([3, 7, 17]), norm(X, axis=0, ord=1)) + assert_array_equal(np.array([2, 4, 8]), norm(X, axis=0, ord=np.inf)) + assert_array_equal(np.array([0, 0, 0]), norm(X, axis=0, ord=-np.inf)) + assert_array_equal(np.array([11, 8, 8, 0]), norm(X, axis=1, ord=1)) + + # Test shapes + assert_equal((), norm(X).shape) + assert_equal((3,), norm(X, axis=0).shape) + assert_equal((4,), norm(X, axis=1).shape)