/
from_model.py
229 lines (185 loc) · 8.42 KB
/
from_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# Authors: Gilles Louppe, Mathieu Blondel, Maheshakya Wijewardena
# License: BSD 3 clause
import numpy as np
import numbers
from .base import SelectorMixin
from ..base import BaseEstimator, clone, MetaEstimatorMixin
from ..exceptions import NotFittedError
from ..utils.metaestimators import if_delegate_has_method
def _get_feature_importances(estimator, norm_order=1):
"""Retrieve or aggregate feature importances from estimator"""
importances = getattr(estimator, "feature_importances_", None)
coef_ = getattr(estimator, "coef_", None)
if importances is None and coef_ is not None:
if estimator.coef_.ndim == 1:
importances = np.abs(coef_)
else:
importances = np.linalg.norm(coef_, axis=0,
ord=norm_order)
elif importances is None:
raise ValueError(
"The underlying estimator %s has no `coef_` or "
"`feature_importances_` attribute. Either pass a fitted estimator"
" to SelectFromModel or call fit before calling transform."
% estimator.__class__.__name__)
return importances
def _calculate_threshold(estimator, importances, threshold):
"""Interpret the threshold value"""
if threshold is None:
# determine default from estimator
est_name = estimator.__class__.__name__
if ((hasattr(estimator, "penalty") and estimator.penalty == "l1") or
"Lasso" in est_name):
# the natural default threshold is 0 when l1 penalty was used
threshold = 1e-5
else:
threshold = "mean"
if isinstance(threshold, str):
if "*" in threshold:
scale, reference = threshold.split("*")
scale = float(scale.strip())
reference = reference.strip()
if reference == "median":
reference = np.median(importances)
elif reference == "mean":
reference = np.mean(importances)
else:
raise ValueError("Unknown reference: " + reference)
threshold = scale * reference
elif threshold == "median":
threshold = np.median(importances)
elif threshold == "mean":
threshold = np.mean(importances)
else:
raise ValueError("Expected threshold='mean' or threshold='median' "
"got %s" % threshold)
else:
threshold = float(threshold)
return threshold
class SelectFromModel(BaseEstimator, SelectorMixin, MetaEstimatorMixin):
"""Meta-transformer for selecting features based on importance weights.
.. versionadded:: 0.17
Parameters
----------
estimator : object
The base estimator from which the transformer is built.
This can be both a fitted (if ``prefit`` is set to True)
or a non-fitted estimator. The estimator must have either a
``feature_importances_`` or ``coef_`` attribute after fitting.
threshold : string, float, optional default None
The threshold value to use for feature selection. Features whose
importance is greater or equal are kept while the others are
discarded. If "median" (resp. "mean"), then the ``threshold`` value is
the median (resp. the mean) of the feature importances. A scaling
factor (e.g., "1.25*mean") may also be used. If None and if the
estimator has a parameter penalty set to l1, either explicitly
or implicitly (e.g, Lasso), the threshold used is 1e-5.
Otherwise, "mean" is used by default.
prefit : bool, default False
Whether a prefit model is expected to be passed into the constructor
directly or not. If True, ``transform`` must be called directly
and SelectFromModel cannot be used with ``cross_val_score``,
``GridSearchCV`` and similar utilities that clone the estimator.
Otherwise train the model using ``fit`` and then ``transform`` to do
feature selection.
norm_order : non-zero int, inf, -inf, default 1
Order of the norm used to filter the vectors of coefficients below
``threshold`` in the case where the ``coef_`` attribute of the
estimator is of dimension 2.
max_features : int or None, optional
The maximum number of features selected scoring above ``threshold``.
To disable ``threshold`` and only select based on ``max_features``,
set ``threshold=-np.inf``.
.. versionadded:: 0.20
Attributes
----------
estimator_ : an estimator
The base estimator from which the transformer is built.
This is stored only when a non-fitted estimator is passed to the
``SelectFromModel``, i.e when prefit is False.
threshold_ : float
The threshold value used for feature selection.
"""
def __init__(self, estimator, threshold=None, prefit=False,
norm_order=1, max_features=None):
self.estimator = estimator
self.threshold = threshold
self.prefit = prefit
self.norm_order = norm_order
self.max_features = max_features
def _get_support_mask(self):
# SelectFromModel can directly call on transform.
if self.prefit:
estimator = self.estimator
elif hasattr(self, 'estimator_'):
estimator = self.estimator_
else:
raise ValueError('Either fit the model before transform or set'
' "prefit=True" while passing the fitted'
' estimator to the constructor.')
scores = _get_feature_importances(estimator, self.norm_order)
threshold = _calculate_threshold(estimator, scores, self.threshold)
if self.max_features is not None:
mask = np.zeros_like(scores, dtype=bool)
candidate_indices = \
np.argsort(-scores, kind='mergesort')[:self.max_features]
mask[candidate_indices] = True
else:
mask = np.ones_like(scores, dtype=bool)
mask[scores < threshold] = False
return mask
def fit(self, X, y=None, **fit_params):
"""Fit the SelectFromModel meta-transformer.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The training input samples.
y : array-like, shape (n_samples,)
The target values (integers that correspond to classes in
classification, real numbers in regression).
**fit_params : Other estimator specific parameters
Returns
-------
self : object
"""
if self.max_features is not None:
if not isinstance(self.max_features, numbers.Integral):
raise TypeError("'max_features' should be an integer between"
" 0 and {} features. Got {!r} instead."
.format(X.shape[1], self.max_features))
elif self.max_features < 0 or self.max_features > X.shape[1]:
raise ValueError("'max_features' should be 0 and {} features."
"Got {} instead."
.format(X.shape[1], self.max_features))
if self.prefit:
raise NotFittedError(
"Since 'prefit=True', call transform directly")
self.estimator_ = clone(self.estimator)
self.estimator_.fit(X, y, **fit_params)
return self
@property
def threshold_(self):
scores = _get_feature_importances(self.estimator_, self.norm_order)
return _calculate_threshold(self.estimator, scores, self.threshold)
@if_delegate_has_method('estimator')
def partial_fit(self, X, y=None, **fit_params):
"""Fit the SelectFromModel meta-transformer only once.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The training input samples.
y : array-like, shape (n_samples,)
The target values (integers that correspond to classes in
classification, real numbers in regression).
**fit_params : Other estimator specific parameters
Returns
-------
self : object
"""
if self.prefit:
raise NotFittedError(
"Since 'prefit=True', call transform directly")
if not hasattr(self, "estimator_"):
self.estimator_ = clone(self.estimator)
self.estimator_.partial_fit(X, y, **fit_params)
return self