-
Notifications
You must be signed in to change notification settings - Fork 852
/
stacking_cv_classification.py
303 lines (264 loc) · 12.5 KB
/
stacking_cv_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
# Stacking CV classifier
# Sebastian Raschka 2014-2016
# mlxtend Machine Learning Library Extensions
#
# An ensemble-learning meta-classifier for stacking
# Authors: Reiichiro Nakano <github.com/reiinakano>
# Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause
from ..externals.name_estimators import _name_estimators
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.base import TransformerMixin
from sklearn.base import clone
from sklearn.utils.validation import check_is_fitted
from sklearn.externals import six
import numpy as np
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version
if Version(sklearn_version) < '0.18':
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import KFold
else:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
class StackingCVClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
"""A 'Stacking Cross-Validation' classifier for scikit-learn estimators.
New in mlxtend v0.4.3
Parameters
----------
classifiers : array-like, shape = [n_classifiers]
A list of classifiers.
Invoking the `fit` method on the `StackingClassifer` will fit clones
of these original classifiers that will
be stored in the class attribute
`self.clfs_`.
meta_classifier : object
The meta-classifier to be fitted on the ensemble of
classifiers
use_probas : bool (default: False)
If True, trains meta-classifier based on predicted probabilities
instead of class labels.
n_folds : int (default=2)
The number of folds used for while creating training data for the
meta-classifier during fitting.
use_features_in_secondary : bool (default: False)
If True, the meta-classifier will be trained both on the predictions
of the original classifiers and the original dataset.
If False, the meta-classifier will be trained only on the predictions
of the original classifiers.
stratify : bool (default: True)
If True, the cross-validation technique used for fitting the classifier
will be Stratified K-Fold.
If False, the cross-validation technique used will be Regular K-Fold.
It is highly recommended to use Stratified K-Fold.
shuffle : bool (default: True)
If True, when fitting, the training data will be shuffled prior to
cross-validation.
random_state: None, int, or RandomState
When shuffle=True, pseudo-random number generator state used for
shuffling. If None, use default numpy RNG for shuffling.
verbose : int, optional (default=0)
Controls the verbosity of the building process.
- `verbose=0` (default): Prints nothing
- `verbose=1`: Prints the number & name of the regressor being fitted
and which fold is currently being used for fitting
- `verbose=2`: Prints info about the parameters of the
regressor being fitted
- `verbose>2`: Changes `verbose` param of the underlying regressor to
self.verbose - 2
Attributes
----------
clfs_ : list, shape=[n_classifiers]
Fitted classifiers (clones of the original classifiers)
meta_clf_ : estimator
Fitted meta-classifier (clone of the original meta-estimator)
"""
def __init__(self, classifiers, meta_classifier,
use_probas=False, n_folds=2,
use_features_in_secondary=False,
stratify=True, random_state=None,
shuffle=True, verbose=0):
self.classifiers = classifiers
self.meta_classifier = meta_classifier
self.named_classifiers = {key: value for
key, value in
_name_estimators(classifiers)}
self.named_meta_classifier = {'meta-%s' % key: value for
key, value in
_name_estimators([meta_classifier])}
self.use_probas = use_probas
self.verbose = verbose
self.n_folds = n_folds
self.use_features_in_secondary = use_features_in_secondary
self.stratify = stratify
self.shuffle = shuffle
self.random_state = random_state
def fit(self, X, y):
""" Fit ensemble classifers and the meta-classifier.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
Returns
-------
self : object
"""
self.clfs_ = [clone(clf) for clf in self.classifiers]
self.meta_clf_ = clone(self.meta_classifier)
if self.verbose > 0:
print("Fitting %d classifiers..." % (len(self.classifiers)))
if self.stratify:
if Version(sklearn_version) < '0.18':
skf = StratifiedKFold(y, n_folds=self.n_folds,
shuffle=self.shuffle,
random_state=self.random_state)
else:
skf = list(StratifiedKFold(n_splits=self.n_folds,
shuffle=self.shuffle,
random_state=self.random_state)
.split(X, y))
else:
if Version(sklearn_version) < '0.18':
skf = KFold(len(y), n_folds=self.n_folds,
shuffle=self.shuffle,
random_state=self.random_state)
else:
skf = list(KFold(n_splits=self.n_folds,
shuffle=self.shuffle,
random_state=self.random_state).split(X))
all_model_predictions = np.array([]).reshape(len(y), 0)
for model in self.clfs_:
if self.verbose > 0:
i = self.clfs_.index(model) + 1
print("Fitting classifier%d: %s (%d/%d)" %
(i, _name_estimators((model,))[0][0],
i, len(self.clfs_)))
if self.verbose > 2:
if hasattr(model, 'verbose'):
model.set_params(verbose=self.verbose - 2)
if self.verbose > 1:
print(_name_estimators((model,))[0][1])
if not self.use_probas:
single_model_prediction = np.array([]).reshape(0, 1)
else:
single_model_prediction = np.array([]).reshape(0, len(set(y)))
for num, (train_index, test_index) in enumerate(skf):
if self.verbose > 0:
print("Training and fitting fold %d of %d..." %
((num + 1), self.n_folds))
model.fit(X[train_index], y[train_index])
if not self.use_probas:
prediction = model.predict(X[test_index])
prediction = prediction.reshape(prediction.shape[0], 1)
else:
prediction = model.predict_proba(X[test_index])
single_model_prediction = np.vstack([single_model_prediction.
astype(prediction.dtype),
prediction])
all_model_predictions = np.hstack([all_model_predictions.
astype(single_model_prediction.
dtype),
single_model_prediction])
# We have to shuffle the labels in the same order as we generated
# predictions during CV (we kinda shuffled them when we did
# Stratified CV).
# We also do the same with the features (we will need this only IF
# use_features_in_secondary is True)
reordered_labels = np.array([]).astype(y.dtype)
reordered_features = np.array([]).reshape((0, X.shape[1]))\
.astype(X.dtype)
for train_index, test_index in skf:
reordered_labels = np.concatenate((reordered_labels,
y[test_index]))
reordered_features = np.concatenate((reordered_features,
X[test_index]))
# Fit the base models correctly this time using ALL the training set
for model in self.clfs_:
model.fit(X, y)
# Fit the secondary model
if not self.use_features_in_secondary:
self.meta_clf_.fit(all_model_predictions, reordered_labels)
else:
self.meta_clf_.fit(np.hstack((reordered_features,
all_model_predictions)),
reordered_labels)
return self
def get_params(self, deep=True):
"""Return estimator parameter names for GridSearch support."""
if not deep:
return super(StackingCVClassifier, self).get_params(deep=False)
else:
out = self.named_classifiers.copy()
for name, step in six.iteritems(self.named_classifiers):
for key, value in six.iteritems(step.get_params(deep=True)):
out['%s__%s' % (name, key)] = value
out.update(self.named_meta_classifier.copy())
for name, step in six.iteritems(self.named_meta_classifier):
for key, value in six.iteritems(step.get_params(deep=True)):
out['%s__%s' % (name, key)] = value
return out
def predict(self, X):
""" Predict target values for X.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
Returns
----------
labels : array-like, shape = [n_samples]
Predicted class labels.
"""
check_is_fitted(self, 'clfs_')
all_model_predictions = np.array([]).reshape(len(X), 0)
for model in self.clfs_:
if not self.use_probas:
single_model_prediction = model.predict(X)
single_model_prediction = single_model_prediction\
.reshape(single_model_prediction.shape[0], 1)
else:
single_model_prediction = model.predict_proba(X)
all_model_predictions = np.hstack((all_model_predictions.
astype(single_model_prediction
.dtype),
single_model_prediction))
if not self.use_features_in_secondary:
return self.meta_clf_.predict(all_model_predictions)
else:
return self.meta_clf_.predict(np.hstack((X,
all_model_predictions)))
def predict_proba(self, X):
""" Predict class probabilities for X.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
Returns
----------
proba : array-like, shape = [n_samples, n_classes]
Probability for each class per sample.
"""
check_is_fitted(self, 'clfs_')
all_model_predictions = np.array([]).reshape(len(X), 0)
for model in self.clfs_:
if not self.use_probas:
single_model_prediction = model.predict(X)
single_model_prediction = single_model_prediction\
.reshape(single_model_prediction.shape[0], 1)
else:
single_model_prediction = model.predict_proba(X)
all_model_predictions = np.hstack((all_model_predictions.
astype(single_model_prediction.
dtype),
single_model_prediction))
if not self.use_features_in_secondary:
return self.meta_clf_.predict_proba(all_model_predictions)
else:
return self.meta_clf_\
.predict_proba(np.hstack((X, all_model_predictions)))