-
Notifications
You must be signed in to change notification settings - Fork 42
/
_bagging.py
247 lines (197 loc) · 8.06 KB
/
_bagging.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
"""Bagging probabilistic regressors."""
__author__ = ["fkiraly"]
__all__ = ["BaggingRegressor"]
from math import ceil
import numpy as np
import pandas as pd
from skpro.distributions.mixture import Mixture
from skpro.regression.base import BaseProbaRegressor
class BaggingRegressor(BaseProbaRegressor):
"""Bagging ensemble of probabilistic regresesors.
Fits ``n_estimators`` clones of an skpro regressor on
datasets which are instance sub-samples and/or variable sub-samples.
On ``predict_proba``, the mixture of probabilistic predictions is returned.
The estimator allows to choose sample sizes for instances, variables,
and whether sampling is with or without replacement.
Direct generalization of ``sklearn``'s ``BaggingClassifier``
to the probabilistic regrsesion task.
Parameters
----------
estimator : skpro regressor, descendant of BaseProbaRegressor
regressor to use in the bagging estimator
n_estimators : int, default=10
number of estimators in the sample for bagging
n_samples : int or float, default=1.0
The number of instances drawn from ``X`` in ``fit`` to train each clone
If int, then indicates number of instances precisely
If float, interpreted as a fraction, and rounded by ``ceil``
n_features : int or float, default=1.0
The number of features/variables drawn from ``X`` in ``fit`` to train each clone
If int, then indicates number of instances precisely
If float, interpreted as a fraction, and rounded by ``ceil``
bootstrap : boolean, default=True
whether samples/instances are drawn with replacement (True) or not (False)
bootstrap_features : boolean, default=False
whether features/variables are drawn with replacement (True) or not (False)
random_state : int, RandomState instance or None, optional (default=None)
If int, ``random_state`` is the seed used by the random number generator;
If ``RandomState`` instance, ``random_state`` is the random number generator;
If None, the random number generator is the ``RandomState`` instance used
by ``np.random``.
Attributes
----------
estimators_ : list of of skpro regressors
clones of regressor in `estimator` fitted in the ensemble
Examples
--------
>>> from skpro.regression.ensemble import BaggingRegressor
>>> from skpro.regression.residual import ResidualDouble
>>> from sklearn.linear_model import LinearRegression
>>> from sklearn.datasets import load_diabetes
>>> from sklearn.model_selection import train_test_split
>>>
>>> X, y = load_diabetes(return_X_y=True, as_frame=True)
>>> X_train, X_test, y_train, y_test = train_test_split(X, y)
>>>
>>> reg_mean = LinearRegression()
>>> reg_proba = ResidualDouble(reg_mean)
>>>
>>> ens = BaggingRegressor(reg_proba, n_estimators=10)
>>> ens.fit(X_train, y_train)
BaggingRegressor(...)
>>> y_pred = ens.predict_proba(X_test)
"""
_tags = {"capability:missing": True}
def __init__(
self,
estimator,
n_estimators=10,
n_samples=1.0,
n_features=1.0,
bootstrap=True,
bootstrap_features=False,
random_state=None,
):
self.estimator = estimator
self.n_estimators = n_estimators
self.n_samples = n_samples
self.n_features = n_features
self.bootstrap = bootstrap
self.bootstrap_features = bootstrap_features
self.random_state = random_state
super().__init__()
tags_to_clone = ["capability:missing"]
self.clone_tags(estimator, tags_to_clone)
def _fit(self, X, y):
"""Fit regressor to training data.
Writes to self:
Sets fitted model attributes ending in "_".
Parameters
----------
X : pandas DataFrame
feature instances to fit regressor to
y : pandas DataFrame, must be same length as X
labels to fit regressor to
Returns
-------
self : reference to self
"""
estimator = self.estimator
n_estimators = self.n_estimators
n_samples = self.n_samples
n_features = self.n_features
bootstrap = self.bootstrap
bootstrap_ft = self.bootstrap_features
random_state = self.random_state
np.random.seed(random_state)
inst_ix = X.index
col_ix = X.columns
n = len(inst_ix)
m = len(col_ix)
if isinstance(n_samples, float):
n_samples_ = ceil(n_samples * n)
else:
n_samples_ = n_samples
if isinstance(n_features, float):
n_features_ = ceil(n_features * m)
else:
n_features_ = n_features
self.estimators_ = []
self.cols_ = []
for _i in range(n_estimators):
esti = estimator.clone()
row_iloc = pd.RangeIndex(n)
row_ss = _random_ss_ix(row_iloc, size=n_samples_, replace=bootstrap)
inst_ix_i = inst_ix[row_ss]
col_ix_i = _random_ss_ix(col_ix, size=n_features_, replace=bootstrap_ft)
# store column subset for use in predict
self.cols_ += [col_ix_i]
Xi = _subs_cols(X.loc[inst_ix_i], col_ix_i, reset_cols=bootstrap_ft)
Xi = Xi.reset_index(drop=True)
yi = y.loc[inst_ix_i].reset_index(drop=True)
self.estimators_ += [esti.fit(Xi, yi)]
return self
def _predict_proba(self, X) -> np.ndarray:
"""Predict distribution over labels for data from features.
State required:
Requires state to be "fitted".
Accesses in self:
Fitted model attributes ending in "_"
Parameters
----------
X : pandas DataFrame, must have same columns as X in `fit`
data to predict labels for
Returns
-------
y : skpro BaseDistribution, same length as `X`
labels predicted for `X`
"""
reset_cols = self.bootstrap_features
Xis = [_subs_cols(X, col_ix_i, reset_cols) for col_ix_i in self.cols_]
y_probas = [est.predict_proba(Xi) for est, Xi in zip(self.estimators_, Xis)]
y_proba = Mixture(y_probas)
return y_proba
@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.
Parameters
----------
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return `"default"` set.
Returns
-------
params : dict or list of dict, default = {}
Parameters to create testing instances of the class
Each dict are parameters to construct an "interesting" test instance, i.e.,
`MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
`create_test_instance` uses the first (or only) dictionary in `params`
"""
from sklearn.linear_model import LinearRegression
from skpro.regression.residual import ResidualDouble
regressor = ResidualDouble(LinearRegression())
params1 = {"estimator": regressor}
params2 = {
"estimator": regressor,
"n_samples": 0.5,
"n_features": 0.5,
}
params3 = {
"estimator": regressor,
"n_samples": 7,
"n_features": 2,
"bootstrap": False,
"bootstrap_features": True,
}
return [params1, params2, params3]
def _random_ss_ix(ix, size, replace=True):
"""Randomly uniformly sample indices from a list of indices."""
a = range(len(ix))
ixs = ix[np.random.choice(a, size=size, replace=replace)]
return ixs
def _subs_cols(df, col_ix, reset_cols=False):
"""Subset columns of a DataFrame, with potential resetting of column index."""
df_subset = df.loc[:, col_ix]
if reset_cols:
df_subset.columns = pd.RangeIndex(len(df_subset.columns))
return df_subset