/
base.py
295 lines (201 loc) · 7.39 KB
/
base.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
"""Basic classes for validation iterators."""
from copy import copy
from typing import Any
from typing import Generator
from typing import Iterable
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import TypeVar
from typing import cast
from lightautoml.dataset.base import LAMLDataset
from lightautoml.pipelines.features.base import FeaturesPipeline
# from ..pipelines.selection.base import SelectionPipeline
# TODO: SOLVE CYCLIC IMPORT PROBLEM!!! add Selectors typing
Dataset = TypeVar("Dataset", bound=LAMLDataset)
CustomIdxs = Iterable[Tuple[Sequence, Sequence]]
# add checks here
# check for same columns in dataset
class TrainValidIterator:
"""Abstract class to train/validation iteration.
Train/valid iterator:
should implement `__iter__` and `__next__` for using in ml_pipeline.
Args:
train: Train dataset.
**kwargs: Key-word parameters.
"""
@property
def features(self):
"""Dataset features names.
Returns:
List of features names.
"""
return self.train.features
def __init__(self, train: Dataset, **kwargs: Any):
self.train = train
for k in kwargs:
self.__dict__[k] = kwargs[k]
def __iter__(self) -> Iterable:
"""Abstract method. Creates iterator."""
raise NotImplementedError
def __len__(self) -> Optional[int]:
"""Abstract method. Get length of dataset."""
raise NotImplementedError
def get_validation_data(self) -> LAMLDataset:
"""Abstract method. Get validation sample."""
raise NotImplementedError
def apply_feature_pipeline(self, features_pipeline: FeaturesPipeline) -> "TrainValidIterator":
"""Apply features pipeline on train data.
Args:
features_pipeline: Composite transformation of features.
Returns:
Copy of object with transformed features.
"""
train_valid = copy(self)
train_valid.train = features_pipeline.fit_transform(train_valid.train)
return train_valid
# TODO: add typing
def apply_selector(self, selector) -> "TrainValidIterator":
"""Select features on train data.
Check if selector is fitted.
If not - fit and then perform selection.
If fitted, check if it's ok to apply.
Args:
selector: Uses for feature selection.
Returns:
Dataset with selected features.
"""
if not selector.is_fitted:
selector.fit(self)
train_valid = copy(self)
train_valid.train = selector.select(train_valid.train)
return train_valid
def convert_to_holdout_iterator(self) -> "HoldoutIterator":
"""Abstract method. Convert iterator to HoldoutIterator."""
raise NotImplementedError
class DummyIterator(TrainValidIterator):
"""Simple Iterator which use train data as validation."""
def __init__(self, train: Dataset):
"""Create iterator. WARNING: validation on train.
Args:
train: Train dataset.
"""
self.train = train
def __len__(self) -> Optional[int]:
"""Get 1 len.
Returns:
'1'.
"""
return 1
def __iter__(self) -> List[Tuple[None, Dataset, Dataset]]:
"""Simple iterable object.
Returns:
Iterable object for dataset, where for validation also uses train.
"""
return iter([(None, self.train, self.train)])
def get_validation_data(self) -> Dataset:
"""Just get validation sample.
Returns:
Whole train dataset.
"""
return self.train
def convert_to_holdout_iterator(self) -> "HoldoutIterator":
"""Convert iterator to hold-out-iterator.
Returns:
iterator: Holdout iterator with ``'train == valid'``.
"""
return HoldoutIterator(self.train, self.train)
class HoldoutIterator(TrainValidIterator):
"""Iterator for classic holdout - just predefined train and valid samples."""
def __init__(self, train: LAMLDataset, valid: LAMLDataset):
"""Create iterator.
Args:
train: Dataset of train data.
valid: Dataset of valid data.
"""
self.train = train
self.valid = valid
def __len__(self) -> Optional[int]:
"""Get 1 len.
Returns:
1
"""
return 1
def __iter__(self) -> Iterable[Tuple[None, LAMLDataset, LAMLDataset]]:
"""Simple iterable object.
Returns:
Iterable object for train validation dataset.
"""
return iter([(None, self.train, self.valid)])
def get_validation_data(self) -> LAMLDataset:
"""Just get validation sample.
Returns:
Whole validation dataset.
"""
return self.valid
def apply_feature_pipeline(self, features_pipeline: FeaturesPipeline) -> "HoldoutIterator":
"""Inplace apply features pipeline to iterator components.
Args:
features_pipeline: Features pipeline to apply.
Returns:
New iterator.
"""
train_valid = cast("HoldoutIterator", super().apply_feature_pipeline(features_pipeline))
train_valid.valid = features_pipeline.transform(train_valid.valid)
return train_valid
def apply_selector(self, selector) -> "HoldoutIterator":
"""Same as for basic class, but also apply to validation.
Args:
selector: Uses for feature selection.
Returns:
New iterator.
"""
train_valid = cast("HoldoutIterator", super().apply_selector(selector))
train_valid.valid = selector.select(train_valid.valid)
return train_valid
def convert_to_holdout_iterator(self) -> "HoldoutIterator":
"""Do nothing, just return itself.
Returns:
self.
"""
return self
class CustomIterator(TrainValidIterator):
"""Iterator that uses function to create folds indexes.
Usefull for example - classic timeseries splits.
"""
def __init__(self, train: LAMLDataset, iterator: CustomIdxs):
"""Create iterator.
Args:
train: Dataset of train data.
iterator: Callable(dataset) -> Iterator of train/valid indexes.
"""
self.train = train
self.iterator = iterator
def __len__(self) -> Optional[int]:
"""Empty __len__ method.
Returns:
None.
"""
return len(self.iterator)
def __iter__(self) -> Generator:
"""Create generator of train/valid datasets.
Returns:
Data generator.
"""
generator = ((val_idx, self.train[tr_idx], self.train[val_idx]) for (tr_idx, val_idx) in self.iterator)
return generator
def get_validation_data(self) -> LAMLDataset:
"""Simple return train dataset.
Returns:
Dataset of train data.
"""
return self.train
def convert_to_holdout_iterator(self) -> "HoldoutIterator":
"""Convert iterator to hold-out-iterator.
Use first train/valid split for :class:`~lightautoml.validation.base.HoldoutIterator` creation.
Returns:
New hold out iterator.
"""
for (tr_idx, val_idx) in self.iterator:
return HoldoutIterator(self.train[tr_idx], self.train[val_idx])