-
Notifications
You must be signed in to change notification settings - Fork 1
/
base.py
422 lines (350 loc) · 16.8 KB
/
base.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
# Copyright (c) 2022. RISC Software GmbH.
# All rights reserved.
import importlib
from pathlib import Path
from typing import Callable, Dict, List, Optional
import numpy as np
import pandas as pd
class TransformationExplainer:
_factories = []
@staticmethod
def register_factory(name: str, func, errors: str = 'raise'):
i = [j for j, (n, _) in enumerate(TransformationExplainer._factories) if name == n]
if i:
if errors == 'raise':
raise ValueError(f'Transformation explainer factory with name "{name}" already exists.')
elif errors == 'update':
TransformationExplainer._factories[i[0]] = (name, func)
elif errors == 'replace':
del TransformationExplainer._factories[i[0]]
TransformationExplainer._factories.insert(0, (name, func))
else:
TransformationExplainer._factories.insert(0, (name, func))
@staticmethod
def make(obj, params=None) -> Optional['TransformationExplainer']:
if all(hasattr(obj, attr)
for attr in ('fit', 'transform', 'fit_forward', 'forward', 'backward', 'backward_global')):
return obj
for _, func in TransformationExplainer._factories:
out = func(obj, params=params)
if out is not obj:
return out
raise RuntimeError(f'Object of type {type(obj)} cannot be converted into a transformation explainer.')
def __init__(self, transformer=None, params=None):
if params is not None:
assert params.get('class_name', self.__class__.__name__) == self.__class__.__name__
self._transformer = transformer
@property
def transformer(self):
return self._transformer
@property
def params_(self) -> dict:
"""
Get all params obtained from fitting the explainer to data in method `fit_forward()`, and which can be passed
to `__init__()`.
Returns
-------
dict
Dictionary of parameters-
"""
return dict(class_name=self.__class__.__name__)
def fit(self, x, y=None):
# only to implement the standard sklearn API, which makes it possible to combine individual explainers in
# pipelines and similar compound transformations
raise RuntimeError(f'Method fit() of class {self.__class__.__name__} cannot be called.')
def transform(self, x):
return self._transformer.transform(x)
def fit_forward(self, x, y):
"""
Fit this explainer to training data, and transform the data by applying the underlying transformation.
`forward()` is implicitly called on `x` as well, meaning that invoking `backward()` immediately afterwards is
possible and refers to the given samples `x`.
Parameters
----------
x:
Features, array-like of shape `(n_samples, n_features_in)`.
y:
Labels, array-like of shape `(n_samples, n_labels)` or `(n_samples,)`.
Returns
-------
The transformed features, array-like of shape `(n_samples, n_features_out)`.
"""
raise NotImplementedError()
def forward(self, x):
"""
Transform `x` by applying the underlying transformation, and record all intermediate values needed for
back-propagating explanations generated by downstream explanation methods.
Parameters
----------
x:
Data to transform (and later explain), array-like of shape `(n_samples, n_features_in)`. `n_features_in`
must be the same as in the data this explainer instance was fitted on.
Returns
-------
Transformed data, array-like of shape `(n_samples, n_features_out)`.
"""
raise NotImplementedError()
def backward(self, s: np.ndarray) -> np.ndarray:
"""
Back-propagate local explanations from output to input.
s: ndarray
Explanations (feature importance scores) generated downstream for the last `x` method `forward()` was
applied to. Array of shape `(*dims, n_samples, n_features_out)`, where `n_samples` must be as in the last
invocation of `forward()`.
Returns
-------
ndarray
Explanations, array of shape `(*dims, n_samples, n_features_in)`.
Notes
-----
In contrast to method `forward()`, this method expects plain Numpy arrays as input and returns plain
Numpy arrays.
"""
raise NotImplementedError()
def backward_global(self, s: np.ndarray) -> np.ndarray:
"""
Back-propagate global explanations from output to input.
Parameters
----------
s: ndarray
Global explanations (feature importance scores) generated by downstream explanation methods. Array of shape
`(*dims, n_features_out)`.
Returns
-------
ndarray
Explanations, array of shape `(*dims, n_features_in)`.
Notes
-----
In contrast to method `forward()`, this method expects plain Numpy arrays as input and returns plain
Numpy arrays.
"""
raise NotImplementedError()
class IdentityTransformationExplainer(TransformationExplainer):
def __init__(self, transformer=None, params=None):
super(IdentityTransformationExplainer, self).__init__(transformer=transformer, params=params)
self._transform_func = getattr(self._transformer, 'transform', None)
def transform(self, x):
return x if self._transform_func is None else self._transform_func(x)
def fit_forward(self, x, y):
return self.forward(x)
def forward(self, x):
return self.transform(x)
def backward(self, s):
return s
def backward_global(self, s):
return s
class EnsembleExplainer:
"""
Class for explaining a given ensemble, or constituents of it.
Parameters
----------
ensemble: FittedEnsemble
The ensemble to explain, an instance of FittedEnsemble.
config: dict, optional
Config dictionary.
feature_names: list, optional
List of feature names. `None` defaults to `range(n_features)`, where `n_features` is determined from
`x`.
target_names: list, optional
List of target names, optional. In case of regression this is the list of target variables, in case of binary
classification this is the singleton list with the sole target variable, and in multiclass- and multilabel
classification this is the list of classes. None defaults to `range(n_targets)`, where `n_targets` is determined
from `y`.
x: DataFrame, optional
Training data, which is required by some explanation methods (e.g., SHAP).
y: DataFrame, optional
Labels of `x`.
params: optional
Params obtained from a previous instantiation of an ensemble explainer of this type on `ensemble`. If given,
neither `feature_names`, `target_names`, `x` nor `y` may be provided.
Examples
--------
>>> # Paradigm for explaining a pipeline `model` of a FittedEnsemble:
>>>
>>> # Setup:
>>> preprocessing_explainer = TransformationExplainer.make(transformation=model.preprocessing)
>>> x_train = preprocessing_explainer.fit_forward(x_train, y_train)
>>>
>>> # Local explanations for `x_test`:
>>> x_test_pp = preprocessing_estimator.forward(x_test)
>>> explanation = func(x_test_pp)
>>> explanation = preprocessing_explainer.backward(explanation)
>>>
>>> # Global explanations:
>>> explanation = func_global()
>>> explanation = preprocessing_explainer.backward_global(explanation)
>>> # Paradigm for explaining data `(x, y)` after applying some preprocessing steps `preprocessing`:
>>>
>>> preprocessing_explainer = TransformationExplainer.make(transformation=preprocessing)
>>> x_pp = preprocessing_explainer.fit_forward(x, y)
>>> explanation = func(x_pp, y)
>>> explanation = preprocessing_explainer.backward(explanation) # or `backward_global(explanation)`
"""
__registered = {}
@staticmethod
def register(name: str, factory: Callable[..., 'EnsembleExplainer']):
"""
Register a new ensemble explainer factory.
Parameters
----------
name: str
The name of the ensemble explainer.
factory: Callable
The factory, a function mapping argument-dicts to instances of class `EnsembleExplainer` (or subclasses
thereof).
"""
EnsembleExplainer.__registered[name] = factory
@staticmethod
def get(name: str, **kwargs) -> Optional['EnsembleExplainer']:
factory = EnsembleExplainer.__registered.get(name)
return factory if factory is None else factory(**kwargs)
@staticmethod
def list_explainers() -> List[str]:
return list(EnsembleExplainer.__registered.keys())
def __init__(self, ensemble: 'FittedEnsemble' = None, config: Optional[dict] = None, # noqa F821
feature_names: Optional[list] = None, target_names: Optional[list] = None,
x: Optional[pd.DataFrame] = None, y: Optional[pd.DataFrame] = None, params=None):
if not (params is None or (feature_names is None and target_names is None and x is None and y is None)):
raise ValueError('If params is given, feature_names, target_names, x and y must be None.')
self.config: dict = config or {}
@property
def name(self) -> str:
raise NotImplementedError()
@property
def behavior(self) -> dict:
"""
Description of the behavior of methods `explain()` and `explain_global()`, especially w.r.t. parameters `x`
and `y`.
Returns
-------
dict
Dictionary with keys
* ``"supports_local"``: True if the backend supports local explanations, i.e., method `explain()` can be
called. If False, calling `explain()` raises an exception.
* ``"requires_y"``: True if `y` must be passed to `explain()` and `explain_global()`.
* ``"global_accepts_x"``: True if `x` can be passed to method `explain_global()`.
* ``"global_requires_x"``: True if `x` must be passed to method `explain_global()`. If False but
``"global_accepts_x"`` is True, the global behavior differs depending on whether `x` is provided.
``"global_requires_x"`` can only be True if "global_accepts_x" is True as well.
* ``"global_is_mean_of_local"``: True if global explanations are the mean of the individual local
explanations, if `x` is provided. If True, it might be better to call method `explain()` instead of
`explain_global()`, since the computational effort is identical. Can only be True if "supports_local" is
True as well.
"""
raise NotImplementedError()
@property
def params_(self) -> dict:
"""
Get all params necessary for instantiating this EnsembleExplainer via parameter `params`.
"""
raise NotImplementedError()
def explain(self, x: pd.DataFrame, y: Optional[pd.DataFrame] = None, jobs: int = 1,
batch_size: Optional[int] = None, model_id=None, mapping: Optional[Dict[str, List[str]]] = None,
show_progress: bool = False) -> dict:
"""
Explain the ensemble, or some of its constituent models (pipelines), on a set of samples.
Parameters
----------
x: DataFrame
The samples, a DataFrame with the same feature columns as the ensemble was trained on.
y: DataFrame, optional
The labels. If given, a DataFrame with the same number of rows and row index as `x` and the same
target columns as the ensemble was trained on. Check property `behavior` to see whether this argument is
required (depends on the backend).
jobs: int, default=1
The number of jobs to use.
batch_size: int, optional
The batch size to use.
model_id: optional
The ID(s) of the model(s) to explain, or None to explain all models in the ensemble.
mapping: dict, optional
Mapping specifying which features to combine: target column names are mapped to lists of source
column names in `x`.
show_progress: bool, default=False
Whether to display a progress bar.
Returns
-------
dict
Dictionary with 1-2 levels of nesting. The keys in the outer dict are model-IDs (possibly including
`"__ensemble__"`), and the keys in the inner dicts (if any) are arbitrary and usually depend on the
prediction task and the explanation backend. Ultimately, the values are DataFrames with the same row index
as `x` and columns corresponding to `feature_names`, containing feature importance scores. Note that the
result consists entirely of floating point values, even if `x` has categorical or other columns.
"""
raise NotImplementedError()
def explain_global(self, x: Optional[pd.DataFrame] = None, y: Optional[pd.DataFrame] = None,
sample_weight: Optional[np.ndarray] = None, jobs: int = 1, batch_size: Optional[int] = None,
model_id=None, mapping: Optional[Dict[str, List[str]]] = None,
show_progress: bool = False) -> dict:
"""
Explain the ensemble, or some of its constituent models (pipelines), globally.
Parameters
----------
x: DataFrame, optional
Samples, optional, a DataFrame with the same columns as the ensemble was trained on. Check property
`behavior` to see whether this argument is accepted or required (depends on the backend).
y: DataFrame, optional
The labels, optional. If given, a DataFrame with the same number of rows and row index as `x` and the same
target columns as the ensemble was trained on. Check property `behavior` to see whether this argument is
required (depends on the backend).
sample_weight: ndarray, optional
Sample weight. Ignored if `x` is None.
jobs: int, default=1
The number of jobs to use.
batch_size: int, optional
The batch size to use.
model_id: optional
The ID(s) of the model(s) to explain, or None to explain all models in the ensemble.
mapping: dict, optional
Mapping specifying which features to combine: target column names are mapped to lists of source column names
in `x`.
show_progress: bool, default=False
Whether to display a progress bar.
Returns
-------
dict
Dictionary whose keys are model-IDs (possibly including "__ensemble__"), and whose values are Series or
DataFrames with feature importance scores. In either case, the row index equals `feature_names`, and the
columns of DataFrames can be arbitrary and usually depend on the prediction task and the explanation
backend.
"""
raise NotImplementedError()
def aggregate_features(self, features: pd.DataFrame, mapping: Dict[str, List[str]]) -> pd.DataFrame:
"""
Combine features for obtaining aggregated values corresponding to aggregated local explanations returned by
method `aggregate_explanations()`.
Parameters
----------
features: DataFrame
DataFrame to aggregate, from which the corresponding local explanations were calculated.
mapping: dict
Mapping specifying which features to combine: target column names are mapped to lists of source column names
in `features`.
Returns
-------
DataFrame
DataFrame with aggregated features.
"""
# This is only a default implementation, which may be overridden by subclasses.
features = features.copy()
for target_col, source_cols in mapping.items():
try:
# only add columns if mean can be computed
features[target_col] = features[source_cols].mean(axis=1)
except: # noqa
pass
features.drop(source_cols, axis=1, inplace=True)
return features
def get_versions(self) -> dict:
"""
Get the versions of all key packages and libraries this explanation backend depends upon.
Returns
-------
dict
Dictionary whose keys are package names and whose values are version strings.
"""
raise NotImplementedError()
# load explanation backends
for _d in Path(__file__).parent.iterdir():
if _d.is_dir() and (_d / '__init__.py').exists():
importlib.import_module('.' + _d.stem, package=__package__)