catabra/explanation/base.py

#  Copyright (c) 2022. RISC Software GmbH.
#  All rights reserved.

import importlib
from pathlib import Path
from typing import Callable, Dict, List, Optional

import numpy as np
import pandas as pd


class TransformationExplainer:

    _factories = []

    @staticmethod
    def register_factory(name: str, func, errors: str = 'raise'):
        i = [j for j, (n, _) in enumerate(TransformationExplainer._factories) if name == n]
        if i:
            if errors == 'raise':
                raise ValueError(f'Transformation explainer factory with name "{name}" already exists.')
            elif errors == 'update':
                TransformationExplainer._factories[i[0]] = (name, func)
            elif errors == 'replace':
                del TransformationExplainer._factories[i[0]]
                TransformationExplainer._factories.insert(0, (name, func))
        else:
            TransformationExplainer._factories.insert(0, (name, func))

    @staticmethod
    def make(obj, params=None) -> Optional['TransformationExplainer']:
        if all(hasattr(obj, attr)
               for attr in ('fit', 'transform', 'fit_forward', 'forward', 'backward', 'backward_global')):
            return obj

        for _, func in TransformationExplainer._factories:
            out = func(obj, params=params)
            if out is not obj:
                return out

        raise RuntimeError(f'Object of type {type(obj)} cannot be converted into a transformation explainer.')

    def __init__(self, transformer=None, params=None):
        if params is not None:
            assert params.get('class_name', self.__class__.__name__) == self.__class__.__name__
        self._transformer = transformer

    @property
    def transformer(self):
        return self._transformer

    @property
    def params_(self) -> dict:
        """

        Get all params obtained from fitting the explainer to data in method `fit_forward()`, and which can be passed
        to `__init__()`.

        Returns
        -------
        dict
            Dictionary of parameters-
        """
        return dict(class_name=self.__class__.__name__)

    def fit(self, x, y=None):
        # only to implement the standard sklearn API, which makes it possible to combine individual explainers in
        # pipelines and similar compound transformations
        raise RuntimeError(f'Method fit() of class {self.__class__.__name__} cannot be called.')

    def transform(self, x):
        return self._transformer.transform(x)

    def fit_forward(self, x, y):
        """
        Fit this explainer to training data, and transform the data by applying the underlying transformation.
        `forward()` is implicitly called on `x` as well, meaning that invoking `backward()` immediately afterwards is
        possible and refers to the given samples `x`.

        Parameters
        ----------
        x:
            Features, array-like of shape `(n_samples, n_features_in)`.
        y:
            Labels, array-like of shape `(n_samples, n_labels)` or `(n_samples,)`.

        Returns
        -------
            The transformed features, array-like of shape `(n_samples, n_features_out)`.
        """
        raise NotImplementedError()

    def forward(self, x):
        """
        Transform `x` by applying the underlying transformation, and record all intermediate values needed for
        back-propagating explanations generated by downstream explanation methods.

        Parameters
        ----------
        x:
            Data to transform (and later explain), array-like of shape `(n_samples, n_features_in)`. `n_features_in`
            must be the same as in the data this explainer instance was fitted on.

        Returns
        -------
            Transformed data, array-like of shape `(n_samples, n_features_out)`.
        """
        raise NotImplementedError()

    def backward(self, s: np.ndarray) -> np.ndarray:
        """
        Back-propagate local explanations from output to input.

        s: ndarray
            Explanations (feature importance scores) generated downstream for the last `x` method `forward()` was
            applied to. Array of shape `(*dims, n_samples, n_features_out)`, where `n_samples` must be as in the last
            invocation of `forward()`.

        Returns
        -------
        ndarray
            Explanations, array of shape `(*dims, n_samples, n_features_in)`.

        Notes
        -----
        In contrast to method `forward()`, this method expects plain Numpy arrays as input and returns plain
        Numpy arrays.
        """
        raise NotImplementedError()

    def backward_global(self, s: np.ndarray) -> np.ndarray:
        """
        Back-propagate global explanations from output to input.

        Parameters
        ----------
        s: ndarray
            Global explanations (feature importance scores) generated by downstream explanation methods. Array of shape
            `(*dims, n_features_out)`.

        Returns
        -------
        ndarray
            Explanations, array of shape `(*dims, n_features_in)`.

        Notes
        -----
         In contrast to method `forward()`, this method expects plain Numpy arrays as input and returns plain
        Numpy arrays.
        """
        raise NotImplementedError()


class IdentityTransformationExplainer(TransformationExplainer):

    def __init__(self, transformer=None, params=None):
        super(IdentityTransformationExplainer, self).__init__(transformer=transformer, params=params)
        self._transform_func = getattr(self._transformer, 'transform', None)

    def transform(self, x):
        return x if self._transform_func is None else self._transform_func(x)

    def fit_forward(self, x, y):
        return self.forward(x)

    def forward(self, x):
        return self.transform(x)

    def backward(self, s):
        return s

    def backward_global(self, s):
        return s


class EnsembleExplainer:
    """
    Class for explaining a given ensemble, or constituents of it.

    Parameters
    ----------
    ensemble: FittedEnsemble
        The ensemble to explain, an instance of FittedEnsemble.
    config: dict, optional
        Config dictionary.
    feature_names: list, optional
        List of feature names. `None` defaults to `range(n_features)`, where `n_features` is determined from
        `x`.
    target_names: list, optional
        List of target names, optional. In case of regression this is the list of target variables, in case of binary
        classification this is the singleton list with the sole target variable, and in multiclass- and multilabel
        classification this is the list of classes. None defaults to `range(n_targets)`, where `n_targets` is determined
        from `y`.
    x: DataFrame, optional
        Training data, which is required by some explanation methods (e.g., SHAP).
    y: DataFrame, optional
        Labels of `x`.
    params: optional
        Params obtained from a previous instantiation of an ensemble explainer of this type on `ensemble`. If given,
        neither `feature_names`, `target_names`, `x` nor `y` may be provided.

    Examples
    --------
    >>> # Paradigm for explaining a pipeline `model` of a FittedEnsemble:
    >>>
    >>> # Setup:
    >>> preprocessing_explainer = TransformationExplainer.make(transformation=model.preprocessing)
    >>> x_train = preprocessing_explainer.fit_forward(x_train, y_train)
    >>>
    >>> # Local explanations for `x_test`:
    >>> x_test_pp = preprocessing_estimator.forward(x_test)
    >>> explanation = func(x_test_pp)
    >>> explanation = preprocessing_explainer.backward(explanation)
    >>>
    >>> # Global explanations:
    >>> explanation = func_global()
    >>> explanation = preprocessing_explainer.backward_global(explanation)


    >>> # Paradigm for explaining data `(x, y)` after applying some preprocessing steps `preprocessing`:
    >>>
    >>> preprocessing_explainer = TransformationExplainer.make(transformation=preprocessing)
    >>> x_pp = preprocessing_explainer.fit_forward(x, y)
    >>> explanation = func(x_pp, y)
    >>> explanation = preprocessing_explainer.backward(explanation)       # or `backward_global(explanation)`
    """

    __registered = {}

    @staticmethod
    def register(name: str, factory: Callable[..., 'EnsembleExplainer']):
        """
        Register a new ensemble explainer factory.

        Parameters
        ----------
        name: str
            The name of the ensemble explainer.
        factory: Callable
            The factory, a function mapping argument-dicts to instances of class `EnsembleExplainer` (or subclasses
            thereof).
        """
        EnsembleExplainer.__registered[name] = factory

    @staticmethod
    def get(name: str, **kwargs) -> Optional['EnsembleExplainer']:
        factory = EnsembleExplainer.__registered.get(name)
        return factory if factory is None else factory(**kwargs)

    @staticmethod
    def list_explainers() -> List[str]:
        return list(EnsembleExplainer.__registered.keys())

    def __init__(self, ensemble: 'FittedEnsemble' = None, config: Optional[dict] = None, # noqa F821
                 feature_names: Optional[list] = None, target_names: Optional[list] = None,
                 x: Optional[pd.DataFrame] = None, y: Optional[pd.DataFrame] = None, params=None):

        if not (params is None or (feature_names is None and target_names is None and x is None and y is None)):
            raise ValueError('If params is given, feature_names, target_names, x and y must be None.')
        self.config: dict = config or {}

    @property
    def name(self) -> str:
        raise NotImplementedError()

    @property
    def behavior(self) -> dict:
        """
        Description of the behavior of methods `explain()` and `explain_global()`, especially w.r.t. parameters `x`
        and `y`.

        Returns
        -------
        dict
            Dictionary with keys

            * ``"supports_local"``: True if the backend supports local explanations, i.e., method `explain()` can be
              called. If False, calling `explain()` raises an exception.
            * ``"requires_y"``: True if `y` must be passed to `explain()` and `explain_global()`.
            * ``"global_accepts_x"``: True if `x` can be passed to method `explain_global()`.
            * ``"global_requires_x"``: True if `x` must be passed to method `explain_global()`. If False but
              ``"global_accepts_x"`` is True, the global behavior differs depending on whether `x` is provided.
              ``"global_requires_x"`` can only be True if "global_accepts_x" is True as well.
            * ``"global_is_mean_of_local"``: True if global explanations are the mean of the individual local
              explanations, if `x` is provided. If True, it might be better to call method `explain()` instead of
              `explain_global()`, since the computational effort is identical. Can only be True if "supports_local" is
              True as well.
        """
        raise NotImplementedError()

    @property
    def params_(self) -> dict:
        """
        Get all params necessary for instantiating this EnsembleExplainer via parameter `params`.
        """
        raise NotImplementedError()

    def explain(self, x: pd.DataFrame, y: Optional[pd.DataFrame] = None, jobs: int = 1,
                batch_size: Optional[int] = None, model_id=None, mapping: Optional[Dict[str, List[str]]] = None,
                show_progress: bool = False) -> dict:
        """
        Explain the ensemble, or some of its constituent models (pipelines), on a set of samples.

        Parameters
        ----------
        x: DataFrame
            The samples, a DataFrame with the same feature columns as the ensemble was trained on.
        y: DataFrame, optional
            The labels. If given, a DataFrame with the same number of rows and row index as `x` and the same
            target columns as the ensemble was trained on. Check property `behavior` to see whether this argument is
            required (depends on the backend).
        jobs: int, default=1
            The number of jobs to use.
        batch_size: int, optional
            The batch size to use.
        model_id: optional
            The ID(s) of the model(s) to explain, or None to explain all models in the ensemble.
        mapping: dict, optional
            Mapping specifying which features to combine: target column names are mapped to lists of source
            column names in `x`.
        show_progress: bool, default=False
            Whether to display a progress bar.

        Returns
        -------
        dict
            Dictionary with 1-2 levels of nesting. The keys in the outer dict are model-IDs (possibly including
            `"__ensemble__"`), and the keys in the inner dicts (if any) are arbitrary and usually depend on the
            prediction task and the explanation backend. Ultimately, the values are DataFrames with the same row index
            as `x` and columns corresponding to `feature_names`, containing feature importance scores. Note that the
            result consists entirely of floating point values, even if `x` has categorical or other columns.
        """
        raise NotImplementedError()

    def explain_global(self, x: Optional[pd.DataFrame] = None, y: Optional[pd.DataFrame] = None,
                       sample_weight: Optional[np.ndarray] = None, jobs: int = 1, batch_size: Optional[int] = None,
                       model_id=None, mapping: Optional[Dict[str, List[str]]] = None,
                       show_progress: bool = False) -> dict:
        """
        Explain the ensemble, or some of its constituent models (pipelines), globally.

        Parameters
        ----------
        x: DataFrame, optional
            Samples, optional, a DataFrame with the same columns as the ensemble was trained on. Check property
            `behavior` to see whether this argument is accepted or required (depends on the backend).
        y: DataFrame, optional
            The labels, optional. If given, a DataFrame with the same number of rows and row index as `x` and the same
            target columns as the ensemble was trained on. Check property `behavior` to see whether this argument is
            required (depends on the backend).
        sample_weight: ndarray, optional
            Sample weight. Ignored if `x` is None.
        jobs: int, default=1
            The number of jobs to use.
        batch_size: int, optional
            The batch size to use.
        model_id: optional
            The ID(s) of the model(s) to explain, or None to explain all models in the ensemble.
        mapping: dict, optional
            Mapping specifying which features to combine: target column names are mapped to lists of source column names
            in `x`.
        show_progress: bool, default=False
            Whether to display a progress bar.

        Returns
        -------
        dict
            Dictionary whose keys are model-IDs (possibly including "__ensemble__"), and whose values are Series or
            DataFrames with feature importance scores. In either case, the row index equals `feature_names`, and the
            columns of DataFrames can be arbitrary and usually depend on the prediction task and the explanation
            backend.
        """
        raise NotImplementedError()

    def aggregate_features(self, features: pd.DataFrame, mapping: Dict[str, List[str]]) -> pd.DataFrame:
        """
        Combine features for obtaining aggregated values corresponding to aggregated local explanations returned by
        method `aggregate_explanations()`.

        Parameters
        ----------
        features: DataFrame
            DataFrame to aggregate, from which the corresponding local explanations were calculated.
        mapping: dict
            Mapping specifying which features to combine: target column names are mapped to lists of source column names
            in `features`.

        Returns
        -------
        DataFrame
            DataFrame with aggregated features.
        """
        # This is only a default implementation, which may be overridden by subclasses.
        features = features.copy()
        for target_col, source_cols in mapping.items():
            try:
                # only add columns if mean can be computed
                features[target_col] = features[source_cols].mean(axis=1)
            except:  # noqa
                pass
            features.drop(source_cols, axis=1, inplace=True)
        return features

    def get_versions(self) -> dict:
        """
        Get the versions of all key packages and libraries this explanation backend depends upon.

        Returns
        -------
        dict
            Dictionary whose keys are package names and whose values are version strings.
        """
        raise NotImplementedError()


# load explanation backends
for _d in Path(__file__).parent.iterdir():
    if _d.is_dir() and (_d / '__init__.py').exists():
        importlib.import_module('.' + _d.stem, package=__package__)