scikit-learn · amueller · Jul 17, 2019 · Feb 8, 2019 · Feb 12, 2019 · Feb 12, 2019
diff --git a/examples/inspect/README.txt b/examples/inspect/README.txt
@@ -0,0 +1,7 @@
+.. _model_inspection_examples:
+
+Model Inspection
+----------------
+
+Examples concerning the :mod:`sklearn.inspect` module.
+
diff --git a/examples/inspect/plot_permutation_importance.py b/examples/inspect/plot_permutation_importance.py
@@ -0,0 +1,76 @@
+"""
+==================================================
+Permutation Importance vs Random Forest Importance
+==================================================
+
+The random forest `feature_importances_`, are computed from train set
+statistics and are subject to bias with the cardinality of the feature. The
+permutation importance of a feature is calculated by measuring how much the
+model performance decreases when the feature is permutated.
+
+In this example, we add a column of random numbers to the diabetes dataset.
+Then we fit a :class:`sklearn.ensemble.RandomForestRegressor` to this modified
+dataset. The feature importance from the random forest is plotted. In this
+case, the ``RANDOM`` feature is considerd more important than the ``age`` or
+``sex`` feature.
+
+Next, we use :func:`sklearn.inspect.permutation_importance` to calcuate the
+permutation importance for each feature.
+The `sklearn.inspect.permutation_importance` returns a numpy array where
+values in each row are the cross-validated scores for a feature. The
+permutation importance for the random forest is plotted. In this case,
+The ``RANDOM`` feature is less important than ``sex`` and ``age``.
+"""
+print(__doc__)
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn.datasets import load_diabetes
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.inspect import permutation_importance
+
+
+def plot_importances(importances, features, highlight=None, ax=None):
+    N = features.shape[0]
+
+    if ax is None:
+        _, ax = plt.subplots()
+    y_ticks = range(1, N + 1)
+    arg_sorted = np.argsort(importances)
+
+    color = ["blue" for _ in range(N)]
+    labels = features[arg_sorted]
+
+    if highlight is not None:
+        for idx, label in enumerate(labels):
+            if label == highlight:
+                color[idx] = "red"
+
+    ax.barh(y_ticks, importances[arg_sorted], color=color)
+    ax.set_yticks(y_ticks)
+    ax.set_xlim(0, np.max(importances)*1.05)
+    ax.set_ylim(0, N + 1)
+    ax.set_yticklabels(features[arg_sorted])
+
+
+ds = load_diabetes()
+X, y = ds.data, ds.target
+features = np.array(ds.feature_names + ["RAND"])
+rng = np.random.RandomState(42)
+X = np.hstack([X, rng.normal(scale=1, size=(X.shape[0], 1))])
+
+rf = RandomForestRegressor(n_estimators=50, random_state=rng)
+rf.fit(X, y)
+
+fig, (ax1, ax2) = plt.subplots(1, 2)
+plot_importances(rf.feature_importances_, features, highlight="RAND", ax=ax1)
+ax1.set_title("Feature importance from random forest")
+
+perm_importances = permutation_importance(rf, X, y, random_state=rng,
+                                          scoring="explained_variance")
+perm_importances_mean = perm_importances.mean(axis=1)
+plot_importances(perm_importances_mean, features, highlight="RAND", ax=ax2)
+ax2.set_title("Permutation importance")
+fig.tight_layout()
+plt.show()
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
@@ -75,6 +75,7 @@
                'naive_bayes', 'neighbors', 'neural_network', 'pipeline',
                'preprocessing', 'random_projection', 'semi_supervised',
                'svm', 'tree', 'discriminant_analysis', 'impute', 'compose',
+               'inspect',
                # Non-modules:
                'clone', 'get_config', 'set_config', 'config_context',
                'show_versions']

diff --git a/sklearn/inspect/__init__.py b/sklearn/inspect/__init__.py
@@ -0,0 +1,3 @@
+from .permutation_importance import permutation_importance
+
+__all__ = ['permutation_importance']
diff --git a/sklearn/inspect/permutation_importance.py b/sklearn/inspect/permutation_importance.py
@@ -0,0 +1,182 @@
+"""Permutation importance for estimators"""
+from contextlib import contextmanager
+
+import numpy as np
+
+from ..base import is_classifier, clone
+from ..utils import check_random_state
+from ..utils._joblib import Parallel, delayed
+from ..model_selection import check_cv
+from ..metrics import check_scoring
+from ..utils.metaestimators import _safe_split
+
+
+@contextmanager
+def _permute_column(X, column, random_state):
+    """Context manager to permute a column"""
+    original_feature = X[:, column].copy()
+    X[:, column] = random_state.permutation(X[:, column])
+    yield X
+    X[:, column] = original_feature
+
+
+def _fit_and_calcuate_permutation_importance(estimator, X, y, train_indices,
+                                             test_indices, columns, scoring,
+                                             random_state):
+    """Fits and calculates permutation importance
+
+    Fits ``estimator`` on ``X`` and ``y``
+
+    Parameters
+    ----------
+    estimator : object
+        A supervised learning estimator with a `fit` and is compatible with
+        ``scorer``.
+
+    X : array-like, shape = (n_samples, n_features)
+        Training data.
+
+    y : array-like, shape = (n_samples, ...)
+        Target relative to ``X``.
+
+    train_indices : array of int
+        Train indicies.
+
+    test_indices : array of int
+        Test indices.
+
+    columns : list of integers
+        A list of columns to calculate the permutation importance. If `None`,
+        all columns will be used.
+
+    scoring : string, callable or None
+        A string (see model evaluation documentation) or
+        a scorer callable object / function with signature
+        ``scorer(estimator, X, y)``.
+
+    random_state: : RandomState instance
+        Random number generator.
+
+    Returns
+    -------
+    permutation_importance_scores : list
+        Permutation importance scores for each column on the validation set
+        defined by ``test_indices``.
+    """
+    X_train, y_train = _safe_split(estimator, X, y, train_indices)
+    X_test, y_test = _safe_split(estimator, X, y, test_indices, train_indices)
+
+    estimator.fit(X_train, y_train)
+    baseline_score = scoring(estimator, X_test, y_test)
+
+    permutation_importance_scores = []
+    for column in columns:
+        with _permute_column(X_test, column, random_state) as X_perm:
+            feature_score = scoring(estimator, X_perm, y_test)
+            permutation_importance_scores.append(baseline_score -
+                                                 feature_score)
+
+    return permutation_importance_scores
+
+
+def permutation_importance(estimator, X, y, columns=None, scoring=None, cv=5,
+                           n_jobs=None, pre_dispatch='2*n_jobs',
+                           random_state=None):
+    """Permutation importance for feature evaluation.
+
+    The permutation importance of a feature is calculated as follows. First,
+    the estimator is trained on a training set. Then a baseline metric, defined
+    by ``scoring``, is evaluated on a validation set. Next, a feature column
+    from the validation set is permuted and the metric is evaluated again.
+    The permutation importance is defined to be the difference between the
+    baseline metric and metric from permutating the feature column.
+
+    Parameters
+    ----------
+    estimator : object
+        A supervised learning estimator with a `fit` and is compatible with
+        ``scorer``.
+
+    X : array-like, shape = (n_samples, n_features)
+        Training data.
+
+    y : array-like, shape = (n_samples, ...)
+        Target relative to ``X``.
+
+    columns : list of integers, optional (default=None)
+        A list of columns to calculate the permutation importance. If `None`,
+        all columns will be used
+
+    scoring : string, callable or None, optional (default=None)
+        A string (see model evaluation documentation) or
+        a scorer callable object / function with signature
+        ``scorer(estimator, X, y)``.
+
+    cv : int, cross-validation generator or an iterable, optional (default=5)
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, :class:`KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+    n_jobs : int or None, optional (default=None)
+        Number of CPUs to use during the cross validation.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    pre_dispatch : int, or string, optional
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+
+            - None, in which case all the jobs are immediately
+              created and spawned. Use this for lightweight and
+              fast-running jobs, to avoid delays due to on-demand
+              spawning of the jobs
+
+            - An int, giving the exact number of total jobs that are
+              spawned
+
+            - A string, giving an expression as a function of n_jobs,
+              as in '2*n_jobs'
+
+    random_state : int, RandomState instance or None, optional, default None
+        The seed of the pseudo random number generator that selects a random
+        feature to update.  If int, random_state is the seed used by the random
+        number generator; If RandomState instance, random_state is the random
+        number generator; If None, the random number generator is the
+        RandomState instance used by `np.random`.
+
+    Returns
+    -------
+
+    permutation_importance_scores : array, shape (n_columns, n_cv)
+        Permutation importance scores where the rows are ordered corresponding
+        to the ``columns`` argument.
+    """
+
+    cv = check_cv(cv, y, classifier=is_classifier(estimator))
+    random_state = check_random_state(random_state)
+    scoring = check_scoring(estimator, scoring=scoring)
+
+    parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch)
+
+    if columns is None:
+        columns = range(0, X.shape[1])
+
+    with parallel:
+        permutation_importance_scores = parallel(
+            delayed(_fit_and_calcuate_permutation_importance)(
+                clone(estimator), X, y, train_indices,
+                test_indices, columns, scoring, random_state
+            ) for train_indices, test_indices in cv.split(X, y))
+
+    return np.array(permutation_importance_scores).T
diff --git a/sklearn/inspect/tests/__init__.py b/sklearn/inspect/tests/__init__.py
diff --git a/sklearn/inspect/tests/test_permutation_importance.py b/sklearn/inspect/tests/test_permutation_importance.py
@@ -0,0 +1,35 @@
+import pytest
+
+import numpy as np
+
+from sklearn.datasets import load_boston
+from sklearn.inspect import permutation_importance
+from sklearn.ensemble import RandomForestRegressor
+
+
+@pytest.mark.parametrize("columns", [
+    None, [0, 2, 4, 6, 8, 10, 12, 13], [1, 3, 5, 7, 9, 11, 13]
+])
+@pytest.mark.parametrize("scoring", [
+    None, "neg_mean_absolute_error"
+])
+def test_permutation_importance_correlated_feature_is_important(
+        columns, scoring):
+    rng = np.random.RandomState(42)
+    X, y = load_boston(return_X_y=True)
+
+    # Adds correlated feature to X
+    y_with_little_noise = y + rng.normal(scale=0.001, size=y.shape[0])
+    X = np.hstack([X, y_with_little_noise.reshape(-1, 1)])
+
+    rf = RandomForestRegressor(n_estimators=50, random_state=42)
+    permute_scores = permutation_importance(rf, X, y, columns=columns, cv=4,
+                                            random_state=42, scoring=scoring)
+
+    if columns is None:
+        assert permute_scores.shape == (X.shape[1], 4)
+    else:
+        assert permute_scores.shape == (len(columns), 4)
+
+    permuate_score_means = np.mean(permute_scores, axis=-1)
+    assert np.all(permuate_score_means[-1] > permuate_score_means[:-1])