Merge pull request #358 from rasbt/feature-importance

Add a new feature_importance_permutation function
rasbt · Apr 3, 2018 · d90b513 · d90b513
2 parents 9c8529a + fdbb375
commit d90b513
Show file tree

Hide file tree

Showing 12 changed files with 894 additions and 11 deletions.
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -57,6 +57,7 @@ pages:
     - user_guide/evaluate/BootstrapOutOfBag.md
     - user_guide/evaluate/cochrans_q.md    
     - user_guide/evaluate/confusion_matrix.md
+    - user_guide/evaluate/feature_importance_permutation.md
     - user_guide/evaluate/lift_score.md
     - user_guide/evaluate/mcnemar_table.md
     - user_guide/evaluate/mcnemar_tables.md

diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md
@@ -17,11 +17,10 @@ The CHANGELOG for the current development version is available at
 
 ##### New Features
 
-
--   The fit method of the ExhaustiveFeatureSelector now optionally accepts
-    **fit_params for the estimator that is used for the feature selection. ([#354](https://github.com/rasbt/mlxtend/pull/354) by Zach Griffith)
--   The fit method of the SequentialFeatureSelector now optionally accepts
-    **fit_params for the estimator that is used for the feature selection. ([#350](https://github.com/rasbt/mlxtend/pull/350) by Zach Griffith)
+-  A new `feature_importance_permuation` function to compute the feature importance in classifiers and regressors via the *permutation importance* method ([#358](https://github.com/rasbt/mlxtend/pull/358))
+-  The fit method of the ExhaustiveFeatureSelector now optionally accepts **fit_params for the estimator that is used for the feature selection. ([#354](https://github.com/rasbt/mlxtend/pull/354) by Zach Griffith)
+-  The fit method of the SequentialFeatureSelector now optionally accepts
+**fit_params for the estimator that is used for the feature selection. ([#350](https://github.com/rasbt/mlxtend/pull/350) by Zach Griffith)
 
 
 - -

diff --git a/docs/sources/USER_GUIDE_INDEX.md b/docs/sources/USER_GUIDE_INDEX.md
@@ -29,6 +29,7 @@
 - [BootstrapOutOfBag](user_guide/evaluate/BootstrapOutOfBag.md)
 - [cochrans_q](user_guide/evaluate/cochrans_q.md)
 - [confusion_matrix](user_guide/evaluate/confusion_matrix.md)
+- [feature_importance_permutation](user_guide/evaluate/feature_importance_permutation.md)
 - [lift_score](user_guide/evaluate/lift_score.md)
 - [mcnemar_table](user_guide/evaluate/mcnemar_table.md)
 - [mcnemar_tables](user_guide/evaluate/mcnemar_tables.md)

diff --git a/docs/sources/user_guide/evaluate/feature_importance_permutation.ipynb b/docs/sources/user_guide/evaluate/feature_importance_permutation.ipynb
diff --git a/...te/feature_importance_permutation_files/feature_importance_permutation_17_0.png b/...te/feature_importance_permutation_files/feature_importance_permutation_17_0.png
diff --git a/...te/feature_importance_permutation_files/feature_importance_permutation_23_0.png b/...te/feature_importance_permutation_files/feature_importance_permutation_23_0.png
diff --git a/...te/feature_importance_permutation_files/feature_importance_permutation_27_0.png b/...te/feature_importance_permutation_files/feature_importance_permutation_27_0.png
diff --git a/...te/feature_importance_permutation_files/feature_importance_permutation_32_0.png b/...te/feature_importance_permutation_files/feature_importance_permutation_32_0.png
diff --git a/...te/feature_importance_permutation_files/feature_importance_permutation_35_0.png b/...te/feature_importance_permutation_files/feature_importance_permutation_35_0.png
diff --git a/mlxtend/evaluate/__init__.py b/mlxtend/evaluate/__init__.py
@@ -4,17 +4,19 @@
 #
 # License: BSD 3 clause
 
-from .scoring import scoring
+
+from .bootstrap import bootstrap
+from .bootstrap_outofbag import BootstrapOutOfBag
+from .bootstrap_point632 import bootstrap_point632_score
+from .cochrans_q import cochrans_q
 from .confusion_matrix import confusion_matrix
+from .feature_importance import feature_importance_permutation
 from .lift_score import lift_score
 from .mcnemar import mcnemar_table
 from .mcnemar import mcnemar_tables
 from .mcnemar import mcnemar
-from .bootstrap import bootstrap
-from .bootstrap_outofbag import BootstrapOutOfBag
-from .bootstrap_point632 import bootstrap_point632_score
 from .permutation import permutation_test
-from .cochrans_q import cochrans_q
+from .scoring import scoring
 from .ttest import paired_ttest_resampled
 from .ttest import paired_ttest_kfold_cv
 from .ttest import paired_ttest_5x2cv
@@ -26,4 +28,5 @@
            "bootstrap", "permutation_test",
            "BootstrapOutOfBag", "bootstrap_point632_score",
            "cochrans_q", "paired_ttest_resampled",
-           "paired_ttest_kfold_cv", "paired_ttest_5x2cv"]
+           "paired_ttest_kfold_cv", "paired_ttest_5x2cv",
+           "feature_importance_permutation"]
diff --git a/mlxtend/evaluate/feature_importance.py b/mlxtend/evaluate/feature_importance.py
@@ -0,0 +1,96 @@
+# Sebastian Raschka 2014-2018
+# mlxtend Machine Learning Library Extensions
+#
+# Feature Importance Estimation Through Permutation
+# Author: Sebastian Raschka <sebastianraschka.com>
+#
+# License: BSD 3 clause
+
+import numpy as np
+
+
+def feature_importance_permutation(X, y, predict_method,
+                                   metric, num_rounds=1, seed=None):
+    """Feature importance imputation via permutation importance
+
+    Parameters
+    ----------
+
+    X : NumPy array, shape = [n_samples, n_features]
+        Dataset, where n_samples is the number of samples and
+        n_features is the number of features.
+
+    y : NumPy array, shape = [n_samples]
+        Target values.
+
+    predict_method : prediction function
+        A callable function that predicts the target values
+        from X.
+
+    metric : str, callable
+        The metric for evaluating the feature importance through
+        permutation. By default, the strings 'accuracy' is
+        recommended for classifiers and the string 'r2' is
+        recommended for regressors. Optionally, a custom
+        scoring function (e.g., `metric=scoring_func`) that
+        accepts two arguments, y_true and y_pred, which have
+        similar shape to the `y` array.
+
+    num_rounds : int (default=1)
+        Number of rounds the feature columns are permuted to
+        compute the permutation importance.
+
+    seed : int or None (default=None)
+        Random seed for permuting the feature columns.
+
+    Returns
+    ---------
+
+    mean_importance_vals, all_importance_vals : NumPy arrays.
+      The first array, mean_importance_vals has shape [n_features, ] and
+      contains the importance values for all features.
+      The shape of the second array is [n_features, num_rounds] and contains
+      the feature importance for each repetition. If num_rounds=1,
+      it contains the same values as the first array, mean_importance_vals.
+
+    """
+
+    if not isinstance(num_rounds, int):
+        raise ValueError('num_rounds must be an integer.')
+    if num_rounds < 1:
+        raise ValueError('num_rounds must be greater than 1.')
+
+    if not (metric in ('r2', 'accuracy') or hasattr(metric, '__call__')):
+        raise ValueError('metric must be either "r2", "accuracy", '
+                         'or a function with signature func(y_true, y_pred).')
+
+    if metric == 'r2':
+        def score_func(y_true, y_pred):
+            sum_of_squares = np.sum(np.square(y_true - y_pred))
+            res_sum_of_squares = np.sum(np.square(y_true - y_true.mean()))
+            r2_score = 1. - (sum_of_squares / res_sum_of_squares)
+            return r2_score
+
+    elif metric == 'accuracy':
+        def score_func(y_true, y_pred):
+            return np.mean(y_true == y_pred)
+
+    rng = np.random.RandomState(seed)
+
+    mean_importance_vals = np.zeros(X.shape[1])
+    all_importance_vals = np.zeros((X.shape[1], num_rounds))
+
+    baseline = score_func(y, predict_method(X))
+
+    for round_idx in range(num_rounds):
+        for col_idx in range(X.shape[1]):
+            save_col = X[:, col_idx].copy()
+            rng.shuffle(X[:, col_idx])
+            new_score = score_func(y, predict_method(X))
+            X[:, col_idx] = save_col
+            importance = baseline - new_score
+            mean_importance_vals[col_idx] += importance
+            all_importance_vals[col_idx, round_idx] = importance
+    mean_importance_vals /= num_rounds
+
+    return mean_importance_vals, all_importance_vals
diff --git a/mlxtend/evaluate/tests/test_feature_importance.py b/mlxtend/evaluate/tests/test_feature_importance.py
@@ -0,0 +1,144 @@
+# Sebastian Raschka 2014-2018
+# mlxtend Machine Learning Library Extensions
+#
+# Feature Importance Estimation Through Permutation
+# Author: Sebastian Raschka <sebastianraschka.com>
+#
+# License: BSD 3 clause
+
+import numpy as np
+from sklearn.datasets import make_classification
+from sklearn.datasets import make_regression
+from sklearn.model_selection import train_test_split
+from sklearn.svm import SVC
+from sklearn.svm import SVR
+from mlxtend.utils import assert_raises
+from mlxtend.evaluate import feature_importance_permutation
+
+
+def test_num_rounds_not_int():
+    assert_raises(ValueError,
+                  'num_rounds must be an integer.',
+                  feature_importance_permutation,
+                  lambda x, y: (x, y),
+                  np.array([[1], [2], [3]]),
+                  np.array([1, 2, 3]),
+                  'accuracy',
+                  1.23)
+
+
+def test_num_rounds_negative_int():
+    assert_raises(ValueError,
+                  'num_rounds must be greater than 1.',
+                  feature_importance_permutation,
+                  lambda x, y: (x, y),
+                  np.array([[1], [2], [3]]),
+                  np.array([1, 2, 3]),
+                  'accuracy',
+                  -1)
+
+
+def test_metric_wrong():
+    assert_raises(ValueError,
+                  ('metric must be either "r2", "accuracy", or a '
+                   'function with signature '
+                   'func(y_true, y_pred).'),
+                  feature_importance_permutation,
+                  lambda x, y: (x, y),
+                  np.array([[1], [2], [3]]),
+                  np.array([1, 2, 3]),
+                  'some-metric')
+
+
+def test_classification():
+
+    X, y = make_classification(n_samples=1000,
+                               n_features=6,
+                               n_informative=3,
+                               n_redundant=0,
+                               n_repeated=0,
+                               n_classes=2,
+                               random_state=0,
+                               shuffle=False)
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.3, random_state=0, stratify=y)
+
+    svm = SVC(C=1.0, kernel='rbf', random_state=0)
+    svm.fit(X_train, y_train)
+
+    imp_vals, imp_all = feature_importance_permutation(
+        predict_method=svm.predict,
+        X=X_test,
+        y=y_test,
+        metric='accuracy',
+        num_rounds=1,
+        seed=1)
+
+    assert imp_vals.shape == (X_train.shape[1], )
+    assert imp_all.shape == (X_train.shape[1], 1)
+    assert imp_vals[0] > 0.2
+    assert imp_vals[1] > 0.2
+    assert imp_vals[2] > 0.2
+    assert sum(imp_vals[3:]) <= 0.02
+
+
+def test_regression():
+
+    X, y = make_regression(n_samples=1000,
+                           n_features=5,
+                           n_informative=2,
+                           n_targets=1,
+                           random_state=123,
+                           shuffle=False)
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.3, random_state=123)
+
+    svm = SVR(kernel='rbf')
+    svm.fit(X_train, y_train)
+
+    imp_vals, imp_all = feature_importance_permutation(
+        predict_method=svm.predict,
+        X=X_test,
+        y=y_test,
+        metric='r2',
+        num_rounds=1,
+        seed=123)
+
+    assert imp_vals.shape == (X_train.shape[1], )
+    assert imp_all.shape == (X_train.shape[1], 1)
+    assert imp_vals[0] > 0.2
+    assert imp_vals[1] > 0.2
+    assert sum(imp_vals[3:]) <= 0.01
+
+
+def test_n_rounds():
+
+    X, y = make_classification(n_samples=1000,
+                               n_features=6,
+                               n_informative=3,
+                               n_redundant=0,
+                               n_repeated=0,
+                               n_classes=2,
+                               random_state=0,
+                               shuffle=False)
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.3, random_state=0, stratify=y)
+
+    svm = SVC(C=1.0, kernel='rbf', random_state=0)
+    svm.fit(X_train, y_train)
+
+    imp_vals, imp_all = feature_importance_permutation(
+        predict_method=svm.predict,
+        X=X_test,
+        y=y_test,
+        metric='accuracy',
+        num_rounds=100,
+        seed=1)
+
+    assert imp_vals.shape == (X_train.shape[1], )
+    assert imp_all.shape == (X_train.shape[1], 100)
+    assert imp_vals[0].mean() > 0.2
+    assert imp_vals[1].mean() > 0.2