Skip to content

Commit

Permalink
Merge pull request #358 from rasbt/feature-importance
Browse files Browse the repository at this point in the history
Add a new feature_importance_permutation function
  • Loading branch information
rasbt committed Apr 3, 2018
2 parents 9c8529a + fdbb375 commit d90b513
Show file tree
Hide file tree
Showing 12 changed files with 894 additions and 11 deletions.
1 change: 1 addition & 0 deletions docs/mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ pages:
- user_guide/evaluate/BootstrapOutOfBag.md
- user_guide/evaluate/cochrans_q.md
- user_guide/evaluate/confusion_matrix.md
- user_guide/evaluate/feature_importance_permutation.md
- user_guide/evaluate/lift_score.md
- user_guide/evaluate/mcnemar_table.md
- user_guide/evaluate/mcnemar_tables.md
Expand Down
9 changes: 4 additions & 5 deletions docs/sources/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,10 @@ The CHANGELOG for the current development version is available at

##### New Features


- The fit method of the ExhaustiveFeatureSelector now optionally accepts
**fit_params for the estimator that is used for the feature selection. ([#354](https://github.com/rasbt/mlxtend/pull/354) by Zach Griffith)
- The fit method of the SequentialFeatureSelector now optionally accepts
**fit_params for the estimator that is used for the feature selection. ([#350](https://github.com/rasbt/mlxtend/pull/350) by Zach Griffith)
- A new `feature_importance_permuation` function to compute the feature importance in classifiers and regressors via the *permutation importance* method ([#358](https://github.com/rasbt/mlxtend/pull/358))
- The fit method of the ExhaustiveFeatureSelector now optionally accepts **fit_params for the estimator that is used for the feature selection. ([#354](https://github.com/rasbt/mlxtend/pull/354) by Zach Griffith)
- The fit method of the SequentialFeatureSelector now optionally accepts
**fit_params for the estimator that is used for the feature selection. ([#350](https://github.com/rasbt/mlxtend/pull/350) by Zach Griffith)


- -
Expand Down
1 change: 1 addition & 0 deletions docs/sources/USER_GUIDE_INDEX.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
- [BootstrapOutOfBag](user_guide/evaluate/BootstrapOutOfBag.md)
- [cochrans_q](user_guide/evaluate/cochrans_q.md)
- [confusion_matrix](user_guide/evaluate/confusion_matrix.md)
- [feature_importance_permutation](user_guide/evaluate/feature_importance_permutation.md)
- [lift_score](user_guide/evaluate/lift_score.md)
- [mcnemar_table](user_guide/evaluate/mcnemar_table.md)
- [mcnemar_tables](user_guide/evaluate/mcnemar_tables.md)
Expand Down
639 changes: 639 additions & 0 deletions docs/sources/user_guide/evaluate/feature_importance_permutation.ipynb

Large diffs are not rendered by default.

Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
15 changes: 9 additions & 6 deletions mlxtend/evaluate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,19 @@
#
# License: BSD 3 clause

from .scoring import scoring

from .bootstrap import bootstrap
from .bootstrap_outofbag import BootstrapOutOfBag
from .bootstrap_point632 import bootstrap_point632_score
from .cochrans_q import cochrans_q
from .confusion_matrix import confusion_matrix
from .feature_importance import feature_importance_permutation
from .lift_score import lift_score
from .mcnemar import mcnemar_table
from .mcnemar import mcnemar_tables
from .mcnemar import mcnemar
from .bootstrap import bootstrap
from .bootstrap_outofbag import BootstrapOutOfBag
from .bootstrap_point632 import bootstrap_point632_score
from .permutation import permutation_test
from .cochrans_q import cochrans_q
from .scoring import scoring
from .ttest import paired_ttest_resampled
from .ttest import paired_ttest_kfold_cv
from .ttest import paired_ttest_5x2cv
Expand All @@ -26,4 +28,5 @@
"bootstrap", "permutation_test",
"BootstrapOutOfBag", "bootstrap_point632_score",
"cochrans_q", "paired_ttest_resampled",
"paired_ttest_kfold_cv", "paired_ttest_5x2cv"]
"paired_ttest_kfold_cv", "paired_ttest_5x2cv",
"feature_importance_permutation"]
96 changes: 96 additions & 0 deletions mlxtend/evaluate/feature_importance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Sebastian Raschka 2014-2018
# mlxtend Machine Learning Library Extensions
#
# Feature Importance Estimation Through Permutation
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause

import numpy as np


def feature_importance_permutation(X, y, predict_method,
metric, num_rounds=1, seed=None):
"""Feature importance imputation via permutation importance
Parameters
----------
X : NumPy array, shape = [n_samples, n_features]
Dataset, where n_samples is the number of samples and
n_features is the number of features.
y : NumPy array, shape = [n_samples]
Target values.
predict_method : prediction function
A callable function that predicts the target values
from X.
metric : str, callable
The metric for evaluating the feature importance through
permutation. By default, the strings 'accuracy' is
recommended for classifiers and the string 'r2' is
recommended for regressors. Optionally, a custom
scoring function (e.g., `metric=scoring_func`) that
accepts two arguments, y_true and y_pred, which have
similar shape to the `y` array.
num_rounds : int (default=1)
Number of rounds the feature columns are permuted to
compute the permutation importance.
seed : int or None (default=None)
Random seed for permuting the feature columns.
Returns
---------
mean_importance_vals, all_importance_vals : NumPy arrays.
The first array, mean_importance_vals has shape [n_features, ] and
contains the importance values for all features.
The shape of the second array is [n_features, num_rounds] and contains
the feature importance for each repetition. If num_rounds=1,
it contains the same values as the first array, mean_importance_vals.
"""

if not isinstance(num_rounds, int):
raise ValueError('num_rounds must be an integer.')
if num_rounds < 1:
raise ValueError('num_rounds must be greater than 1.')

if not (metric in ('r2', 'accuracy') or hasattr(metric, '__call__')):
raise ValueError('metric must be either "r2", "accuracy", '
'or a function with signature func(y_true, y_pred).')

if metric == 'r2':
def score_func(y_true, y_pred):
sum_of_squares = np.sum(np.square(y_true - y_pred))
res_sum_of_squares = np.sum(np.square(y_true - y_true.mean()))
r2_score = 1. - (sum_of_squares / res_sum_of_squares)
return r2_score

elif metric == 'accuracy':
def score_func(y_true, y_pred):
return np.mean(y_true == y_pred)

rng = np.random.RandomState(seed)

mean_importance_vals = np.zeros(X.shape[1])
all_importance_vals = np.zeros((X.shape[1], num_rounds))

baseline = score_func(y, predict_method(X))

for round_idx in range(num_rounds):
for col_idx in range(X.shape[1]):
save_col = X[:, col_idx].copy()
rng.shuffle(X[:, col_idx])
new_score = score_func(y, predict_method(X))
X[:, col_idx] = save_col
importance = baseline - new_score
mean_importance_vals[col_idx] += importance
all_importance_vals[col_idx, round_idx] = importance
mean_importance_vals /= num_rounds

return mean_importance_vals, all_importance_vals
144 changes: 144 additions & 0 deletions mlxtend/evaluate/tests/test_feature_importance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# Sebastian Raschka 2014-2018
# mlxtend Machine Learning Library Extensions
#
# Feature Importance Estimation Through Permutation
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause

import numpy as np
from sklearn.datasets import make_classification
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.svm import SVR
from mlxtend.utils import assert_raises
from mlxtend.evaluate import feature_importance_permutation


def test_num_rounds_not_int():
assert_raises(ValueError,
'num_rounds must be an integer.',
feature_importance_permutation,
lambda x, y: (x, y),
np.array([[1], [2], [3]]),
np.array([1, 2, 3]),
'accuracy',
1.23)


def test_num_rounds_negative_int():
assert_raises(ValueError,
'num_rounds must be greater than 1.',
feature_importance_permutation,
lambda x, y: (x, y),
np.array([[1], [2], [3]]),
np.array([1, 2, 3]),
'accuracy',
-1)


def test_metric_wrong():
assert_raises(ValueError,
('metric must be either "r2", "accuracy", or a '
'function with signature '
'func(y_true, y_pred).'),
feature_importance_permutation,
lambda x, y: (x, y),
np.array([[1], [2], [3]]),
np.array([1, 2, 3]),
'some-metric')


def test_classification():

X, y = make_classification(n_samples=1000,
n_features=6,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=2,
random_state=0,
shuffle=False)

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=0, stratify=y)

svm = SVC(C=1.0, kernel='rbf', random_state=0)
svm.fit(X_train, y_train)

imp_vals, imp_all = feature_importance_permutation(
predict_method=svm.predict,
X=X_test,
y=y_test,
metric='accuracy',
num_rounds=1,
seed=1)

assert imp_vals.shape == (X_train.shape[1], )
assert imp_all.shape == (X_train.shape[1], 1)
assert imp_vals[0] > 0.2
assert imp_vals[1] > 0.2
assert imp_vals[2] > 0.2
assert sum(imp_vals[3:]) <= 0.02


def test_regression():

X, y = make_regression(n_samples=1000,
n_features=5,
n_informative=2,
n_targets=1,
random_state=123,
shuffle=False)

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=123)

svm = SVR(kernel='rbf')
svm.fit(X_train, y_train)

imp_vals, imp_all = feature_importance_permutation(
predict_method=svm.predict,
X=X_test,
y=y_test,
metric='r2',
num_rounds=1,
seed=123)

assert imp_vals.shape == (X_train.shape[1], )
assert imp_all.shape == (X_train.shape[1], 1)
assert imp_vals[0] > 0.2
assert imp_vals[1] > 0.2
assert sum(imp_vals[3:]) <= 0.01


def test_n_rounds():

X, y = make_classification(n_samples=1000,
n_features=6,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=2,
random_state=0,
shuffle=False)

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=0, stratify=y)

svm = SVC(C=1.0, kernel='rbf', random_state=0)
svm.fit(X_train, y_train)

imp_vals, imp_all = feature_importance_permutation(
predict_method=svm.predict,
X=X_test,
y=y_test,
metric='accuracy',
num_rounds=100,
seed=1)

assert imp_vals.shape == (X_train.shape[1], )
assert imp_all.shape == (X_train.shape[1], 100)
assert imp_vals[0].mean() > 0.2
assert imp_vals[1].mean() > 0.2

0 comments on commit d90b513

Please sign in to comment.