From 2d8d103c7faf4b870e5dc68de11a844cf72c3a38 Mon Sep 17 00:00:00 2001 From: Scott Lundberg Date: Tue, 29 May 2018 09:58:47 -0700 Subject: [PATCH] Support multi-class LightGBM output and the LightGBM sklearn interface Fixes #103 --- setup.py | 4 +- shap/explainers/mimic.py | 84 ++++++++++++++++++++++++++++++++++++++++ shap/explainers/tree.py | 8 ++++ tests/test_basic.py | 27 +++++++++++++ 4 files changed, 121 insertions(+), 2 deletions(-) create mode 100644 shap/explainers/mimic.py diff --git a/setup.py b/setup.py index ba6702458..cb97101ab 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ def run_setup(with_binary): setup( name='shap', - version='0.16.2', + version='0.17.0', description='A unified approach to explain the output of any machine learning model.', url='http://github.com/slundberg/shap', author='Scott Lundberg', @@ -23,7 +23,7 @@ def run_setup(with_binary): packages=['shap', 'shap.explainers'], install_requires=['numpy', 'scipy', 'iml>=0.6.0', 'scikit-learn', 'matplotlib', 'pandas', 'tqdm'], test_suite='nose.collector', - tests_require=['nose', 'xgboost'], + tests_require=['nose', 'xgboost', 'lightgbm'], ext_modules = ext_modules, zip_safe=False ) diff --git a/shap/explainers/mimic.py b/shap/explainers/mimic.py new file mode 100644 index 000000000..96fb1c15e --- /dev/null +++ b/shap/explainers/mimic.py @@ -0,0 +1,84 @@ +import numpy as np +import multiprocessing +import sys + +try: + import xgboost +except ImportError: + pass +except: + print("xgboost is installed...but failed to load!") + pass + +class MimicExplainer: + """Fits a mimic model to the original model and then explains predictions using the mimic model. + + Tree SHAP allows for very fast SHAP value explainations of flexible gradient boosted decision + tree (GBDT) models. Since GBDT models are so flexible we can train them to mimic any black-box + model and then using Tree SHAP we can explain them. This won't work well for images, but for + any type of problem that GBDTs do reasonable well on, they should also be able to learn how to + explain black-box models on the data. This mimic explainer also allows you to use a linear model, + but keep in mind that will not do as well at explaining typical non-linear black-box models. In + the future we could include other mimic model types given enough demand/help. Finally, we would + like to note that this explainer is vaugely inspired by https://arxiv.org/abs/1802.07814 where + they learn an explainer that can be applied to any input. + """ + + def __init__(self, model, data, mimic_model="xgboost", mimic_model_params={}): + self.mimic_model_type = mimic_model + self.mimic_model_params = mimic_model_params + + # convert incoming inputs to standardized iml objects + self.link = convert_to_link(link) + self.model = convert_to_model(model) + self.keep_index = kwargs.get("keep_index", False) + self.data = convert_to_data(data, keep_index=self.keep_index) + match_model_to_data(self.model, self.data) + + self.model_out = self.model.f(data.data) + + # enforce our current input type limitations + assert isinstance(self.data, DenseData), "Shap explainer only supports the DenseData input currently." + assert not self.data.transposed, "Shap explainer does not support transposed DenseData currently." + + # warn users about large background data sets + if len(self.data.weights) < 100: + log.warning("Using only " + str(len(self.data.weights)) + " training data samples could cause " + + "the mimic model poorly to fit the real model. Consider using more training samples " + + "or if you don't have more samples, using shap.inflate(data, N) to generate more.") + + self._train_mimic_model() + + def _train_mimic_model(self): + + if self.mimic_model_type == "xgboost": + self.mimic_model = xgboost.train(self.mimic_model_params, xgboost.DMatrix(data.data)) + + def shap_values(self, X, **kwargs): + """ Estimate the SHAP values for a set of samples. + + Parameters + ---------- + X : numpy.array or pandas.DataFrame + A matrix of samples (# samples x # features) on which to explain the model's output. + + Returns + ------- + For a models with a single output this returns a matrix of SHAP values + (# samples x # features + 1). The last column is the base value of the model, which is + the expected value of the model applied to the background dataset. This causes each row to + sum to the model output for that sample. For models with vector outputs this returns a list + of such matrices, one for each output. + """ + + phi = None + if self.mimic_model_type == "xgboost": + if not str(type(X)).endswith("xgboost.core.DMatrix'>"): + X = xgboost.DMatrix(X) + phi = self.trees.predict(X, pred_contribs=True) + + if phi is not None: + if len(phi.shape) == 3: + return [phi[:, i, :] for i in range(phi.shape[1])] + else: + return phi diff --git a/shap/explainers/tree.py b/shap/explainers/tree.py index 241f7992f..1cd35595c 100644 --- a/shap/explainers/tree.py +++ b/shap/explainers/tree.py @@ -66,6 +66,12 @@ def __init__(self, model, **kwargs): elif str(type(model)).endswith("lightgbm.basic.Booster'>"): self.model_type = "lightgbm" self.trees = model + elif str(type(model)).endswith("lightgbm.sklearn.LGBMRegressor'>"): + self.model_type = "lightgbm" + self.trees = model.booster_ + elif str(type(model)).endswith("lightgbm.sklearn.LGBMClassifier'>"): + self.model_type = "lightgbm" + self.trees = model.booster_ elif str(type(model)).endswith("catboost.core.CatBoostRegressor'>"): self.model_type = "catboost" self.trees = model @@ -100,6 +106,8 @@ def shap_values(self, X, **kwargs): phi = self.trees.predict(X, pred_contribs=True) elif self.model_type == "lightgbm": phi = self.trees.predict(X, pred_contrib=True) + if phi.shape[1] != X.shape[1] + 1: + phi = phi.reshape(X.shape[0], phi.shape[1]//(X.shape[1]+1), X.shape[1]+1) elif self.model_type == "catboost": # thanks to the CatBoost team for implementing this... phi = self.trees.get_feature_importance(data=catboost.Pool(X), fstr_type='ShapValues') diff --git a/tests/test_basic.py b/tests/test_basic.py index f8d09ad91..d33ce353b 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -118,3 +118,30 @@ def test_mixed_types(): bst = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X, label=y), 1000) shap_values = shap.TreeExplainer(bst).shap_values(X) shap.dependence_plot(0, shap_values, X, show=False) + +def test_lightgbm(): + import lightgbm + import shap + + # train XGBoost model + X, y = shap.datasets.boston() + model = lightgbm.sklearn.LGBMRegressor() + model.fit(X, y) + + # explain the model's predictions using SHAP values + shap_values = shap.TreeExplainer(model).shap_values(X) + +def test_lightgbm_multiclass(): + import lightgbm + import shap + + # train XGBoost model + X, Y = shap.datasets.iris() + model = lightgbm.sklearn.LGBMClassifier() + model.fit(X, Y) + + # explain the model's predictions using SHAP values + shap_values = shap.TreeExplainer(model).shap_values(X) + + # ensure plot works for first class + shap.dependence_plot(0, shap_values[0], X, show=False)