# EBM 알고리즘 구현
- Paper link: https://arxiv.org/abs/1909.09223
- Code link1: https://github.com/interpretml/interpret/tree/develop/examples/python/notebooks
- Code link2: https://medium.com/analytics-vidhya/model-interpretation-with-microsofts-interpret-ml-85aa0ad697ae
- Data link: https://archive.ics.uci.edu/ml/datasets/bank+marketing

In [7]:
# pip install interpret

## EBM을 통한 Regression 구현
- 보스턴 데이터 가격 예측

In [1]:
## 라이브러리 불러오기
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
## 데이터 불러오기
boston = load_boston()
feature_names = list(boston.feature_names)
df = pd.DataFrame(boston.data, columns = feature_names)
df['target'] = boston.target
train_cols = df.columns[0:-1]
label = df.columns[-1]
X = df[train_cols]
y = df[label]

seed = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [3]:
# 타겟 변수의 요약 통계량
y_train.describe()

count    404.000000
mean      22.522277
std        8.998991
min        5.000000
25%       17.175000
50%       21.050000
75%       25.225000
max       50.000000
Name: target, dtype: float64

In [4]:
"""Marginal(): 모델 적합 전의 X와 y 변수 간의 marginal plot"""
    # 피어슨 상관계수도 확인할 수 있음
from interpret import show
from interpret.data import Marginal

marginal = Marginal().explain_data(X_train, y_train, name = 'Train Data')
show(marginal)

In [7]:
"""Global Explanation"""
## Local explanation은 각각의 개별 인스턴스를 상대로 각 피처의 기여도를 보여주나 
## global explanation은 모델이 전체 인스턴스를 학습함으로써 획득한 전반적인 관점에서의 설명 가능성을 제공한다.
## 먼저 EBM 객체를 생성하여 train data에 적합시킨다.
from interpret.glassbox import ExplainableBoostingRegressor, LinearRegression, RegressionTree

ebm = ExplainableBoostingRegressor(random_state = 42, n_jobs=1)
ebm.fit(X_train, y_train)

## Global Explanation
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

- 전체 Summary는 LSTAT 변수가 가장 중요한 변수라고 말하고 있다.
- `드롭다운을 클릭해보면 변수마다 전체 샘플에 대해 갖는 score의 plot을 볼 수 있다.` LSTAT 변수는 값이 증가함에 따라 score가 감소하는 경향을 보인다.
- LSTAT 변수의 오른쪽 끝을 보면 upper bound와 lower bound가 매우 큰 것을 알 수 있다. 이느 해당 구간 score의 분산이 매우 큰 것을 의미한다.
- 아래의 히스토그램은 해당 구간에 존재하는 데이터 샘플의 수(density)를 나타낸다.

In [8]:
# Local Explanations: How an individual prediction was made

ebm_local = ebm.explain_local(X_test[:5], y_test[:5], name='EBM')
show(ebm_local)

- 이는 각각의 feature가 모델의 예측 결과에 미친 영향을 나타낸다.
- 파란 막대는 음의 기여도를 주황 막대는 양의 기여도를 보인다.
- 이 모든 기여도가 더해져 최종 예측 값이 계산된다.

----

## EBM을 통한 Classification 구현

In [52]:
"""
포르투갈 은행 기관의 마케팅 캠페인 데이터
target(y) : 캠페인의 일부로 고객이 제품(은행 정기 예금)을 구독할지 않을지를 예측
"""
df = pd.read_csv('bank.csv')
y = df["y"].map({"no": 0, "yes": 1})
X = df.drop("y", axis=1)

num_features = ["age", "campaign", "pdays", "previous"]

cat_features = ["job", "marital", "education","default", "housing", "loan", "contact", "month", "day", "poutcome"]
preprocessor = ColumnTransformer([("numerical", "passthrough", num_features), 
                                ("categorical", OneHotEncoder(sparse=False, handle_unknown="ignore"),cat_features)])
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.3, random_state=42)


In [53]:
preprocessor.fit(X_train)

preprocessor.transform(X_train)

array([[37.,  1., -1., ...,  0.,  0.,  1.],
       [25.,  1., -1., ...,  0.,  0.,  1.],
       [53.,  3., -1., ...,  0.,  0.,  1.],
       ...,
       [35.,  2., -1., ...,  0.,  0.,  1.],
       [37.,  4., -1., ...,  0.,  0.,  1.],
       [63.,  1., -1., ...,  0.,  0.,  1.]])

In [54]:
# Get the list of categories generated by the process
ohe_categories = preprocessor.named_transformers_["categorical"].categories_

# Create nice names for our one hot encoded features
new_ohe_features = [f"{col}__{val}" for col, vals in zip(cat_features, ohe_categories) for val in vals]

# Create a new list with all names of features
all_features = num_features + new_ohe_features

In [55]:
X_train = pd.DataFrame(preprocessor.transform(X_train), columns=all_features)
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=all_features)

In [56]:
"""ClassHistogram() : 데이터셋에 대한 EDA를 해준다. 
- 데이터에 결측치가 존재해서는 안된다."""

from interpret import show
from interpret.data import ClassHistogram

hist = ClassHistogram().explain_data(X_train, y_train, name = 'Train Data')
show(hist)

- 고객이 제품을 구독했는지(1) 구독하지 않았는지(0)에 대해서 Plotly Histogram이 있는 대시보드 생성 

In [57]:
from interpret.glassbox import ExplainableBoostingClassifier

ebm = ExplainableBoostingClassifier(random_state=42, n_jobs=1)
ebm.fit(X_train, y_train)

ExplainableBoostingClassifier(feature_names=['age', 'campaign', 'pdays',
                                             'previous', 'job__admin.',
                                             'job__blue-collar',
                                             'job__entrepreneur',
                                             'job__housemaid',
                                             'job__management', 'job__retired',
                                             'job__self-employed',
                                             'job__services', 'job__student',
                                             'job__technician',
                                             'job__unemployed', 'job__unknown',
                                             'marital__divorced',
                                             'marital__married',
                                             'marital__single',
                                             'educa...
                                          

In [58]:
ebm_global = ebm.explain_global(name = 'EBM')
show(ebm_global)

- duration과 contact가 중요한 변수임을 알 수 있다.
- age 변수의 시각화 결과는 나이가 30대 이하거나 60대 이상일수록 캠페인이 효과적일 수 있음을 보여준다.

In [59]:
ebm_local = ebm.explain_local(X_test[:4], y_test[:4], name='EBM') 
show(ebm_local)

- PrScore는 예측 score를 의미한다. (이 경우에 0을 예측할 확률이 0.964이고, 실제값은 0이다.)

In [60]:
"""ROC 커브 확인"""
from interpret.perf import ROC
ebm_perf = ROC(ebm.predict_proba).explain_perf(X_test, y_test, name = 'EBM')
show(ebm_perf)

## 타 모델들과 비교

In [61]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import balanced_accuracy_score, classification_report
from lightgbm import LGBMClassifier
from interpret.glassbox import ExplainableBoostingClassifier, LogisticRegression, ClassificationTree, DecisionListClassifier

In [62]:
X_train.head()

Unnamed: 0,age,campaign,pdays,previous,job__admin.,job__blue-collar,job__entrepreneur,job__housemaid,job__management,job__retired,...,day__26,day__27,day__28,day__29,day__30,day__31,poutcome__failure,poutcome__other,poutcome__success,poutcome__unknown
0,37.0,1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,25.0,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,53.0,3.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,31.0,2.0,-1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,43.0,2.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [63]:
"""타 모델들과 비교"""
from interpret.glassbox import LogisticRegression, ClassificationTree 
from interpret.perf import ROC
from lightgbm import LGBMClassifier

# Logistic Regression
lr_model = LogisticRegression()

lr_model.fit(X_train, y_train)
# Random Forest
rf_model = ClassificationTree()

rf_model.fit(X_train, y_train)

#Light GBM
lgb_model = LGBMClassifier()

lgb_model.fit(X_train, y_train)

LGBMClassifier()

In [64]:
"""다른 모델 객체들도 대시보드 형태로 보여줄 수 있다."""
lr_perf = ROC(lr_model.predict_proba).explain_perf(X_test, y_test, name='Logistic Regression')
tree_perf = ROC(rf_model.predict_proba).explain_perf(X_test, y_test, name='Classification Tree')
lgbm_perf = ROC(lgb_model.predict_proba).explain_perf(X_test, y_test, name='Light GBM')
ebm_perf = ROC(ebm.predict_proba).explain_perf(X_test, y_test, name = 'EBM')


show(lr_perf)
show(tree_perf)
show(ebm_perf)
show(lgbm_perf)

In [65]:
lr_global = lr_model.explain_global(name='LR')
tree_global = rf_model.explain_global(name='Tree')

show(lr_global)
show(tree_global)
show(ebm_global)

In [66]:
lr_global = lr_model.explain_global(name='LR')
tree_global = rf_model.explain_global(name='Tree')
show([hist, lr_global, lr_perf, tree_global, tree_perf,ebm_global,ebm_local,ebm_perf], share_tables=True)