In [2]:
!echo $PWD

/notebook


In [3]:
# To run benchmark script, you will need to install XGBoost 
# (pip install XGBoost)

import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer


def load_adult_data():
    df = pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
        header=None)
    df.columns = [
        "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
        "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
        "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
    ]
    train_cols = df.columns[0:-1]
    label = df.columns[-1]
    X_df = df[train_cols]
    y_df = df[label]

    dataset = {
        'problem': 'classification',
        'full': {
            'X': X_df,
            'y': y_df,
        },
    }

    return dataset

def load_heart_data():
    # https://www.kaggle.com/ronitf/heart-disease-uci
    df = pd.read_csv(r'/notebook/data/heart.csv')
    train_cols = df.columns[0:-1]
    label = df.columns[-1]
    X_df = df[train_cols]
    y_df = df[label]
    dataset = {
        'problem': 'classification',
        'full': {
            'X': X_df,
            'y': y_df,
        },
    }
    
    return dataset



In [4]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit, cross_validate

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.linear_model import SGDClassifier, LogisticRegression

from interpret.glassbox import ExplainableBoostingClassifier


def format_n(x):
    return "{0:.3f}".format(x)

def process_model(clf, name, X, y, n_splits=3):
    # Evaluate model
    ss = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.25, random_state=1337)
    scores = cross_validate(
        clf, X, y, scoring='roc_auc', cv=ss,
        n_jobs=None, return_estimator=True
    )

    record = dict()
    record['model_name'] = name
    record['fit_time_mean'] = format_n(np.mean(scores['fit_time']))
    record['fit_time_std'] = format_n(np.std(scores['fit_time']))
    record['test_score_mean'] = format_n(np.mean(scores['test_score']))
    record['test_score_std'] = format_n(np.std(scores['test_score']))

    return record



def benchmark_models(dataset_name, X, y, ct=None, n_splits=3, random_state=1337):
    if ct is None:
        is_cat = np.array([dt.kind == 'O' for dt in X.dtypes])
        cat_cols = X.columns.values[is_cat]
        num_cols = X.columns.values[~is_cat]

        cat_ohe_step = ('ohe', OneHotEncoder(sparse=False,
                                             handle_unknown='ignore'))

        cat_pipe = Pipeline([cat_ohe_step])
        num_pipe = Pipeline([('identity', FunctionTransformer())])
        transformers = [
            ('cat', cat_pipe, cat_cols),
            ('num', num_pipe, num_cols)
        ]
        ct = ColumnTransformer(transformers=transformers)

    records = []

    summary_record = {}
    summary_record['dataset_name'] = dataset_name
    print()
    print('-' * 78)
    print(dataset_name)
    print('-' * 78)
    print(summary_record)
    print()

    pipe = Pipeline([
        ('ct', ct),
        ('std', StandardScaler()),
        ('linear-sgd', SGDClassifier(random_state=random_state)),
    ])
    record = process_model(pipe, 'linear-sgd', X, y, n_splits=n_splits)
    print(record)
    record.update(summary_record)
    records.append(record)

    pipe = Pipeline([
        ('ct', ct),
        ('std', StandardScaler()),
        ('lr', LogisticRegression(random_state=random_state)),
    ])
    record = process_model(pipe, 'lr', X, y, n_splits=n_splits)
    print(record)
    record.update(summary_record)
    records.append(record)

    pipe = Pipeline([
        ('ct', ct),
        # n_estimators updated from 10 to 100 due to sci-kit defaults changing in future versions
        ('rf-100', RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=random_state)),
    ])
    record = process_model(pipe, 'rf-100', X, y, n_splits=n_splits)
    print(record)
    record.update(summary_record)
    records.append(record)
    
    pipe = Pipeline([
        ('ct', ct),
        ('xgb', XGBClassifier(random_state=random_state)),
    ])
    record = process_model(pipe, 'xgb', X, y, n_splits=n_splits)
    print(record)
    record.update(summary_record)
    records.append(record)

    # No pipeline needed due to EBM handling string datatypes
   # ebm_main = ExplainableBoostingClassifier(n_jobs=-1, interactions=0, random_state=random_state)
   # record = process_model(ebm_main, 'ebm main', X, y, n_splits=n_splits)
   # print(record)
   # record.update(summary_record)
   # records.append(record)

    return records

In [5]:
results = []
n_splits = 3

In [6]:
dataset = load_heart_data()
result = benchmark_models('heart', dataset['full']['X'], dataset['full']['y'], n_splits=n_splits)
results.append(result)


------------------------------------------------------------------------------
heart
------------------------------------------------------------------------------
{'dataset_name': 'heart'}

{'model_name': 'linear-sgd', 'fit_time_mean': '0.007', 'fit_time_std': '0.002', 'test_score_mean': '0.882', 'test_score_std': '0.020'}
{'model_name': 'lr', 'fit_time_mean': '0.009', 'fit_time_std': '0.000', 'test_score_mean': '0.895', 'test_score_std': '0.030'}
{'model_name': 'rf-100', 'fit_time_mean': '0.615', 'fit_time_std': '0.549', 'test_score_mean': '0.890', 'test_score_std': '0.008'}
{'model_name': 'xgb', 'fit_time_mean': '0.033', 'fit_time_std': '0.002', 'test_score_mean': '0.851', 'test_score_std': '0.018'}


In [7]:
dataset = load_adult_data()
result = benchmark_models('adult', dataset['full']['X'], dataset['full']['y'], n_splits=n_splits)
results.append(result)


------------------------------------------------------------------------------
adult
------------------------------------------------------------------------------
{'dataset_name': 'adult'}

{'model_name': 'linear-sgd', 'fit_time_mean': '0.889', 'fit_time_std': '0.029', 'test_score_mean': '0.890', 'test_score_std': '0.004'}
{'model_name': 'lr', 'fit_time_mean': '0.829', 'fit_time_std': '0.062', 'test_score_mean': '0.906', 'test_score_std': '0.003'}
{'model_name': 'rf-100', 'fit_time_mean': '2.465', 'fit_time_std': '0.016', 'test_score_mean': '0.903', 'test_score_std': '0.002'}
{'model_name': 'xgb', 'fit_time_mean': '5.010', 'fit_time_std': '0.119', 'test_score_mean': '0.927', 'test_score_std': '0.001'}


In [8]:
records = [item for result in results for item in result]
record_df = pd.DataFrame.from_records(records)[['dataset_name', 'model_name', 'test_score_mean', 'test_score_std']]
record_df.to_csv('ebm-perf-classification-overnight.csv')
display(pd.read_csv('ebm-perf-classification-overnight.csv',
                    usecols=["dataset_name", "model_name", "test_score_mean",'test_score_std'])
       )

Unnamed: 0,dataset_name,model_name,test_score_mean,test_score_std
0,heart,linear-sgd,0.882,0.02
1,heart,lr,0.895,0.03
2,heart,rf-100,0.89,0.008
3,heart,xgb,0.851,0.018
4,adult,linear-sgd,0.89,0.004
5,adult,lr,0.906,0.003
6,adult,rf-100,0.903,0.002
7,adult,xgb,0.927,0.001


In [19]:
def ct(dataset_name, X, y):
    is_cat = np.array([dt.kind == 'O' for dt in X.dtypes])
    cat_cols = X.columns.values[is_cat]
    num_cols = X.columns.values[~is_cat]
    cat_ohe_step = ('ohe', OneHotEncoder(sparse=False,
                                         handle_unknown='ignore'))

    cat_pipe = Pipeline([cat_ohe_step])
    num_pipe = Pipeline([('identity', FunctionTransformer())])
    transformers = [
        ('cat', cat_pipe, cat_cols),
        ('num', num_pipe, num_cols)
    ]
    return ColumnTransformer(transformers=transformers)

def fit_logreg(dataset_name, X, y):
    
    pipe = Pipeline([
        ('std', StandardScaler()),
        ('lr', LogisticRegression(random_state=1337)),
    ])
    return pipe.fit(X,y)


from sklearn.model_selection import train_test_split
dataset = load_heart_data()
X_train, X_test, y_train, y_test = train_test_split(dataset['full']['X'],
                                                    dataset['full']['y'],
                                                    test_size=0.20, 
                                                    random_state=1337)

model = fit_logreg('heart', X_train, y_train)




In [20]:
from interpret import show
from interpret.perf import ROC

blackbox_perf = ROC(model.predict_proba).explain_perf(X_test, y_test, name='Blackbox')
import interpret
interpret.set_show_addr(("0.0.0.0", 7001))
show(blackbox_perf)




In [None]:


from interpret.blackbox import LimeTabular
from interpret import show

#Blackbox explainers need a predict function, and optionally a dataset
lime = LimeTabular(predict_fn=blackbox_model.predict_proba, data=X_train, random_state=1)

#Pick the instances to explain, optionally pass in labels if you have them
lime_local = lime.explain_local(X_test[:5], y_test[:5], name='LIME')

show(lime_local)

