# Saving/loading InterpretML EBMs to/from JSON

This demonstration assumes the following:
* no pairs
* only continuous values
* serialized model has already been fit
* no missing values
* global explanations only
* no data in serialized format

## Setup a classification experiment

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split  

df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
    header=None)

df.columns = [
    "sample_code_number", "clump_thickness", "uniformity_of_cell_size", "uniformity_of_cell_shape", 
    "marginal_adhesion", "single_epithelial_cell_size", "bare_nuclei", "bland_chromatin",
    "normal_nucleoli", "mitoses", "class"
]

# drop any rows that have missing values
df = df[~df.eq('?').any(1)]

# force bare_nuclei column to int64 data type after dropping '?' values
df['bare_nuclei'] = df['bare_nuclei'].astype(str).astype(int)

#print(df.head(n=10).to_string(index=False))

train_cols = df.columns[1:-1]
label = df.columns[-1]
X = df[train_cols]
y = df[label].apply(lambda x: 0 if x == 2 else 1)

seed = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)


## Explore the dataset

In [3]:
from interpret import show
from interpret.data import ClassHistogram

hist = ClassHistogram().explain_data(X_train, y_train, name = 'Train Data')
show(hist)

In [6]:
from interpret.glassbox import ExplainableBoostingClassifier, LogisticRegression, ClassificationTree, DecisionListClassifier

ebm_orig = ExplainableBoostingClassifier(random_state=seed, n_jobs=-1, interactions=0)
ebm_orig.fit(X_train, y_train)   

ExplainableBoostingClassifier(feature_names=['clump_thickness',
                                             'uniformity_of_cell_size',
                                             'uniformity_of_cell_shape',
                                             'marginal_adhesion',
                                             'single_epithelial_cell_size',
                                             'bare_nuclei', 'bland_chromatin',
                                             'normal_nucleoli', 'mitoses'],
                              feature_types=['continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous'],
                              interactions=0, n_jobs=-1, random_state=1)

## Serialize the trained EBM to JSON and deserialize into new EBM object

In [7]:
from interpret.utils.ebm_dto import EBMDTO

ebm_dto_orig = EBMDTO.from_ebm(ebm_orig)
json_str = ebm_dto_orig.to_json()
ebm_dto_deserialized = EBMDTO.load_json(json_str)
assert(ebm_dto_orig == ebm_dto_deserialized)
ebm_deserialized = ebm_dto_deserialized.to_ebm()

## Do the original and deserialized EBMs give the same predictions?

In [8]:
import numpy as np

ebm_orig_predictions = ebm_orig.predict(X_test)
ebm_orig_probabilities = ebm_orig.predict_proba(X_test)
ebm_deserialized_predictions = ebm_deserialized.predict(X_test)
ebm_deserialized_probabilities = ebm_deserialized.predict_proba(X_test)

assert np.array_equal(ebm_orig_predictions, ebm_deserialized_predictions)
assert np.array_equal(ebm_orig_probabilities, ebm_deserialized_probabilities)

## Global explanations based on the deserialized EBM

In [None]:
ebm_deserialized_global = ebm_deserialized.explain_global(name='EBM')
show(ebm_deserialized_global)