First, we load in the information Sibyl needs. This includes the model, the dataset, and the hand-written mappings object that allows for conversions to a categorical format.

In [6]:
import sys
import os
import pandas as pd
import numpy as np
import importlib
sys.path.insert(1, '../../../pyreal')

directory = "data/"
dataset_filename = os.path.join(directory, "Model_Dataset_Version_01_01.csv")
features_filename = os.path.join(directory, "Design_Specification_DOUGLAS_v1.4.csv")
model_filename = os.path.join(directory, "weights_model_feb2019.csv")
feature_mappings_filename = os.path.join(directory, "mappings.csv")

# LOAD IN THE BASE MODEL

from pyreal.utils import model_utils, transformer
importlib.reload(model_utils)
importlib.reload(transformer)

weights_df = pd.read_csv(model_filename)
weights = weights_df["weight"]
model_features = weights_df["name"][1:]
model = model_utils.load_model_from_weights(weights, model_type="linear_regression")
feature_select = transformer.FeatureSelectTransformer(model_features)

# LOAD IN THE MAPPINGS OBJECT

from pyreal.utils import mappings
importlib.reload(mappings)

mappings = mappings.Mappings.generate_mappings(dataframe=pd.read_csv(feature_mappings_filename))

# LOAD IN THE DATASET

x_orig = pd.read_csv(dataset_filename)
x_orig = dataset[model_features].astype("float")

(2, 460) (460,)


Columns (222) have mixed types. Specify dtype option on import or set low_memory=False.


 Now have everything we need to set up a FeatureContributionExplainer object, and begin using it.

In [8]:
import pyreal.explainers.local_feature_explanation as lfe
importlib.reload(lfe)

lce = lfe.FeatureContributionExplainer(model, x_orig, y_orig=None, 
                                       e_transforms=feature_select, m_transforms=None,
                                       e_algorithm="shap", fit_on_init=False)

We can now fit the contribution explainer, and time it

In [12]:
import time

start_time = time.time()
lce.fit_contributions()
end_time = time.time()

print("Total time taken to fit full dataset: %f seconds" % (end_time-start_time))

Total time taken to fit full dataset: 0.113702 seconds


Now, we can get contributions, and time the process

In [13]:
d = 1000
x_orig_daily = dataset.sample(d)

start_time = time.time()
contributions = lce.get_contributions(x_orig_daily) 
end_time = time.time()

print("Time time taken to get contributions on %s items: %f seconds" % (d, end_time-start_time))

Time time taken to get contributions on 1000 items: 0.007009 seconds


Now, we can take a look at the contributions found, and see if they make sense

In [14]:
# time taken to convert to categorical
from pyreal.utils import transformer
importlib.reload(transformer)
import time

cat_transformer = transformer.MappingsDecoderTransformer(mappings)

start_time = time.time()
transformed = cat_transformer.transform(dataset)
end_time = time.time()
print(transformed.columns)

print("Time taken to encode full dataset: %f seconds" % (end_time-start_time))

Index(['PRI_CBMS_FOCUS_CD', 'PRI_CBMS_FOCUS_CH', 'PRI_CBMS_FOCUS_CP',
       'PRI_CBMS_FOCUS_CW', 'PRI_CBMS_FOCUS_DF', 'PRI_CBMS_FOCUS_EX',
       'PRI_CBMS_FOCUS_FM', 'PRI_CBMS_FOCUS_FP', 'PRI_CBMS_FOCUS_FS',
       'PRI_CBMS_FOCUS_FT',
       ...
       'PRI_OTHA_REF_NEGLECT_COUNT', 'PRI_OTHA_REF_EMOTIONAL_COUNT',
       'PRI_OTHA_REF_PHYSICAL_COUNT', 'PRI_OTHA_REF_DRUG_COUNT',
       'PRI_OTHA_REF_SEXUAL_COUNT', 'PRI_OTHA_REF_OTHER_COUNT',
       'PRI_OTHA_REF_DOMESTIC_VIOLENCE_COUNT', 'PRI_OTHA_CYF_ACTIVE',
       'PRI_OTHA_JUVENILE_JUSTICE', 'PRI_OTHA_COURT_ACTIVE'],
      dtype='object', length=358)
Time taken to encode full dataset: 0.781941 seconds
