Setup (fitting the model requires data transformers)

In [1]:
import time
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd
from pyreal.transformers import Transformer, OneHotEncoder, Mappings, MappingsOneHotDecoder, FeatureSelectTransformer
from pyreal.transformers import fit_transformers

data_orig = pd.read_csv("trinket_data.csv", index_col=0)
y_orig = data_orig["price"]
X_orig = data_orig.drop("price", axis=1)

def hex_to_color_name(h):
    h = h.lstrip('#')
    rgb = tuple(int(h[i:i+2], 16) for i in (0, 2, 4))
    return ["red", "green", "blue"][rgb.index(max(rgb))]


class ColorTransformer(Transformer):
    def __init__(self, columns, **kwargs):
        self.columns = columns
        super().__init__(**kwargs)

    def fit(self, x):
        return self

    def data_transform(self, x):
        for col in self.columns:
            x[col] = x[col].apply(hex_to_color_name)
        return x


class MaxAggregator(Transformer):
    def __init__(self, columns, **kwargs):
        self.columns = columns
        super().__init__(**kwargs)

    def data_transform(self, x):
        column_name = "MAX("
        column_name += ",".join(self.columns)
        column_name += ")"
        x[column_name] = x[self.columns].max(axis=1)
        x = x.drop(self.columns, axis=1)
        return x

colorTransformer = ColorTransformer(columns=["color"], interpret=True)
colorEncoder = OneHotEncoder(columns=["color"])
maxAggregator = MaxAggregator(columns=["width", "height"], interpret=True)
featureSelect = FeatureSelectTransformer(['age', 'type_bar', 'type_foo', 'type_foobar',
                                          'color_blue', 'color_green', 'color_red',
                                          'MAX(width,height)'])
mappings = Mappings.generate_mappings(
    categorical_to_one_hot={"type": {"type_foo": "foo", "type_bar": "bar", "type_foobar": "foobar"}})

model_transformers = [maxAggregator, colorTransformer, colorEncoder, featureSelect]
X_model = fit_transformers(model_transformers, X_orig)

model = LinearRegression()
model.fit(X_model[0:400], y_orig[0:400])
print("Model r-squared: %.4f" % model.score(X_model[401:], y_orig[401:]))


Model r-squared: 0.9998


Local Feature Contributions - Pyreal

In [2]:
from pyreal.transformers import Transformer, OneHotEncoder, Mappings, MappingsOneHotDecoder, FeatureSelectTransformer
from pyreal.transformers import fit_transformers
from pyreal.explainers import LocalFeatureContribution

start = time.time()
class ColorTransformer(Transformer): #****
    def __init__(self, columns, **kwargs): #****
        self.columns = columns
        super().__init__(**kwargs)
    def fit(self, x): #****
        return self #****
    def data_transform(self, x): #****
        for col in self.columns:
            x[col] = x[col].apply(hex_to_color_name)
        return x #****
class MaxAggregator(Transformer): #****
    def __init__(self, columns, **kwargs): #****
        self.columns = columns
        super().__init__(**kwargs) 
    def data_transform(self, x): #****
        column_name = "MAX("
        column_name += ",".join(self.columns)
        column_name += ")"
        x[column_name] = x[self.columns].max(axis=1)
        x = x.drop(self.columns, axis=1)
        return x #****
colorTransformer = ColorTransformer(columns=["color"], interpret=True)
colorEncoder = OneHotEncoder(columns=["color"])
maxAggregator = MaxAggregator(columns=["width", "height"], interpret=True)
featureSelect = FeatureSelectTransformer(['age', 'type_bar', 'type_foo', 'type_foobar',
                                          'color_blue', 'color_green', 'color_red',
                                          'MAX(width,height)'])
mappings = Mappings.generate_mappings(
    categorical_to_one_hot={"type": {"type_foo": "foo", "type_bar": "bar", "type_foobar": "foobar"}})
typeDecoder = MappingsOneHotDecoder(mappings, model=False, interpret=True)
model_transformers = [maxAggregator, colorTransformer, colorEncoder, featureSelect, typeDecoder]
fit_transformers(model_transformers, X_orig)
lfc = LocalFeatureContribution(model, X_orig, transformers=model_transformers, fit_on_init=True)
explanation_pyreal = lfc.produce(X_orig.iloc[:100])
print(explanation_pyreal)
print("runtime:", time.time()-start)


(          age  MAX(width,height)      color        type
0  -15.656246           8.279705 -20.622256   -8.060027
1    8.368684          39.201880 -20.622256   92.056552
2   23.584472          13.853105  -0.876739   -8.060027
3  -22.863724          37.544861  19.097510 -108.013055
4  -13.253753         -20.753272  -0.876739   92.056552
..        ...                ...        ...         ...
95  28.389458          50.433364 -20.622256   92.056552
96   9.169515         -32.254449  -0.876739 -108.013055
97 -27.668710          35.592235 -20.622256   92.056552
98  -2.842950         -78.169232  19.097510   -8.060027
99  17.978655           3.187405  19.097510   -8.060027

[100 rows x 4 columns],     color  age    type  MAX(width,height)
0     red   41     bar          68.784567
1     red   71  foobar          84.261218
2   green   90     bar          71.574072
3    blue   32     foo          83.431874
4   green   44  foobar          54.253466
..    ...  ...     ...                ...
95    re

Local feature contributions - no Pyreal

In [3]:
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder
import shap

start = time.time()
def color_transform(x, columns):  #****
    for col in columns:
        x[col] = x[col].apply(hex_to_color_name)
    return x #****
def max_aggregate(x, columns):  #****
    column_name = "MAX("
    column_name += ",".join(columns)
    column_name += ")"
    x[column_name] = x[columns].max(axis=1)
    x = x.drop(columns, axis=1)
    return x  #****
x_1 = max_aggregate(X_orig, columns=["width", "height"])
x_2 = color_transform(x_1, columns=["color"])
to_encode = x_2[["color"]]
color_ohe = SklearnOneHotEncoder(sparse=False).fit(to_encode)
encoded_columns = color_ohe.get_feature_names(to_encode.columns)
index = to_encode.index
encoded = color_ohe.transform(to_encode)
encoded_df = pd.DataFrame(encoded, columns=encoded_columns, index=index)
x_3 = pd.concat([x_2.drop(["color"], axis="columns"), encoded_df], axis=1)
x_explain = x_3[['age', 'type_bar', 'type_foo', 'type_foobar',
          'color_blue', 'color_green', 'color_red','MAX(width,height)']]
columns = x_explain.columns
explainer = shap.Explainer(model, x_explain)
explanation = explainer(x_explain.iloc[0:100])
explanation_df = pd.DataFrame(explanation.values, columns=columns)
for col in ["color", "type"]:
    encoded_features = [item for item in columns if item.startswith(col+'_')]
    summed_contribution = explanation_df[encoded_features].sum(axis=1)
    explanation_df = explanation_df.drop(encoded_features, axis="columns")
    explanation_df[col] = summed_contribution
print(explanation_df)
print("runtime:", time.time()-start)


          age  MAX(width,height)      color        type
0  -15.656246           8.279705 -20.622256   -8.060027
1    8.368684          39.201880 -20.622256   92.056552
2   23.584472          13.853105  -0.876739   -8.060027
3  -22.863724          37.544861  19.097510 -108.013055
4  -13.253753         -20.753272  -0.876739   92.056552
..        ...                ...        ...         ...
95  28.389458          50.433364 -20.622256   92.056552
96   9.169515         -32.254449  -0.876739 -108.013055
97 -27.668710          35.592235 -20.622256   92.056552
98  -2.842950         -78.169232  19.097510   -8.060027
99  17.978655           3.187405  19.097510   -8.060027

[100 rows x 4 columns]
runtime: 0.020306825637817383


Global feature importance - with Pyreal

In [4]:
from pyreal.transformers import Transformer, OneHotEncoder, Mappings, MappingsOneHotDecoder, FeatureSelectTransformer
from pyreal.transformers import fit_transformers
from pyreal.explainers import GlobalFeatureImportance

start = time.time()
class ColorTransformer(Transformer):  # ****
    def __init__(self, columns, **kwargs):  # ****
        self.columns = columns
        super().__init__(**kwargs)
    def fit(self, x):  # ****
        return self  # ****
    def data_transform(self, x):  # ****
        for col in self.columns:
            x[col] = x[col].apply(hex_to_color_name)
        return x  # ****
class MaxAggregator(Transformer):  # ****
    def __init__(self, columns, **kwargs):  # ****
        self.columns = columns
        super().__init__(**kwargs)
    def data_transform(self, x):  # ****
        column_name = "MAX("
        column_name += ",".join(self.columns)
        column_name += ")"
        x[column_name] = x[self.columns].max(axis=1)
        x = x.drop(self.columns, axis=1)
        return x  # ****
colorTransformer = ColorTransformer(columns=["color"], interpret=True)
colorEncoder = OneHotEncoder(columns=["color"])
maxAggregator = MaxAggregator(columns=["width", "height"], interpret=True)
featureSelect = FeatureSelectTransformer(['age', 'type_bar', 'type_foo', 'type_foobar',
                                          'color_blue', 'color_green', 'color_red',
                                          'MAX(width,height)'])
mappings = Mappings.generate_mappings(
    categorical_to_one_hot={"type": {"type_foo": "foo", "type_bar": "bar", "type_foobar": "foobar"}})
typeDecoder = MappingsOneHotDecoder(mappings, model=False, interpret=True)
model_transformers = [maxAggregator, colorTransformer, colorEncoder, featureSelect, typeDecoder]
fit_transformers(model_transformers, X_orig)
gfi = GlobalFeatureImportance(model, X_orig, transformers=model_transformers, fit_on_init=True)
explanation_pyreal = gfi.produce()
print(explanation_pyreal)
print("runtime:", time.time()-start)


         age  MAX(width,height)      color       type
0  15.962804          30.875724  18.201658  89.316062
runtime: 0.0370330810546875


Global feature importance - no Pyreal

In [5]:
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder
import shap

start = time.time()
def color_transform(x, columns):  # ****
    for col in columns:
        x[col] = x[col].apply(hex_to_color_name)
    return x  # ****
def max_aggregate(x, columns):  # ****
    column_name = "MAX("
    column_name += ",".join(columns)
    column_name += ")"
    x[column_name] = x[columns].max(axis=1)
    x = x.drop(columns, axis=1)
    return x  # ****
x_1 = max_aggregate(X_orig, columns=["width", "height"])
x_2 = color_transform(x_1, columns=["color"])
to_encode = x_2[["color"]]
color_ohe = SklearnOneHotEncoder(sparse=False).fit(to_encode)
encoded_columns = color_ohe.get_feature_names(to_encode.columns)
index = to_encode.index
encoded = color_ohe.transform(to_encode)
encoded_df = pd.DataFrame(encoded, columns=encoded_columns, index=index)
x_3 = pd.concat([x_2.drop(["color"], axis="columns"), encoded_df], axis=1)
x_explain = x_3[['age', 'type_bar', 'type_foo', 'type_foobar',
                 'color_blue', 'color_green', 'color_red', 'MAX(width,height)']]
columns = x_explain.columns
explainer = shap.Explainer(model, x_explain)
explanation = explainer(x_explain)
explanation = np.mean(np.absolute(explanation.values), axis=0).reshape(1, -1)
explanation_df = pd.DataFrame(explanation, columns=columns)
for col in ["color", "type"]:
    encoded_features = [item for item in columns if item.startswith(col+'_')]
    summed_contribution = explanation_df[encoded_features].sum(axis=1)
    explanation_df = explanation_df.drop(encoded_features, axis="columns")
    explanation_df[col] = summed_contribution
print(explanation_df)
print("runtime:", time.time()-start)


         age  MAX(width,height)      color       type
0  15.962804          30.875724  18.201658  89.316062
runtime: 0.03751492500305176


Decision Tree Explainer - with Pyreal

In [6]:
from pyreal.transformers import Transformer, OneHotEncoder, Mappings, MappingsOneHotDecoder, FeatureSelectTransformer
from pyreal.transformers import fit_transformers
from pyreal.explainers import DecisionTreeExplainer

start = time.time()
class ColorTransformer(Transformer):  # ****
    def __init__(self, columns, **kwargs):  # ****
        self.columns = columns
        super().__init__(**kwargs)
    def fit(self, x):  # ****
        return self  # ****
    def data_transform(self, x):  # ****
        for col in self.columns:
            x[col] = x[col].apply(hex_to_color_name)
        return x  # ****
class MaxAggregator(Transformer):  # ****
    def __init__(self, columns, **kwargs):  # ****
        self.columns = columns
        super().__init__(**kwargs)
    def data_transform(self, x):  # ****
        column_name = "MAX("
        column_name += ",".join(self.columns)
        column_name += ")"
        x[column_name] = x[self.columns].max(axis=1)
        x = x.drop(self.columns, axis=1)
        return x  # ****
colorTransformer = ColorTransformer(columns=["color"], interpret=True)
colorEncoder = OneHotEncoder(columns=["color"])
maxAggregator = MaxAggregator(columns=["width", "height"], interpret=True)
featureSelect = FeatureSelectTransformer(['age', 'type_bar', 'type_foo', 'type_foobar',
                                          'color_blue', 'color_green', 'color_red',
                                          'MAX(width,height)'])
mappings = Mappings.generate_mappings(
    categorical_to_one_hot={"type": {"type_foo": "foo", "type_bar": "bar", "type_foobar": "foobar"}})
typeDecoder = MappingsOneHotDecoder(mappings, model=False, interpret=True)
model_transformers = [maxAggregator, colorTransformer, colorEncoder, featureSelect, typeDecoder]
fit_transformers(model_transformers, X_orig)
dte = DecisionTreeExplainer(model, X_orig, is_classifier=False, max_depth=4, transformers=model_transformers, fit_on_init=True)
explanation_pyreal = dte.produce()
print("runtime:", time.time()-start)


runtime: 0.03746485710144043


Decision Tree Explainer - no Pyreal

In [7]:
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder
from sklearn import tree

start = time.time()
def color_transform(x, columns):  # ****
    for col in columns:
        x[col] = x[col].apply(hex_to_color_name)
    return x  # ****
def max_aggregate(x, columns):  # ****
    column_name = "MAX("
    column_name += ",".join(columns)
    column_name += ")"
    x[column_name] = x[columns].max(axis=1)
    x = x.drop(columns, axis=1)
    return x  # ****
x_1 = max_aggregate(X_orig, columns=["width", "height"])
x_2 = color_transform(x_1, columns=["color"])
to_encode = x_2[["color"]]
color_ohe = SklearnOneHotEncoder(sparse=False).fit(to_encode)
encoded_columns = color_ohe.get_feature_names(to_encode.columns)
index = to_encode.index
encoded = color_ohe.transform(to_encode)
encoded_df = pd.DataFrame(encoded, columns=encoded_columns, index=index)
x_3 = pd.concat([x_2.drop(["color"], axis="columns"), encoded_df], axis=1)
x_explain = x_3[['age', 'type_bar', 'type_foo', 'type_foobar',
                 'color_blue', 'color_green', 'color_red', 'MAX(width,height)']]
columns = x_explain.columns
sklearn_explanation = tree.DecisionTreeRegressor()
sklearn_explanation.fit(x_explain, model.predict(x_explain))
print("runtime:", time.time()-start)


runtime: 0.011514902114868164
