Imports and code to load in the data and model is used by both conditions and included in analysis

In [None]:
import pandas as pd
from urllib.parse import urljoin
import lightgbm
import time
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder

AWS_BASE_URL = 'https://pyreal-data.s3.amazonaws.com/'
data_url = urljoin(AWS_BASE_URL, "usability_study/california.csv")
data = pd.read_csv(data_url)

data = data[data["median_house_value"] < 500000]

X_orig = data.drop("median_house_value", axis=1)
y = data["median_house_value"]

x_to_encode = X_orig[["ocean_proximity"]]
ohe = SklearnOneHotEncoder(sparse=False).fit(x_to_encode)
encoded_columns = ohe.get_feature_names(x_to_encode.columns)
index = x_to_encode.index
ocean_encoded = ohe.transform(x_to_encode)
ocean_encoded = pd.DataFrame(ocean_encoded, columns=encoded_columns, index=index)
X_explain = pd.concat([X_orig.drop("ocean_proximity", axis="columns"), ocean_encoded], axis=1)

model = lightgbm.LGBMRegressor().fit(X_explain, y)

cities_url = urljoin(AWS_BASE_URL, "usability_study/cal_cities_lat_long.csv")
cities = pd.read_csv(cities_url)

Control Condition: No use of Pyreal, local feature contributions

In [None]:
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder
import shap

start = time.time()
x_to_encode = X_orig[["ocean_proximity"]]
ohe = SklearnOneHotEncoder(sparse=False).fit(x_to_encode)
encoded_columns = ohe.get_feature_names(x_to_encode.columns)
index = x_to_encode.index
ocean_encoded = ohe.transform(x_to_encode)
ocean_encoded = pd.DataFrame(ocean_encoded, columns=encoded_columns, index=index)
X_explain = pd.concat([X_orig.drop("ocean_proximity", axis="columns"), ocean_encoded], axis=1)
X_interpret = X_orig.copy()
for index, row in cities.iterrows():
    lat = row["Latitude"]
    lon = row["Longitude"]
    X_interpret.loc[(X_interpret["latitude"] > lat-0.1) & (X_interpret["latitude"] < lat+0.1) & (X_interpret["longitude"] > lon-0.1) & (X_interpret["longitude"] < lon+0.1), "city"] = row["Name"]
X_interpret = X_interpret.drop("latitude", axis=1)
X_interpret = X_interpret.drop("longitude", axis=1)
columns = X_explain.columns
explainer = shap.Explainer(model, X_explain)
explanation = explainer(X_explain.iloc[0:500])
explanation_df = pd.DataFrame(explanation.values, columns=columns)
encoded_features = [item for item in encoded_columns if item.startswith("ocean_proximity_")]
summed_contribution = explanation_df[encoded_features].sum(axis=1)
explanation_df = explanation_df.drop(encoded_features, axis="columns")
explanation_df["ocean_proximity"] = summed_contribution
explanation_df["city"] = explanation_df["longitude"] + explanation_df["latitude"]
explanation_df = explanation_df.drop("longitude", axis=1)
explanation_df = explanation_df.drop("latitude", axis=1)
shap_explanation = explanation_df #***
print(shap_explanation)
print("Runtime: ", time.time()-start)

Experimental Condition: Using Pyreal

In [None]:
from pyreal.transformers import Transformer, fit_transformers, OneHotEncoder
from pyreal.explainers import LocalFeatureContribution
from pyreal.types.explanations.dataframe import AdditiveFeatureContributionExplanation

start = time.time()
class CityConverter(Transformer): #****
  def __init__(self, **kwargs): #****
    self.cities = cities #****
    super().__init__(**kwargs) #****
  def data_transform(self, x): #****
    for index, row in self.cities.iterrows():
      lat = row["Latitude"]
      lon = row["Longitude"]
      x.loc[(x["latitude"] > lat-0.1) & (x["latitude"] < lat+0.1) & (x["longitude"] > lon-0.1) & (x["longitude"] < lon+0.1), "city"] = row["Name"]
    x = x.drop("latitude", axis=1)
    x = x.drop("longitude", axis=1)
    return x #****
  def transform_explanation_additive_contributions(self, explanation): #****
    explanation = explanation.get()
    explanation["city"] = explanation["longitude"] + explanation["latitude"]
    explanation = explanation.drop("longitude", axis=1)
    explanation = explanation.drop("latitude", axis=1)
    return AdditiveFeatureContributionExplanation(explanation)
one_hot_encoder = OneHotEncoder(columns=["ocean_proximity"])
city_converter = CityConverter(model=False, interpret=True)
transformers = [one_hot_encoder, city_converter]
fit_transformers(transformers, X_orig)
local_explainer = LocalFeatureContribution(model, x_train_orig=X_orig, y_orig=y, e_algorithm="shap", transformers=transformers, fit_on_init=True)
pyreal_explanation = local_explainer.produce(X_orig.iloc[0:500])
print(pyreal_explanation)
print("Runtime: ", time.time()-start)

In [None]:
print("Difference in explanations:  ")
pyreal_explanation[0].compare(shap_explanation)

Global Feature Importance

In [None]:
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder
import shap
import numpy as np

start = time.time()
x_to_encode = X_orig[["ocean_proximity"]]
ohe = SklearnOneHotEncoder(sparse=False).fit(x_to_encode)
encoded_columns = ohe.get_feature_names(x_to_encode.columns)
index = x_to_encode.index
ocean_encoded = ohe.transform(x_to_encode)
ocean_encoded = pd.DataFrame(ocean_encoded, columns=encoded_columns, index=index)
X_explain = pd.concat([X_orig.drop("ocean_proximity", axis="columns"), ocean_encoded], axis=1)
X_interpret = X_orig.copy()
for index, row in cities.iterrows():
    lat = row["Latitude"]
    lon = row["Longitude"]
    X_interpret.loc[(X_interpret["latitude"] > lat-0.1) & (X_interpret["latitude"] < lat+0.1) & (X_interpret["longitude"] > lon-0.1) & (X_interpret["longitude"] < lon+0.1), "city"] = row["Name"]
X_interpret = X_interpret.drop("latitude", axis=1)
X_interpret = X_interpret.drop("longitude", axis=1)
columns = X_explain.columns
explainer = shap.Explainer(model, X_explain)
explanation = explainer(X_explain)
explanation = np.mean(np.absolute(explanation.values), axis=0).reshape(1, -1)
explanation_df = pd.DataFrame(explanation, columns=columns)
encoded_features = [item for item in encoded_columns if item.startswith("ocean_proximity_")]
summed_contribution = explanation_df[encoded_features].sum(axis=1)
explanation_df = explanation_df.drop(encoded_features, axis="columns")
explanation_df["ocean_proximity"] = summed_contribution
explanation_df["city"] = explanation_df["longitude"] + explanation_df["latitude"]
explanation_df = explanation_df.drop("longitude", axis=1)
explanation_df = explanation_df.drop("latitude", axis=1)
shap_explanation = explanation_df #***
print(shap_explanation)
print("Runtime: ", time.time()-start)



   housing_median_age  total_rooms  total_bedrooms    population   households  \
0         5703.277746  8538.330061     7876.322111  14273.961072  2172.921944   

   median_income  ocean_proximity          city  
0   34165.340693     28072.309062  75501.488753  
Runtime:  48.12432837486267


Experimental Condition: Using Pyreal

In [None]:
from pyreal.transformers import Transformer, fit_transformers, OneHotEncoder
from pyreal.explainers import LocalFeatureContribution, GlobalFeatureImportance
from pyreal.types.explanations.dataframe import AdditiveFeatureContributionExplanation

start = time.time()
class CityConverter(Transformer): #****
  def __init__(self, **kwargs): #****
    self.cities = cities #****
    super().__init__(**kwargs) #****
  def data_transform(self, x): #****
    for index, row in self.cities.iterrows():
      lat = row["Latitude"]
      lon = row["Longitude"]
      x.loc[(x["latitude"] > lat-0.1) & (x["latitude"] < lat+0.1) & (x["longitude"] > lon-0.1) & (x["longitude"] < lon+0.1), "city"] = row["Name"]
    x = x.drop("latitude", axis=1)
    x = x.drop("longitude", axis=1)
    return x #****
  def transform_explanation_additive_contributions(self, explanation): #****
    explanation = explanation.get()
    explanation["city"] = explanation["longitude"] + explanation["latitude"]
    explanation = explanation.drop("longitude", axis=1)
    explanation = explanation.drop("latitude", axis=1)
    return AdditiveFeatureContributionExplanation(explanation)
one_hot_encoder = OneHotEncoder(columns=["ocean_proximity"])
city_converter = CityConverter(model=False, interpret=True)
transformers = [one_hot_encoder, city_converter]
fit_transformers(transformers, X_orig)
global_explainer = GlobalFeatureImportance(model, x_train_orig=X_orig, y_orig=y, e_algorithm="shap", transformers=transformers, fit_on_init=True)
pyreal_explanation = global_explainer.produce()
print(pyreal_explanation)
print("Runtime: ", time.time()-start)



   housing_median_age  total_rooms  total_bedrooms    population   households  \
0         5703.277746  8538.330061     7876.322111  14273.961072  2172.921944   

   median_income  ocean_proximity          city  
0   34165.340693     28072.309062  75501.488753  
Runtime:  48.10335326194763


In [None]:
from sklearn import tree
from sklearn.impute import SimpleImputer
import pickle

start = time.time()
numeric_imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
categorical_imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
numeric_cols = X_orig.dropna(axis="columns", how="all") \
            .select_dtypes(include="number").columns
categorical_cols = X_orig.dropna(axis="columns", how="all") \
    .select_dtypes(exclude="number").columns
numeric_imputer.fit(X_orig[numeric_cols])
categorical_imputer.fit(X_orig[categorical_cols])
new_numeric_cols = numeric_imputer.transform(X_orig[numeric_cols])
new_categorical_cols = categorical_imputer.transform(X_orig[categorical_cols])
X_explain = pd.concat([pd.DataFrame(new_numeric_cols, columns=numeric_cols, index=X_orig.index),
                          pd.DataFrame(new_categorical_cols, columns=categorical_cols,
                                       index=X_orig.index)], axis=1)
x_to_encode = X_explain[["ocean_proximity"]]
ohe = SklearnOneHotEncoder(sparse=False).fit(x_to_encode)
encoded_columns = ohe.get_feature_names(x_to_encode.columns)
index = x_to_encode.index
ocean_encoded = ohe.transform(x_to_encode)
ocean_encoded = pd.DataFrame(ocean_encoded, columns=encoded_columns, index=index)
X_explain = pd.concat([X_explain.drop("ocean_proximity", axis="columns"), ocean_encoded], axis=1)
sklearn_explanation = tree.DecisionTreeRegressor()
results = model.predict(X_explain)
sklearn_explanation.fit(X_explain, model.predict(X_explain))
print("Runtime: ", time.time()-start)

Runtime:  0.1740279197692871


In [None]:
from pyreal.explainers import DecisionTreeExplainer
from pyreal.transformers import MultiTypeImputer

start = time.time()
one_hot_encoder = OneHotEncoder(columns=["ocean_proximity"])
imputer = MultiTypeImputer()
transformers = [imputer, one_hot_encoder]
fit_transformers(transformers, X_orig)
local_explainer = DecisionTreeExplainer(model, x_train_orig=X_orig, y_orig=y, transformers=transformers, fit_on_init=True, is_classifier=False)
pyreal_explanation = local_explainer.produce()
print("Runtime: ", time.time()-start)

TypeError: object of type 'NoneType' has no len()