Imports and code to load in the data and model is used by both conditions and included in analysis

In [50]:
import pandas as pd
from urllib.parse import urljoin
import requests
import lightgbm

AWS_BASE_URL = 'https://pyreal-data.s3.amazonaws.com/'
data_url = urljoin(AWS_BASE_URL, "usability_study/california.csv")
data = pd.read_csv(data_url)

data = data[data["median_house_value"] < 500000]

X_orig = data.drop("median_house_value", axis=1)
y = data["median_house_value"]

model_url = urljoin(AWS_BASE_URL, "usability_study/model.model")
r = requests.get(model_url, allow_redirects=True)
open('model.model', 'wb').write(r.content)

model = lightgbm.Booster(model_file='model.model')

cities_url = urljoin(AWS_BASE_URL, "usability_study/cal_cities_lat_long.csv")
cities = pd.read_csv(cities_url)

Control Condition: No use of Pyreal

In [51]:
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder
import shap

x_to_encode = X_orig[["ocean_proximity"]]
ohe = SklearnOneHotEncoder(sparse=False).fit(x_to_encode)
encoded_columns = ohe.get_feature_names(x_to_encode.columns)
index = x_to_encode.index
ocean_encoded = ohe.transform(x_to_encode)
ocean_encoded = pd.DataFrame(ocean_encoded, columns=encoded_columns, index=index)
X_explain = pd.concat([X_orig.drop("ocean_proximity", axis="columns"), ocean_encoded], axis=1)
X_explain["average_rooms"] = X_explain["total_rooms"] / X_explain["households"]
X_explain["average_bedrooms"] = X_explain["total_bedrooms"] / X_explain["households"]

X_interpret = X_orig.copy()
X_interpret["average_rooms"] = X_interpret["total_rooms"] / X_interpret["households"]
X_interpret["average_bedrooms"] = X_interpret["total_bedrooms"] / X_interpret["households"]
for index, row in cities.iterrows():
    lat = row["Latitude"]
    lon = row["Longitude"]
    X_interpret.loc[(X_interpret["latitude"] > lat-0.1) & (X_interpret["latitude"] < lat+0.1) & (X_interpret["longitude"] > lon-0.1) & (X_interpret["longitude"] < lon+0.1), "city"] = row["Name"]
X_interpret = X_interpret.drop("latitude", axis=1)
X_interpret = X_interpret.drop("longitude", axis=1)

columns = X_explain.columns

explainer = shap.Explainer(model, X_explain)
explanation = explainer(X_explain.iloc[0:1])
explanation_df = pd.DataFrame(explanation.values, columns=columns)

encoded_features = [item for item in encoded_columns if item.startswith("ocean_proximity_")]
summed_contribution = explanation_df[encoded_features].sum(axis=1)
explanation_df = explanation_df.drop(encoded_features, axis="columns")
explanation_df["ocean_proximity"] = summed_contribution

explanation_df["city"] = explanation_df["longitude"] + explanation_df["latitude"]
explanation_df = explanation_df.drop("longitude", axis=1)
explanation_df = explanation_df.drop("latitude", axis=1)

shap_explanation = explanation_df
print(shap_explanation)

   housing_median_age  total_rooms  total_bedrooms    population   households  \
0         6240.231167 -6519.575265    -5066.954466  20706.379627 -9398.551951   

   median_income  average_rooms  average_bedrooms  ocean_proximity  \
0   147004.07603   19999.074403       -650.607882     21483.632595   

           city  
0  28633.157172  


Experimental Condition: Using Pyreal

In [52]:
from pyreal.transformers import Transformer, fit_transformers, OneHotEncoder
from pyreal.explainers import LocalFeatureContribution
from pyreal.types.explanations.dataframe import AdditiveFeatureContributionExplanation

class CityConverter(Transformer):
  def __init__(self):
    self.cities = cities

  def data_transform(self, x):
    for index, row in self.cities.iterrows():
      lat = row["Latitude"]
      lon = row["Longitude"]
      x.loc[(x["latitude"] > lat-0.1) & (x["latitude"] < lat+0.1) & (x["longitude"] > lon-0.1) & (x["longitude"] < lon+0.1), "city"] = row["Name"]
    x = x.drop("latitude", axis=1)
    x = x.drop("longitude", axis=1)
    return x

  def transform_explanation_additive_contributions(self, explanation):
    explanation = explanation.get()
    explanation["city"] = explanation["longitude"] + explanation["latitude"]
    explanation = explanation.drop("longitude", axis=1)
    explanation = explanation.drop("latitude", axis=1)
    return AdditiveFeatureContributionExplanation(explanation)

class PerHouseholdAverager(Transformer):
  def __init__(self, column):
    self.column = column

  def data_transform(self, x):
    name = self.column.replace("total", "average")
    x[name] = x[self.column] / x["households"]
    return x

  def inverse_transform_explanation(self, explanation):
    return explanation

  def transform_explanation(self, explanation):
    return explanation

one_hot_encoder = OneHotEncoder(columns=["ocean_proximity"])
room_averager = PerHouseholdAverager("total_rooms")
bedroom_averager = PerHouseholdAverager("total_bedrooms")
city_converter = CityConverter()
e_transformers = [one_hot_encoder, room_averager, bedroom_averager]
i_transformers = [room_averager, bedroom_averager, city_converter]
fit_transformers(e_transformers, X_orig)
fit_transformers(i_transformers, X_orig)
local_explainer = LocalFeatureContribution(model, x_train_orig=X_orig, y_orig=y, e_algorithm="shap", e_transformers=e_transformers, i_transformers=i_transformers, fit_on_init=True)

pyreal_explanation = local_explainer.produce(X_orig.iloc[0:1])
print(pyreal_explanation)

Transformer class CityConverter does not have the required inverse explanation transform
Stopping explanation transform process
     longitude      latitude  housing_median_age  total_rooms  total_bedrooms  \
0  46879.62999 -18246.472817         6240.231167 -6519.575265    -5066.954466   

     population   households  median_income  average_rooms  average_bedrooms  \
0  20706.379627 -9398.551951   147004.07603   19999.074403       -650.607882   

   ocean_proximity  
0     21483.632595  


In [54]:
print(pyreal_explanation)
print(shap_explanation)
#pyreal_explanation.compare(shap_explanation)

     longitude      latitude  housing_median_age  total_rooms  total_bedrooms  \
0  46879.62999 -18246.472817         6240.231167 -6519.575265    -5066.954466   

     population   households  median_income  average_rooms  average_bedrooms  \
0  20706.379627 -9398.551951   147004.07603   19999.074403       -650.607882   

   ocean_proximity  
0     21483.632595  
   housing_median_age  total_rooms  total_bedrooms    population   households  \
0         6240.231167 -6519.575265    -5066.954466  20706.379627 -9398.551951   

   median_income  average_rooms  average_bedrooms  ocean_proximity  \
0   147004.07603   19999.074403       -650.607882     21483.632595   

           city  
0  28633.157172  
