In [1]:
# Some code taken from:
#   https://www.kaggle.com/juliencs/a-study-on-regression-applied-to-the-ames-dataset

In [2]:
import pandas as pd
import numpy as np
import sys
import pickle
import importlib
sys.path.insert(1, '../../../sibyl')

In this tutorial, we use Sibyl to get feature contribution explanations for the AmesHousing dataset

First, we load in the data. Sibyl expects all data as DataFrames, where columns have the feature names.

In [3]:
data = pd.read_csv("data.csv")
y_orig = data["SalePrice"]
X_orig = data.drop("SalePrice", axis="columns")

print(y_orig.shape, X_orig.shape)

(1460,) (1460, 80)


Next, we prepare the transfomers. There are three kinds of transformers: e_transforms, m_transforms, and i_transforms. In this case, the original data is already interpretable, and the explanation algorithm will take the data in its model_ready form.

Therefore, i_transforms=None and e_transforms=m_transforms.

The first kind of transform we need will adjust the feature set and includes encoding categorical values and one-hot-encoding

In [42]:
from sibyl.utils.transformer import BaseTransformer
from sklearn.preprocessing import OneHotEncoder
class AmesHousingImputer:
    def transform(self, x_orig):
            x_transform = x_orig.copy()
            # Alley : data description says NA means "no alley access"
            x_transform.loc[:, "Alley"] = x_transform.loc[:, "Alley"].fillna("None")
            # BedroomAbvGr : NA most likely means 0
            x_transform.loc[:, "BedroomAbvGr"] = x_transform.loc[:, "BedroomAbvGr"].fillna(0)
            # BsmtQual etc : x_transform description says NA for basement features is "no basement"
            x_transform.loc[:, "BsmtQual"] = x_transform.loc[:, "BsmtQual"].fillna("No")
            x_transform.loc[:, "BsmtCond"] = x_transform.loc[:, "BsmtCond"].fillna("No")
            x_transform.loc[:, "BsmtExposure"] = x_transform.loc[:, "BsmtExposure"].fillna("No")
            x_transform.loc[:, "BsmtFinType1"] = x_transform.loc[:, "BsmtFinType1"].fillna("No")
            x_transform.loc[:, "BsmtFinType2"] = x_transform.loc[:, "BsmtFinType2"].fillna("No")
            x_transform.loc[:, "BsmtFullBath"] = x_transform.loc[:, "BsmtFullBath"].fillna(0)
            x_transform.loc[:, "BsmtHalfBath"] = x_transform.loc[:, "BsmtHalfBath"].fillna(0)
            x_transform.loc[:, "BsmtUnfSF"] = x_transform.loc[:, "BsmtUnfSF"].fillna(0)
            # CentralAir : NA most likely means No
            x_transform.loc[:, "CentralAir"] = x_transform.loc[:, "CentralAir"].fillna("N")
            # Condition : NA most likely means Normal
            x_transform.loc[:, "Condition1"] = x_transform.loc[:, "Condition1"].fillna("Norm")
            x_transform.loc[:, "Condition2"] = x_transform.loc[:, "Condition2"].fillna("Norm")
            # EnclosedPorch : NA most likely means no enclosed porch
            x_transform.loc[:, "EnclosedPorch"] = x_transform.loc[:, "EnclosedPorch"].fillna(0)
            # External stuff : NA most likely means average
            x_transform.loc[:, "ExterCond"] = x_transform.loc[:, "ExterCond"].fillna("TA")
            x_transform.loc[:, "ExterQual"] = x_transform.loc[:, "ExterQual"].fillna("TA")
            # Fence : x_transform description says NA means "no fence"
            x_transform.loc[:, "Fence"] = x_transform.loc[:, "Fence"].fillna("No")
            # FireplaceQu : x_transform description says NA means "no fireplace"
            x_transform.loc[:, "FireplaceQu"] = x_transform.loc[:, "FireplaceQu"].fillna("No")
            x_transform.loc[:, "Fireplaces"] = x_transform.loc[:, "Fireplaces"].fillna(0)
            # Functional : x_transform description says NA means typical
            x_transform.loc[:, "Functional"] = x_transform.loc[:, "Functional"].fillna("Typ")
            # GarageType etc : x_transform description says NA for garage features is "no garage"
            x_transform.loc[:, "GarageType"] = x_transform.loc[:, "GarageType"].fillna("No")
            x_transform.loc[:, "GarageFinish"] = x_transform.loc[:, "GarageFinish"].fillna("No")
            x_transform.loc[:, "GarageQual"] = x_transform.loc[:, "GarageQual"].fillna("No")
            x_transform.loc[:, "GarageCond"] = x_transform.loc[:, "GarageCond"].fillna("No")
            x_transform.loc[:, "GarageArea"] = x_transform.loc[:, "GarageArea"].fillna(0)
            x_transform.loc[:, "GarageCars"] = x_transform.loc[:, "GarageCars"].fillna(0)
            # HalfBath : NA most likely means no half baths above grade
            x_transform.loc[:, "HalfBath"] = x_transform.loc[:, "HalfBath"].fillna(0)
            # HeatingQC : NA most likely means typical
            x_transform.loc[:, "HeatingQC"] = x_transform.loc[:, "HeatingQC"].fillna("TA")
            # KitchenAbvGr : NA most likely means 0
            x_transform.loc[:, "KitchenAbvGr"] = x_transform.loc[:, "KitchenAbvGr"].fillna(0)
            # KitchenQual : NA most likely means typical
            x_transform.loc[:, "KitchenQual"] = x_transform.loc[:, "KitchenQual"].fillna("TA")
            # LotFrontage : NA most likely means no lot frontage
            x_transform.loc[:, "LotFrontage"] = x_transform.loc[:, "LotFrontage"].fillna(0)
            # LotShape : NA most likely means regular
            x_transform.loc[:, "LotShape"] = x_transform.loc[:, "LotShape"].fillna("Reg")
            # MasVnrType : NA most likely means no veneer
            x_transform.loc[:, "MasVnrType"] = x_transform.loc[:, "MasVnrType"].fillna("None")
            x_transform.loc[:, "MasVnrArea"] = x_transform.loc[:, "MasVnrArea"].fillna(0)
            # MiscFeature : x_transform description says NA means "no misc feature"
            x_transform.loc[:, "MiscFeature"] = x_transform.loc[:, "MiscFeature"].fillna("No")
            x_transform.loc[:, "MiscVal"] = x_transform.loc[:, "MiscVal"].fillna(0)
            # OpenPorchSF : NA most likely means no open porch
            x_transform.loc[:, "OpenPorchSF"] = x_transform.loc[:, "OpenPorchSF"].fillna(0)
            # PavedDrive : NA most likely means not paved
            x_transform.loc[:, "PavedDrive"] = x_transform.loc[:, "PavedDrive"].fillna("N")
            # PoolQC : x_transform description says NA means "no pool"
            x_transform.loc[:, "PoolQC"] = x_transform.loc[:, "PoolQC"].fillna("No")
            x_transform.loc[:, "PoolArea"] = x_transform.loc[:, "PoolArea"].fillna(0)
            # SaleCondition : NA most likely means normal sale
            x_transform.loc[:, "SaleCondition"] = x_transform.loc[:, "SaleCondition"].fillna("Normal")
            # ScreenPorch : NA most likely means no screen porch
            x_transform.loc[:, "ScreenPorch"] = x_transform.loc[:, "ScreenPorch"].fillna(0)
            # TotRmsAbvGrd : NA most likely means 0
            x_transform.loc[:, "TotRmsAbvGrd"] = x_transform.loc[:, "TotRmsAbvGrd"].fillna(0)
            # Utilities : NA most likely means all public utilities
            x_transform.loc[:, "Utilities"] = x_transform.loc[:, "Utilities"].fillna("AllPub")
            # WoodDeckSF : NA most likely means no wood deck
            x_transform.loc[:, "WoodDeckSF"] = x_transform.loc[:, "WoodDeckSF"].fillna(0)
            
            return x_transform

class AmesHousingCategorizer:
    def transform(self, x_orig):
            x_transform = x_orig.copy()
            x_transform = x_transform.drop("Id", axis="columns")
            int_to_cat = {"Alley" : {"Grvl" : 1, "Pave" : 2},
                      "BsmtCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                      "BsmtExposure" : {"No" : 0, "Mn" : 1, "Av": 2, "Gd" : 3},
                      "BsmtFinType1" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4,
                                        "ALQ" : 5, "GLQ" : 6},
                      "BsmtFinType2" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4,
                                        "ALQ" : 5, "GLQ" : 6},
                      "BsmtQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5},
                      "ExterCond" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                      "ExterQual" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                      "FireplaceQu" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                      "Functional" : {"Sal" : 1, "Sev" : 2, "Maj2" : 3, "Maj1" : 4, "Mod": 5,
                               "Min2" : 6, "Min1" : 7, "Typ" : 8},
                      "GarageCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                      "GarageQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                      "HeatingQC" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                      "KitchenQual" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                      "LandSlope" : {"Sev" : 1, "Mod" : 2, "Gtl" : 3},
                      "LotShape" : {"IR3" : 1, "IR2" : 2, "IR1" : 3, "Reg" : 4},
                      "PavedDrive" : {"N" : 0, "P" : 1, "Y" : 2},
                      "PoolQC" : {"No" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4},
                      "Street" : {"Grvl" : 1, "Pave" : 2},
                      "Utilities" : {"ELO" : 1, "NoSeWa" : 2, "NoSewr" : 3, "AllPub" : 4}}
            
            x_transform = x_transform.replace(int_to_cat)
            x_transform = x_transform.replace({"MSSubClass": {20: "SC20", 30: "SC30", 40: "SC40", 45: "SC45",
                                                  50: "SC50", 60: "SC60", 70: "SC70", 75: "SC75",
                                                  80: "SC80", 85: "SC85", 90: "SC90", 120: "SC120",
                                                  150: "SC150", 160: "SC160", 180: "SC180", 190: "SC190"},
                                                "MoSold": {1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr", 5: "May", 6: "Jun",
                                                  7: "Jul", 8: "Aug", 9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec"}})
            return x_transform

class NumericMean:
    def transform(self, x_orig):
        x_num = x_orig.select_dtypes(exclude=["object"])
        x_cat = x_orig.select_dtypes(include=["object"])
        return pd.concat([x_num.fillna(x_num.median()), x_cat], axis=1).dropna()

class OneHotEncoderWrapper:
    def __init__(self):
        self.ohe = OneHotEncoder(handle_unknown="ignore", sparse=False, dtype=pd.DataFrame)
        
    def fit(self, x_orig):
        self.ohe.fit(x_orig.select_dtypes(include=["object"]).astype("str"))
        
    def transform(self, x_orig):
        x_num = x_orig.select_dtypes(exclude=["object"])
        x_cat = x_orig.select_dtypes(include=["object"]).astype("str")
        x_cat_ohe = self.ohe.transform(x_cat)
        return pd.concat([x_num, x_cat_ohe], axis=1)

The next kind of transformer we need is a standardizer.
todo: move fitting and training model out of this tutorial and just load in pickle files

In [43]:
from sklearn.preprocessing import StandardScaler

ahi = AmesHousingImputer()
ahc = AmesHousingCategorizer()
ohe = OneHotEncoderWrapper()
nm = NumericMean()

X_numeric = ahi.transform(X_orig)
X_numeric = ahc.transform(X_numeric)
X_numeric = nm.transform(X_numeric)
ohe.fit(X_numeric)
X_numeric = ohe.transform(X_numeric)

standard_transformer = StandardScaler()
standard_transformer.fit(X_numeric)

ValueError: unsupported data types in input

In [None]:
from sklearn.linear_model import Ridge

X_model = standard_transformer.transform(X_numeric)
model = Ridge()
model.fit(X_model, np.log1p(y_orig))

pickle.dump(model, open("model.pkl", "wb"))

In [None]:
from sibyl.explainers import local_feature_explanation
importlib.reload(local_feature_explanation)

m_transforms = [AmesHousingImputer(), AmesHousingCategorizer(), OneHotEncoderWrapper(), NumericMean(), standard_scaler]
lfc = local_feature_explanation.LocalFeatureContribution(model_pickle_filepath="model.pkl", 
                                                         X_orig=X_orig, e_transforms=m_transforms,
                                                         m_transforms=m_transforms)

In [None]:
preds = lfc.model_predict(X_orig)

In [None]:
lfc.fit()

In [None]:
contributions = lfc.produce(X_orig.iloc[0])
print(contributions)