# House Experiment

This notebook contains the code to reproduce the House experiment. The dataset can be downloaded from [Kaggle](https://www.kaggle.com/c/house-prices-advanced-regression-techniques). 

**Run the following cells in order to reproduce the experiment from the paper.**

In [None]:
from boexplain import fmin
import numpy as np
import pandas as pd
from scipy.stats import skew, boxcox_normmax
from scipy.special import boxcox1p
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error

## Preprocess the data and prepare it for training

In [None]:
# load train and test data
train, test = pd.read_csv('train.csv'), pd.read_csv('test.csv')

# corrupt the training data that satisfies the condition
condition = ((train["Neighborhood"] == "CollgCr") &
             (train["Exterior1st"] == "VinylSd") & (train["YearBuilt"] >= 2000)
             & (train["YearBuilt"] <= 2010))
train.loc[condition, "SalePrice"] *= 10

# normalize predictor
y = np.log1p(train["SalePrice"])
train = train.drop(columns='SalePrice')

# merge data to featurize
df = pd.concat([train, test]).reset_index(drop=True)
df = df.drop(columns=['Utilities', 'Street', 'PoolQC'])

# convert non-numeric features to strings
strs = ['MSSubClass', 'YrSold', 'MoSold']
df[strs] = df[strs].astype(str)

# impute missing values
objects = [c for c in df.columns if df[c].dtype == object]
df[objects] = df[objects].fillna('None')
numerics = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
df[numerics] = df[numerics].fillna(0)
df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(
    lambda x: x.fillna(x.median()))

# fix skew of training attributes
skew_features = df[numerics].apply(lambda x: skew(x))
skew_index = skew_features[skew_features > 0.5].index
for i in skew_index:
    df[i] = boxcox1p(df[i], boxcox_normmax(df[i] + 1))

# feature engineering
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
df['YrBltAndRemod'] = df['YearBuilt'] + df['YearRemodAdd']
df['Total_sqr_footage'] = (df['BsmtFinSF1'] + df['BsmtFinSF2'] +
                           df['1stFlrSF'] + df['2ndFlrSF'])
df['Total_Bathrooms'] = (df['FullBath'] + (0.5 * df['HalfBath']) +
                         df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath']))
df['Total_porch_sf'] = (df['OpenPorchSF'] + df['3SsnPorch'] +
                        df['EnclosedPorch'] + df['ScreenPorch'] +
                        df['WoodDeckSF'])

# one-hot encode
df = pd.get_dummies(df)  # 336 features

# split data back into train and test
X = df[:len(y)]
test = df[len(X):]
features = X.columns

# prepare the data for BOExplain
df = X.copy()
df["SalePrice"] = y
# add training data columns for searching
df[["Neighborhood", "Exterior1st"]] = train[["Neighborhood", "Exterior1st"]]

## The objective function

The objective function trains a support vector regression model, and queries the average house sale price. The value 177748.79624911005 is the average house sale price after removing the corrupted tuples.

In [None]:
def obj(train_filtered):
    # aggregates after removing the tuples satisfying a predicate
    svr = make_pipeline(RobustScaler(), SVR())
    svr = svr.fit(train_filtered[features], train_filtered["SalePrice"])
    # house price prediction
    return abs(np.expm1(svr.predict(test)).mean() - 177748.79624911005)

## BOExplain API call

The function *fmin* is used to minimize the objective function. The columns Neighborhood, Exterior1st and YearBuilt are searched for an explanation. The runtime is 200 seconds, and the results are averaged over 10 runs. The correct predicate is provided so that F-score, precision and recall can be calculated. Statistics about the run are saved to the file house_boexplain.json.

In [None]:
df_rem = fmin(
    data=df,
    f=obj,
    cat_cols=["Neighborhood", "Exterior1st"],
    num_cols=["YearBuilt"],
    cat_alg=["individual_contribution", "categorical_warm_start", "categorical"],
    runtime=200,
    runs=10,
    random=True,  # perform a random iteration
    name="house",
    file="house.json",
    correct_pred={
        'Neighborhood': "CollgCr",
        'Exterior1st': "VinylSd",
        'YearBuilt_min': 2000,
        'YearBuilt_len': 10
    },
    use_seeds_from_paper=True,
)

# Recreate Figure 7

From the output of the above call to the BOExplain APi, the following two cells can be used to recreate Figure 7 from the paper.

In [2]:
import pandas as pd
import altair as alt
import numpy as np
alt.data_transformers.disable_max_rows()

from json import loads
experiments = {}

fo = open("results/house.json", "r")
for i, line in enumerate(fo.readlines()):
    experiments[i] = loads(line)
fo.close()

df = pd.DataFrame({}, columns=["Algorithm", "Time (seconds)", "Value"])
for i in range(len(experiments)):
    df_new = pd.DataFrame.from_dict({"Algorithm": experiments[i]["cat_enc"],
                        "Time (seconds)": list(range(5, experiments[i]["runtime"]+5, 5)),
                        "Value": experiments[i]["time_array"]},
                                orient='index').T
    df = df.append(df_new)

df = df.explode("Value")
df = df.set_index(['Algorithm']).apply(pd.Series.explode).reset_index()
df["Algorithm"] = df["Algorithm"].replace({"categorical_warm_start": "BOExplain (w/o IC)", 
                 "categorical": "BOExplain (w/o IC and WS)",
                "individual_contribution_warm_start_top1": "BOExplain",
                np.nan: "Random"})


domain = ["BOExplain", "BOExplain (w/o IC)", "BOExplain (w/o IC and WS)", "Random"]
range_ = ["#1f77b4", "#2ca02c", "#d62728", "#ff7f0e"]

line = alt.Chart(df).mark_line().encode(
    x=alt.X('Time (seconds)', axis=alt.Axis(labels=False, title=None, tickSize=0)),
    y=alt.Y('mean(Value)', title=['Mean Objective', 'Function Value']),
    color=alt.Color("Algorithm", scale=alt.Scale(domain=domain, range=range_))
).properties(
    width=225,
    height=90
)
band = alt.Chart(df).mark_errorband(extent='stdev').encode(
    x='Time (seconds)',
    y=alt.Y('Value', title='Mean Objective Function Value'),
    color=alt.Color("Algorithm")
)
chart = line
chart = chart.configure_title(
    anchor='start',
)
chart.configure_legend(
    title=None,
    orient='none',
    legendX=0,
    legendY=175,
    columns=2,
    labelFontSize=15,
    symbolSize=1000,
    labelLimit=275,
).configure_axis(
    labelFontSize=15,
    titleFontSize=15
).configure_title(
    fontSize=15
)

In [4]:
from json import loads
import altair as alt
import numpy as np
experiments = {}

fo = open("results/house.json", "r")
for i, line in enumerate(fo.readlines()):
    experiments[i] = loads(line)
fo.close()

import re
df = pd.DataFrame({}, columns=["Algorithm", "Time (seconds)", "Precision", "Recall", "F-score", "Jaccard"])
for i in range(len(experiments)):
    df_new = pd.DataFrame.from_dict({"Algorithm": experiments[i]["cat_enc"],
#                         "Iteration": tuple(range(experiments[i]["n_trials"])),
                        "Time (seconds)": tuple(range(5, experiments[i]["runtime"]+5, 5)),
                        "Precision": experiments[i]["precision_time_array"],
                        "Recall": experiments[i]["recall_time_array"],
                        "F-score": experiments[i]["f_score_time_array"],
                        "Jaccard": experiments[i]["jaccard_time_array"]
                        }, orient='index').T
    df = df.append(df_new)

df = df.set_index(['Algorithm', "Time (seconds)"]).apply(pd.Series.explode).reset_index()
df = df.set_index(['Algorithm']).apply(pd.Series.explode).reset_index()
df["Algorithm"] = df["Algorithm"].replace({"categorical_warm_start": "BOExplain (w/o IC)", 
                 "categorical": "BOExplain (w/o IC and WS)",
                "individual_contribution_warm_start_top1": "BOExplain",
                np.nan: "Random"})

num_cols = f"{len(experiments[0]['num_cols'])} numerical columns: "
for i, col in enumerate(experiments[0]["num_cols"]):
    num_cols += f"{col} (range {experiments[0]['num_cols_range'][i][0]} to {experiments[0]['num_cols_range'][i][1]}), "
cat_cols = f"{experiments[0]['cat_cols']} categorical columns: "
for i, col in enumerate(experiments[0]["cat_cols"]):
    cat_cols += f"{col} ({experiments[0]['cat_cols_n_uniq'][i]} unique values), "

out_str = f"Experiment: {experiments[0]['name']}. Completed {experiments[0]['n_trials']} iterations for {experiments[0]['runs']} runs. Search space includes "

if len(experiments[0]['num_cols']) > 0:
    out_str += num_cols
    if len(experiments[0]['cat_cols']) > 0:
        out_str += "and "

if len(experiments[0]['cat_cols']) > 0:
    out_str += cat_cols

out_str = f"{out_str[:-2]}."

out_lst = [line.strip() for line in re.findall(r'.{1,140}(?:\s+|$)', out_str)]
# df = pd.melt(df, id_vars=["Algorithm", "Iteration"], value_vars=["Precision", "Recall", "F1_score", "Jaccard"], value_name="Metric")
# altair
domain = ["BOExplain", "BOExplain (w/o IC)", "BOExplain (w/o IC and WS)", "Random"]
range_ = ["#1f77b4", "#2ca02c", "#d62728", "#ff7f0e"]
metric = "Metric"
f1_score = alt.Chart(df).mark_line().encode(
    x=alt.X('Time (seconds)', axis=alt.Axis(labels=False, title=None, tickSize=0)),
    y=alt.Y(f'mean(F-score)', scale=alt.Scale(domain=[0, 1]), axis=alt.Axis(values=[0.2, 0.5, 0.8]), title=None),
    color=alt.Color("Algorithm", scale=alt.Scale(domain=domain, range=range_))
).properties(
    title="F-score",
    width=225,
    height=90
)
jaccard = alt.Chart(df).mark_line().encode(
    x=alt.X('Time (seconds)', axis=alt.Axis(labels=False, title=None, tickSize=0)),
    y=alt.Y(f'mean(Jaccard)', scale=alt.Scale(domain=[0, 1]), axis=alt.Axis(values=[0.2, 0.5, 0.8], labels=False, title=None,  tickSize=0), title=None),
    color=alt.Color("Algorithm", scale=alt.Scale(domain=domain, range=range_))
).properties(
    title="Jaccard Similarity",
    width=225,
    height=90
)
prec = alt.Chart(df).mark_line().encode(
    x='Time (seconds)',
    y=alt.Y(f'mean(Precision)', scale=alt.Scale(domain=[0, 1]), axis=alt.Axis(values=[0.2, 0.5, 0.8]), title=None),
    color=alt.Color("Algorithm", scale=alt.Scale(domain=domain, range=range_))
).properties(
    title="Precision",
    width=225,
    height=90
)
recall = alt.Chart(df).mark_line().encode(
    x='Time (seconds)',
    y=alt.Y(f'mean(Recall)', scale=alt.Scale(domain=[0, 1]), axis=alt.Axis(values=[0.2, 0.5, 0.8]), title=None),
    color=alt.Color("Algorithm", scale=alt.Scale(domain=domain, range=range_))
).properties(
    title="Recall",
    width=225,
    height=90
)
first = alt.hconcat(jaccard, f1_score, spacing=0)
second = alt.hconcat(prec, recall, spacing=0)
alt.vconcat(first, second, spacing=0).resolve_scale(x='shared', y='shared').configure_legend(
    title=None,
    orient='none',
    legendX=200,
    legendY=370,
    labelFontSize=15,
    symbolSize=700,
    columns=2,
    labelLimit=275,
).configure_axis(
    labelFontSize=15,
    titleFontSize=15
).configure_title(
    fontSize=15
)