# Credit Experiment

This notebook contains the code to reproduce the Credit experiment. The datasets are stored in the _data_ folder in the repo.

**Run the following cells in order to reproduce the experiment from the paper.**

In [None]:
from boexplain import fmax
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

## The objective function

The objective function takes as input the filtered source data, and then proceed through an ML pipeline that includes joining with the record table, one-hot encoding binning numerical variables using quantile and equi-range binning, and grouping categorical values.

In [None]:
def obj(source_filtered):
    
    data = source_filtered.copy()
    record = record_orig.copy()

    # determine if the user defaults or not (create labels)
    record['target'] = 0
    record.loc[record['STATUS'].isin({'2', '3', '4', '5'}), 'target'] = 1

    # determine all IDs associated with a loan default
    cpunt = record.groupby('ID').sum()
    cpunt.loc[cpunt['target'] > 0, 'target'] = 1
    cpunt.loc[cpunt['target'] == 0, 'target'] = 0
    new_data = pd.merge(data, cpunt[['target']], how='inner', on='ID')
    new_data = new_data.dropna()

    # feature engineering
    new_data['CODE_GENDER'] = new_data['CODE_GENDER'].replace(['F', 'M'],
                                                              [0, 1])
    new_data['FLAG_OWN_CAR'] = new_data['FLAG_OWN_CAR'].replace(['N', 'Y'],
                                                                [0, 1])
    new_data['FLAG_OWN_REALTY'] = new_data['FLAG_OWN_REALTY'].replace(
        ['N', 'Y'], [0, 1])

    new_data.loc[new_data['CNT_CHILDREN'] >= 2, 'CNT_CHILDREN'] = 'over2'
    new_data = convert_dummy(new_data, 'CNT_CHILDREN')

    new_data = get_category(new_data,
                            'AMT_INCOME_TOTAL',
                            3, ["low", "medium", "high"],
                            qcut=True)
    new_data = convert_dummy(new_data, 'AMT_INCOME_TOTAL')

    new_data['Age'] = -new_data['DAYS_BIRTH'] // 365
    new_data = get_category(new_data, 'Age', 5,
                            ["lowest", "low", "medium", "high", "highest"])
    new_data = convert_dummy(new_data, 'Age')

    new_data.loc[new_data['FLAG_PHONE'] >= 3, 'FLAG_PHONE'] = '3more'
    new_data = convert_dummy(new_data, 'FLAG_PHONE')

    new_data.loc[new_data['NAME_INCOME_TYPE'].isin({'Pensioner', 'Student'}),
                 'NAME_INCOME_TYPE'] = 'State servant'
    new_data = convert_dummy(new_data, 'NAME_INCOME_TYPE')

    new_data.loc[new_data['OCCUPATION_TYPE'].isin({
        'Cleaning staff', 'Cooking staff', 'Drivers', 'Laborers',
        'Low-skill Laborers', 'Security staff', 'Waiters/barmen staff'
    }), 'OCCUPATION_TYPE'] = 'Laborwk'
    new_data.loc[new_data['OCCUPATION_TYPE'].isin({
        'Accountants', 'Core staff', 'HR staff', 'Medicine staff',
        'Private service staff', 'Realty agents', 'Sales staff', 'Secretaries'
    }), 'OCCUPATION_TYPE'] = 'officewk'
    new_data.loc[new_data['OCCUPATION_TYPE'].
                 isin({'Managers', 'High skill tech staff', 'IT staff'}),
                 'OCCUPATION_TYPE'] = 'hightecwk'
    new_data = convert_dummy(new_data, 'OCCUPATION_TYPE')

    new_data = convert_dummy(new_data, 'NAME_HOUSING_TYPE')

    new_data.loc[new_data['NAME_EDUCATION_TYPE'] == 'Academic degree',
                 'NAME_EDUCATION_TYPE'] = 'Higher education'
    new_data = convert_dummy(new_data, 'NAME_EDUCATION_TYPE')

    new_data = convert_dummy(new_data, 'NAME_FAMILY_STATUS')

    new_data['work_time'] = -new_data['DAYS_EMPLOYED'] // 365
    new_data.loc[new_data['work_time'] < 0, "work_time"] = np.nan
    new_data['work_time'] = new_data['work_time'].fillna(
        new_data['work_time'].mean())
    new_data = get_category(new_data, 'work_time', 5,
                            ["lowest", "low", "medium", "high", "highest"])
    new_data = convert_dummy(new_data, 'work_time')

    for col in new_data.select_dtypes(include=np.number):
        new_data[col] = new_data[col].astype(float)

    Y = new_data['target'].astype('int')
    for col in features:
        if col not in new_data.columns:
            new_data[col] = 0.0
    X = new_data[features]
    try:
        X_balance, Y_balance = SMOTE(random_state=0).fit_sample(X, Y)
    except ValueError:
        return 0
    X_balance = pd.DataFrame(X_balance, columns=X.columns)
    
    # train the model
    dt = DecisionTreeClassifier(random_state=0)
    dtfit = dt.fit(X_balance, Y_balance)
    y_predict = dtfit.predict(X_test_final)
    # return the predicte accuracy score
    return accuracy_score(y_test_final, y_predict)


def convert_dummy(df, feature):
    pos = pd.get_dummies(df[feature], prefix=feature)
    df = df.drop(columns=[feature])
    df = df.join(pos)
    return df


def get_category(df, col, nbins, labels, qcut=False):
    if qcut:
        df[col] = pd.qcut(df[col], q=nbins, labels=labels)  # quantile cut
    else:
        df[col] = pd.cut(df[col], bins=nbins,
                         labels=labels)  # equal-length cut
    return df


features = [
    'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'DAYS_BIRTH',
    'DAYS_EMPLOYED', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_EMAIL',
    'CNT_FAM_MEMBERS', 'CNT_CHILDREN_0', 'CNT_CHILDREN_1',
    'CNT_CHILDREN_over2', 'AMT_INCOME_TOTAL_low', 'AMT_INCOME_TOTAL_medium',
    'AMT_INCOME_TOTAL_high', 'Age_lowest', 'Age_low', 'Age_medium', 'Age_high',
    'Age_highest', 'FLAG_PHONE_0', 'FLAG_PHONE_1',
    'NAME_INCOME_TYPE_Commercial associate', 'NAME_INCOME_TYPE_State servant',
    'NAME_INCOME_TYPE_Working', 'OCCUPATION_TYPE_Laborwk',
    'OCCUPATION_TYPE_hightecwk', 'OCCUPATION_TYPE_officewk',
    'NAME_HOUSING_TYPE_Co-op apartment', 'NAME_HOUSING_TYPE_House / apartment',
    'NAME_HOUSING_TYPE_Municipal apartment',
    'NAME_HOUSING_TYPE_Office apartment', 'NAME_HOUSING_TYPE_Rented apartment',
    'NAME_HOUSING_TYPE_With parents', 'NAME_EDUCATION_TYPE_Higher education',
    'NAME_EDUCATION_TYPE_Incomplete higher',
    'NAME_EDUCATION_TYPE_Lower secondary',
    'NAME_EDUCATION_TYPE_Secondary / secondary special',
    'NAME_FAMILY_STATUS_Civil marriage', 'NAME_FAMILY_STATUS_Married',
    'NAME_FAMILY_STATUS_Separated', 'NAME_FAMILY_STATUS_Single / not married',
    'NAME_FAMILY_STATUS_Widow', 'work_time_lowest', 'work_time_low',
    'work_time_medium', 'work_time_high', 'work_time_highest'
]

## Load and corrupt the source data

In [None]:
data = pd.read_csv("data/application_record_train.csv", encoding='utf-8')
record_orig = pd.read_csv("data/credit_record_train.csv", encoding='utf-8')
X_test_final = pd.read_csv("data/credit_test.csv")
y_test_final = pd.read_csv("data/credit_labels.csv")
data["CNT_FAM_MEMBERS"] = data["CNT_FAM_MEMBERS"].astype(int)

# corrupt the source data
record_orig.loc[record_orig["ID"].isin(data.loc[(data["DAYS_BIRTH"] >= -23e3) &
                                                (data["DAYS_BIRTH"] <= -17e3) &
                                                (data["CNT_FAM_MEMBERS"] >= 2)
                                                &
                                                (data["CNT_FAM_MEMBERS"] <= 3),
                                                "ID"].unique()),
                "STATUS"] = '5'

## BOExplain API call

The function *fmax* is used to maximize the objective function. The columns DAYS_BIRTH and CNT_FAM_MEMBERS are searched for an explanation. The runtime is 200 seconds, and the results are averaged over 10 runs. The correct predicate is provided so that F-score, precision and recall can be calculated. Statistics about the run are saved to the file credit_boexplain.json.

In [None]:
df_rem = fmax(
    data=data,
    f=obj,
    num_cols=["DAYS_BIRTH", "CNT_FAM_MEMBERS"],
    runtime=7,
    runs=10,
    random=True,  # perform a random iteration
    correct_pred={
        "DAYS_BIRTH_min": -23e3,
        "DAYS_BIRTH_len": 6e3,
        "CNT_FAM_MEMBERS_min": 2,
        "CNT_FAM_MEMBERS_len": 1
    },
    name="credit",
    file="credit.json",
    use_seeds_from_paper=True,
)

# Recreate Figure 8

From the output of the above call to the BOExplain APi, the following two cells can be used to recreate Figure 8 from the paper.

In [1]:
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows()
import numpy as np

from json import loads
experiments = {}

fo = open("results/credit.json", "r")
for i, line in enumerate(fo.readlines()):
    experiments[i] = loads(line)
fo.close()

df = pd.DataFrame({}, columns=["Algorithm", "Time (seconds)", "Value"])
for i in range(len(experiments)):
    df_new = pd.DataFrame.from_dict({"Algorithm": experiments[i]["cat_enc"],
                        "Time (seconds)": list(range(5, experiments[i]["runtime"]+5, 5)),
                        "Value": experiments[i]["time_array"]},
                                orient='index').T
    df = df.append(df_new)

df = df.explode("Value")
df = df.set_index(['Algorithm']).apply(pd.Series.explode).reset_index()
df["Algorithm"] = df["Algorithm"].replace({"individual_contribution_warm_start_top1": "BOExplain", np.nan: "Random"})

line = alt.Chart(df).mark_line().encode(
    x='Time (seconds)',
    y=alt.Y('mean(Value)', title=['Mean Objective', 'Function Value'], scale=alt.Scale(domain=[0.7, 0.9])),
    color="Algorithm"
).properties(
    width=225,
    height=90
)
band = alt.Chart(df).mark_errorband(extent='stdev').encode(
    x='Time (seconds)',
    y=alt.Y('Value', title='Mean Objective Function Value', scale=alt.Scale(domain=[0.7, 0.9])),
    color=alt.Color("Algorithm")
)
chart = line# + band
chart = chart.configure_title(
    anchor='start',
)
chart.configure_legend(
    title=None,
    orient='none',
    legendX=30,
    legendY=163,
    columns=4,
    labelFontSize=15,
    symbolSize=700,
    labelLimit=275,
).configure_axis(
    labelFontSize=15,
    titleFontSize=15,
    titlePadding=2
).configure_title(
    fontSize=15
)

In [2]:
from json import loads
import altair as alt
experiments = {}

fo = open("results/credit.json", "r")
for i, line in enumerate(fo.readlines()):
    experiments[i] = loads(line)
fo.close()

import re
df = pd.DataFrame({}, columns=["Algorithm", "Time (seconds)", "Precision", "Recall", "F-score", "Jaccard"])
for i in range(len(experiments)):
    df_new = pd.DataFrame.from_dict({"Algorithm": experiments[i]["cat_enc"],
#                         "Iteration": tuple(range(experiments[i]["n_trials"])),
                        "Time (seconds)": tuple(range(5, experiments[i]["runtime"]+5, 5)),
                        "Precision": experiments[i]["precision_time_array"],
                        "Recall": experiments[i]["recall_time_array"],
                        "F-score": experiments[i]["f_score_time_array"],
                        "Jaccard": experiments[i]["jaccard_time_array"]
                        }, orient='index').T
    df = df.append(df_new)

df = df.set_index(['Algorithm', "Time (seconds)"]).apply(pd.Series.explode).reset_index()
df = df.set_index(['Algorithm']).apply(pd.Series.explode).reset_index()
df["Algorithm"] = df["Algorithm"].replace({"individual_contribution_warm_start_top1": "BOExplain", np.nan: "Random"})


num_cols = f"{len(experiments[0]['num_cols'])} numerical columns: "
for i, col in enumerate(experiments[0]["num_cols"]):
    num_cols += f"{col} (range {experiments[0]['num_cols_range'][i][0]} to {experiments[0]['num_cols_range'][i][1]}), "
cat_cols = f"{experiments[0]['cat_cols']} categorical columns: "
for i, col in enumerate(experiments[0]["cat_cols"]):
    cat_cols += f"{col} ({experiments[0]['cat_cols_n_uniq'][i]} unique values), "

out_str = f"Experiment: {experiments[0]['name']}. Completed {experiments[0]['n_trials']} iterations for {experiments[0]['runs']} runs. Search space includes "

if len(experiments[0]['num_cols']) > 0:
    out_str += num_cols
    if len(experiments[0]['cat_cols']) > 0:
        out_str += "and "

if len(experiments[0]['cat_cols']) > 0:
    out_str += cat_cols

out_str = f"{out_str[:-2]}."

out_lst = [line.strip() for line in re.findall(r'.{1,140}(?:\s+|$)', out_str)]
# df = pd.melt(df, id_vars=["Algorithm", "Iteration"], value_vars=["Precision", "Recall", "F1_score", "Jaccard"], value_name="Metric")
# altair
metric = "Metric"
f1_score = alt.Chart(df).mark_line().encode(
    x=alt.X('Time (seconds)', axis=alt.Axis()),
    y=alt.Y(f'mean(F-score)', scale=alt.Scale(domain=[0, 1]), axis=alt.Axis(values=[0.2, 0.5, 0.8]), title=None),
    color=alt.Color("Algorithm")
).properties(
    title="F-score",
    width=225,
    height=90
)
jaccard = alt.Chart(df).mark_line().encode(
    x=alt.X('Time (seconds)', axis=alt.Axis(labels=False, title=None,  tickSize=0)),
    y=alt.Y(f'mean(Jaccard)', scale=alt.Scale(domain=[0, 1]), axis=alt.Axis(values=[0.2, 0.5, 0.8], labels=False, title=None,  tickSize=0), title=None),
    color=alt.Color("Algorithm")
).properties(
    title="Jaccard Similarity",
    width=225,
    height=90
)
prec = alt.Chart(df).mark_line().encode(
    x='Time (seconds)',
    y=alt.Y(f'mean(Precision)', scale=alt.Scale(domain=[0, 1]), axis=alt.Axis(values=[0.2, 0.5, 0.8], labels=False, title=None,  tickSize=0), title=None),
    color=alt.Color("Algorithm")
).properties(
    title="Precision",
    width=225,
    height=90
)
recall = alt.Chart(df).mark_line().encode(
    x='Time (seconds)',
    y=alt.Y(f'mean(Recall)', scale=alt.Scale(domain=[0, 1]), axis=alt.Axis(values=[0.2, 0.5, 0.8], labels=False, title=None,  tickSize=0), title=None),
    color=alt.Color("Algorithm")
).properties(
    title="Recall",
    width=225,
    height=90
)
# first = alt.hconcat(f1_score, jaccard, spacing=0)
# second = alt.hconcat(prec, recall, spacing=0)
alt.hconcat(f1_score, prec,recall, spacing=0).resolve_scale(x='shared', y='shared').configure_legend(
    title=None,
    orient='none',
    legendX=200,
    legendY=135,
    labelFontSize=15,
    symbolSize=700,
    columns=2,
    labelLimit=275,
).configure_axis(
    labelFontSize=15,
    titleFontSize=15
).configure_title(
    fontSize=15
)