In [1]:
import sys
from pathlib import Path

main_path = Path().absolute().parent
sys.path.append(str(main_path))

import seaborn as sns
import matplotlib.pyplot as plt

import shap
import numpy as np
import pandas as pd
from IPython.display import display

from fge import Dataset, ModelBuilder, TreeBuilder
shap.initjs()

In [2]:
import pickle
import os
from collections import defaultdict

def load_cache(cache_path):
    datasets = ['titanic', 'adult', 'boston', 'california']
    cache = defaultdict()
    system = '_win' if os.name == 'nt' else ''
    for ds in datasets:
        with (cache_path / f'{ds}{system}.pickle').open('rb') as file:
            res = pickle.load(file)
        cache[ds] = res
        dataset = cache[ds]['dataset']
        explainer = cache[ds]['explainer']
        data = dataset.data['X_train']
        sv = explainer.shap_values(data)
        cache[ds]['shap_values'] = sv
    return cache

cache = load_cache(Path('../cache').resolve())

In [10]:
ds_name= 'adult'
dataset = cache[ds_name]['dataset']
siv = cache[ds_name]['siv']
shap_values = cache[ds_name]['shap_values']
explainer = cache[ds_name]['explainer']
features = dataset.data['X_train']
shap.initjs()

In [22]:

shap.summary_plot(shap_values, features=features)

In [None]:
shap.force_plot(explainer.expected_value, shap_values[:1000], features.iloc[:1000])

In [5]:
dataset.data['X_train']

Unnamed: 0,Age,Workclass,Education-Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country
22149,22.0,4,9.0,2,7,4,4,1,0.0,0.0,40.0,39
30351,68.0,4,4.0,2,3,4,4,1,0.0,0.0,40.0,39
2292,18.0,0,7.0,4,0,3,4,1,0.0,0.0,15.0,39
31407,24.0,4,13.0,4,10,0,2,1,0.0,0.0,40.0,39
20563,62.0,0,14.0,2,0,4,4,1,0.0,0.0,2.0,39
...,...,...,...,...,...,...,...,...,...,...,...,...
2181,34.0,4,13.0,2,10,4,4,1,0.0,0.0,40.0,0
26985,38.0,4,10.0,2,3,4,4,1,7688.0,0.0,40.0,39
18417,24.0,4,10.0,4,8,0,4,0,0.0,0.0,53.0,39
25940,24.0,4,7.0,2,3,4,4,1,0.0,0.0,50.0,39


# Open ML

In [None]:
# from fge.dataset import ames_house_prices
# dataset = ames_house_prices(data_home='../data', as_frame=True, display=False)

seed = 8
dataset = Dataset(dataset_name='ames', data_folder='../data', seed=seed)
model_builder = ModelBuilder()
results = model_builder.train(dataset, eta=0.1, max_depth=8, subsample=1.0, seed=seed, num_rounds=300)

performance = results['score']
model = results['model']
print(f'test performance is {performance:.4f}')

tree_builder = TreeBuilder(model, dataset, original_score=performance)

In [None]:
group_id = None
shap_interactions = tree_builder.shap_interaction_values(group_id=group_id)

In [None]:
# get tree from all train dataset
trees = tree_builder.build(
    score_method='abs_interaction', 
    shap_interactions=shap_interactions, 
    n_select_scores=10,  # select nodes_to_run & keys to filter 
    n_select_gap=5, 
    max_iter=35,
    nodes_to_run_method='random',  # random / sort / full
    filter_method='random',  # random / sort / prob
    rt_only_best=True,
    verbose=False,
    thres_random=0.25
)
for tree in trees:
    img = tree.show_tree(dataset.feature_names)
    display(img)

# Titanic

In [None]:
seed = 8
dataset = Dataset(dataset_name='titanic', data_folder='../data', seed=seed)
model_builder = ModelBuilder()
results = model_builder.train(dataset, eta=0.1, max_depth=8, subsample=1.0, seed=seed, num_rounds=100)

performance = results['score']
model = results['model']
print(f'test performance is {performance:.4f}')

tree_builder = TreeBuilder(model, dataset, original_score=performance)

In [None]:
group_id = None
shap_interactions = tree_builder.shap_interaction_values(group_id=group_id)

In [None]:
shap.summary_plot(shap_interactions.sum(2), dataset.data['X_train'])

In [None]:
shap.summary_plot(shap_interactions, dataset.data['X_train'])

In [None]:
# get tree from all train dataset
trees = tree_builder.build(
    score_method='abs_interaction', 
    shap_interactions=shap_interactions, 
    n_select_scores=10,  # select nodes_to_run & keys to filter 
    n_select_gap=5, 
    max_iter=35,
    nodes_to_run_method='random',  # random / sort / full
    filter_method='random',  # random / sort / prob
    rt_only_best=True,
    verbose=False,
    thres_random=0.25
)
for tree in trees:
    img = tree.show_tree(dataset.feature_names)
    display(img)

# Adult

In [None]:
seed = 8
dataset = Dataset(dataset_name='adult', data_folder='../data', seed=seed)
model_builder = ModelBuilder()
results = model_builder.train(dataset, eta=0.3, max_depth=8, subsample=1.0, seed=seed, num_rounds=200)

performance = results['score']
model = results['model']
print(f'test performance is {performance:.4f}')

tree_builder = TreeBuilder(model, dataset, original_score=performance)

In [None]:
group_id = None
shap_interactions = tree_builder.shap_interaction_values(group_id=group_id)

In [None]:
# get tree from all train dataset
tree = tree_builder.build(
    score_method='abs', 
    shap_interactions=shap_interactions, 
    n_select_scores=5,
    n_select_gap=5, 
    max_iter=None,
    nodes_to_run_method='full',  # random / sort / full
    filter_method='prob',  # random / sort / prob
    rt_only_best=False,
    verbose=False
)

In [None]:
for t in tree:
    img = t.show_tree(dataset.feature_names)
    display(img)

---

## Group with labels equals to 0

In [None]:
# get tree from group 0
group_id = 0
siv_adult_g0 = tree_builder.shap_interaction_values(group_id=group_id)

In [None]:
tree = tree_builder.build(
    score_method='abs', 
    shap_interactions=siv_adult_g0, 
    n_select_scores=5,
    n_select_gap=5, 
    max_iter=None,
    nodes_to_run_method='full',  # random / sort / full
    filter_method='prob',  # random / sort / prob
    rt_only_best=False,
    verbose=False
)
img = tree.show_tree(feature_names=dataset.feature_names)

In [None]:
display(img)

## Group with labels equals to 1

In [None]:
# get tree from group 0
group_id = 1
siv_adult_g1 = tree_builder.shap_interaction_values(group_id=group_id)

In [None]:
tree = tree_builder.build(
    score_method='abs', 
    shap_interactions=siv_adult_g1, 
    n_select_scores=5,
    n_select_gap=5, 
    max_iter=None,
    nodes_to_run_method='full',  # random / sort / full
    filter_method='prob',  # random / sort / prob
    rt_only_best=False,
    verbose=False
)
img = tree.show_tree(feature_names=dataset.feature_names)

In [None]:
display(img)

# California

In [None]:
from sklearn.datasets import fetch_california_housing
data_path = Path('../data').resolve()
dataset = fetch_california_housing(data_home=data_path / 'california', as_frame=True)
X = dataset['data'].copy()
y = dataset['target']

In [None]:
seed = 8
dataset = Dataset(dataset_name='california', data_folder='../data', seed=seed)
model_builder = ModelBuilder()
results = model_builder.train(dataset, eta=0.3, max_depth=8, subsample=1.0, seed=seed, num_rounds=200)

performance = results['score']
model = results['model']
print(f'test performance is {performance:.4f}')

tree_builder = TreeBuilder(model, dataset, original_score=performance)

In [None]:
# get tree from group 0
group_id = 0
siv_cali_g0 = tree_builder.shap_interaction_values(group_id=group_id)

In [None]:
tree = tree_builder.build(
    score_method='abs', 
    shap_interactions=siv_cali_g0, 
    n_select_scores=5,
    n_select_gap=5, 
    max_iter=None,
    nodes_to_run_method='full',  # random / sort / full
    filter_method='prob',  # random / sort / prob
    rt_only_best=False,
    verbose=False
)
img = tree.show_tree(feature_names=dataset.feature_names)

In [None]:
sns.histplot(dataset[group_id]['y_train'])

In [None]:
display(img)

In [None]:
# get tree from group 3
group_id = 3
siv_cali_g3 = tree_builder.shap_interaction_values(group_id=group_id)

In [None]:
tree = tree_builder.build(
    score_method='abs', 
    shap_interactions=siv_cali_g3, 
    n_select_scores=5,
    n_select_gap=5, 
    max_iter=None,
    nodes_to_run_method='full',  # random / sort / full
    filter_method='prob',  # random / sort / prob
    rt_only_best=False,
    verbose=False
)
img = tree.show_tree(feature_names=dataset.feature_names)

In [None]:
sns.histplot(dataset[group_id]['y_train'])

In [None]:
display(img)

# Boston

In [None]:
seed = 8
dataset = Dataset(dataset_name='boston', data_folder='../data', seed=seed)
model_builder = ModelBuilder()
results = model_builder.train(dataset, eta=0.1, max_depth=8, subsample=1.0, seed=seed, num_rounds=200)

performance = results['score']
model = results['model']
print(f'test performance is {performance:.4f}')

tree_builder = TreeBuilder(model, dataset, original_score=performance)

* CRIM - per capita crime rate by town
* ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
* INDUS - proportion of non-retail business acres per town.
* CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
* NOX - nitric oxides concentration (parts per 10 million)
* RM - average number of rooms per dwelling
* AGE - proportion of owner-occupied units built prior to 1940
* DIS - weighted distances to five Boston employment centres
* RAD - index of accessibility to radial highways
* TAX - full-value property-tax rate per $10,000
* PTRATIO - pupil-teacher ratio by town
* B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
* LSTAT - % lower status of the population
* MEDV - Median value of owner-occupied homes in $1000's

In [None]:
# get tree from group 0
group_id = 0
siv_bos_g0 = tree_builder.shap_interaction_values(group_id=group_id)

In [None]:
sns.histplot(dataset[group_id]['y_train'])

In [None]:
tree = tree_builder.build(
    score_method='abs', 
    shap_interactions=siv_bos_g0, 
    n_select_scores=5,
    n_select_gap=5, 
    max_iter=None,
    nodes_to_run_method='full',  # random / sort / full
    filter_method='prob',  # random / sort / prob
    rt_only_best=False,
    verbose=False
)
img = tree.show_tree(feature_names=dataset.feature_names)
display(img)

In [None]:
# get tree from group 3
group_id = 3
siv_bos_g3 = tree_builder.shap_interaction_values(group_id=group_id)

In [None]:
sns.histplot(dataset[group_id]['y_train'])

In [None]:
tree = tree_builder.build(
    score_method='abs', 
    shap_interactions=siv_bos_g3, 
    n_select_scores=5,
    n_select_gap=5, 
    max_iter=None,
    nodes_to_run_method='full',  # random / sort / full
    filter_method='prob',  # random / sort / prob
    rt_only_best=False,
    verbose=False
)
img = tree.show_tree(feature_names=dataset.feature_names)
display(img)

---

# Not ready yet

# Crime

In [None]:
seed = 8
dataset = Dataset(dataset_name='crime', data_folder='../data', seed=seed)
model_builder = ModelBuilder()
results = model_builder.train(dataset, eta=0.3, max_depth=8, subsample=1.0, seed=seed, num_rounds=400)

performance = results['score']
model = results['model']
print(f'test performance is {performance:.4f}')

tree_builder = TreeBuilder(model, dataset, original_score=performance)

In [None]:
# get tree from group 0
group_id = 0
shap_interactions = tree_builder.shap_interaction_values(group_id=group_id)

In [None]:
sns.histplot(dataset[group_id]['y_train'])

In [None]:
tree = tree_builder.build(
    score_method='abs', 
    shap_interactions=shap_interactions, 
    n_select_scores=5,
    n_select_performance=5, 
    max_iter=None,
    initialize=None,
    rt_only_best=True
)
img = tree.show_tree(feature_names=dataset.feature_names)
display(img)

In [None]:
len(list(tree_builder.infos[97]['nodes'][0].keys()))

# nhanesi

In [None]:
seed = 8
dataset = Dataset(dataset_name='nhanesi', data_folder='../data', seed=seed)
model_builder = ModelBuilder()
results = model_builder.train(dataset, eta=0.05, max_depth=8, subsample=1.0, seed=seed, num_rounds=400)

performance = results['score']
model = results['model']
print(f'test performance is {performance:.4f}')

tree_builder = TreeBuilder(model, dataset, original_score=performance)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
import itertools
X_train, X_test = dataset.data['X_train'].copy(), dataset.data['X_test'].copy()
poly = PolynomialFeatures(2, interaction_only=True, include_bias=True)
X_train_full, X_test_full = poly.fit_transform(X_train), poly.fit_transform(X_test)

In [None]:
X_train.iloc[:, :3]

In [None]:
len(list(itertools.combinations(np.arange(X_train.shape[1]), 2)))

In [None]:
X_train_full[:, :3]

In [None]:
# get tree from group 0
group_id = 0
shap_interactions = tree_builder.shap_interaction_values(group_id=group_id)

In [None]:
tree = tree_builder.build(
    score_method='abs', 
    shap_interactions=shap_interactions, 
    n_select_scores=5,
    n_select_performance=5, 
    max_iter=25,
    initialize='random',
    rt_only_best=True
)
img = tree.show_tree(feature_names=dataset.feature_names)
display(img)

In [None]:
tree_builder.infos[1]['nodes_to_run'][0][25:]

## Debug

In [None]:
from anytree import Node

In [None]:
tree_builder.reset_tree(False)
k=0
g_fn = tree_builder.score_methods['abs']
# feature settings

if shap_interactions.ndim == 3:
    # ndim == 3 case: global tree
    build_global = True
elif shap_interactions.ndim == 2:
    # ndim == 2 case: single tree
    build_global = False
else:
    raise ValueError('number of dimension of `shap_interactions` should be 2 or 3')

siv_scores = g_fn(shap_interactions, build_global)
max_iter = 10
r_diag, c_diag = np.diag_indices(len(tree_builder.feature_names))
main_effect = siv_scores[r_diag, c_diag]
tree_builder.infos[k]['nodes'] = [dict()]
tree_builder.infos[k]['done'] = [set()]
for i, name in enumerate(tree_builder.feature_names):
    tree_builder.infos[k]['nodes'][0][i] = Node(
        name=name, parent=None, score=main_effect[i], interaction=0.0, k=0, drop=0.0
    )

---