# Experiment everything

# Setup

## Importings

In [1]:
from src.xtrees.VizTree import *
from src.xtrees.TreePlot import *
from src.xtrees.TreeDash import *
from src.xtrees.ForestBasedTree import *

import pandas as pd

from jupyter_dash import JupyterDash

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from sklearn.datasets import load_iris, load_breast_cancer, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

seed = 42

## Data loading

In [2]:
iris_data = load_iris()
# data = load_breast_cancer()
iris_X = iris_data.data
iris_y = iris_data.target

iris_class_names = iris_data.target_names
iris_feature_names = iris_data.feature_names
iris_X = pd.DataFrame(iris_X, columns=iris_feature_names)

iris_feature_types = pd.DataFrame(iris_data, columns=iris_feature_names).dtypes

iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris_X, iris_y, test_size=0.3, random_state=seed)


In [3]:
cancer_data = load_breast_cancer()
cancer_X = cancer_data.data
cancer_y = cancer_data.target

cancer_class_names = cancer_data.target_names
cancer_feature_names = cancer_data.feature_names
cancer_X = pd.DataFrame(cancer_X, columns=cancer_feature_names)

cancer_feature_types = pd.DataFrame(cancer_X, columns=cancer_feature_names).dtypes

cancer_X_train, cancer_X_test, cancer_y_train, cancer_y_test = train_test_split(cancer_X, cancer_y, test_size=0.3, random_state=seed)

In [4]:

calif_data = fetch_california_housing()
calif_X = calif_data.data
calif_y = calif_data.target

calif_feature_names = calif_data.feature_names
calif_X = pd.DataFrame(calif_X, columns=calif_feature_names)

calif_feature_types = pd.DataFrame(calif_X, columns=calif_feature_names).dtypes

calif_X_train, calif_X_test, calif_y_train, calif_y_test = train_test_split(calif_X, calif_y, test_size=0.2, random_state=seed)


## Model traning

In [5]:
# rf parameters
num_of_estimators = 20
max_depth = 5
min_sample_leaf = max(1, int(0.02 * len(iris_X_train)))

iris_rf = RandomForestClassifier(n_estimators=num_of_estimators, max_depth=max_depth, min_samples_leaf=min_sample_leaf, random_state=seed)
iris_rf.fit(iris_X_train, iris_y_train)

minimal_forest_size=10
max_number_of_branches=50
exclusion_threshold=0.8

iris_fbt = ForestBasedTree(random_state=seed)
iris_fbt.fit(iris_rf, iris_X_train, iris_y_train, iris_feature_types, iris_feature_names, 
        minimal_forest_size=minimal_forest_size, amount_of_branches_threshold=max_number_of_branches, exclusion_threshold=exclusion_threshold)

print(iris_fbt.cs_df.head())


   0_upper  0_lower  1_upper  1_lower  2_upper  2_lower  3_upper  3_lower  \
0     5.45     -inf      inf     -inf     2.50     -inf      0.7     -inf   
1     5.45     -inf      inf     -inf     4.45     2.50      inf      0.8   
2     5.75     5.45     2.95     -inf     4.30     2.50      inf      0.8   
3     5.45     -inf      inf     -inf     4.85     4.45      inf      0.8   
4     5.45     -inf     2.90     -inf     5.20     4.85      1.7      0.8   

   number_of_samples  branch_probability                 probas    0       1  \
0           20.20610             0.02989        [1.0, 0.0, 0.0]  1.0  0.0000   
1            7.42538             0.05753        [0.0, 1.0, 0.0]  0.0  1.0000   
2            9.82285             0.00785        [0.0, 1.0, 0.0]  0.0  1.0000   
3            4.96565             0.02531    [0.0, 0.875, 0.125]  0.0  0.8750   
4            3.13017             0.00526  [0.0, 0.8125, 0.1875]  0.0  0.8125   

        2  
0  0.0000  
1  0.0000  
2  0.0000  
3  0.125

In [6]:

# rf parameters
num_of_estimators = 20
max_depth = 5
min_sample_leaf = max(1,int(0.02*len(cancer_X_train)))

cancer_rf = RandomForestClassifier(n_estimators=num_of_estimators,max_depth=max_depth,min_samples_leaf=min_sample_leaf,random_state=seed)
cancer_rf.fit(cancer_X_train, cancer_y_train)

minimal_forest_size=10
max_number_of_branches=50
exclusion_threshold=0.8

cancer_fbt = ForestBasedTree(random_state=seed)
cancer_fbt.fit(cancer_rf, cancer_X_train, cancer_y_train, cancer_feature_types, cancer_feature_names, 
        minimal_forest_size=minimal_forest_size, amount_of_branches_threshold=max_number_of_branches, exclusion_threshold=exclusion_threshold)


print(cancer_fbt.cs_df.head())


   0_upper  0_lower  1_upper  1_lower  2_upper  2_lower  3_upper  3_lower  \
0      inf     -inf    19.36     -inf      inf     -inf      inf     -inf   
1      inf     -inf    19.36     -inf      inf     -inf      inf     -inf   
2      inf     -inf    19.36     -inf      inf     -inf      inf     -inf   
3      inf     -inf    19.36     -inf      inf     -inf      inf     -inf   
4      inf     -inf    19.36     -inf      inf     -inf      inf     -inf   

   4_upper  4_lower  ...  27_lower  28_upper  28_lower  29_upper  29_lower  \
0      inf     -inf  ...      -inf       inf      -inf       inf      -inf   
1      inf     -inf  ...      -inf       inf      -inf       inf      -inf   
2      inf     -inf  ...      -inf       inf      -inf       inf      -inf   
3      inf     -inf  ...      -inf       inf      -inf       inf      -inf   
4      inf     -inf  ...      -inf       inf      -inf       inf      -inf   

   number_of_samples  branch_probability  \
0           97.85811    

In [7]:
feature_names = calif_feature_names
feature_types = calif_feature_types

# rf parameters
num_of_estimators = 20
max_depth = 5
min_sample_leaf = max(1, int(0.02 * len(calif_X_train)))

calif_rf = RandomForestRegressor(n_estimators=num_of_estimators, max_depth=max_depth, 
                                 min_samples_leaf=min_sample_leaf, random_state=seed)
calif_rf.fit(calif_X_train, calif_y_train)


minimal_forest_size=10
max_number_of_branches=50
exclusion_threshold=0.8

calif_fbt = ForestBasedTree(random_state=seed)
calif_fbt.fit(calif_rf, calif_X_train, calif_y_train, feature_types=feature_types, feature_names=feature_names, 
        minimal_forest_size=minimal_forest_size, amount_of_branches_threshold=max_number_of_branches, exclusion_threshold=exclusion_threshold)

print(calif_fbt.cs_df.head())

   0_upper  0_lower  1_upper  1_lower  2_upper  2_lower  3_upper  3_lower  \
0  3.07415     -inf      inf     -inf  4.20698     -inf      inf     -inf   
1  1.95400     -inf      inf     -inf  4.20698     -inf      inf     -inf   
2  2.15150  1.95400      inf     -inf  4.20698     -inf      inf     -inf   
3  3.07415  2.19195      inf     -inf  4.20698     -inf      inf     -inf   
4  2.31775     -inf      inf     -inf      inf  4.31419      inf     -inf   

   4_upper  4_lower  5_upper  5_lower  6_upper  6_lower  7_upper  7_lower  \
0      inf     -inf  2.44150     -inf      inf     -inf      inf     -inf   
1      inf     -inf      inf  2.51066      inf     -inf      inf     -inf   
2      inf     -inf      inf  2.51066      inf     -inf      inf     -inf   
3      inf     -inf      inf  2.51066      inf     -inf      inf     -inf   
4      inf     -inf  2.43244     -inf   35.295     -inf      inf     -inf   

   number_of_samples  branch_probability  regressions  
0          490.840

In [8]:
X_train = cancer_X_train
X_test = cancer_X_test
y_train = cancer_y_train
y_test = cancer_y_test
X = cancer_X
class_names = cancer_class_names

cancer_dt = DecisionTreeClassifier(max_depth=4).fit(X_train, y_train)

y_pred = cancer_dt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.95


In [9]:
X_train = calif_X_train
X_test = calif_X_test
y_train = calif_y_train
y_test = calif_y_test
X = calif_X

calif_dt = DecisionTreeRegressor(random_state=42)
calif_dt.fit(X_train, y_train)

y_pred = calif_dt.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")


Mean Squared Error: 0.50


# Model evaluation

In [10]:
X_test = iris_X_test

print(X_test.dtypes)

sepal length (cm)    float64
sepal width (cm)     float64
petal length (cm)    float64
petal width (cm)     float64
dtype: object


In [11]:
X_train = iris_X_train
X_test = iris_X_test
y_train = iris_y_train
y_test = iris_y_test
X = iris_X
class_names = iris_class_names

fbt_ypred = iris_fbt.predict(X_test)

accuracy = accuracy_score(y_test, fbt_ypred)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.9778


In [12]:
iris_fbt_viz = VizTree(iris_fbt, class_names=class_names)
print(iris_fbt_viz.is_classifier)
iris_fbt_viz.print_nodes()

True


node 0
id                  : 0
feature             : 2
threshold           : 4.75
value               : [0.19839816 0.44701505 0.35458679]
parent              : None
is_left             : None
left                : 1
right               : 22
n_train             : 0
n_samples           : 0

node 1
id                  : 1
feature             : 0
threshold           : 5.45
value               : [0.3286967  0.59428994 0.07701335]
parent              : 0
is_left             : True
left                : 2
right               : 11
n_train             : 0
n_samples           : 0

node 2
id                  : 2
feature             : 2
threshold           : 2.5
value               : [0.36360677 0.52282263 0.1135706 ]
parent              : 1
is_left             : True
left                : 3
right               : 8
n_train             : 0
n_samples           : 0

node 3
id                  : 3
feature             : 3
threshold           : 1.75
value               : [0.34695871 0.51706659 0

## Viz trees

In [13]:
iris_fbt_viz = VizTree(iris_fbt, X, class_names)

fbtviz_ypred = iris_fbt_viz.predict(X_test)

print(fbtviz_ypred)
accuracy = accuracy_score(y_test, fbtviz_ypred)
print(f"Accuracy:  {accuracy:.4f}")

iris_fbt_viz.print_nodes('before number predict')

[1 0 2 1 2 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0]
Accuracy:  0.9778
before number predict

node 0
id                  : 0
feature             : 2
threshold           : 4.75
value               : [0.37602945 0.31327576 0.3106948 ]
parent              : None
is_left             : None
left                : 1
right               : 22
n_train             : 150
n_samples           : 0

node 1
id                  : 1
feature             : 0
threshold           : 5.45
value               : [0.58204913 0.38105782 0.03689305]
parent              : 0
is_left             : True
left                : 2
right               : 11
n_train             : 95
n_samples           : 0

node 2
id                  : 2
feature             : 2
threshold           : 2.5
value               : [0.86538462 0.12620192 0.00841346]
parent              : 1
is_left             : True
left                : 3
right               : 8
n_train             : 52
n_samples           : 

In [14]:
X_train = cancer_X_train
X_test = cancer_X_test
y_train = cancer_y_train
y_test = cancer_y_test
X = cancer_X
class_names = cancer_class_names

cancer_fbt_viz = VizTree(cancer_fbt, X, class_names)

fbtviz_ypred = cancer_fbt_viz.predict(X_test)
accuracy = accuracy_score(y_test, fbtviz_ypred)
print(f"Accuracy:  {accuracy:.4f}")

cancer_fbt_viz.print_nodes('before number predict')



Accuracy:  0.9591
before number predict

node 0
id                  : 0
feature             : 27
threshold           : 0.1563
value               : [0.36137104 0.63862896]
parent              : None
is_left             : None
left                : 1
right               : 8
n_train             : 569
n_samples           : 0

node 1
id                  : 1
feature             : 20
threshold           : 16.83
value               : [0.17661913 0.82338087]
parent              : 0
is_left             : True
left                : 2
right               : 3
n_train             : 414
n_samples           : 0

node 2
id                  : 2
feature             : None
threshold           : None
value               : [0.10154501 0.89845499]
parent              : 1
is_left             : True
left                : None
right               : None
n_train             : 357
n_samples           : 0

node 3
id                  : 3
feature             : 13
threshold           : 31.245
value               : [

In [15]:
cancer_dt_viz = VizTree(cancer_dt, X, class_names)

dtviz_ypred = cancer_dt_viz.predict(X_test)
accuracy = accuracy_score(y_test, dtviz_ypred)
print(f"Accuracy:  {accuracy:.4f}")

cancer_fbt_viz.print_nodes('before number predict')


Accuracy:  0.9532
before number predict

node 0
id                  : 0
feature             : 27
threshold           : 0.1563
value               : [0.36137104 0.63862896]
parent              : None
is_left             : None
left                : 1
right               : 8
n_train             : 569
n_samples           : 0

node 1
id                  : 1
feature             : 20
threshold           : 16.83
value               : [0.17661913 0.82338087]
parent              : 0
is_left             : True
left                : 2
right               : 3
n_train             : 414
n_samples           : 0

node 2
id                  : 2
feature             : None
threshold           : None
value               : [0.10154501 0.89845499]
parent              : 1
is_left             : True
left                : None
right               : None
n_train             : 357
n_samples           : 0

node 3
id                  : 3
feature             : 13
threshold           : 31.245
value               : [

# Visualization

In [16]:
pruned_fbt = cancer_fbt_viz.prune(10)
fbt_sankey = SankeyTreePlot(pruned_fbt)

fbt_sankey.show()

In [17]:
fbt_sankey = SankeyTreePlot(pruned_fbt)
fbt_sankey.show()


In [18]:
X_test = iris_X_test
y_test = iris_y_test

pruned_fbt = iris_fbt_viz.prune(100)
fbt_dash = VizTreeDashboard(pruned_fbt, X_test, y_test)
fbt_dash.run(port=8060)

In [19]:
print("Original Tree:")
tree_plot = SankeyTreePlot(cancer_dt_viz)
tree_plot.show()

Original Tree:


In [20]:
viz_tree = cancer_dt_viz
X_test = cancer_X_test
y_test = cancer_y_test

tree_dash = VizTreeDashboard(viz_tree, X_test, y_test)
tree_dash.run(port=8061)


# Random forest dash

In [21]:
X_train = cancer_X_train
X_test = cancer_X_test
y_train = cancer_y_train
y_test = cancer_y_test
X = cancer_X
class_names = cancer_class_names

# rf parameters
num_of_estimators = 20
max_depth = 10
min_sample_leaf = max(1,int(0.02*len(X_train)))

cancer_rf = RandomForestClassifier(n_estimators=num_of_estimators,max_depth=max_depth,min_samples_leaf=min_sample_leaf)
cancer_rf.fit(cancer_X_train, cancer_y_train)


In [22]:
rf = cancer_rf

rf_dash = RFDashboard(X, X_test, y_test, rf, class_names)
rf_dash.run(port=8062)

In [23]:


X_train = iris_X_train
X_test = iris_X_test
y_train = iris_y_train
y_test = iris_y_test
X = iris_X
class_names = iris_class_names

# Example usage
combined_dashboard = CombinedDashboard(iris_fbt_viz, X_test, y_test, X, iris_rf, class_names)
combined_dashboard.run(port=8063)


# Regression generalization

In [24]:
calif_dt_viz = VizTree(calif_dt, calif_X)


In [25]:
pruned_calif_viz = calif_dt_viz.prune(4)

calif_sankey = SankeyTreePlot(pruned_calif_viz, show_text=False)
calif_sankey.show()

In [26]:

X_train = calif_X_train
X_test = calif_X_test
y_train = calif_y_train
y_test = calif_y_test
X = calif_X

calif_fbt_viz = VizTree(calif_fbt, X)

# Example usage
combined_dashboard = CombinedDashboard(calif_fbt_viz, X_test, y_test, X, calif_rf, class_names)
combined_dashboard.run(port=8064)
