## Decision Trees example

#### Classification dataset

In [1]:
import numpy as np
import pandas as pd

# Set a random seed for reproducibility
np.random.seed(42)

# Number of samples
num_samples = 1000

# Generate two random features
feature1 = np.random.rand(num_samples)
feature2 = np.random.rand(num_samples)

# Create three artificial classes based on the features
class_0_indices = (feature1 + feature2) < 0.8
class_1_indices = (feature1 + feature2) >= 0.8

# Assign labels to each class
labels = np.zeros(num_samples, dtype=int)
labels[class_1_indices] = 1
labels[class_0_indices] = 0

# Combine the features and labels into a DataFrame
data = pd.DataFrame({'Feature1': feature1, 'Feature2': feature2, 'Label': labels})


In [2]:
data

Unnamed: 0,Feature1,Feature2,Label
0,0.374540,0.185133,0
1,0.950714,0.541901,1
2,0.731994,0.872946,1
3,0.598658,0.732225,1
4,0.156019,0.806561,1
...,...,...,...
995,0.091582,0.656955,0
996,0.917314,0.956615,1
997,0.136819,0.068958,0
998,0.950237,0.057055,1


In [20]:
from tensorflow_ml.classification.decision_tree import DecisionTree

gbt = DecisionTree(model = "gbt", verbose = True)
rf = DecisionTree(model = "rf", verbose = True)
cart = DecisionTree(model = "cart", verbose = True)

Use /var/folders/ll/h2ldzwfj6kl5qkxyrsrp33p80000gn/T/tmpgykfh8cu as temporary training directory
Use /var/folders/ll/h2ldzwfj6kl5qkxyrsrp33p80000gn/T/tmpzcvdoeh7 as temporary training directory
Use /var/folders/ll/h2ldzwfj6kl5qkxyrsrp33p80000gn/T/tmpvr64nunq as temporary training directory




In [21]:
# Print the configs for each model in a loop
models = {
    "cart":cart,
    "rf":rf,
    "gbt":gbt,
}

for model_name in models.keys():
    print(f"\nModel: {model_name}")
    print(f"Params: {models[model_name].get_params()}")


Model: cart
Params: {'task': 1}

Model: rf
Params: {'task': 1}

Model: gbt
Params: {'task': 1}


In [22]:
for model_name in models.keys():
    print("-"*50)
    print(f"Now training model: {model_name}")
    models[model_name].load_dataset(data, 'Label')
    models[model_name].fit()
    models[model_name].evaluate()
    # print(f'Info: {models[model_name].info()}')
    predictions = models[model_name].predict(length=5, split="test")
    print("-"*50)



--------------------------------------------------
Now training model: cart
Reading training dataset...
Training dataset read in 0:00:00.072817. Found 675 examples.
Reading validation dataset...
Num validation examples: tf.Tensor(154, shape=(), dtype=int32)
Validation dataset read in 0:00:00.058348. Found 154 examples.
Training model...
Model trained in 0:00:00.004544
Compiling model...
Model compiled.


[INFO 23-08-07 02:53:31.0423 +08 kernel.cc:1243] Loading model from path /var/folders/ll/h2ldzwfj6kl5qkxyrsrp33p80000gn/T/tmpvr64nunq/model/ with prefix 45c51ca53a0e4887
[INFO 23-08-07 02:53:31.0425 +08 decision_forest.cc:660] Model loaded with 1 root(s), 13 node(s), and 2 input feature(s).
[INFO 23-08-07 02:53:31.0425 +08 abstract_model.cc:1312] Engine "RandomForestOptPred" built
[INFO 23-08-07 02:53:31.0425 +08 kernel.cc:1075] Use fast generic engine


loss: 0.0000
accuracy: 0.9181




--------------------------------------------------
--------------------------------------------------
Now training model: rf
Reading training dataset...
Training dataset read in 0:00:00.061636. Found 660 examples.
Reading validation dataset...
Num validation examples: tf.Tensor(146, shape=(), dtype=int32)
Validation dataset read in 0:00:00.059887. Found 146 examples.
Training model...
Model trained in 0:00:00.032003
Compiling model...
Model compiled.


[INFO 23-08-07 02:53:31.3087 +08 kernel.cc:1243] Loading model from path /var/folders/ll/h2ldzwfj6kl5qkxyrsrp33p80000gn/T/tmpzcvdoeh7/model/ with prefix 9c63eaf2ddb04e6b
[INFO 23-08-07 02:53:31.3172 +08 decision_forest.cc:660] Model loaded with 300 root(s), 7820 node(s), and 2 input feature(s).
[INFO 23-08-07 02:53:31.3172 +08 kernel.cc:1075] Use fast generic engine


loss: 0.0000
accuracy: 0.9639




--------------------------------------------------
--------------------------------------------------
Now training model: gbt
Reading training dataset...
Training dataset read in 0:00:00.061175. Found 636 examples.
Reading validation dataset...
Num validation examples: tf.Tensor(166, shape=(), dtype=int32)
Validation dataset read in 0:00:00.057103. Found 166 examples.
Training model...
Model trained in 0:00:00.073194
Compiling model...


[INFO 23-08-07 02:53:31.6300 +08 kernel.cc:1243] Loading model from path /var/folders/ll/h2ldzwfj6kl5qkxyrsrp33p80000gn/T/tmpgykfh8cu/model/ with prefix 4029eea5acab4c68
[INFO 23-08-07 02:53:31.6334 +08 kernel.cc:1075] Use fast generic engine


Model compiled.
loss: 0.0000
accuracy: 0.9646
--------------------------------------------------


#### Regression dataset example

In [1]:
import numpy as np
import pandas as pd

# Set a random seed for reproducibility
np.random.seed(42)

# Number of samples
num_samples = 1000

# Generate a random feature
feature = np.random.rand(num_samples)

# Generate the target variable based on the feature with some random noise
target = 2 * feature + 1 + np.random.randn(num_samples)

# Combine the feature and target into a DataFrame
data = pd.DataFrame({'Feature': feature, 'Target': target})

In [2]:
data.head()

Unnamed: 0,Feature,Target
0,0.37454,1.926781
1,0.950714,1.566084
2,0.731994,2.844186
3,0.598658,2.807903
4,0.156019,1.871828


In [7]:
from tensorflow_ml.classification.decision_tree import DecisionTree

gbt = DecisionTree(model = "gbt", verbose = True, _task = 'regression')
rf = DecisionTree(model = "rf", verbose = True, _task = 'regression')
cart = DecisionTree(model = "cart", verbose = True, _task = 'regression')

Use /var/folders/ll/h2ldzwfj6kl5qkxyrsrp33p80000gn/T/tmp2vbh6s7j as temporary training directory
Use /var/folders/ll/h2ldzwfj6kl5qkxyrsrp33p80000gn/T/tmpk4f_mv28 as temporary training directory
Use /var/folders/ll/h2ldzwfj6kl5qkxyrsrp33p80000gn/T/tmph3rk0opn as temporary training directory




In [8]:
# Print the configs for each model in a loop
models = {
    "cart":cart,
    "rf":rf,
    "gbt":gbt,
}

for model_name in models.keys():
    print(f"\nModel: {model_name}")
    print(f"Params: {models[model_name].get_params()}")


Model: cart
Params: {'task': 2}

Model: rf
Params: {'task': 2}

Model: gbt
Params: {'task': 2}


In [9]:
for model_name in models.keys():
    print("-"*50)
    print(f"Now training model: {model_name}")
    models[model_name].load_dataset(data, 'Target')
    models[model_name].fit(_metrics = ['mse'])
    models[model_name].evaluate()
    # print(f'Info: {models[model_name].info()}')
    predictions = models[model_name].predict(length=5, split="test")
    print("-"*50)



--------------------------------------------------
Now training model: cart
Reading training dataset...
Training dataset read in 0:00:00.057140. Found 660 examples.
Reading validation dataset...
Num validation examples: tf.Tensor(139, shape=(), dtype=int32)
Validation dataset read in 0:00:00.047689. Found 139 examples.
Training model...
Model trained in 0:00:00.004739
Compiling model...
Model compiled.


[INFO 23-08-07 03:10:18.3637 +08 kernel.cc:1243] Loading model from path /var/folders/ll/h2ldzwfj6kl5qkxyrsrp33p80000gn/T/tmph3rk0opn/model/ with prefix f23e419c73f14d5a
[INFO 23-08-07 03:10:18.3639 +08 decision_forest.cc:660] Model loaded with 1 root(s), 45 node(s), and 1 input feature(s).
[INFO 23-08-07 03:10:18.3639 +08 abstract_model.cc:1312] Engine "RandomForestOptPred" built
[INFO 23-08-07 03:10:18.3639 +08 kernel.cc:1075] Use fast generic engine


loss: 0.0000
mse: 1.0441




--------------------------------------------------
--------------------------------------------------
Now training model: rf
Reading training dataset...




Training dataset read in 0:00:00.047939. Found 630 examples.
Reading validation dataset...




Num validation examples: tf.Tensor(158, shape=(), dtype=int32)
Validation dataset read in 0:00:00.048464. Found 158 examples.
Training model...


[INFO 23-08-07 03:10:18.6269 +08 kernel.cc:1243] Loading model from path /var/folders/ll/h2ldzwfj6kl5qkxyrsrp33p80000gn/T/tmpk4f_mv28/model/ with prefix 128a8023414e42b6


Model trained in 0:00:00.131851
Compiling model...


[INFO 23-08-07 03:10:18.6888 +08 decision_forest.cc:660] Model loaded with 300 root(s), 56724 node(s), and 1 input feature(s).
[INFO 23-08-07 03:10:18.6888 +08 kernel.cc:1075] Use fast generic engine


Model compiled.
loss: 0.0000
mse: 1.1126




--------------------------------------------------
--------------------------------------------------
Now training model: gbt
Reading training dataset...




Training dataset read in 0:00:00.048993. Found 625 examples.
Reading validation dataset...




Num validation examples: tf.Tensor(160, shape=(), dtype=int32)
Validation dataset read in 0:00:00.048584. Found 160 examples.
Training model...
Model trained in 0:00:00.029657
Compiling model...
Model compiled.


[INFO 23-08-07 03:10:18.9676 +08 kernel.cc:1243] Loading model from path /var/folders/ll/h2ldzwfj6kl5qkxyrsrp33p80000gn/T/tmp2vbh6s7j/model/ with prefix 2d96fdcfd4164213
[INFO 23-08-07 03:10:18.9688 +08 kernel.cc:1075] Use fast generic engine


loss: 0.0000
mse: 0.9410
--------------------------------------------------
