In [1]:
import sys
sys.path.append("..")

import numpy as np
from sklearn.datasets import fetch_covtype
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import examples.util as util
import dforest_class
import query

In [2]:
# Numpy options
np.random.seed(12345)
np.set_printoptions(precision=1, suppress=True)

In [3]:
n_samples = 50_000

# Load dataset
data_X, data_y = fetch_covtype(return_X_y=True, shuffle=True)

print(np.unique_counts(data_y))

data_X, data_y = data_X[:n_samples], data_y[:n_samples]

print(np.unique_counts(data_y))

# Shuffle and split data
train_X, train_y, test_X, test_y = util.split(data_X, data_y)
print(f"Training shape: {train_X.shape}")
print(f"Testing shape: {test_X.shape}")

UniqueCountsResult(values=array([1, 2, 3, 4, 5, 6, 7], dtype=int32), counts=array([211840, 283301,  35754,   2747,   9493,  17367,  20510]))
UniqueCountsResult(values=array([1, 2, 3, 4, 5, 6, 7], dtype=int32), counts=array([18151, 24473,  3015,   258,   854,  1518,  1731]))
Training shape: (40000, 54)
Testing shape: (10000, 54)


In [None]:
# Train models
rf_model = RandomForestClassifier(max_features=0.2)
rf2_model = RandomForestClassifier(max_features=0.2, max_depth=20)
bag_model = RandomForestClassifier()
bag2_model = RandomForestClassifier(max_depth=20)
gb_model = GradientBoostingClassifier(max_depth=12)
gb2_model = GradientBoostingClassifier(max_depth=8)

rf_model.fit(train_X, train_y)
rf2_model.fit(train_X, train_y)
bag_model.fit(train_X, train_y)
bag2_model.fit(train_X, train_y)
gb_model.fit(train_X, train_y)
gb2_model.fit(train_X, train_y)

In [None]:
# Eval models
_ = util.eval_multiclass_model(rf_model, train_X, train_y, test_X, test_y, "Random Forest")
_ = util.eval_multiclass_model(rf2_model, train_X, train_y, test_X, test_y, "Random Forest (Max Depth = 20)")
_ = util.eval_multiclass_model(bag_model, train_X, train_y, test_X, test_y, "Bagging")
_ = util.eval_multiclass_model(bag2_model, train_X, train_y, test_X, test_y, "Bagging (Max Depth = 20)")
_ = util.eval_multiclass_model(gb_model, train_X, train_y, test_X, test_y, "Gradient Boosting (Max Depth = 12)")
_ = util.eval_multiclass_model(gb2_model, train_X, train_y, test_X, test_y, "Gradient Boosting (Max Depth = 8)")

Random Forest - Train Accuracy: 1.0
Random Forest - Test Accuracy: 0.8827

Random Forest (Max Depth = 18) - Train Accuracy: 0.9218
Random Forest (Max Depth = 18) - Test Accuracy: 0.8392

Bagging - Train Accuracy: 1.0
Bagging - Test Accuracy: 0.8769

Bagging (Max Depth = 18) - Train Accuracy: 0.9133
Bagging (Max Depth = 18) - Test Accuracy: 0.833

Gradient Boosting (Max Depth = 10) - Train Accuracy: 0.998525
Gradient Boosting (Max Depth = 10) - Test Accuracy: 0.8913

Gradient Boosting (Max Depth = 7) - Train Accuracy: 0.940825
Gradient Boosting (Max Depth = 7) - Test Accuracy: 0.856



In [None]:
rf = dforest_class.make_forest_classifier_sklearn(rf_model)
rf2 = dforest_class.make_forest_classifier_sklearn(rf2_model)
bag = dforest_class.make_forest_classifier_sklearn(bag_model)
bag2 = dforest_class.make_forest_classifier_sklearn(bag2_model)
gb = dforest_class.make_forest_classifier_sklearn(gb_model, gb=True)
gb2 = dforest_class.make_forest_classifier_sklearn(gb2_model, gb=True)

In [8]:
rf.print_summary()
rf2.print_summary()

# classes: 7
Size of forests: [100, 100, 100, 100, 100, 100, 100]
Average Tree Size: 5842.03
Avg Max Depth: 39.93
Class 1:
Minimum: [0.0, None]
Maximum: [None, 1.0]
Class 2:
Minimum: [0.0, None]
Maximum: [None, 1.0]
Class 3:
Minimum: [0.0, None]
Maximum: [None, 1.0]
Class 4:
Minimum: [0.0, None]
Maximum: [None, 1.0]
Class 5:
Minimum: [0.0, None]
Maximum: [None, 1.0]
Class 6:
Minimum: [0.0, None]
Maximum: [None, 1.0]
Class 7:
Minimum: [0.0, None]
Maximum: [None, 1.0]
# classes: 7
Size of forests: [100, 100, 100, 100, 100, 100, 100]
Average Tree Size: 2598.32
Avg Max Depth: 18.0
Class 1:
Minimum: [0.0, None]
Maximum: [None, 1.0]
Class 2:
Minimum: [0.0, None]
Maximum: [None, 1.0]
Class 3:
Minimum: [0.0, None]
Maximum: [None, 1.0]
Class 4:
Minimum: [0.0, None]
Maximum: [None, 1.0]
Class 5:
Minimum: [0.0, None]
Maximum: [None, 1.0]
Class 6:
Minimum: [0.0, None]
Maximum: [None, 1.0]
Class 7:
Minimum: [0.0, None]
Maximum: [None, 1.0]


In [11]:
n_trials = 25
kwargs = {"pso_max_iters": 5, "merge_limit": 15}
clip_min = np.min(data_X, axis=0)
clip_max = np.max(data_X, axis=0)
delta = np.std(train_X, axis=0) / 20
rf_res, cexs = query.multiclass_rob_query_many(rf2, test_X[0:n_trials], delta, clip_min, clip_max, **kwargs)

print(f"True: {len(rf_res[0])}")
print(f"False: {len(rf_res[1])}")
print(f"None: {len(rf_res[2])}")

True: 17
False: 7
None: 1
