Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NOCATS: Categorical splits for tree-based learners (ctnd.) #12866

Open
wants to merge 59 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 35 commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
61e28ec
Remove inverted dependence of _utils.pyx on _tree.pyx because it was …
jblackburne Dec 22, 2016
926d48f
Tree constructor now checks for mismatched struct sizes.
jblackburne Oct 6, 2016
82e932e
Created SplitValue datatype to generalize the concept of a threshold …
jblackburne Oct 8, 2016
5cfa6c2
Added attribute n_categories to Splitter and Tree, an array of ints t…
jblackburne Oct 8, 2016
a65b848
Added a goes_left function to replace threshold comparisons during pr…
jblackburne Oct 20, 2016
2bd5633
BestSplitter now calculates the best categorical split.
jblackburne Jan 8, 2017
e3f0a99
Added categorical split code to RandomSplitter.node_split
jblackburne Oct 29, 2016
35dad26
Added an implementation of the Breiman sorting shortcut for finding t…
jblackburne Jan 8, 2017
2312ce0
Added categorical constructor parameter and error checking to BaseDec…
jblackburne Nov 4, 2016
e0068f8
Added the categorical keyword to forest constructors.
jblackburne Nov 23, 2016
1b77326
Added some unit tests.
jblackburne Dec 5, 2016
149b7bf
Refactored _partial_dependence_tree a little.
jblackburne Dec 21, 2016
b65da1c
Added categorical support to gradient boosting.
jblackburne Dec 21, 2016
04c5fa9
compile with recent cython
adrinjalali Nov 29, 2018
1c6f5b9
Merge remote-tracking branch 'upstream/master' into tree/nocats
adrinjalali Nov 29, 2018
0becd7b
Merge remote-tracking branch 'upstream/master' into tree/nocats
adrinjalali Dec 23, 2018
95a0bd2
compile goes a bit further
adrinjalali Dec 25, 2018
28f42b1
compiles
adrinjalali Dec 26, 2018
724e720
fix yield based tests
adrinjalali Dec 26, 2018
614e51c
add cat_split to NODE_DTYPE
adrinjalali Dec 26, 2018
51e51cf
compare float arrays with almost_equal
adrinjalali Dec 26, 2018
7840004
remove extra import
adrinjalali Dec 26, 2018
3d12660
remove commented extra lines
adrinjalali Dec 26, 2018
bb02337
fix some docstrings
adrinjalali Dec 26, 2018
56cdb72
remove overlapping DTYPE
adrinjalali Dec 26, 2018
d49ff0e
fix forest doctest
adrinjalali Dec 26, 2018
081dd83
remove input validation from __init__, it's don in fit
adrinjalali Dec 26, 2018
51b908c
fix extra tree param docstring
adrinjalali Dec 26, 2018
45d1f33
improve tests for invalid categorical input
adrinjalali Dec 26, 2018
4f1f360
use pytest.raises instead in invalid categorical sparse test
adrinjalali Dec 27, 2018
ea263ea
add cython code coverage to see uncovered code.
adrinjalali Dec 27, 2018
e173dee
add language_level and profiling directives to cython files
adrinjalali Dec 27, 2018
3eafa0a
Merge remote-tracking branch 'upstream/master' into tree/nocats
adrinjalali Dec 27, 2018
a4400db
revert linetrace directive
adrinjalali Dec 27, 2018
b88cef2
revert coveragerc cython support
adrinjalali Dec 28, 2018
f6fef44
Merge branch 'master' into tree/nocats
adrinjalali Jan 12, 2019
c9b263c
benchmark added
adrinjalali Jan 12, 2019
6ae188d
some benchmark cleanup
adrinjalali Jan 12, 2019
9bae7d0
more benchmark cleanup
adrinjalali Jan 12, 2019
b1bd2d7
remove extra import
adrinjalali Jan 12, 2019
05d2985
more benchmark touches
adrinjalali Jan 13, 2019
428206c
pep8
adrinjalali Jan 13, 2019
2974d5a
pep8
adrinjalali Jan 13, 2019
286b04b
add some tests
adrinjalali Jan 17, 2019
90d7365
tests too hard
adrinjalali Jan 19, 2019
475ea7b
add some forest tests
adrinjalali Jan 21, 2019
1e2bcfe
mostly cosmetics
adrinjalali Jan 21, 2019
35f273d
fix typo in _splitters.pyx
adrinjalali Jan 30, 2019
b0d73e0
n_categories as memview
adrinjalali Feb 3, 2019
bb4abfe
minor cleanup
adrinjalali Feb 3, 2019
532061f
cat_split as a BitSet
adrinjalali Feb 3, 2019
15a184e
minor fix (n_categories < 0)
adrinjalali Feb 4, 2019
38e7b95
move from class BitSet to BITSET_t and functions
adrinjalali Feb 4, 2019
1a78f1a
better docstring for cache functions
adrinjalali Feb 6, 2019
646a86a
(gbc) fix realloc size param
adrinjalali Feb 6, 2019
6be8edd
bs_* with no pointers
adrinjalali Feb 6, 2019
5d04b18
fix SplitValue description
adrinjalali Feb 6, 2019
f44152b
fix silly == bug
adrinjalali Feb 25, 2019
d3925f0
Merge remote-tracking branch 'upstream/master' into tree/nocats
adrinjalali Feb 25, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
65 changes: 47 additions & 18 deletions sklearn/ensemble/_gradient_boosting.pyx
Expand Up @@ -20,10 +20,13 @@ from scipy.sparse import csr_matrix

from sklearn.tree._tree cimport Node
from sklearn.tree._tree cimport Tree
from sklearn.tree._tree cimport CategoryCacheMgr
from sklearn.tree._tree cimport DTYPE_t
from sklearn.tree._tree cimport SIZE_t
from sklearn.tree._tree cimport INT32_t
from sklearn.tree._tree cimport UINT32_t
from sklearn.tree._utils cimport safe_realloc
from sklearn.tree._utils cimport goes_left

ctypedef np.int32_t int32
ctypedef np.float64_t float64
Expand All @@ -48,6 +51,8 @@ cdef void _predict_regression_tree_inplace_fast_dense(DTYPE_t *X,
Py_ssize_t K,
Py_ssize_t n_samples,
Py_ssize_t n_features,
INT32_t* n_categories,
UINT32_t** cachebits,
float64 *out):
"""Predicts output for regression tree and stores it in ``out[i, k]``.

Expand Down Expand Up @@ -82,20 +87,32 @@ cdef void _predict_regression_tree_inplace_fast_dense(DTYPE_t *X,
``n_samples == X.shape[0]``.
n_features : int
The number of features; ``n_samples == X.shape[1]``.
n_categories : INT32_t pointer
Array of length n_features containing the number of categories
(for categorical features) or -1 (for non-categorical features)
cachebits : UINT32_t pointer pointer
Array of length node_count containing category cache buffers
for categorical features
out : np.float64_t pointer
The pointer to the data array where the predictions are stored.
``out`` is assumed to be a two-dimensional array of
shape ``(n_samples, K)``.
"""
cdef Py_ssize_t i
cdef Node *node
cdef UINT32_t* node_cache

for i in range(n_samples):
node = root_node
node_cache = cachebits[0]
# While node not a leaf
while node.left_child != TREE_LEAF:
if X[i * n_features + node.feature] <= node.threshold:
if goes_left(X[i * n_features + node.feature], node.split_value,
n_categories[node.feature], node_cache):
node_cache = cachebits[node.left_child]
node = root_node + node.left_child
else:
node_cache = cachebits[node.right_child]
node = root_node + node.right_child
out[i * K + k] += scale * value[node - root_node]

Expand Down Expand Up @@ -130,8 +147,8 @@ def _predict_regression_tree_stages_sparse(np.ndarray[object, ndim=2] estimators
cdef Tree tree
cdef Node** nodes = NULL
cdef double** values = NULL
safe_realloc(&nodes, n_stages * n_outputs)
safe_realloc(&values, n_stages * n_outputs)
safe_realloc(<void ***>&nodes, n_stages * n_outputs, sizeof(void*))
safe_realloc(<void ***>&values, n_stages * n_outputs, sizeof(void*))
for stage_i in range(n_stages):
for output_i in range(n_outputs):
tree = estimators[stage_i, output_i].tree_
Expand All @@ -147,8 +164,8 @@ def _predict_regression_tree_stages_sparse(np.ndarray[object, ndim=2] estimators
# which features are nonzero in the present sample.
cdef SIZE_t* feature_to_sample = NULL

safe_realloc(&X_sample, n_features)
safe_realloc(&feature_to_sample, n_features)
safe_realloc(&X_sample, n_features, sizeof(DTYPE_t))
safe_realloc(&feature_to_sample, n_features, sizeof(SIZE_t))

memset(feature_to_sample, -1, n_features * sizeof(SIZE_t))

Expand All @@ -174,7 +191,7 @@ def _predict_regression_tree_stages_sparse(np.ndarray[object, ndim=2] estimators
else:
feature_value = 0.

if feature_value <= node.threshold:
if feature_value <= node.split_value.threshold:
node = root_node + node.left_child
else:
node = root_node + node.right_child
Expand Down Expand Up @@ -216,13 +233,18 @@ def predict_stages(np.ndarray[object, ndim=2] estimators,
for k in range(K):
tree = estimators[i, k].tree_

# Make category cache buffers for this tree's nodes
cache_mgr = CategoryCacheMgr()
cache_mgr.populate(tree.nodes, tree.node_count, tree.n_categories)

# avoid buffer validation by casting to ndarray
# and get data pointer
# need brackets because of casting operator priority
_predict_regression_tree_inplace_fast_dense(
<DTYPE_t*> (<np.ndarray> X).data,
tree.nodes, tree.value,
scale, k, K, X.shape[0], X.shape[1],
tree.n_categories, cache_mgr.bits,
<float64 *> (<np.ndarray> out).data)
## out += scale * tree.predict(X).reshape((X.shape[0], 1))

Expand Down Expand Up @@ -293,27 +315,34 @@ cpdef _partial_dependence_tree(Tree tree, DTYPE_t[:, ::1] X,
cdef SIZE_t node_count = tree.node_count

cdef SIZE_t stack_capacity = node_count * 2
cdef Node **node_stack
cdef double[::1] weight_stack = np_ones((stack_capacity,), dtype=np_float64)
cdef SIZE_t stack_size = 1
cdef double left_sample_frac
cdef double current_weight
cdef double total_weight = 0.0
cdef Node *current_node
underlying_stack = np_zeros((stack_capacity,), dtype=np.intp)
node_stack = <Node **>(<np.ndarray> underlying_stack).data
cdef SIZE_t[::1] node_stack = np_zeros((stack_capacity,), dtype=np.intp)
cdef UINT32_t** cachebits
cdef UINT32_t* node_cache

# Make category cache buffers for this tree's nodes
cache_mgr = CategoryCacheMgr()
cache_mgr.populate(root_node, node_count, tree.n_categories)
cachebits = cache_mgr.bits

for i in range(X.shape[0]):
# init stacks for new example
stack_size = 1
node_stack[0] = root_node
node_stack[0] = 0
node_cache = cachebits[0]
weight_stack[0] = 1.0
total_weight = 0.0

while stack_size > 0:
# get top node on stack
stack_size -= 1
current_node = node_stack[stack_size]
current_node = root_node + node_stack[stack_size]
node_cache = cachebits[node_stack[stack_size]]

if current_node.left_child == TREE_LEAF:
out[i] += weight_stack[stack_size] * value[current_node - root_node] * \
Expand All @@ -325,21 +354,21 @@ cpdef _partial_dependence_tree(Tree tree, DTYPE_t[:, ::1] X,
if feature_index != -1:
# split feature in target set
# push left or right child on stack
if X[i, feature_index] <= current_node.threshold:
if goes_left(X[i, feature_index], current_node.split_value,
tree.n_categories[current_node.feature],
node_cache):
# left
node_stack[stack_size] = (root_node +
current_node.left_child)
node_stack[stack_size] = current_node.left_child
else:
# right
node_stack[stack_size] = (root_node +
current_node.right_child)
node_stack[stack_size] = current_node.right_child
stack_size += 1
else:
# split feature in complement set
# push both children onto stack

# push left child
node_stack[stack_size] = root_node + current_node.left_child
node_stack[stack_size] = current_node.left_child
current_weight = weight_stack[stack_size]
left_sample_frac = root_node[current_node.left_child].n_node_samples / \
<double>current_node.n_node_samples
Expand All @@ -354,7 +383,7 @@ cpdef _partial_dependence_tree(Tree tree, DTYPE_t[:, ::1] X,
stack_size +=1

# push right child
node_stack[stack_size] = root_node + current_node.right_child
node_stack[stack_size] = current_node.right_child
weight_stack[stack_size] = current_weight * \
(1.0 - left_sample_frac)
stack_size +=1
Expand Down