diff --git a/src/cutpoints.jl b/src/cutpoints.jl new file mode 100644 index 0000000..3ffa830 --- /dev/null +++ b/src/cutpoints.jl @@ -0,0 +1,61 @@ +""" +Return a rough estimate for the index of the cutpoint. +Choose the highest suitable index if there is more than one suitable index. +The reason is that this will split the data nicely in combination with the `<` used later on. +For example, for [1, 2, 3, 4], both 2 and 3 satisfy the 0.5 quantile. +In this case, we pick the ceil, so 3. +Next, the tree will split on 3, causing left (<) to contain 1 and 2 and right (≥) to contain 3 and 4. +""" +_rough_cutpoint_index_estimate(n::Int, quantile::Real) = Int(ceil(quantile * n)) + +"Return the empirical `quantile` for data `V`." +function _empirical_quantile(V::AbstractVector, quantile::Real) + @assert 0.0 ≤ quantile ≤ 1.0 + n = length(V) + index = _rough_cutpoint_index_estimate(n, quantile) + if index == 0 + index = 1 + end + if index == n + 1 + index = n + end + sorted = sort(V) + return Float(sorted[index]) +end + +"Return a vector of `q` cutpoints taken from the empirical distribution from data `V`." +function _cutpoints(V::AbstractVector, q::Int) + @assert 2 ≤ q + # Taking 2 extra to avoid getting minimum(V) and maximum(V) becoming cutpoints. + # Tree on left and right have always respectively length 0 and 1 then anyway. + length = q + 2 + quantiles = range(0.0; stop=1.0, length)[2:end-1] + return Float[_empirical_quantile(V, quantile) for quantile in quantiles] +end + +"Return the number of features `p` in a dataset `X`." +_p(X::AbstractMatrix) = size(X, 2) +_p(X) = length(Tables.columnnames(X)) + +"Set of possible cutpoints, that is, numbers from the empirical quantiles." +const Cutpoints = Vector{Float} + +_view_feature(X::AbstractMatrix, feature::Int) = view(X, :, feature) +_view_feature(X, feature::Int) = X[feature] + +""" +Return a vector of vectors containing +- one inner vector for each feature in the dataset and +- inner vectors containing the unique cutpoints, that is, `length(V[i])` ≤ `q` for all i in V. + +Using unique here to avoid checking splits twice. +""" +function _cutpoints(X, q::Int)::Vector{Cutpoints} + p = _p(X) + cutpoints = Vector{Cutpoints}(undef, p) + for feature in 1:p + V = _view_feature(X, feature) + cutpoints[feature] = unique(_cutpoints(V, q)) + end + return cutpoints +end diff --git a/src/forest.jl b/src/forest.jl index e46b8de..2ecd39d 100644 --- a/src/forest.jl +++ b/src/forest.jl @@ -96,17 +96,6 @@ function _cutpoints(X, q::Int) return cutpoints end -""" -Return a view on all `y` for which the `comparison` holds in `data`. -`indexes_in_region` is re-used between calls. -""" -function _view_y!(mask, data, y, comparison, cutpoint) - for i in eachindex(data) - mask[i] = comparison(data[i], cutpoint) - end - return view(y, mask) -end - """ SplitPoint(feature::Int, value::Float, feature_name::String) @@ -131,9 +120,14 @@ _feature(sp::SplitPoint) = sp.feature _value(sp::SplitPoint) = sp.value _feature_name(sp::SplitPoint) = sp.feature_name -function _information_gain(y, y_left, y_right, classes) +function _information_gain( + y, + y_left, + y_right, + classes, + starting_impurity::Real + ) p = length(y_left) / length(y) - starting_impurity = _gini(y, classes) impurity_change = p * _gini(y_left, classes) + (1 - p) * _gini(y_right, classes) return starting_impurity - impurity_change end @@ -143,6 +137,18 @@ function _rand_subset(rng::AbstractRNG, V::AbstractVector, n::Int) return view(shuffle(rng, V), 1:n) end +""" +Return a view on all `y` for which the `comparison` holds in `data`. +`mask` is re-used between calls. +""" +function _view_y!(mask, X, feature::Int, y, comparison, cutpoint) + for i in eachindex(y) + value = @inbounds X[i, feature] + mask[i] = comparison(value, cutpoint) + end + return @inbounds view(y, mask) +end + """ Return the split for which the gini index is maximized. This function receives the cutpoints for the whole dataset `D` because `X` can be a subset of `D`. @@ -157,21 +163,21 @@ function _split( cutpoints::Vector{Cutpoints}; max_split_candidates::Int=_p(X) ) - best_score = Float(0) + best_score = 0.0 best_score_feature = 0 - best_score_cutpoint = Float(0) + best_score_cutpoint = 0.0 p = _p(X) mc = max_split_candidates possible_features = mc == p ? (1:p) : _rand_subset(rng, 1:p, mc) mask = Vector{Bool}(undef, length(y)) + starting_impurity = _gini(y, classes) for feature in possible_features - data = view(X, :, feature) for cutpoint in cutpoints[feature] - y_left = _view_y!(mask, data, y, <, cutpoint) + y_left = _view_y!(mask, X, feature, y, <, cutpoint) isempty(y_left) && continue - y_right = _view_y!(mask, data, y, ≥, cutpoint) + y_right = _view_y!(mask, X, feature, y, ≥, cutpoint) isempty(y_right) && continue - gain = _information_gain(y, y_left, y_right, classes) + gain = _information_gain(y, y_left, y_right, classes, starting_impurity) if best_score ≤ gain best_score = gain best_score_feature = feature diff --git a/test/forest.jl b/test/forest.jl index 8ebfe92..8659120 100644 --- a/test/forest.jl +++ b/test/forest.jl @@ -8,8 +8,9 @@ X = [1 2; y = [1, 2] indexes = Vector{Bool}(undef, 2) -@test collect(ST._view_y!(indexes, X[:, 1], [1 2], <, 2)) == [1] -@test collect(ST._view_y!(indexes, X[:, 1], [1 2], >, 2)) == [2] +feature = 1 +@test collect(ST._view_y!(indexes, X, feature, [1 2], <, 2)) == [1] +@test collect(ST._view_y!(indexes, X, feature, [1 2], >, 2)) == [2] @test ST._cutpoints([3, 1, 2], 2) == Float[1, 2] @test ST._cutpoints(1:9, 3) == Float[3, 5, 7] diff --git a/test/mlj.jl b/test/mlj.jl index d7fda27..811adaa 100644 --- a/test/mlj.jl +++ b/test/mlj.jl @@ -177,12 +177,13 @@ let e = _evaluate_baseline!(results, "boston") end -let +function _evaluate_boston() hyper = (; rng=_rng(), n_trees=1_500) e = _evaluate!(results, "boston", StableForestClassifier, hyper) e = _evaluate!(results, "boston", StableRulesClassifier, hyper) end +_evaluate_boston() pretty = rename(results, :se => "1.96*SE") rename!(pretty, :nfolds => "`nfolds`")