diff --git a/src/dependent.jl b/src/dependent.jl index cd394fd..056f8e4 100644 --- a/src/dependent.jl +++ b/src/dependent.jl @@ -266,20 +266,3 @@ function _filter_linearly_dependent(rules::Vector{Rule})::Vector{Rule} end return out end - -""" -Return a linearly independent subset of `rules` of length ≤ `max_rules`. - -!!! note - This doesn't use p0 like is done in the paper. - The problem, IMO, with p0 is that it is very difficult to decide beforehand what p0 is suitable and so it requires hyperparameter tuning. - Instead, luckily, the linearly dependent filter is quite fast here, so passing a load of rules into that and then selecting the first `max_rules` is feasible. -""" -function _process_rules( - rules::Vector{Rule}, - max_rules::Int - )::Vector{Rule} - simplified = _simplify_single_rules(rules) - filtered = _filter_linearly_dependent(simplified) - return first(filtered, max_rules) -end diff --git a/src/mlj.jl b/src/mlj.jl index 7b3fb4a..357bd17 100644 --- a/src/mlj.jl +++ b/src/mlj.jl @@ -151,7 +151,7 @@ Base.@kwdef mutable struct StableRulesClassifier <: Probabilistic q::Int=10 min_data_in_leaf::Int=5 max_rules::Int=10 - lambda::Float64=5 + lambda::Float64=1.0 end """ diff --git a/src/rules.jl b/src/rules.jl index d721db1..80d5d60 100644 --- a/src/rules.jl +++ b/src/rules.jl @@ -189,12 +189,6 @@ function _else_output!( end end -function _frequency_sort(V::AbstractVector) - counts = _count_unique(V) - sorted = sort(collect(counts); by=last, rev=true) - return first.(sorted) -end - function Rule( root::Node, node::Union{Node, Leaf}, @@ -286,16 +280,6 @@ function Base.hash(rule::Rule) hash([rule.path.splits, rule.then, rule.otherwise]) end -function _count_unique(V::AbstractVector{T}) where T - U = unique(V) - l = length(U) - counts = Dict{T,Int}(zip(U, zeros(l))) - for v in V - counts[v] += 1 - end - return counts -end - struct StableRules{T} <: StableModel rules::Vector{Rule} algo::Algorithm @@ -320,6 +304,43 @@ function _remove_zero_weights(rules::Vector{Rule}, weights::Vector{Float16}) return filtered_rules, filtered_weights end +function _count_unique(V::AbstractVector{T}) where T + U = unique(V) + l = length(U) + counts = Dict{T,Int}(zip(U, zeros(l))) + for v in V + counts[v] += 1 + end + return counts +end + +"Return a vector of unique values in `V` sorted by frequency." +function _sort_by_frequency(V::AbstractVector{T}) where T + counts = _count_unique(V)::Dict{T, Int} + sorted = sort(collect(counts), by=last, rev=true) + return first.(sorted) +end + +""" +Apply _rule selection_ and _rule set post-treatment_ +(Bénard et al., [2021](http://proceedings.mlr.press/v130/benard21a)). + +Rule selection, here, denotes sorting the set by frequency. +Next, linearly dependent rules are removed from the set. +To ensure the size of the final set is equal to `max_rules` in most cases, we ignore the +p0 parameter and instead pass all rules directly to the linearly dependent filter. +This is possible because the filter for linear dependencies is quite fast. +""" +function _process_rules( + rules::Vector{Rule}, + max_rules::Int + )::Vector{Rule} + simplified = _simplify_single_rules(rules) + sorted = _sort_by_frequency(simplified) + filtered = _filter_linearly_dependent(sorted) + return first(filtered, max_rules) +end + function StableRules( rules::Vector{Rule}, algo::Algorithm, diff --git a/src/weights.jl b/src/weights.jl index 9e9fa13..1f4501f 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -49,7 +49,7 @@ Note that the exact weights do not matter for the classification case, since the highest class will be selected anyway. For regression however, the weights should sum to roughly one. """ -function _estimate_coefficients( +function _estimate_coefficients!( algo::Algorithm, binary_feature_data::Matrix{Float16}, outcome::Vector{Float16}, @@ -63,15 +63,17 @@ function _estimate_coefficients( # Also, ElasticNet shows no clear benefit in accuracy. # According to Clément, avoid Lasso since it would introduce additional # sparsity and then instability in the rule selection. - model = RidgeRegressor(; fit_intercept=false, model.lambda) + lambda = model.lambda + model = RidgeRegressor(; fit_intercept=false, lambda) coefs = MLJLinearModels.fit(glr(model), binary_feature_data, outcome)::Vector + # Avoid negative coefficients. + coefs = max.(coefs, 0) if algo isa Regression # Ensure that coefs sum roughly to one. total = sum(coefs) coefs = coefs ./ total end - # Avoid negative coefficients. - return max.(coefs, 0) + return coefs end """ @@ -90,6 +92,6 @@ function _weights( ) binary_feature_data = _binary_features(rules, data) y = convert(Vector{Float16}, outcome) - coefficients = _estimate_coefficients(algo, binary_feature_data, y, model) + coefficients = _estimate_coefficients!(algo, binary_feature_data, y, model) return coefficients::Vector{Float16} end diff --git a/test/mlj.jl b/test/mlj.jl index a6c96d5..15e068d 100644 --- a/test/mlj.jl +++ b/test/mlj.jl @@ -25,7 +25,7 @@ datasets = Dict{String,Tuple}( end, "boston" => boston(), "make_regression" => let - make_regression(200, 3; noise=0.0, sparse=0.0, outliers=0.0, rng=_rng()) + make_regression(600, 3; noise=0.0, sparse=0.0, outliers=0.0, rng=_rng()) end ) @@ -126,13 +126,19 @@ preds = predict(rulesmach) @test 0.95 < auc(preds, y) let - hyper = (; rng=_rng(), n_trees=50) + hyper = (; rng=_rng(), max_depth=2) + e = _evaluate!(results, "blobs", StableForestClassifier, hyper) + + hyper = (; rng=_rng(), max_depth=2, max_rules=30) e = _evaluate!(results, "blobs", StableRulesClassifier, hyper) - @test 0.95 < _score(e) for fitted in e.fitted_params_per_fold @test fitted.fitresult.classes == [1, 2] end + + hyper = (; rng=_rng(), max_depth=2, max_rules=10) + e = _evaluate!(results, "blobs", StableRulesClassifier, hyper) + @test 0.95 < _score(e) end n_trees = 40 @@ -151,10 +157,15 @@ let end let - hyper = (; rng=_rng(), n_trees=1_500) + hyper = (; rng=_rng(), max_depth=2) e = _evaluate!(results, "titanic", StableForestClassifier, hyper) @test 0.80 < _score(e) + hyper = (; rng=_rng(), max_depth=2, max_rules=30) + e = _evaluate!(results, "titanic", StableRulesClassifier, hyper) + @test 0.79 < _score(e) + + hyper = (; rng=_rng(), max_depth=2, max_rules=10) e = _evaluate!(results, "titanic", StableRulesClassifier, hyper) @test 0.79 < _score(e) end @@ -185,66 +196,74 @@ let end let - hyper = (; rng=_rng(), n_trees=1_500) + hyper = (; rng=_rng(), max_depth=2) e = _evaluate!(results, "haberman", StableForestClassifier, hyper) @test 0.60 < _score(e) + hyper = (; rng=_rng(), max_depth=2, max_rules=30) e = _evaluate!(results, "haberman", StableRulesClassifier, hyper) @test 0.60 < _score(e) fitresult = string(first(e.fitted_params_per_fold).fitresult) @test contains(fitresult, ":x1") + + hyper = (; rng=_rng(), max_depth=2, max_rules=10) + e = _evaluate!(results, "haberman", StableRulesClassifier, hyper) + @test 0.60 < _score(e) end -rulesmodel = StableRulesRegressor(; max_depth=1, n_trees=50, rng=_rng()) +rulesmodel = StableRulesRegressor(; max_depth=2, max_rules=30, rng=_rng()) X, y = datasets["boston"] rulesmach = machine(rulesmodel, X, y) fit!(rulesmach; verbosity=0) preds = predict(rulesmach) @test preds isa Vector{Float64} +# @show rsq(preds, y) # @test 0.95 < rsq(preds, y) emr = let - hyper = (;) measure = rsq - _evaluate!(results, "make_regression", LGBMRegressor, hyper; measure) - _evaluate!(results, "make_regression", LinearRegressor, hyper; measure) + data = "make_regression" + hyper = (;) + _evaluate!(results, data, LGBMRegressor, hyper; measure) - hyper = (; rng=_rng(), n_trees=1_500) - _evaluate!(results, "make_regression", StableForestRegressor, hyper; measure) + hyper = (; max_depth=2) + _evaluate!(results, data, LGBMRegressor, hyper; measure) - hyper = (; rng=_rng(), n_trees=1_500, max_rules=100) - _evaluate!(results, "make_regression", StableRulesRegressor, hyper; measure) + hyper = (; max_depth=2, rng=_rng()) + _evaluate!(results, data, StableForestRegressor, hyper; measure) - hyper = (; rng=_rng(), n_trees=1_500, max_rules=30) - _evaluate!(results, "make_regression", StableRulesRegressor, hyper; measure) + hyper = (; rng=_rng(), max_depth=2, max_rules=30) + _evaluate!(results, data, StableRulesRegressor, hyper; measure) - hyper = (; rng=_rng(), n_trees=1_500, max_rules=10) - _evaluate!(results, "make_regression", StableRulesRegressor, hyper; measure) + hyper = (; rng=_rng(), max_depth=2, max_rules=10) + er = _evaluate!(results, data, StableRulesRegressor, hyper; measure) + @test 0.50 < _score(er) end el = let hyper = (;) measure = rsq elgbm = _evaluate!(results, "boston", LGBMRegressor, hyper; measure) - el = _evaluate!(results, "boston", LinearRegressor, hyper; measure) - hyper = (; rng=_rng(), n_trees=1_500) + + hyper = (; max_depth=2) + el = _evaluate!(results, "boston", LGBMRegressor, hyper; measure) + + hyper = (; max_depth=2, rng=_rng()) ef = _evaluate!(results, "boston", StableForestRegressor, hyper; measure) - @test 0.65 < _score(el) + @test 0.62 < _score(el) @test _score(el) ≈ _score(ef) atol=0.05 @test 0.65 < _score(elgbm) el end er = let - hyper = (; rng=_rng(), n_trees=1_500, max_rules=100) - _evaluate!(results, "boston", StableRulesRegressor, hyper; measure=rsq) - - hyper = (; rng=_rng(), n_trees=1_500, max_rules=30) + hyper = (; rng=_rng(), max_depth=2, max_rules=30) er = _evaluate!(results, "boston", StableRulesRegressor, hyper; measure=rsq) - hyper = (; rng=_rng(), n_trees=1_500, max_rules=10) + hyper = (; rng=_rng(), max_depth=2, max_rules=10) er = _evaluate!(results, "boston", StableRulesRegressor, hyper; measure=rsq) + @test 0.55 < _score(er) end pretty = rename(results, :se => "1.96*SE") diff --git a/test/rules.jl b/test/rules.jl index cf46fe5..bec7ab1 100644 --- a/test/rules.jl +++ b/test/rules.jl @@ -55,7 +55,7 @@ rules = S._rules(forest) @test S._count_unique([1, 1, 1, 2]) == Dict(1 => 3, 2 => 1) -weights = [0.395, 0.197, 0.187, 0.057, 0.054, 0.043, 0.027, 0.02, 0.01, 0.01] +@test S._sort_by_frequency([r1, r5, r1]) == [r1, r5] algo = SIRUS.Classification() empty_model = S.StableRules(S.Rule[], algo, [1], Float16[0.1]) diff --git a/test/weights.jl b/test/weights.jl index 14fe10c..0a39a0b 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -5,14 +5,16 @@ data = [0.0 2.5 r1 = S.Rule(S.TreePath(" X[i, 1] < 1 "), [0.1], [0.0]) r2 = S.Rule(S.TreePath(" X[i, 2] < 2 "), [0.2], [0.0]) -binary_feature_data = Float16[1 0; 0 0; 1 1] +binary_feature_data = Float16[1 0; + 0 0; + 1 1] @test S._binary_features([r1, r2], data) == binary_feature_data y = Float16[0.5, 0, 1] model = StableRulesClassifier() algo = S.Classification() -@test S._estimate_coefficients(algo, binary_feature_data, y, model) isa Vector +@test S._estimate_coefficients!(algo, binary_feature_data, copy(y), model) isa Vector @test SIRUS._normalize!([1.0, 2.0, 3.0]) == [0.0, 0.5, 1.0]