Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Also sort by rule frequency #23

Merged
merged 5 commits into from
Jun 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 0 additions & 17 deletions src/dependent.jl
Original file line number Diff line number Diff line change
Expand Up @@ -266,20 +266,3 @@ function _filter_linearly_dependent(rules::Vector{Rule})::Vector{Rule}
end
return out
end

"""
Return a linearly independent subset of `rules` of length ≤ `max_rules`.

!!! note
This doesn't use p0 like is done in the paper.
The problem, IMO, with p0 is that it is very difficult to decide beforehand what p0 is suitable and so it requires hyperparameter tuning.
Instead, luckily, the linearly dependent filter is quite fast here, so passing a load of rules into that and then selecting the first `max_rules` is feasible.
"""
function _process_rules(
rules::Vector{Rule},
max_rules::Int
)::Vector{Rule}
simplified = _simplify_single_rules(rules)
filtered = _filter_linearly_dependent(simplified)
return first(filtered, max_rules)
end
2 changes: 1 addition & 1 deletion src/mlj.jl
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ Base.@kwdef mutable struct StableRulesClassifier <: Probabilistic
q::Int=10
min_data_in_leaf::Int=5
max_rules::Int=10
lambda::Float64=5
lambda::Float64=1.0
end

"""
Expand Down
53 changes: 37 additions & 16 deletions src/rules.jl
Original file line number Diff line number Diff line change
Expand Up @@ -189,12 +189,6 @@ function _else_output!(
end
end

function _frequency_sort(V::AbstractVector)
counts = _count_unique(V)
sorted = sort(collect(counts); by=last, rev=true)
return first.(sorted)
end

function Rule(
root::Node,
node::Union{Node, Leaf},
Expand Down Expand Up @@ -286,16 +280,6 @@ function Base.hash(rule::Rule)
hash([rule.path.splits, rule.then, rule.otherwise])
end

function _count_unique(V::AbstractVector{T}) where T
U = unique(V)
l = length(U)
counts = Dict{T,Int}(zip(U, zeros(l)))
for v in V
counts[v] += 1
end
return counts
end

struct StableRules{T} <: StableModel
rules::Vector{Rule}
algo::Algorithm
Expand All @@ -320,6 +304,43 @@ function _remove_zero_weights(rules::Vector{Rule}, weights::Vector{Float16})
return filtered_rules, filtered_weights
end

function _count_unique(V::AbstractVector{T}) where T
U = unique(V)
l = length(U)
counts = Dict{T,Int}(zip(U, zeros(l)))
for v in V
counts[v] += 1
end
return counts
end

"Return a vector of unique values in `V` sorted by frequency."
function _sort_by_frequency(V::AbstractVector{T}) where T
counts = _count_unique(V)::Dict{T, Int}
sorted = sort(collect(counts), by=last, rev=true)
return first.(sorted)
end

"""
Apply _rule selection_ and _rule set post-treatment_
(Bénard et al., [2021](http://proceedings.mlr.press/v130/benard21a)).

Rule selection, here, denotes sorting the set by frequency.
Next, linearly dependent rules are removed from the set.
To ensure the size of the final set is equal to `max_rules` in most cases, we ignore the
p0 parameter and instead pass all rules directly to the linearly dependent filter.
This is possible because the filter for linear dependencies is quite fast.
"""
function _process_rules(
rules::Vector{Rule},
max_rules::Int
)::Vector{Rule}
simplified = _simplify_single_rules(rules)
sorted = _sort_by_frequency(simplified)
filtered = _filter_linearly_dependent(sorted)
return first(filtered, max_rules)
end

function StableRules(
rules::Vector{Rule},
algo::Algorithm,
Expand Down
12 changes: 7 additions & 5 deletions src/weights.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ Note that the exact weights do not matter for the classification case, since
the highest class will be selected anyway. For regression however, the weights
should sum to roughly one.
"""
function _estimate_coefficients(
function _estimate_coefficients!(
algo::Algorithm,
binary_feature_data::Matrix{Float16},
outcome::Vector{Float16},
Expand All @@ -63,15 +63,17 @@ function _estimate_coefficients(
# Also, ElasticNet shows no clear benefit in accuracy.
# According to Clément, avoid Lasso since it would introduce additional
# sparsity and then instability in the rule selection.
model = RidgeRegressor(; fit_intercept=false, model.lambda)
lambda = model.lambda
model = RidgeRegressor(; fit_intercept=false, lambda)
coefs = MLJLinearModels.fit(glr(model), binary_feature_data, outcome)::Vector
# Avoid negative coefficients.
coefs = max.(coefs, 0)
if algo isa Regression
# Ensure that coefs sum roughly to one.
total = sum(coefs)
coefs = coefs ./ total
end
# Avoid negative coefficients.
return max.(coefs, 0)
return coefs
end

"""
Expand All @@ -90,6 +92,6 @@ function _weights(
)
binary_feature_data = _binary_features(rules, data)
y = convert(Vector{Float16}, outcome)
coefficients = _estimate_coefficients(algo, binary_feature_data, y, model)
coefficients = _estimate_coefficients!(algo, binary_feature_data, y, model)
return coefficients::Vector{Float16}
end
69 changes: 44 additions & 25 deletions test/mlj.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ datasets = Dict{String,Tuple}(
end,
"boston" => boston(),
"make_regression" => let
make_regression(200, 3; noise=0.0, sparse=0.0, outliers=0.0, rng=_rng())
make_regression(600, 3; noise=0.0, sparse=0.0, outliers=0.0, rng=_rng())
end
)

Expand Down Expand Up @@ -126,13 +126,19 @@ preds = predict(rulesmach)
@test 0.95 < auc(preds, y)

let
hyper = (; rng=_rng(), n_trees=50)
hyper = (; rng=_rng(), max_depth=2)
e = _evaluate!(results, "blobs", StableForestClassifier, hyper)

hyper = (; rng=_rng(), max_depth=2, max_rules=30)
e = _evaluate!(results, "blobs", StableRulesClassifier, hyper)
@test 0.95 < _score(e)

for fitted in e.fitted_params_per_fold
@test fitted.fitresult.classes == [1, 2]
end

hyper = (; rng=_rng(), max_depth=2, max_rules=10)
e = _evaluate!(results, "blobs", StableRulesClassifier, hyper)
@test 0.95 < _score(e)
end

n_trees = 40
Expand All @@ -151,10 +157,15 @@ let
end

let
hyper = (; rng=_rng(), n_trees=1_500)
hyper = (; rng=_rng(), max_depth=2)
e = _evaluate!(results, "titanic", StableForestClassifier, hyper)
@test 0.80 < _score(e)

hyper = (; rng=_rng(), max_depth=2, max_rules=30)
e = _evaluate!(results, "titanic", StableRulesClassifier, hyper)
@test 0.79 < _score(e)

hyper = (; rng=_rng(), max_depth=2, max_rules=10)
e = _evaluate!(results, "titanic", StableRulesClassifier, hyper)
@test 0.79 < _score(e)
end
Expand Down Expand Up @@ -185,66 +196,74 @@ let
end

let
hyper = (; rng=_rng(), n_trees=1_500)
hyper = (; rng=_rng(), max_depth=2)
e = _evaluate!(results, "haberman", StableForestClassifier, hyper)
@test 0.60 < _score(e)

hyper = (; rng=_rng(), max_depth=2, max_rules=30)
e = _evaluate!(results, "haberman", StableRulesClassifier, hyper)
@test 0.60 < _score(e)
fitresult = string(first(e.fitted_params_per_fold).fitresult)
@test contains(fitresult, ":x1")

hyper = (; rng=_rng(), max_depth=2, max_rules=10)
e = _evaluate!(results, "haberman", StableRulesClassifier, hyper)
@test 0.60 < _score(e)
end

rulesmodel = StableRulesRegressor(; max_depth=1, n_trees=50, rng=_rng())
rulesmodel = StableRulesRegressor(; max_depth=2, max_rules=30, rng=_rng())
X, y = datasets["boston"]
rulesmach = machine(rulesmodel, X, y)
fit!(rulesmach; verbosity=0)
preds = predict(rulesmach)
@test preds isa Vector{Float64}
# @show rsq(preds, y)
# @test 0.95 < rsq(preds, y)

emr = let
hyper = (;)
measure = rsq
_evaluate!(results, "make_regression", LGBMRegressor, hyper; measure)
_evaluate!(results, "make_regression", LinearRegressor, hyper; measure)
data = "make_regression"
hyper = (;)
_evaluate!(results, data, LGBMRegressor, hyper; measure)

hyper = (; rng=_rng(), n_trees=1_500)
_evaluate!(results, "make_regression", StableForestRegressor, hyper; measure)
hyper = (; max_depth=2)
_evaluate!(results, data, LGBMRegressor, hyper; measure)

hyper = (; rng=_rng(), n_trees=1_500, max_rules=100)
_evaluate!(results, "make_regression", StableRulesRegressor, hyper; measure)
hyper = (; max_depth=2, rng=_rng())
_evaluate!(results, data, StableForestRegressor, hyper; measure)

hyper = (; rng=_rng(), n_trees=1_500, max_rules=30)
_evaluate!(results, "make_regression", StableRulesRegressor, hyper; measure)
hyper = (; rng=_rng(), max_depth=2, max_rules=30)
_evaluate!(results, data, StableRulesRegressor, hyper; measure)

hyper = (; rng=_rng(), n_trees=1_500, max_rules=10)
_evaluate!(results, "make_regression", StableRulesRegressor, hyper; measure)
hyper = (; rng=_rng(), max_depth=2, max_rules=10)
er = _evaluate!(results, data, StableRulesRegressor, hyper; measure)
@test 0.50 < _score(er)
end

el = let
hyper = (;)
measure = rsq
elgbm = _evaluate!(results, "boston", LGBMRegressor, hyper; measure)
el = _evaluate!(results, "boston", LinearRegressor, hyper; measure)
hyper = (; rng=_rng(), n_trees=1_500)

hyper = (; max_depth=2)
el = _evaluate!(results, "boston", LGBMRegressor, hyper; measure)

hyper = (; max_depth=2, rng=_rng())
ef = _evaluate!(results, "boston", StableForestRegressor, hyper; measure)

@test 0.65 < _score(el)
@test 0.62 < _score(el)
@test _score(el) ≈ _score(ef) atol=0.05
@test 0.65 < _score(elgbm)
el
end

er = let
hyper = (; rng=_rng(), n_trees=1_500, max_rules=100)
_evaluate!(results, "boston", StableRulesRegressor, hyper; measure=rsq)

hyper = (; rng=_rng(), n_trees=1_500, max_rules=30)
hyper = (; rng=_rng(), max_depth=2, max_rules=30)
er = _evaluate!(results, "boston", StableRulesRegressor, hyper; measure=rsq)

hyper = (; rng=_rng(), n_trees=1_500, max_rules=10)
hyper = (; rng=_rng(), max_depth=2, max_rules=10)
er = _evaluate!(results, "boston", StableRulesRegressor, hyper; measure=rsq)
@test 0.55 < _score(er)
end

pretty = rename(results, :se => "1.96*SE")
Expand Down
2 changes: 1 addition & 1 deletion test/rules.jl
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ rules = S._rules(forest)

@test S._count_unique([1, 1, 1, 2]) == Dict(1 => 3, 2 => 1)

weights = [0.395, 0.197, 0.187, 0.057, 0.054, 0.043, 0.027, 0.02, 0.01, 0.01]
@test S._sort_by_frequency([r1, r5, r1]) == [r1, r5]

algo = SIRUS.Classification()
empty_model = S.StableRules(S.Rule[], algo, [1], Float16[0.1])
Expand Down
6 changes: 4 additions & 2 deletions test/weights.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@ data = [0.0 2.5
r1 = S.Rule(S.TreePath(" X[i, 1] < 1 "), [0.1], [0.0])
r2 = S.Rule(S.TreePath(" X[i, 2] < 2 "), [0.2], [0.0])

binary_feature_data = Float16[1 0; 0 0; 1 1]
binary_feature_data = Float16[1 0;
0 0;
1 1]
@test S._binary_features([r1, r2], data) == binary_feature_data

y = Float16[0.5, 0, 1]

model = StableRulesClassifier()
algo = S.Classification()
@test S._estimate_coefficients(algo, binary_feature_data, y, model) isa Vector
@test S._estimate_coefficients!(algo, binary_feature_data, copy(y), model) isa Vector

@test SIRUS._normalize!([1.0, 2.0, 3.0]) == [0.0, 0.5, 1.0]

Expand Down