From f2fe3eed70158e85288cf558dde131cdc4d5c60f Mon Sep 17 00:00:00 2001 From: Rik Huijzer Date: Thu, 23 Nov 2023 12:07:42 +0100 Subject: [PATCH] Write down problem --- docs/src/api.md | 3 ++- src/SIRUS.jl | 2 +- src/extract.jl | 31 ++++++++++++++++++++++++++++++- src/rules.jl | 10 +++++++--- test/extract.jl | 16 +++++++++++++--- 5 files changed, 53 insertions(+), 9 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 21fa8f1..d32aec9 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -13,10 +13,11 @@ StableForestRegressor ```@docs feature_names +feature_importance +feature_importances directions values(::SIRUS.Rule) satisfies Cutpoints cutpoints -feature_importance ``` diff --git a/src/SIRUS.jl b/src/SIRUS.jl index fc83be7..32756e4 100644 --- a/src/SIRUS.jl +++ b/src/SIRUS.jl @@ -33,7 +33,7 @@ include("ruleshow.jl") include("weights.jl") include("dependent.jl") include("extract.jl") -export feature_importance +export feature_importance, feature_importances include("mlj.jl") const StableForestClassifier = MLJImplementation.StableForestClassifier diff --git a/src/extract.jl b/src/extract.jl index 7af6ff8..0feb584 100644 --- a/src/extract.jl +++ b/src/extract.jl @@ -18,7 +18,8 @@ end ) Estimate the importance of the given `feature_name`. -The aim is to satisfy the following property: +The aim is to satisfy the following property, so that the features can be +ordered by importance: > Given two features A and B, if A has more effect on the outcome, then > feature_importance(model, A) > feature_importance(model, B). @@ -63,3 +64,31 @@ end function feature_importance(models::Vector{<:StableRules}, feature_name::AbstractString) return feature_importance(models, string(feature_name)::String) end + +""" + feature_importances( + models::Union{StableRules, Vector{StableRules}} + feature_names + )::Vector{NamedTuple{(:feature_name, :importance), Tuple{String, Float64}}} + +Return the feature names and importances, sorted by feature importance in descending order. +""" +function feature_importances( + models::Union{StableRules, Vector{StableRules}}, + feature_names::Vector{String} + )::Vector{NamedTuple{(:feature_name, :importance), Tuple{String, Float64}}} + @assert length(unique(feature_names)) == length(feature_names) + importances = map(feature_names) do feature_name + importance = feature_importance(models, feature_name) + (; feature_name, importance) + end + alg = Helpers.STABLE_SORT_ALG + return sort(importances; alg, by=last, rev=true) +end + +function feature_importances( + models::Union{StableRules, Vector{StableRules}}, + feature_names + )::Vector{NamedTuple{(:feature_name, :importance), Tuple{String, Float64}}} + return feature_importances(models, string.(feature_names)) +end diff --git a/src/rules.jl b/src/rules.jl index d29b206..d1476ec 100644 --- a/src/rules.jl +++ b/src/rules.jl @@ -359,6 +359,10 @@ function _count_unique(V::AbstractVector{T}) where T return counts end +# TODO IS THE PROBLEM HERE? +# MAYBE THE PROBLEM IS THAT ONLY THE UNIQUE RULE CLAUSES SHOULD BE CHECKED +# NOT THE THEN/OTHERWISE. + """ Return a vector of unique values in `V` sorted by frequency. """ @@ -383,9 +387,9 @@ function _process_rules( rules::Vector{Rule}, max_rules::Int )::Vector{Rule} - simplified = _simplify_single_rules(rules) - sorted = _sort_by_frequency(simplified) - filtered = _filter_linearly_dependent(sorted) + simplified = _simplify_single_rules(rules)::Vector{Rule} + sorted = _sort_by_frequency(simplified)::Vector{Rule} + filtered = _filter_linearly_dependent(sorted)::Vector{Rule} return first(filtered, max_rules) end diff --git a/test/extract.jl b/test/extract.jl index 60c872a..b7cf837 100644 --- a/test/extract.jl +++ b/test/extract.jl @@ -12,12 +12,22 @@ mach = machine(classifier, X, y) fit!(mach) model = mach.fitresult::StableRules +# StableRules model with 8 rules: +# if X[i, :x3] < 8.0 then 0.084 else 0.03 + +# if X[i, :x3] < 14.0 then 0.147 else 0.098 + +# if X[i, :x3] < 2.0 then 0.073 else 0.047 + +# if X[i, :x3] < 4.0 then 0.079 else 0.048 + +# if X[i, :x3] < 1.0 then 0.076 else 0.06 + +# if X[i, :x2] < 1959.0 then 0.006 else 0.008 + +# if X[i, :x1] < 38.0 then 0.029 else 0.024 + +# if X[i, :x1] < 42.0 then 0.052 else 0.043 +# and 2 classes: [0, 1]. +# Note: showing only the probability for class 1 since class 0 has probability 1 - p. importance = feature_importance(model, "x1") -# Based on the numbers that are printed in the following lines: -# if X[i, :x1] < 38.0 then 0.029 else 0.024 + -# if X[i, :x1] < 42.0 then 0.052 else 0.043 +# Based on the numbers above. expected = ((0.029 - 0.024) + (0.052 - 0.043)) @test importance ≈ expected atol=0.01 @test feature_importance([model, model], "x1") ≈ expected atol=0.01 +@test only(feature_importances(model, ["x1"])).importance ≈ expected atol=0.01