Extend API

rikhuijzer · Nov 14, 2023 · 71abde0 · 71abde0
1 parent 73322b3
commit 71abde0
Show file tree

Hide file tree

Showing 6 changed files with 70 additions and 3 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "SIRUS"
 uuid = "cdeec39e-fb35-4959-aadb-a1dd5dede958"
 authors = ["Rik Huijzer <github@huijzer.xyz>"]
-version = "1.3.4"
+version = "2.0.0"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"

diff --git a/src/SIRUS.jl b/src/SIRUS.jl
@@ -34,6 +34,8 @@ include("ruleshow.jl")
 include("weights.jl")
 export TreePath
 include("dependent.jl")
+include("extract.jl")
+export sum_weights
 
 include("mlj.jl")
 const StableForestClassifier = MLJImplementation.StableForestClassifier

diff --git a/src/extract.jl b/src/extract.jl
@@ -0,0 +1,41 @@
+"Estimate the importance of a rule."
+function _rule_importance(weight::Number, rule::Rule)
+    importance = 0.0
+    thens = rule.then::Vector{Float64}
+    otherwises = rule.otherwise::Vector{Float64}
+    for (then, otherwise) in zip(thens, otherwises)
+        importance += weight * abs(then - otherwise)
+    end
+    return importance
+end
+
+"""
+    feature_importance(
+        model::StableRules,
+        feature_name::AbstractString
+    )
+
+Estimate the importance of the given `feature_name`.
+The aim of this function is to satisfy the following property:
+
+> Given two features X and Y, if X has more effect on the outcome, then
+> feature_importance(model, X) > feature_importance(model, Y).
+
+This function provides only an estimation of the importance because
+the effect on the outcome depends on the data.
+"""
+function feature_importance(
+        model::StableRules,
+        feature_name::AbstractString
+    )
+    importance = 0.0
+    for (i, rule) in enumerate(model.rules)
+        for clause::Split in rule.path.splits
+            if _feature_name(clause) == feature_name
+                weight = model.weights[i]
+                importance += _rule_importance(weight, rule)
+            end
+        end
+    end
+    return importance
+end
diff --git a/src/rules.jl b/src/rules.jl
@@ -7,8 +7,12 @@ Each rule is based on one or more splits.
 
 Data can be accessed via `_feature`, `_value`, `_feature_name`, `_direction`, and `_reverse`.
 """
-struct Split
-    splitpoint::SplitPoint
+struct SubClause
+    # Removed splitpoint
+    # splitpoint::SplitPoint
+    feature::Int,
+    feature_name::String,
+    splitval::Float32,
     direction::Symbol # :L or :R
 end
 

diff --git a/test/extract.jl b/test/extract.jl
@@ -0,0 +1,16 @@
+function _haberman_data()
+    df = haberman()
+    X = MLJBase.table(MLJBase.matrix(df[:, Not(:survival)]))
+    y = categorical(df.survival)
+    (X, y)
+end
+
+X, y = _haberman_data()
+
+classifier = StableRulesClassifier(; max_depth=2, max_rules=8, n_trees=1000, rng=_rng())
+mach = machine(classifier, X, y)
+fit!(mach)
+
+model = mach.fitresult::StableRules
+
+importance = SIRUS.feature_importance(model, "x1")
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -44,6 +44,10 @@ if CAN_RUN_R_SIRUS
     end
 end
 
+@testset "extract" begin
+    include("extract.jl")
+end
+
 @testset "mlj" begin
     include("mlj.jl")
 end