diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index f00e33c..67a9496 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -30,6 +30,13 @@ jobs: with: version: ${{ matrix.version }} - uses: julia-actions/cache@v1 + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + r-version: '4' + - run: echo "LD_LIBRARY_PATH=$(R RHOME)/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV + if: matrix.os == 'ubuntu-latest' + - run: Rscript -e 'install.packages("sirus")' - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 with: diff --git a/.gitignore b/.gitignore index f302d34..4d58117 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,6 @@ tmp** paper/*.pdf paper/*.jats + +.RData +.Rhistory diff --git a/test/Project.toml b/test/Project.toml index b45ef2c..cf48260 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -9,8 +9,10 @@ MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661" MLJLinearModels = "6ee0df7b-362f-4a72-a706-9e79364fb692" +MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" MLJTestInterface = "72560011-54dd-4dc2-94f3-c5de45b75ecd" MLJXGBoostInterface = "54119dfa-1dab-4055-a167-80440f4f7a91" +RCall = "6f49c342-dc21-5d91-9882-a32aef131414" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541" @@ -29,6 +31,7 @@ MLJDecisionTreeInterface = "0.4" MLJLinearModels = "0.9" MLJTestInterface = "0.2" MLJXGBoostInterface = "0.3.8" +RCall = "0.13" StableRNGs = "1" StatisticalMeasures = "0.1" Tables = "1.7" diff --git a/test/mlj.jl b/test/mlj.jl index 720c89c..6bca1e8 100644 --- a/test/mlj.jl +++ b/test/mlj.jl @@ -74,10 +74,6 @@ end @test classes isa Vector{<:Int} end -function _score(e::PerformanceEvaluation) - return round(only(e.measurement); sigdigits=2) -end - function _with_trailing_zero(score::Real)::String text = string(score)::String if length(text) == 3 @@ -87,12 +83,6 @@ function _with_trailing_zero(score::Real)::String end end -function _evaluate(model, X, y, nfolds=10, measure=auc) - resampling = CV(; nfolds, shuffle=true, rng=_rng()) - acceleration = MLJBase.CPUThreads() - evaluate(model, X, y; acceleration, verbosity=0, resampling, measure) -end - results = DataFrame(; Dataset=String[], Model=String[], @@ -117,7 +107,7 @@ function _evaluate!( X, y = datasets[dataset] nfolds = 10 model = modeltype(; hyperparameters...) - e = _evaluate(model, X, y, nfolds, measure) + e = _evaluate(model, X, y; nfolds, measure) score = _with_trailing_zero(_score(e)) se = let val = round(only(MLJBase._standard_errors(e)); digits=2) @@ -207,6 +197,11 @@ let hyper = (; rng=_rng(), max_depth=2, max_rules=10) e = _evaluate!(results, data, StableRulesClassifier, hyper) @test 0.60 < _score(e) + + if CAN_RUN_R_SIRUS + hyper = (; max_depth=2, max_rules=10) + e = _evaluate!(results, data, RSirusClassifier, hyper) + end end let @@ -234,6 +229,11 @@ let hyper = (; rng=_rng(), max_depth=2, max_rules=10) e = _evaluate!(results, data, StableRulesClassifier, hyper) @test 0.79 < _score(e) + + if CAN_RUN_R_SIRUS + hyper = (; max_depth=2, max_rules=10) + e = _evaluate!(results, data, RSirusClassifier, hyper) + end end let @@ -260,6 +260,11 @@ let hyper = (; rng=_rng(), max_depth=2, max_rules=10) e = _evaluate!(results, data, StableRulesClassifier, hyper; measure) + + if CAN_RUN_R_SIRUS + hyper = (; max_depth=2, max_rules=10) + e = _evaluate!(results, data, RSirusClassifier, hyper; measure) + end end let @@ -284,9 +289,14 @@ let hyper = (; rng=_rng(), max_depth=2, max_rules=10) e = _evaluate!(results, data, StableRulesClassifier, hyper) + + if CAN_RUN_R_SIRUS + hyper = (; max_depth=2, max_rules=10) + e = _evaluate!(results, data, RSirusClassifier, hyper) + end end -e_iris = let +let data = "iris" measure = accuracy @@ -312,7 +322,8 @@ e_iris = let hyper = (; rng=_rng(), max_depth=2, max_rules=10) e = _evaluate!(results, data, StableRulesClassifier, hyper; measure) @test 0.62 < _score(e) - e + + # R sirus doesn't appear to support multiclass classification. end rulesmodel = StableRulesRegressor(; max_depth=2, max_rules=30, rng=_rng()) @@ -351,6 +362,11 @@ let hyper = (; rng=_rng(), max_depth=2, max_rules=10) er = _evaluate!(results, data, StableRulesRegressor, hyper; measure=rsq) @test 0.55 < _score(er) + + if CAN_RUN_R_SIRUS + hyper = (; max_depth=2, max_rules=10) + _evaluate!(results, data, RSirusRegressor, hyper; measure=rsq) + end end emr = let @@ -377,6 +393,11 @@ emr = let hyper = (; rng=_rng(), max_depth=2, max_rules=10) er = _evaluate!(results, data, StableRulesRegressor, hyper; measure) @test 0.50 < _score(er) + + if CAN_RUN_R_SIRUS + hyper = (; max_depth=2, max_rules=10) + _evaluate!(results, data, RSirusRegressor, hyper; measure=rsq) + end end pretty = rename(results, :se => "1.96*SE") diff --git a/test/preliminaries.jl b/test/preliminaries.jl index f4dbbde..15b1ce3 100644 --- a/test/preliminaries.jl +++ b/test/preliminaries.jl @@ -2,6 +2,8 @@ import Base ENV["DATADEPS_ALWAYS_ACCEPT"] = "true" +const CAN_RUN_R_SIRUS = v"1.8" < VERSION + using CategoricalArrays: CategoricalValue, CategoricalVector, @@ -53,6 +55,16 @@ using Test const S = SIRUS _rng(seed::Int=1) = StableRNG(seed) +function _score(e::PerformanceEvaluation) + return round(only(e.measurement); sigdigits=2) +end + +function _evaluate(model, X, y; nfolds::Number=10, measure=auc) + resampling = CV(; nfolds, shuffle=true, rng=_rng()) + acceleration = MLJBase.CPUThreads() + evaluate(model, X, y; acceleration, verbosity=0, resampling, measure) +end + if !haskey(ENV, "REGISTERED_CANCER") name = "Cancer" message = "Wisconsin Diagnostic Breast Cancer (WDBC) dataset" diff --git a/test/rcall.jl b/test/rcall.jl new file mode 100644 index 0000000..e049df0 --- /dev/null +++ b/test/rcall.jl @@ -0,0 +1,184 @@ +# +# This file defines the MLJ wrappers around R sirus and tests them. +# Actual comparisons against other models are done in test/mlj.jl. +# + +import MLJModelInterface: + MLJModelInterface, + fit, + predict, + metadata_model, + metadata_pkg + +using CategoricalArrays: + CategoricalArray, + CategoricalPool, + CategoricalValue +using MLJModelInterface: + MLJModelInterface, + UnivariateFinite, + Continuous, + Count, + Deterministic, + Finite, + Probabilistic, + Table +using RCall + +const MMI = MLJModelInterface + +# @rlibrary sirus + +MMI.@mlj_model mutable struct RSirusRegressor <: Deterministic + max_depth::Int=2 + max_rules::Int=10 +end + +R"library('sirus')" + +n = 100 +A = rand(_rng(), n) +B = rand(_rng(), n) +X = DataFrame(; A, B) +y = rand(_rng(), n) + +function fit( + model::RSirusRegressor, + verbosity::Int, + X, + y + ) + if !Tables.istable(X) + error("Expected a Table but got $(typeof(Xnew))") + end + df = DataFrame(X) + fitted_model = R""" + fitted.model <- sirus.fit( + $df, + $y, + type="reg", + num.rule=$(model.max_rules), + p0=NULL, + num.rule.max=$(model.max_rules), + q=4, + max.depth=$(model.max_depth), + num.trees=NULL, + num.threads=1, + verbose=FALSE, + seed=1 + ) + # print(sirus.print(fitted.model)) + fitted.model + """ + fitresult = fitted_model + cache = nothing + report = nothing + return fitresult, cache, report +end + +verbosity = 0 + +model = RSirusRegressor() +mach = machine(model, X, y, verbosity) +fit!(mach; verbosity) +# mach.fitresult + +function predict( + model::RSirusRegressor, + fitresult::RObject, + Xnew + ) + if !Tables.istable(Xnew) + error("Expected a Table but got $(typeof(Xnew))") + end + df = DataFrame(Xnew) + predictions = R""" + sirus.predict($fitresult, $df) + """ + return rcopy(predictions) +end + +predict(mach, X) +e = _evaluate(model, X, y; measure=rsq) +@test 0.6 < _score(e) + +MMI.@mlj_model mutable struct RSirusClassifier <: Probabilistic + max_depth::Int=2 + max_rules::Int=10 +end + +function fit( + model::RSirusClassifier, + verbosity::Int, + X, + y::CategoricalArray + ) + # Based on MLJXGBoostInterface. + a_target_element = y[1] + @assert a_target_element isa CategoricalValue + + if !Tables.istable(X) + error("Expected a Table but got $(typeof(Xnew))") + end + df = DataFrame(X) + outcomes = get.(y) + fitted_model = R""" + fitted.model <- sirus.fit( + $df, + $outcomes, + type="classif", + num.rule=$(model.max_rules), + p0=NULL, + num.rule.max =$(model.max_rules), + q=4, + max.depth=$(model.max_depth), + num.trees=NULL, + num.threads=1, + verbose=FALSE, + seed=1 + ) + # print(sirus.print(fitted.model)) + fitted.model + """ + fitresult = (fitted_model, a_target_element) + cache = nothing + report = nothing + return fitresult, cache, report +end + +y = categorical(rand(_rng(), [0, 1], n)) + +model = RSirusClassifier() +mach = machine(model, X, y, verbosity) +fit!(mach; verbosity) + +function predict( + model::RSirusClassifier, + fitresult::Tuple{RObject, CategoricalValue}, + Xnew + ) + fitted_model, a_target_element = fitresult + if !Tables.istable(Xnew) + error("Expected a Table but got $(typeof(Xnew))") + end + df = DataFrame(Xnew) + rpredictions = R""" + sirus.predict($fitted_model, $df) + """ + classes = MMI.classes(a_target_element) + predictions = rcopy(rpredictions) + augment = ndims(predictions) == 1 + @show classes + @show predictions + @show augment + return UnivariateFinite(classes, predictions; augment) +end + +predict(mach, X) + +e = _evaluate(model, X, y; measure=auc) +@test 0.5 < _score(e) + +# Looks like sirus does not support multiclass classification. +# y = categorical(rand(_rng(), [0, 1, 2], n)) +# _evaluate(model, X, y; measure=accuracy) diff --git a/test/runtests.jl b/test/runtests.jl index 8545ef0..72b363f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -38,6 +38,12 @@ end include("weights.jl") end +if CAN_RUN_R_SIRUS + @testset "rcall" begin + include("rcall.jl") + end +end + @testset "mlj" begin include("mlj.jl") end