adds support for bagging muTC classifiers

sadit · Feb 12, 2020 · 8e6ae3d · 8e6ae3d · sadit · Feb 12, 2020
1 parent 020fcde
commit 8e6ae3d
Show file tree

Hide file tree

Showing 6 changed files with 144 additions and 12 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -30,6 +30,11 @@ notifications:
     email:
       recipients:
         - donsadit@gmail.com
+        - kyriox@gmail.com
+        - mgraffg@gmail.com
+        - sabinomiranda@gmail.com
+        - dmocteo@gmail.com
+
 
     on_success: change  # options: [always|never|change] default: always
     on_failure: always  # options: [always|never|change] default: always
@@ -43,6 +48,10 @@ env:
 # script:
 #  - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
 #  - julia -e 'using Pkg; pkg"add https://github.com/sadit/TextSeach.jl"'
+
+before_install:
+  - julia -e 'using Pkg; pkg"add https://github.com/sadit/SimilaritySearch.jl https://github.com/sadit/KCenters.jl https://github.com/sadit/TextSearch.jl"'
+
 after_success:
   - julia -e 'cd(Pkg.dir("TextClassification")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'
   - julia -e 'cd(Pkg.dir("TextClassification")); Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())'
diff --git a/Project.toml b/Project.toml
@@ -4,6 +4,8 @@ authors = ["Eric S. Tellez <donsadit@gmail.com>"]
 version = "0.2.1"
 
 [deps]
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
 KCenters = "5d8de97f-65f8-4dd6-a15b-0f89c36a43ce"
@@ -16,11 +18,11 @@ TextSearch = "7f6f6c8a-3b03-11e9-223d-e7d88259bd6c"
 
 [compat]
 IterTools = "1.3.0"
-KCenters = "0.1.12"
+KCenters = "0.1.13"
 MLDataUtils = "0.5.0"
 SimilaritySearch = "0.3.19"
 StatsBase = "0.30.0, 0.32.0"
-TextSearch = "0.3.4"
+TextSearch = "0.3.5"
 julia = "0.7, 1"
 
 [extras]

diff --git a/src/TextClassification.jl b/src/TextClassification.jl
@@ -5,4 +5,5 @@
 module TextClassification
 
 include("microtc.jl")
+include("multi.jl")
 end # module
diff --git a/src/microtc.jl b/src/microtc.jl
@@ -10,7 +10,7 @@ import Base: hash, isequal
 export microtc_search_params, search_params, random_configurations, combine_configurations, filtered_power_set, fit, predict, vectorize, transform, μTC_Configuration, μTC, MicroTC, after_load
 import Base: hash, isequal
 
-struct μTC_Configuration{Kind,VKind}
+mutable struct μTC_Configuration{Kind,VKind}
     p::Float64
 
     del_diac::Bool
@@ -95,6 +95,10 @@ mutable struct μTC{Kind,VKind,T}
     kernel::Function
 end
 
+function broadcastable(tc::μTC)
+    (tc,)
+end
+
 const MicroTC = μTC
 
 function filtered_power_set(set, lowersize=0, uppersize=5)
@@ -127,8 +131,7 @@ function create_textmodel(config::μTC_Configuration{EntModel,VKind}, train_corp
 end
 
 function create_textmodel(config::μTC_Configuration{VectorModel,VKind}, train_corpus, train_y) where VKind
-    model = fit(VectorModel, create_textconfig(config), train_corpus,
-		minocc=config.minocc)
+    model = fit(VectorModel, create_textconfig(config), train_corpus, minocc = config.minocc)
     if config.p < 1.0
         model = prune_select_top(model, config.p)
     end
@@ -137,8 +140,12 @@ function create_textmodel(config::μTC_Configuration{VectorModel,VKind}, train_c
 end
 
 function fit(::Type{μTC}, config::μTC_Configuration{Kind,VKind}, train_corpus, train_y; verbose=true) where {Kind,VKind}
-    model = create_textmodel(config, train_corpus, train_y)
-    train_X = [vectorize(model, config.vkind, text) for text in train_corpus]
+    textmodel = create_textmodel(config, train_corpus, train_y)
+    fit(μTC, config, textmodel, train_corpus, train_y; verbose=verbose)
+end
+
+function fit(::Type{μTC}, config::μTC_Configuration{Kind,VKind}, textmodel::Kind, train_corpus, train_y; verbose=true) where {Kind,VKind}
+    train_X = [vectorize(textmodel, config.vkind, text) for text in train_corpus]
 
     if config.ncenters == 0
         C = kcenters(config.dist, train_X, train_y, TextSearch.centroid)
@@ -153,7 +160,7 @@ function fit(::Type{μTC}, config::μTC_Configuration{Kind,VKind}, train_corpus,
             verbose=verbose)
     end
 
-    μTC(cls, model, config, config.kernel(config.dist))
+    μTC(cls, textmodel, config, config.kernel(config.dist))
 end
 
 fit(config::μTC_Configuration, train_corpus, train_y; verbose=true) = fit(μTC, config, train_corpus, train_y; verbose=verbose)
@@ -167,8 +174,19 @@ function after_load(tc::μTC)
     tc.kernel = tc.config.kernel(tc.config.dist)
 end
 
-function predict(tc::μTC, X)
-    ypred = predict(tc.nc, tc.kernel, X, tc.config.k)
+function predict(tc::μTC, X::AbstractString, k=0)
+    k = k == 0 ? tc.config.k : k
+    predict(tc.nc, tc.kernel, [vectorize(tc, X)], )
+end
+
+function predict(tc::μTC, X::AbstractVector{D}, k=0) where D <: DVEC
+    k = k == 0 ? tc.config.k : k
+    predict(tc.nc, tc.kernel, X, k)
+end
+
+function predict(tc::μTC, X::AbstractVector, k=0)
+    k = k == 0 ? tc.config.k : k
+    predict(tc.nc, tc.kernel, [vectorize(tc, x) for x in X], k)
 end
 
 function vectorize(tc::μTC{Kind,VKind}, text) where {Kind,VKind}

diff --git a/src/multi.jl b/src/multi.jl
@@ -0,0 +1,79 @@
+# This file is a part of TextClassification.jl
+# License is Apache 2.0: https://www.apache.org/licenses/LICENSE-2.0.txt
+
+using Random, TextSearch
+import KCenters: bagging, glue
+import SimilaritySearch: optimize!
+export glue, bagging, optimize!
+
+"""
+    glue(arr::AbstractVector{μTC})
+
+Joins a list of text classifiers into a single one classifier.
+"""
+function glue(arr::AbstractVector{μTC})
+    centers = []
+    class_map = Int[]
+    dmax = Float64[]
+    for a in arr
+        for c in a.nc.centers
+            push!(centers, bow(a.model, c))
+        end
+
+        append!(class_map, a.nc.class_map)
+        append!(dmax, a.nc.dmax)
+    end
+
+    item = first(arr)
+    model = glue([a.model for a in arr])
+    centers_ = [dvec(model, c) for c in centers]
+    nc = KNC(centers_, dmax, class_map, item.nc.nclasses)
+    config = item.config
+    kernel = item.kernel
+    μTC(nc, model, config, kernel)
+end
+
+"""
+    bagging(config::μTC_Configuration, X::AbstractVector, y::AbstractVector{I}; b=13, ratio=0.5) where {I<:Integer}
+
+Creates `b` text classifiers, each trained with a random `ratio` of the dataset;
+the resulting classifiers are joint into a single classifier.
+"""
+function bagging(config::μTC_Configuration, X::AbstractVector, y::AbstractVector{I}; b=13, ratio=0.5) where {I<:Integer}
+    indexes = collect(1:length(X))
+    m = ceil(Int, ratio * length(X))
+
+    L = Vector{μTC}(undef, b)
+    for i in 1:b
+        shuffle!(indexes)
+        sample = @view indexes[1:m]
+        L[i] = fit(μTC, config, X[sample], y[sample]; verbose=true)
+    end
+
+    glue(L)
+end
+
+
+"""
+    optimize!(model::μTC, X, y; k=[1, 3, 5, 7], kernel=[direct_kernel, relu_kernel, laplacian_kernel, gaussian_kernel])
+Selects `k` and `kernel` to adjust better to the given score and the dataset ``(X, y)``.
+"""
+function optimize!(model::μTC, X, y, score::Function=recall_score; k=[1, 3, 5, 7], kernel=[direct_kernel, relu_kernel, laplacian_kernel, gaussian_kernel], verbose=true)
+    L = []
+    for k_ in k, kernel_ in kernel
+        kernel_fun = kernel_(model.config.dist)
+        model.config.k = k_
+        model.kernel = kernel_fun
+        ypred = predict(model, X)
+        s = score(y, ypred)
+        push!(L, (score=s, k=k_, kernel=kernel_, kernel_fun=kernel_fun))
+        verbose && println(stderr, L[end])
+    end
+
+    sort!(L, by=x->x.score, rev=true)
+    c = first(L)
+    model.config.k = c.k
+    model.config.kernel = c.kernel
+    model.kernel = c.kernel_fun
+    L
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,4 +1,4 @@
-using Test, TextClassification, StatsBase
+using Test, StatsBase, KCenters, TextSearch, TextClassification
 
 @testset "microtc" begin
     !isfile("emotions.csv") && download("http://ingeotec.mx/~sadit/emotions.csv", "emotions.csv")
@@ -10,20 +10,43 @@ using Test, TextClassification, StatsBase
     le = labelenc(labels)
     y = label2ind.(labels, le)
     corpus = X.text
+
+    (Xtrain, ytrain), (Xtest, ytest) = splitobs(shuffleobs((corpus, y)), at=0.7)
     best_list = microtc_search_params(
-        corpus, y, 16;
+        Xtrain, ytrain, 8;
         # search hyper-parameters
         tol=0.01, search_maxiters=3, folds=3, verbose=true,
         # configuration space
         ncenters=[0],
         qlist=filtered_power_set([3, 5], 1, 2),
         nlist=filtered_power_set([1, 2], 0, 1),
         slist=[],
+        kind = [EntModel]
      )
 
     for (i, b) in enumerate(best_list)
         @info i, b[1], b[2]
     end
+
+    cls = fit(μTC, best_list[1][1], Xtrain, ytrain)
+    sc = scores(predict(cls, Xtest), ytest)
+    @info "*** Performance on test: " sc
+    @test sc.accuracy > 0.7
+
+    cls = bagging(best_list[1][1], Xtrain, ytrain; b=15, ratio=0.85)
+    sc = scores(predict(cls, Xtest), ytest)
+    @info "*** Bagging performance on test: " sc
+    @test sc.accuracy > 0.7
+
+    sc = scores(predict(cls, Xtest, 3), ytest)
+    @info "*** Bagging performance on test (k=3): " sc
+    @test sc.accuracy > 0.7
+
+    B = optimize!(cls, Xtest, ytest; verbose=false)
+    @info "best config for optimize!" B[1]
+    sc = scores(predict(cls, Xtest), ytest)
+    @info "*** Bagging performance on test (after calling optimize!): " sc
+    @test sc.accuracy > 0.7
 end