Skip to content

Commit

Permalink
adds support for bagging muTC classifiers
Browse files Browse the repository at this point in the history
  • Loading branch information
sadit committed Feb 12, 2020
1 parent 020fcde commit 8e6ae3d
Show file tree
Hide file tree
Showing 6 changed files with 144 additions and 12 deletions.
9 changes: 9 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ notifications:
email:
recipients:
- donsadit@gmail.com
- kyriox@gmail.com
- mgraffg@gmail.com
- sabinomiranda@gmail.com
- dmocteo@gmail.com


on_success: change # options: [always|never|change] default: always
on_failure: always # options: [always|never|change] default: always
Expand All @@ -43,6 +48,10 @@ env:
# script:
# - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
# - julia -e 'using Pkg; pkg"add https://github.com/sadit/TextSeach.jl"'

before_install:
- julia -e 'using Pkg; pkg"add https://github.com/sadit/SimilaritySearch.jl https://github.com/sadit/KCenters.jl https://github.com/sadit/TextSearch.jl"'

after_success:
- julia -e 'cd(Pkg.dir("TextClassification")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'
- julia -e 'cd(Pkg.dir("TextClassification")); Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())'
6 changes: 4 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ authors = ["Eric S. Tellez <donsadit@gmail.com>"]
version = "0.2.1"

[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
KCenters = "5d8de97f-65f8-4dd6-a15b-0f89c36a43ce"
Expand All @@ -16,11 +18,11 @@ TextSearch = "7f6f6c8a-3b03-11e9-223d-e7d88259bd6c"

[compat]
IterTools = "1.3.0"
KCenters = "0.1.12"
KCenters = "0.1.13"
MLDataUtils = "0.5.0"
SimilaritySearch = "0.3.19"
StatsBase = "0.30.0, 0.32.0"
TextSearch = "0.3.4"
TextSearch = "0.3.5"
julia = "0.7, 1"

[extras]
Expand Down
1 change: 1 addition & 0 deletions src/TextClassification.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@
module TextClassification

include("microtc.jl")
include("multi.jl")
end # module
34 changes: 26 additions & 8 deletions src/microtc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import Base: hash, isequal
export microtc_search_params, search_params, random_configurations, combine_configurations, filtered_power_set, fit, predict, vectorize, transform, μTC_Configuration, μTC, MicroTC, after_load
import Base: hash, isequal

struct μTC_Configuration{Kind,VKind}
mutable struct μTC_Configuration{Kind,VKind}
p::Float64

del_diac::Bool
Expand Down Expand Up @@ -95,6 +95,10 @@ mutable struct μTC{Kind,VKind,T}
kernel::Function
end

function broadcastable(tc::μTC)
(tc,)
end

const MicroTC = μTC

function filtered_power_set(set, lowersize=0, uppersize=5)
Expand Down Expand Up @@ -127,8 +131,7 @@ function create_textmodel(config::μTC_Configuration{EntModel,VKind}, train_corp
end

function create_textmodel(config::μTC_Configuration{VectorModel,VKind}, train_corpus, train_y) where VKind
model = fit(VectorModel, create_textconfig(config), train_corpus,
minocc=config.minocc)
model = fit(VectorModel, create_textconfig(config), train_corpus, minocc = config.minocc)
if config.p < 1.0
model = prune_select_top(model, config.p)
end
Expand All @@ -137,8 +140,12 @@ function create_textmodel(config::μTC_Configuration{VectorModel,VKind}, train_c
end

function fit(::Type{μTC}, config::μTC_Configuration{Kind,VKind}, train_corpus, train_y; verbose=true) where {Kind,VKind}
model = create_textmodel(config, train_corpus, train_y)
train_X = [vectorize(model, config.vkind, text) for text in train_corpus]
textmodel = create_textmodel(config, train_corpus, train_y)
fit(μTC, config, textmodel, train_corpus, train_y; verbose=verbose)
end

function fit(::Type{μTC}, config::μTC_Configuration{Kind,VKind}, textmodel::Kind, train_corpus, train_y; verbose=true) where {Kind,VKind}
train_X = [vectorize(textmodel, config.vkind, text) for text in train_corpus]

if config.ncenters == 0
C = kcenters(config.dist, train_X, train_y, TextSearch.centroid)
Expand All @@ -153,7 +160,7 @@ function fit(::Type{μTC}, config::μTC_Configuration{Kind,VKind}, train_corpus,
verbose=verbose)
end

μTC(cls, model, config, config.kernel(config.dist))
μTC(cls, textmodel, config, config.kernel(config.dist))
end

fit(config::μTC_Configuration, train_corpus, train_y; verbose=true) = fit(μTC, config, train_corpus, train_y; verbose=verbose)
Expand All @@ -167,8 +174,19 @@ function after_load(tc::μTC)
tc.kernel = tc.config.kernel(tc.config.dist)
end

function predict(tc::μTC, X)
ypred = predict(tc.nc, tc.kernel, X, tc.config.k)
function predict(tc::μTC, X::AbstractString, k=0)
k = k == 0 ? tc.config.k : k
predict(tc.nc, tc.kernel, [vectorize(tc, X)], )
end

function predict(tc::μTC, X::AbstractVector{D}, k=0) where D <: DVEC
k = k == 0 ? tc.config.k : k
predict(tc.nc, tc.kernel, X, k)
end

function predict(tc::μTC, X::AbstractVector, k=0)
k = k == 0 ? tc.config.k : k
predict(tc.nc, tc.kernel, [vectorize(tc, x) for x in X], k)
end

function vectorize(tc::μTC{Kind,VKind}, text) where {Kind,VKind}
Expand Down
79 changes: 79 additions & 0 deletions src/multi.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# This file is a part of TextClassification.jl
# License is Apache 2.0: https://www.apache.org/licenses/LICENSE-2.0.txt

using Random, TextSearch
import KCenters: bagging, glue
import SimilaritySearch: optimize!
export glue, bagging, optimize!

"""
glue(arr::AbstractVector{μTC})
Joins a list of text classifiers into a single one classifier.
"""
function glue(arr::AbstractVector{μTC})
centers = []
class_map = Int[]
dmax = Float64[]
for a in arr
for c in a.nc.centers
push!(centers, bow(a.model, c))
end

append!(class_map, a.nc.class_map)
append!(dmax, a.nc.dmax)
end

item = first(arr)
model = glue([a.model for a in arr])
centers_ = [dvec(model, c) for c in centers]
nc = KNC(centers_, dmax, class_map, item.nc.nclasses)
config = item.config
kernel = item.kernel
μTC(nc, model, config, kernel)
end

"""
bagging(config::μTC_Configuration, X::AbstractVector, y::AbstractVector{I}; b=13, ratio=0.5) where {I<:Integer}
Creates `b` text classifiers, each trained with a random `ratio` of the dataset;
the resulting classifiers are joint into a single classifier.
"""
function bagging(config::μTC_Configuration, X::AbstractVector, y::AbstractVector{I}; b=13, ratio=0.5) where {I<:Integer}
indexes = collect(1:length(X))
m = ceil(Int, ratio * length(X))

L = Vector{μTC}(undef, b)
for i in 1:b
shuffle!(indexes)
sample = @view indexes[1:m]
L[i] = fit(μTC, config, X[sample], y[sample]; verbose=true)
end

glue(L)
end


"""
optimize!(model::μTC, X, y; k=[1, 3, 5, 7], kernel=[direct_kernel, relu_kernel, laplacian_kernel, gaussian_kernel])
Selects `k` and `kernel` to adjust better to the given score and the dataset ``(X, y)``.
"""
function optimize!(model::μTC, X, y, score::Function=recall_score; k=[1, 3, 5, 7], kernel=[direct_kernel, relu_kernel, laplacian_kernel, gaussian_kernel], verbose=true)
L = []
for k_ in k, kernel_ in kernel
kernel_fun = kernel_(model.config.dist)
model.config.k = k_
model.kernel = kernel_fun
ypred = predict(model, X)
s = score(y, ypred)
push!(L, (score=s, k=k_, kernel=kernel_, kernel_fun=kernel_fun))
verbose && println(stderr, L[end])
end

sort!(L, by=x->x.score, rev=true)
c = first(L)
model.config.k = c.k
model.config.kernel = c.kernel
model.kernel = c.kernel_fun
L
end
27 changes: 25 additions & 2 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using Test, TextClassification, StatsBase
using Test, StatsBase, KCenters, TextSearch, TextClassification

@testset "microtc" begin
!isfile("emotions.csv") && download("http://ingeotec.mx/~sadit/emotions.csv", "emotions.csv")
Expand All @@ -10,20 +10,43 @@ using Test, TextClassification, StatsBase
le = labelenc(labels)
y = label2ind.(labels, le)
corpus = X.text

(Xtrain, ytrain), (Xtest, ytest) = splitobs(shuffleobs((corpus, y)), at=0.7)
best_list = microtc_search_params(
corpus, y, 16;
Xtrain, ytrain, 8;
# search hyper-parameters
tol=0.01, search_maxiters=3, folds=3, verbose=true,
# configuration space
ncenters=[0],
qlist=filtered_power_set([3, 5], 1, 2),
nlist=filtered_power_set([1, 2], 0, 1),
slist=[],
kind = [EntModel]
)

for (i, b) in enumerate(best_list)
@info i, b[1], b[2]
end

cls = fit(μTC, best_list[1][1], Xtrain, ytrain)
sc = scores(predict(cls, Xtest), ytest)
@info "*** Performance on test: " sc
@test sc.accuracy > 0.7

cls = bagging(best_list[1][1], Xtrain, ytrain; b=15, ratio=0.85)
sc = scores(predict(cls, Xtest), ytest)
@info "*** Bagging performance on test: " sc
@test sc.accuracy > 0.7

sc = scores(predict(cls, Xtest, 3), ytest)
@info "*** Bagging performance on test (k=3): " sc
@test sc.accuracy > 0.7

B = optimize!(cls, Xtest, ytest; verbose=false)
@info "best config for optimize!" B[1]
sc = scores(predict(cls, Xtest), ytest)
@info "*** Bagging performance on test (after calling optimize!): " sc
@test sc.accuracy > 0.7
end


Expand Down

2 comments on commit 8e6ae3d

@sadit
Copy link
Owner Author

@sadit sadit commented on 8e6ae3d Feb 12, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register()

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Error while trying to register: "Tag with name 0.2.1 already exists and points to a different commit"

Please sign in to comment.