Skip to content

Commit

Permalink
work around for batch
Browse files Browse the repository at this point in the history
  • Loading branch information
sadit committed Mar 30, 2023
1 parent 5712a71 commit d581f8d
Show file tree
Hide file tree
Showing 7 changed files with 43 additions and 16 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "TextSearch"
uuid = "7f6f6c8a-3b03-11e9-223d-e7d88259bd6c"
authors = ["Eric S. Tellez <donsadit@gmail.com>"]
version = "0.16.1"
version = "0.16.2"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Expand Down
6 changes: 5 additions & 1 deletion src/emodel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ function entropy_(dist)
e
end

categorical_labels(labels::AbstractVector{<:CategoricalValue}) = labels
categorical_labels(labels::AbstractVector{T}) where {T<:Union{AbstractString,Integer,Symbol}} = categorical(labels)
categorical_labels(labels::AbstractCategoricalVector) = labels

"""
VectorModel(ent::EntropyWeighting, lw::LocalWeighting, corpus::BOW, labels;
mindocs::Integer=1,
Expand All @@ -43,7 +47,7 @@ function VectorModel(ent::EntropyWeighting, lw::LocalWeighting, voc::Vocabulary,
minbatch=0
)
@assert length(labels) == length(corpus)
labels = categorical(labels)
labels = categorical_labels(labels)
n = length(labels)
nclasses = length(levels(labels))
D = fill(smooth, nclasses, vocsize(voc))
Expand Down
4 changes: 2 additions & 2 deletions src/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ function loadmodel(::Type{SemanticVocabulary}, file::JLDFile; parent="/", static
meta = file[joinpath(parent, "meta")]
voc = file[joinpath(parent, "voc")]
knns = file[joinpath(parent, "knns")]

sel = file[joinpath(parent, "sel")]
lexidx, _ = loadindex(file; parent=joinpath(parent, "lexidx"), staticgraph)

SemanticVocabulary(tvoc, lexidx, knns, sel), meta
SemanticVocabulary(voc, lexidx, knns, sel), meta
end

6 changes: 3 additions & 3 deletions src/semvoc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ struct SelectAllTokens <: AbstractTokenSelection
end

SemanticVocabulary(C::SemanticVocabulary;
voc=C.voc, lexidx=C.lexidx, semidx=C.semidx, sel=C.sel) =
SemanticVocabulary(voc, lexidx, semidx, C.sel)
voc=C.voc, lexidx=C.lexidx, knns=C.knns, sel=C.sel) =
SemanticVocabulary(voc, lexidx, knns, sel)

vocsize(model::SemanticVocabulary) = vocsize(voc)
vocsize(model::SemanticVocabulary) = vocsize(model.voc)

function SemanticVocabulary(voc::Vocabulary, sel::AbstractTokenSelection=SelectCentralToken(16, 8);
textconfig::TextConfig=TextConfig(nlist=[1], qlist=[4]),
Expand Down
20 changes: 16 additions & 4 deletions src/semvocbow.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

function vectorize_knns!(D::Dict, model::SemanticVocabulary, tok)
klex = model.sel.klex
ksem = model.sel.ksem
ksem = min(model.sel.ksem, size(model.knns, 1))

res = getknnresult(klex)
search(model, tok, res)
Expand All @@ -24,7 +24,7 @@ function token2id(model::SemanticVocabulary, tok::AbstractString)::UInt32

if id == 0
klex = model.sel.klex
ksem = model.sel.ksem
ksem = min(model.sel.ksem, size(model.knns, 1))

if ksem == 0
res = getknnresult(klex)
Expand Down Expand Up @@ -58,12 +58,23 @@ end

function tokenize(model::SemanticVocabulary, text)
tokenize!(model, tokenize(model.voc.textconfig, text))

end

function tokenize_corpus(model::SemanticVocabulary, corpus)
n = length(corpus)
arr = Vector{TokenizedText}(undef, n)
Threads.@threads for i in 1:n
arr[i] = tokenize!(model, tokenize(model.voc.textconfig, corpus[i]))
end

arr
end

function bagofwords!(bow::BOW, model::SemanticVocabulary{SelectCentralToken}, tokens::TokenizedText)
for t in tokens
id = token2id(model, t)
bow[id] = get(bow, id, 1)
bow[id] = get(bow, id, 0) + 1
end

bow
Expand All @@ -79,7 +90,7 @@ end

function vectorize(model::SemanticVocabulary, text; normalize=true)
klex = model.sel.klex
ksem = model.sel.ksem
ksem = min(model.sel.ksem, size(model.knns, 1))

res = getknnresult(klex)
search(model, text, res)
Expand All @@ -100,6 +111,7 @@ function vectorize(model::SemanticVocabulary, text; normalize=true)
end

normalize && normalize!(D)

D
end

12 changes: 9 additions & 3 deletions src/vmodel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,13 @@ function Base.getindex(model::VectorModel, tokenID::Integer)
end
end

Base.show(io::IO, model::VectorModel) = print(io, "{VectorModel global_weighting=$(model.global_weighting), local_weighting=$(model.local_weighting), train-voc=$(vocsize(model)), train-n=$(trainsize(model)), maxoccs=$(model.maxoccs)}")
Base.show(io::IO, model::VectorModel) = print(io, """{VectorModel
global_weighting: $(model.global_weighting)
local_weighting: $(model.local_weighting)
vocsize: $(vocsize(model))
trainsize=$(trainsize(model))
maxoccs=$(model.maxoccs)
}""")

function filter_tokens(pred::Function, model::VectorModel)
voc = model.voc
Expand Down Expand Up @@ -222,8 +228,8 @@ function vectorize_corpus(model::VectorModel, corpus::AbstractVector; normalize=
resize!(V, n)
minbatch = getminbatch(minbatch, n)

@batch minbatch=minbatch per=thread for i in 2:n
# Threads.@threads for i in 2:n
#@batch minbatch=minbatch per=thread for i in 2:n
Threads.@threads for i in 2:n
V[i] = vectorize(model, corpus[i]; normalize, minweight)
end

Expand Down
9 changes: 7 additions & 2 deletions src/voc.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# This file is a part of TextSearch.jl

export Vocabulary, occs, ndocs, token, vocsize, trainsize, filter_tokens, tokenize_and_append!, merge_voc, update_voc!, vocabulary_from_thesaurus, token2id,
encode, decode
encode, decode, totable


struct Vocabulary
Expand All @@ -23,6 +23,10 @@ function encode(voc::Vocabulary, bow::Dict)
Dict(token2id(voc, k) => v for (k, v) in bow)
end

function totable(voc::Vocabulary, TableConstructor)
TableConstructor(; voc.token, voc.ndocs, voc.occs)
end

function vocabulary_from_thesaurus(textconfig::TextConfig, tokens::AbstractVector)
n = length(tokens)
token2id = Dict{String,UInt32}
Expand Down Expand Up @@ -95,7 +99,8 @@ function tokenize_and_append!(voc::Vocabulary, textconfig::TextConfig, corpus; m
n = length(corpus)
minbatch = getminbatch(minbatch, n)

@batch per=thread minbatch=minbatch for i in 1:n

Threads.@threads for i in 1:n # @batch per=thread minbatch=minbatch for i in 1:n
doc = corpus[i]

buff = take!(TEXT_SEARCH_CACHES)
Expand Down

0 comments on commit d581f8d

Please sign in to comment.