work around for batch

sadit · Mar 30, 2023 · d581f8d · d581f8d
1 parent 5712a71
commit d581f8d
Show file tree

Hide file tree

Showing 7 changed files with 43 additions and 16 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "TextSearch"
 uuid = "7f6f6c8a-3b03-11e9-223d-e7d88259bd6c"
 authors = ["Eric S. Tellez <donsadit@gmail.com>"]
-version = "0.16.1"
+version = "0.16.2"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"

diff --git a/src/emodel.jl b/src/emodel.jl
@@ -26,6 +26,10 @@ function entropy_(dist)
     e
 end
 
+categorical_labels(labels::AbstractVector{<:CategoricalValue}) = labels
+categorical_labels(labels::AbstractVector{T}) where {T<:Union{AbstractString,Integer,Symbol}} = categorical(labels) 
+categorical_labels(labels::AbstractCategoricalVector) = labels
+
 """
     VectorModel(ent::EntropyWeighting, lw::LocalWeighting, corpus::BOW, labels;
         mindocs::Integer=1,
@@ -43,7 +47,7 @@ function VectorModel(ent::EntropyWeighting, lw::LocalWeighting, voc::Vocabulary,
             minbatch=0
         )
     @assert length(labels) == length(corpus)
-    labels = categorical(labels)
+    labels = categorical_labels(labels)
     n = length(labels)
     nclasses = length(levels(labels))
     D = fill(smooth, nclasses, vocsize(voc))

diff --git a/src/io.jl b/src/io.jl
@@ -45,9 +45,9 @@ function loadmodel(::Type{SemanticVocabulary}, file::JLDFile; parent="/", static
     meta = file[joinpath(parent, "meta")]
     voc = file[joinpath(parent, "voc")]
     knns = file[joinpath(parent, "knns")]
-
+    sel = file[joinpath(parent, "sel")]
     lexidx, _ = loadindex(file; parent=joinpath(parent, "lexidx"), staticgraph)
 
-    SemanticVocabulary(tvoc, lexidx, knns, sel), meta
+    SemanticVocabulary(voc, lexidx, knns, sel), meta
 end
 
diff --git a/src/semvoc.jl b/src/semvoc.jl
@@ -22,10 +22,10 @@ struct SelectAllTokens <: AbstractTokenSelection
 end
 
 SemanticVocabulary(C::SemanticVocabulary;
-                   voc=C.voc, lexidx=C.lexidx, semidx=C.semidx, sel=C.sel) =
-    SemanticVocabulary(voc, lexidx, semidx, C.sel)
+                   voc=C.voc, lexidx=C.lexidx, knns=C.knns, sel=C.sel) =
+    SemanticVocabulary(voc, lexidx, knns, sel)
 
-vocsize(model::SemanticVocabulary) = vocsize(voc)
+vocsize(model::SemanticVocabulary) = vocsize(model.voc)
 
 function SemanticVocabulary(voc::Vocabulary, sel::AbstractTokenSelection=SelectCentralToken(16, 8);
         textconfig::TextConfig=TextConfig(nlist=[1], qlist=[4]),

diff --git a/src/semvocbow.jl b/src/semvocbow.jl
@@ -2,7 +2,7 @@
 
 function vectorize_knns!(D::Dict, model::SemanticVocabulary, tok)
     klex = model.sel.klex
-    ksem = model.sel.ksem
+    ksem = min(model.sel.ksem, size(model.knns, 1))
 
     res = getknnresult(klex)
     search(model, tok, res)
@@ -24,7 +24,7 @@ function token2id(model::SemanticVocabulary, tok::AbstractString)::UInt32
 
     if id == 0
         klex = model.sel.klex
-        ksem = model.sel.ksem
+        ksem = min(model.sel.ksem, size(model.knns, 1))
 
         if ksem == 0
             res = getknnresult(klex)
@@ -58,12 +58,23 @@ end
 
 function tokenize(model::SemanticVocabulary, text)
     tokenize!(model, tokenize(model.voc.textconfig, text))
+
+end
+
+function tokenize_corpus(model::SemanticVocabulary, corpus)
+    n = length(corpus)
+    arr = Vector{TokenizedText}(undef, n)
+    Threads.@threads for i in 1:n
+        arr[i] = tokenize!(model, tokenize(model.voc.textconfig, corpus[i]))
+    end
+
+    arr
 end
 
 function bagofwords!(bow::BOW, model::SemanticVocabulary{SelectCentralToken}, tokens::TokenizedText)
     for t in tokens 
         id = token2id(model, t)
-        bow[id] = get(bow, id, 1)
+        bow[id] = get(bow, id, 0) + 1
     end
 
     bow
@@ -79,7 +90,7 @@ end
 
 function vectorize(model::SemanticVocabulary, text; normalize=true)
     klex = model.sel.klex
-    ksem = model.sel.ksem
+    ksem = min(model.sel.ksem, size(model.knns, 1))
 
     res = getknnresult(klex)
     search(model, text, res)
@@ -100,6 +111,7 @@ function vectorize(model::SemanticVocabulary, text; normalize=true)
     end
 
     normalize && normalize!(D)
+
     D
 end
 
diff --git a/src/vmodel.jl b/src/vmodel.jl
@@ -147,7 +147,13 @@ function Base.getindex(model::VectorModel, tokenID::Integer)
     end
 end
 
-Base.show(io::IO, model::VectorModel) = print(io, "{VectorModel global_weighting=$(model.global_weighting), local_weighting=$(model.local_weighting), train-voc=$(vocsize(model)), train-n=$(trainsize(model)), maxoccs=$(model.maxoccs)}")
+Base.show(io::IO, model::VectorModel) = print(io, """{VectorModel
+    global_weighting: $(model.global_weighting)
+    local_weighting: $(model.local_weighting)
+    vocsize: $(vocsize(model))
+    trainsize=$(trainsize(model))
+    maxoccs=$(model.maxoccs)                                    
+}""")
 
 function filter_tokens(pred::Function, model::VectorModel)
     voc = model.voc
@@ -222,8 +228,8 @@ function vectorize_corpus(model::VectorModel, corpus::AbstractVector; normalize=
     resize!(V, n)
     minbatch = getminbatch(minbatch, n)
 
-    @batch minbatch=minbatch per=thread for i in 2:n
-    # Threads.@threads for i in 2:n
+    #@batch minbatch=minbatch per=thread for i in 2:n
+    Threads.@threads for i in 2:n
         V[i] = vectorize(model, corpus[i]; normalize, minweight)
     end
 

diff --git a/src/voc.jl b/src/voc.jl
@@ -1,7 +1,7 @@
 # This file is a part of TextSearch.jl
 
 export Vocabulary, occs, ndocs, token, vocsize, trainsize, filter_tokens, tokenize_and_append!, merge_voc, update_voc!, vocabulary_from_thesaurus, token2id, 
-       encode, decode
+       encode, decode, totable
 
 
 struct Vocabulary
@@ -23,6 +23,10 @@ function encode(voc::Vocabulary, bow::Dict)
     Dict(token2id(voc, k) => v for (k, v) in bow)
 end
 
+function totable(voc::Vocabulary, TableConstructor)
+    TableConstructor(; voc.token, voc.ndocs, voc.occs)
+end
+
 function vocabulary_from_thesaurus(textconfig::TextConfig, tokens::AbstractVector)
     n = length(tokens)
     token2id = Dict{String,UInt32}
@@ -95,7 +99,8 @@ function tokenize_and_append!(voc::Vocabulary, textconfig::TextConfig, corpus; m
     n = length(corpus)
     minbatch = getminbatch(minbatch, n)
 
-    @batch per=thread minbatch=minbatch for i in 1:n
+
+    Threads.@threads for i in 1:n # @batch per=thread minbatch=minbatch for i in 1:n
         doc = corpus[i]
 
         buff = take!(TEXT_SEARCH_CACHES)