Skip to content

Commit

Permalink
Merge pull request #23 from sadit/pre17
Browse files Browse the repository at this point in the history
merges 0.17 into main
  • Loading branch information
sadit committed Sep 22, 2023
2 parents b1796f3 + 2fe486c commit 881120b
Show file tree
Hide file tree
Showing 16 changed files with 390 additions and 433 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "TextSearch"
uuid = "7f6f6c8a-3b03-11e9-223d-e7d88259bd6c"
authors = ["Eric S. Tellez <donsadit@gmail.com>"]
version = "0.16.2"
version = "0.17.0"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Expand Down
8 changes: 5 additions & 3 deletions src/TextSearch.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,12 @@ function Base.empty!(buff::TextSearchBuffer)
end

function __init__()
for _ in 1:Threads.nthreads()
for _ in 1:2*Threads.nthreads()+4
put!(TEXT_SEARCH_CACHES, TextSearchBuffer())
end
end


@inline function textbuffer(f)
buff = take!(TEXT_SEARCH_CACHES)
try
Expand All @@ -64,10 +65,12 @@ end
end
end

include("tokentrans.jl")
include("textconfig.jl")
include("normalize.jl")
include("tokenize.jl")
include("voc.jl")
include("updatevoc.jl")
include("tokcorpus.jl")
include("bow.jl")
include("sparseconversions.jl")
Expand All @@ -76,8 +79,7 @@ include("emodel.jl")
include("bm25.jl")
include("bm25invfile.jl")
include("bm25invfilesearch.jl")
include("semvoc.jl")
include("semvocbow.jl")
include("approxvoc.jl")
include("io.jl")

end
50 changes: 50 additions & 0 deletions src/approxvoc.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# This file is a part of TextSearch.jl

export QgramsLookup

struct QgramsLookup <: AbstractTokenLookup
voc::Vocabulary{TokenLookup}
idx::BinaryInvertedFile
maxdist::Float32
end

"""
QgramsLookup(
voc::Vocabulary,
dist::SemiMetric=JaccardDistance();
maxdist::Real = 0.7,
textconfig=TextConfig(qlist=[3]),
doc_min_freq::Integer=1, # any hard vocabulary pruning are expected to be made in `voc`
doc_max_ratio::AbstractFloat=0.4 # popular tokens are likely to be thrash
)
"""
function QgramsLookup(
voc::Vocabulary,
dist::SemiMetric=JaccardDistance();
maxdist::Real = 0.7,
textconfig=TextConfig(qlist=[3]),
doc_min_freq::Integer=1, # any hard vocabulary pruning are expected to be made in `voc`
doc_max_ratio::AbstractFloat=0.4 # popular tokens are likely to be thrash
)

voc_ = Vocabulary(textconfig, token(voc))
voc_ = filter_tokens(voc_) do t
doc_min_freq <= t.ndocs <= doc_max_ratio * vocsize(voc_)
end

invfile = BinaryInvertedFile(vocsize(voc_), dist)
append_items!(invfile, VectorDatabase(bagofwords_corpus(voc_, token(voc))))
QgramsLookup(voc_, invfile, maxdist)
end

function token2id(voc::Vocabulary{QgramsLookup}, tok)::UInt32
lookup = voc.lookup
i = get(voc.token2id, tok, zero(UInt32))
i > 0 && return i
tok == "" && return 0
res = KnnResult(1)
search(lookup.idx, bagofwords(lookup.voc, tok), res)
p = res[1]
p.weight > lookup.maxdist ? 0 : p.id
end

17 changes: 0 additions & 17 deletions src/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,27 +27,10 @@ function savemodel(filename::AbstractString, model; meta=nothing, parent="/")
end
end

function savemodel(file::JLDFile, model::SemanticVocabulary; meta=nothing, parent="/")
file[joinpath(parent, "meta")] = meta
file[joinpath(parent, "voc")] = model.voc
file[joinpath(parent, "knns")] = model.knns
file[joinpath(parent, "sel")] = model.sel
saveindex(file, model.lexidx; parent=joinpath(parent, "lexidx"))
end

function loadmodel(t::Type, filename::AbstractString; parent="/", staticgraph=false)
jldopen(filename) do f
loadmodel(t, f; staticgraph, parent)
end
end

function loadmodel(::Type{SemanticVocabulary}, file::JLDFile; parent="/", staticgraph=false)
meta = file[joinpath(parent, "meta")]
voc = file[joinpath(parent, "voc")]
knns = file[joinpath(parent, "knns")]
sel = file[joinpath(parent, "sel")]
lexidx, _ = loadindex(file; parent=joinpath(parent, "lexidx"), staticgraph)

SemanticVocabulary(voc, lexidx, knns, sel), meta
end

82 changes: 0 additions & 82 deletions src/semvoc.jl

This file was deleted.

117 changes: 0 additions & 117 deletions src/semvocbow.jl

This file was deleted.

Loading

0 comments on commit 881120b

Please sign in to comment.