From de751754eda878136ea604da3a3e030e6d30555b Mon Sep 17 00:00:00 2001 From: "Eric S. Tellez" Date: Mon, 11 Sep 2023 12:10:55 -0600 Subject: [PATCH 1/6] adds collocations --- src/TextSearch.jl | 1 + src/textconfig.jl | 29 +++++++++---- src/tokenize.jl | 46 ++++++++++++++++++-- src/updatevoc.jl | 101 ++++++++++++++++++++++++++++++++++++++++++++ src/vmodel.jl | 5 +++ src/voc.jl | 105 ++++++---------------------------------------- 6 files changed, 184 insertions(+), 103 deletions(-) create mode 100644 src/updatevoc.jl diff --git a/src/TextSearch.jl b/src/TextSearch.jl index 86a0420..721d4e3 100644 --- a/src/TextSearch.jl +++ b/src/TextSearch.jl @@ -68,6 +68,7 @@ include("textconfig.jl") include("normalize.jl") include("tokenize.jl") include("voc.jl") +include("updatevoc.jl") include("tokcorpus.jl") include("bow.jl") include("sparseconversions.jl") diff --git a/src/textconfig.jl b/src/textconfig.jl index 1defbd0..2bc149b 100644 --- a/src/textconfig.jl +++ b/src/textconfig.jl @@ -32,6 +32,14 @@ Return `nothing` to ignore the `tok` occurence (e.g., stop words). """ transform_qgram(::AbstractTokenTransformation, tok) = tok +""" + transform_collocation(::AbstractTokenTransformation, tok) + +Hook applied in the tokenization stage to change the input token `tok` if needed. +Return `nothing` to ignore the `tok` occurence (e.g., stop words). +""" +transform_collocation(::AbstractTokenTransformation, tok) = tok + """ transform_skipgram(::AbstractTokenTransformation, tok) @@ -64,6 +72,7 @@ Base.isequal(a::Skipgram, b::Skipgram) = a.qsize == b.qsize && a.skip == b.skip group_usr::Bool=false, group_emo::Bool=false, lc::Bool=true, + collocations::Int8=0, qlist::Vector=Int8[], nlist::Vector=Int8[], slist::Vector{Skipgram}=Skipgram[], @@ -81,6 +90,11 @@ Defines a preprocessing and tokenization pipeline - `group_usr`: indicates if users (@usr) should be grouped as _usr - `group_emo`: indicates if emojis should be grouped as _emo - `lc`: indicates if the text should be normalized to lower case +- `collocations`: window to expand collocations as tokens, please take into account that: + - 0 => disables collocations + - 1 => will compute words (ignored in favor of use typical unigrams) + - 2 => will compute bigrams (don't use this, but not disabled) + - 3 <= typical values - `qlist`: a list of character q-grams to use - `nlist`: a list of words n-grams to use - `slist`: a list of skip-grams tokenizers to use @@ -98,13 +112,15 @@ Base.@kwdef struct TextConfig group_usr::Bool = false group_emo::Bool = false lc::Bool = true + collocations::Int8 = 0 + mark_token_type::Bool = true qlist::Vector{Int8} = Int8[] nlist::Vector{Int8} = Int8[] slist::Vector{Skipgram} = Skipgram[] - mark_token_type::Bool = true tt::AbstractTokenTransformation = IdentityTokenTransformation() - function TextConfig(del_diac, del_dup, del_punc, group_num, group_url, group_usr, group_emo, lc, qlist, nlist, slist, mark_token_type, tt) + function TextConfig(del_diac::Bool, del_dup::Bool, del_punc::Bool, group_num::Bool, group_url::Bool, group_usr::Bool, group_emo::Bool, lc::Bool, collocations::Integer, + mark_token_type::Bool, qlist::AbstractVector, nlist::AbstractVector, slist::AbstractVector, tt) if length(qlist) == length(nlist) == length(slist) == 0 nlist = [1] end @@ -112,7 +128,7 @@ Base.@kwdef struct TextConfig nlist = sort!(Vector{Int8}(nlist)) slist = sort!(Vector{Skipgram}(slist)) - new(del_diac, del_dup, del_punc, group_num, group_url, group_usr, group_emo, lc, qlist, nlist, slist, mark_token_type, tt) + new(del_diac, del_dup, del_punc, group_num, group_url, group_usr, group_emo, lc, collocations, mark_token_type, qlist, nlist, slist, tt) end end @@ -125,16 +141,15 @@ function TextConfig(c::TextConfig; group_usr::Bool=c.group_usr, group_emo::Bool=c.group_emo, lc::Bool=c.lc, + collocations=c.collocations, + mark_token_type=c.mark_token_type, qlist=c.qlist, nlist=c.nlist, slist=c.slist, - mark_token_type=c.mark_token_type, tt::AbstractTokenTransformation=c.tt ) - TextConfig(del_diac, del_dup, del_punc, - group_num, group_url, group_usr, group_emo, - lc, qlist, nlist, slist, mark_token_type, tt) + TextConfig(del_diac, del_dup, del_punc, group_num, group_url, group_usr, group_emo, lc, collocations, mark_token_type, qlist, nlist, slist, tt) end Base.broadcastable(c::TextConfig) = (c,) diff --git a/src/tokenize.jl b/src/tokenize.jl index a3b0093..39b7c53 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -98,7 +98,7 @@ function tokenize_(config::TextConfig, buff::TextSearchBuffer) qgrams(q, buff, config.tt, config.mark_token_type) end - if length(config.nlist) > 0 || length(config.slist) > 0 + if length(config.nlist) > 0 || length(config.slist) > 0 || config.collocations n1 = length(buff.tokens) unigrams(buff, config.tt) # unigrams are always activated if any |nlist| > 0 or |slist| > 0 @@ -113,6 +113,10 @@ function tokenize_(config::TextConfig, buff::TextSearchBuffer) for q in config.slist skipgrams(q, buff, config.tt, config.mark_token_type) end + + if config.collocations > 1 + collocations(config.collocations, buff, config.tt, config.mark_token_type) + end end buff.tokens @@ -173,6 +177,21 @@ function flush_skipgram!(buff::TextSearchBuffer, tt::AbstractTokenTransformation s end +""" + flush_collocations!(buff::TextSearchBuffer, tt::AbstractTokenTransformation, mark_token_type) + +Pushes a collocation inside the buffer to the token list; it discards empty strings. +""" +function flush_collocation!(buff::TextSearchBuffer, tt::AbstractTokenTransformation, mark_token_type) + buff.io.size == 0 && return nothing + mark_token_type && write(buff.io, '\t', 'c') + s = transform_collocation(tt, String(take!(buff.io))) + s === nothing && return nothing + push!(buff.tokens, s) + s +end + + """ qgrams(q::Integer, buff::TextSearchBuffer, tt::AbstractTokenTransformation, mark_token_type) @@ -233,11 +252,9 @@ function unigrams(buff::TextSearchBuffer, tt::AbstractTokenTransformation) elseif c == BLANK if p !== BLANK s = flush_unigram!(buff, tt) - #write(buff.io, c) s !== nothing && push!(buff.unigrams, s) end else - ## @show :d write(buff.io, c) end end @@ -267,6 +284,29 @@ function nwords(q::Integer, buff::TextSearchBuffer, tt::AbstractTokenTransformat buff.tokens end + +""" + collocations(q, buff::TextSearchBuffer, tt::AbstractTokenTransformation, mark_token_type) + +Computes a kind of collocations of the given text +""" +function collocations(q::Integer, buff::TextSearchBuffer, tt::AbstractTokenTransformation, mark_token_type) + tokens = buff.unigrams + n = length(tokens) + + for i in 1:n-1 # the upper limit is an implementation detail to discard some entries + for j in i+1:min(i+1+q, n) + write(buff.io, buff.unigrams[i]) + write(buff.io, BLANK) + write(buff.io, buff.unigrams[j]) + flush_collocation!(buff, tt, mark_token_type) + end + end + + buff.tokens +end + + """ skipgrams(q::Skipgram, buff::TextSearchBuffer, tt::AbstractTokenTransformation, mark_token_type) diff --git a/src/updatevoc.jl b/src/updatevoc.jl new file mode 100644 index 0000000..cba4b58 --- /dev/null +++ b/src/updatevoc.jl @@ -0,0 +1,101 @@ +""" + update_voc!(voc::Vocabulary, another::Vocabulary) + update_voc!(pred::Function, voc::Vocabulary, another::Vocabulary) + +Update `voc` vocabulary using another vocabulary. Optionally a predicate can be given to filter vocabularies. + +Note 1: `corpuslen` remains unchanged (the structure is immutable and a new `Vocabulary` should be created to update this field). +Note 2: Both `voc` and `another` vocabularies should had been created with a _compatible_ [`Textconfig`](@ref) to be able to work on them. +""" +update_voc!(voc::Vocabulary, another::Vocabulary) = update_voc!(t->true, voc, another) + +function update_voc!(pred::Function, voc::Vocabulary, another::Vocabulary) + for i in eachindex(another) + v = another[i] + if pred(v) + push_token!(voc, v.token, v.occs, v.ndocs) + end + end + + voc +end + + +# filtering functions +""" + filter_tokens!(voc::Vocabulary, text::TokenizedText) + +Removes tokens from a given tokenized text based using the valid vocabulary +""" +function filter_tokens!(voc::Vocabulary, text::TokenizedText) + j = 0 + for i in eachindex(text.tokens) + t = text.tokens[i] + if haskey(voc.token2id, t) + j += 1 + text.tokens[j] = t + end + end + + resize!(text.tokens, j) + text +end + +""" + filter_tokens!(voc::Vocabulary, text::TokenizedText) + +Removes tokens from text array +""" +function filter_tokens!(voc::Vocabulary, arr::AbstractVector{TokenizedText}) + for t in arr + filter_tokens!(voc, t) + end + + arr +end + +""" + merge_voc(voc1::Vocabulary, voc2::Vocabulary[, ...]) + merge_voc(pred::Function, voc1::Vocabulary, voc2::Vocabulary[, ...]) + +Merges two or more vocabularies into a new one. A predicate function can be used to filter token entries. + +Note: All vocabularies should had been created with a _compatible_ [`Textconfig`](@ref) to be able to work on them. +""" +merge_voc(voc1::Vocabulary, voc2::Vocabulary, voclist...) = merge_voc(x->true, voc1, voc2, voclist...) + +function merge_voc(pred::Function, voc1::Vocabulary, voc2::Vocabulary, voclist...) + #all(v -> v isa Vocabulary, voclist) || throw(ArgumentError("arguments should be of type `Vocabulary`")) + + L = [voc1, voc2] + for v in voclist + push!(L, v) + end + + sort!(L, by=vocsize, rev=true) + voc = Vocabulary(voc1.textconfig, sum(v.corpuslen for v in L)) + + for v in L + update_voc!(pred, voc, v) + end + + voc +end + +""" + filter_tokens(pred::Function, voc::Vocabulary) + +Returns a copy of reduced vocabulary based on evaluating `pred` function for each entry in `voc` +""" +function filter_tokens(pred::Function, voc::Vocabulary) + V = Vocabulary(voc.textconfig, voc.corpuslen) + + for i in eachindex(voc) + v = voc[i] + if pred(v) + push_token!(V, v.token, v.occs, v.ndocs) + end + end + + V +end diff --git a/src/vmodel.jl b/src/vmodel.jl index f0b5212..079dec6 100644 --- a/src/vmodel.jl +++ b/src/vmodel.jl @@ -137,6 +137,10 @@ Base.copy(e::VectorModel; kwargs...) = VectorModel(e::VectorModel; kwargs...) @inline ndocs(model::VectorModel) = ndocs(model.voc) @inline token(model::VectorModel) = token(model.voc) +function totable(model::VectorModel, TableConstructor) + TableConstructor(; token=token(model), ndocs=ndocs(model), occs=occs(model), weight=weight(model)) +end + function Base.getindex(model::VectorModel, tokenID::Integer) id = convert(UInt32, tokenID) voc = model.voc @@ -171,6 +175,7 @@ function filter_tokens(pred::Function, model::VectorModel) VectorModel(model; voc=V, weight=W) end + function vectorize_!(buff::TextSearchBuffer, model::VectorModel{G_,L_}, bow::BOW; normalize=true, minweight=1e-9) where {G_,L_} vec = buff.vec numtokens::Int = 0 diff --git a/src/voc.jl b/src/voc.jl index d192a6c..b13ac10 100644 --- a/src/voc.jl +++ b/src/voc.jl @@ -15,6 +15,17 @@ end token2id(voc::Vocabulary, tok::AbstractString) = get(voc.token2id, tok, zero(UInt32)) +function Vocabulary(voc::Vocabulary) + Vocabulary( + voc.deepcopy(voc.textconfig), + voc.copy(voc.token), + voc.copy(voc.occs), + voc.copy(voc.ndocs), + voc.copy(voc.token2id), + voc.corpuslen + ) +end + function decode(voc::Vocabulary, bow::Dict) Dict(voc.token[k] => v for (k, v) in bow) end @@ -98,8 +109,7 @@ function tokenize_and_append!(voc::Vocabulary, corpus; minbatch=0) l = Threads.SpinLock() n = length(corpus) minbatch = getminbatch(minbatch, n) - - + Threads.@threads for i in 1:n # @batch per=thread minbatch=minbatch for i in 1:n doc = corpus[i] @@ -121,79 +131,6 @@ function tokenize_and_append!(voc::Vocabulary, corpus; minbatch=0) voc end -function filter_tokens!(voc::Vocabulary, text::TokenizedText) - j = 0 - for i in eachindex(text.tokens) - t = text.tokens[i] - if haskey(voc.token2id, t) - j += 1 - text.tokens[j] = t - end - end - - resize!(text.tokens, j) - text -end - -function filter_tokens!(voc::Vocabulary, arr::AbstractVector{TokenizedText}) - for t in arr - filter_tokens!(voc, t) - end - - arr -end - -""" - update_voc!(voc::Vocabulary, another::Vocabulary) - update_voc!(pred::Function, voc::Vocabulary, another::Vocabulary) - -Update `voc` vocabulary using another one. -Optionally a predicate can be given to filter vocabularies. - -Note 1: `corpuslen` remains unchanged (the structure is immutable and a new `Vocabulary` should be created to update this field). -Note 2: Both `voc` and `another` vocabularies should had been created with a _compatible_ [`Textconfig`](@ref) to be able to work on them. -""" -update_voc!(voc::Vocabulary, another::Vocabulary) = update_voc!(t->true, voc, another) - -function update_voc!(pred::Function, voc::Vocabulary, another::Vocabulary) - for i in eachindex(another) - v = another[i] - if pred(v) - push_token!(voc, v.token, v.occs, v.ndocs) - end - end - - voc -end - -""" - merge_voc(voc1::Vocabulary, voc2::Vocabulary[, ...]) - merge_voc(pred::Function, voc1::Vocabulary, voc2::Vocabulary[, ...]) - -Merges two or more vocabularies into a new one. A predicate function can be used to filter token entries. - -Note: All vocabularies should had been created with a _compatible_ [`Textconfig`](@ref) to be able to work on them. -""" -merge_voc(voc1::Vocabulary, voc2::Vocabulary, voclist...) = merge_voc(x->true, voc1, voc2, voclist...) - -function merge_voc(pred::Function, voc1::Vocabulary, voc2::Vocabulary, voclist...) - #all(v -> v isa Vocabulary, voclist) || throw(ArgumentError("arguments should be of type `Vocabulary`")) - - L = [voc1, voc2] - for v in voclist - push!(L, v) - end - - sort!(L, by=vocsize, rev=true) - voc = Vocabulary(voc1.textconfig, sum(v.corpuslen for v in L)) - - for v in L - update_voc!(pred, voc, v) - end - - voc -end - Base.length(voc::Vocabulary) = length(voc.occs) Base.eachindex(voc::Vocabulary) = eachindex(voc.occs) vocsize(voc::Vocabulary) = length(voc) @@ -254,21 +191,3 @@ function Base.getindex(voc::Vocabulary, tokenID::Integer) end end - -""" - filter_tokens(pred::Function, voc::Vocabulary) - -Returns a copy of reduced vocabulary based on evaluating `pred` function for each entry in `voc` -""" -function filter_tokens(pred::Function, voc::Vocabulary) - V = Vocabulary(voc.textconfig, voc.corpuslen) - - for i in eachindex(voc) - v = voc[i] - if pred(v) - push_token!(V, v.token, v.occs, v.ndocs) - end - end - - V -end From 9d3d76edd4d3c96249c1dfe668c567c314744ad4 Mon Sep 17 00:00:00 2001 From: "Eric S. Tellez" Date: Thu, 14 Sep 2023 09:18:25 -0500 Subject: [PATCH 2/6] collocations --- src/tokenize.jl | 63 ++++++++++++++++++++++++++----------------------- src/vmodel.jl | 4 +++- src/voc.jl | 49 ++++++++++++++++++++++++++++---------- 3 files changed, 72 insertions(+), 44 deletions(-) diff --git a/src/tokenize.jl b/src/tokenize.jl index 39b7c53..6d6e3aa 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -98,7 +98,7 @@ function tokenize_(config::TextConfig, buff::TextSearchBuffer) qgrams(q, buff, config.tt, config.mark_token_type) end - if length(config.nlist) > 0 || length(config.slist) > 0 || config.collocations + if length(config.nlist) > 0 || length(config.slist) > 0 || config.collocations > 1 n1 = length(buff.tokens) unigrams(buff, config.tt) # unigrams are always activated if any |nlist| > 0 or |slist| > 0 @@ -122,6 +122,19 @@ function tokenize_(config::TextConfig, buff::TextSearchBuffer) buff.tokens end +function push_token_from_transform!(tokens, s::Nothing) +end + +function push_token_from_transform!(tokens, s::AbstractString) + push!(tokens, s) +end + +function push_token_from_transform!(tokens, slist::AbstractVector) + for s in slist + push!(tokens, s) + end +end + """ flush_unigram!(buff::TextSearchBuffer, tt::AbstractTokenTransformation) @@ -130,9 +143,7 @@ Pushes the word inside the buffer to the token list; it discards empty strings. function flush_unigram!(buff::TextSearchBuffer, tt::AbstractTokenTransformation) buff.io.size == 0 && return nothing s = transform_unigram(tt, String(take!(buff.io))) - s === nothing && return nothing - push!(buff.tokens, s) - s + push_token_from_transform!(buff.tokens, s) end """ @@ -144,9 +155,7 @@ function flush_nword!(buff::TextSearchBuffer, tt::AbstractTokenTransformation, m buff.io.size == 0 && return nothing mark_token_type && write(buff.io, '\t', 'n') s = transform_nword(tt, String(take!(buff.io))) - s === nothing && return nothing - push!(buff.tokens, s) - s + push_token_from_transform!(buff.tokens, s) end """ @@ -158,9 +167,7 @@ function flush_qgram!(buff::TextSearchBuffer, tt::AbstractTokenTransformation, m buff.io.size == 0 && return nothing mark_token_type && write(buff.io, '\t', 'q') s = transform_qgram(tt, String(take!(buff.io))) - s === nothing && return nothing - push!(buff.tokens, s) - s + push_token_from_transform!(buff.tokens, s) end """ @@ -172,9 +179,7 @@ function flush_skipgram!(buff::TextSearchBuffer, tt::AbstractTokenTransformation buff.io.size == 0 && return nothing mark_token_type && write(buff.io, '\t', 's') s = transform_skipgram(tt, String(take!(buff.io))) - s === nothing && return nothing - push!(buff.tokens, s) - s + push_token_from_transform!(buff.tokens, s) end """ @@ -186,9 +191,7 @@ function flush_collocation!(buff::TextSearchBuffer, tt::AbstractTokenTransformat buff.io.size == 0 && return nothing mark_token_type && write(buff.io, '\t', 'c') s = transform_collocation(tt, String(take!(buff.io))) - s === nothing && return nothing - push!(buff.tokens, s) - s + push_token_from_transform!(buff.tokens, s) end @@ -219,6 +222,7 @@ Performs the word tokenization """ function unigrams(buff::TextSearchBuffer, tt::AbstractTokenTransformation) n = length(buff.normtext) + mfirst = length(buff.tokens) + 1 # @info buff.normtext @inbounds for i in 2:n # normtext[1] is BLANK c = buff.normtext[i] @@ -227,40 +231,39 @@ function unigrams(buff::TextSearchBuffer, tt::AbstractTokenTransformation) ## @show i, p, c if ispunct2(c) && !ispunct2(p) && p !== BLANK ## @show :a - s = flush_unigram!(buff, tt) - s !== nothing && push!(buff.unigrams, s) + flush_unigram!(buff, tt) write(buff.io, c) elseif ispunct2(p) if ispunct2(c) && buff.io.size > 2 - s = flush_unigram!(buff, tt) - s !== nothing && push!(buff.unigrams, s) + flush_unigram!(buff, tt) write(buff.io, c) elseif !ispunct2(c) && !(p in ('#', '@', '_')) ## @show :b - s = flush_unigram!(buff, tt) - s !== nothing && push!(buff.unigrams, s) + flush_unigram!(buff, tt) c !== BLANK && write(buff.io, c) else write(buff.io, c) end elseif isemoji(c) - s = flush_unigram!(buff, tt) - s !== nothing && push!(buff.unigrams, s) + flush_unigram!(buff, tt) write(buff.io, c) - s = flush_unigram!(buff, tt) - s !== nothing && push!(buff.unigrams, s) + flush_unigram!(buff, tt) elseif c == BLANK if p !== BLANK - s = flush_unigram!(buff, tt) - s !== nothing && push!(buff.unigrams, s) + flush_unigram!(buff, tt) end else write(buff.io, c) end end - s = flush_unigram!(buff, tt) - s !== nothing && push!(buff.unigrams, s) + flush_unigram!(buff, tt) + mlast = length(buff.tokens) + + for i in mfirst:mlast + push!(buff.unigrams, buff.tokens[i]) + end + buff.tokens end diff --git a/src/vmodel.jl b/src/vmodel.jl index 079dec6..c20fa1d 100644 --- a/src/vmodel.jl +++ b/src/vmodel.jl @@ -137,10 +137,12 @@ Base.copy(e::VectorModel; kwargs...) = VectorModel(e::VectorModel; kwargs...) @inline ndocs(model::VectorModel) = ndocs(model.voc) @inline token(model::VectorModel) = token(model.voc) -function totable(model::VectorModel, TableConstructor) +function table(model::VectorModel, TableConstructor) TableConstructor(; token=token(model), ndocs=ndocs(model), occs=occs(model), weight=weight(model)) end +Base.getindex(model::VectorModel, token::AbstractString) = model[get(model.voc.token2id, token, 0)] + function Base.getindex(model::VectorModel, tokenID::Integer) id = convert(UInt32, tokenID) voc = model.voc diff --git a/src/voc.jl b/src/voc.jl index b13ac10..edf1aae 100644 --- a/src/voc.jl +++ b/src/voc.jl @@ -1,7 +1,7 @@ # This file is a part of TextSearch.jl export Vocabulary, occs, ndocs, token, vocsize, trainsize, filter_tokens, tokenize_and_append!, merge_voc, update_voc!, vocabulary_from_thesaurus, token2id, - encode, decode, totable + encode, decode, table struct Vocabulary @@ -15,15 +15,9 @@ end token2id(voc::Vocabulary, tok::AbstractString) = get(voc.token2id, tok, zero(UInt32)) -function Vocabulary(voc::Vocabulary) - Vocabulary( - voc.deepcopy(voc.textconfig), - voc.copy(voc.token), - voc.copy(voc.occs), - voc.copy(voc.ndocs), - voc.copy(voc.token2id), - voc.corpuslen - ) +function Vocabulary(voc::Vocabulary; textconfig=voc.textconfig, token=voc.token, occs=voc.occs, ndocs=voc.ndocs, token2id=voc.token2id, corpuslen=voc.corpuslen) + Vocabulary(textconfig, token, occs, ndocs, token2id, corpuslen) + # Vocabulary(voc.deepcopy(voc.textconfig), voc.copy(voc.token), voc.copy(voc.occs), voc.copy(voc.ndocs), voc.copy(voc.token2id), corpuslen) end function decode(voc::Vocabulary, bow::Dict) @@ -34,7 +28,7 @@ function encode(voc::Vocabulary, bow::Dict) Dict(token2id(voc, k) => v for (k, v) in bow) end -function totable(voc::Vocabulary, TableConstructor) +function table(voc::Vocabulary, TableConstructor) TableConstructor(; voc.token, voc.ndocs, voc.occs) end @@ -70,12 +64,40 @@ end Computes a vocabulary from a corpus using the TextConfig `textconfig`. """ -function Vocabulary(textconfig::TextConfig, corpus::AbstractVector; minbatch=0) +function vocab_from_small_collection(textconfig::TextConfig, corpus::AbstractVector; minbatch=0) voc = Vocabulary(textconfig, length(corpus)) tokenize_and_append!(voc, corpus; minbatch) voc end +function Vocabulary(textconfig::TextConfig, corpusgenerator::Union{Base.EachLine,Base.Generator,AbstractVector}; minbatch=0, buffsize=2^16, verbose=true) + if corpusgenerator isa AbstractVector && length(corpusgenerator) <= buffsize + return vocab_from_small_collection(textconfig, corpusgenerator; minbatch) + end + + voc = Vocabulary(textconfig, 0) + len = 0 + corpus = String[] + sizehint!(corpus, buffsize) + for doc in corpusgenerator + push!(corpus, doc) + + if length(corpus) == buffsize + verbose && (@info "computing vocabulary -- advance: $len - buffsize: $buffsize") + len += buffsize + tokenize_and_append!(voc, corpus; minbatch) + empty!(corpus) + end + end + + if length(corpus) > 0 + len += length(corpus) + tokenize_and_append!(voc, corpus; minbatch) + end + + Vocabulary(voc; corpuslen=len) +end + function locked_tokenize_and_push(voc, doc, buff, l) empty!(buff) @@ -112,7 +134,6 @@ function tokenize_and_append!(voc::Vocabulary, corpus; minbatch=0) Threads.@threads for i in 1:n # @batch per=thread minbatch=minbatch for i in 1:n doc = corpus[i] - buff = take!(TEXT_SEARCH_CACHES) try @@ -181,6 +202,8 @@ function Base.getindex(voc::Vocabulary, idlist) [voc[i] for i in itertokenid(idlist)] end +Base.getindex(voc::Vocabulary, token::AbstractString) = voc[get(voc.token2id, token, 0)] + function Base.getindex(voc::Vocabulary, tokenID::Integer) id = convert(UInt32, tokenID) From 299988f4cc951518e4de73bdf5315551fb206c09 Mon Sep 17 00:00:00 2001 From: "Eric S. Tellez" Date: Fri, 15 Sep 2023 10:23:27 -0500 Subject: [PATCH 3/6] removes output --- src/tokenize.jl | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/tokenize.jl b/src/tokenize.jl index 6d6e3aa..9ba6a68 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -228,9 +228,7 @@ function unigrams(buff::TextSearchBuffer, tt::AbstractTokenTransformation) c = buff.normtext[i] p = buff.normtext[i-1] - ## @show i, p, c if ispunct2(c) && !ispunct2(p) && p !== BLANK - ## @show :a flush_unigram!(buff, tt) write(buff.io, c) elseif ispunct2(p) @@ -238,7 +236,6 @@ function unigrams(buff::TextSearchBuffer, tt::AbstractTokenTransformation) flush_unigram!(buff, tt) write(buff.io, c) elseif !ispunct2(c) && !(p in ('#', '@', '_')) - ## @show :b flush_unigram!(buff, tt) c !== BLANK && write(buff.io, c) else From 76eae3daf662b21be636d3f3433ea71416ffff16 Mon Sep 17 00:00:00 2001 From: "Eric S. Tellez" Date: Fri, 15 Sep 2023 15:39:21 -0500 Subject: [PATCH 4/6] unigrams function rewritten; bug fixed that joined words and punctuactions; increases the number of buffers in the cache to allow semvoc work --- src/TextSearch.jl | 3 ++- src/semvocbow.jl | 12 +++++++++++- src/tokenize.jl | 24 ++++++++++++------------ src/vmodel.jl | 2 +- test/runtests.jl | 3 ++- test/tok.jl | 2 +- 6 files changed, 29 insertions(+), 17 deletions(-) diff --git a/src/TextSearch.jl b/src/TextSearch.jl index 721d4e3..f7592b5 100644 --- a/src/TextSearch.jl +++ b/src/TextSearch.jl @@ -50,11 +50,12 @@ function Base.empty!(buff::TextSearchBuffer) end function __init__() - for _ in 1:Threads.nthreads() + for _ in 1:2*Threads.nthreads()+4 put!(TEXT_SEARCH_CACHES, TextSearchBuffer()) end end + @inline function textbuffer(f) buff = take!(TEXT_SEARCH_CACHES) try diff --git a/src/semvocbow.jl b/src/semvocbow.jl index 347f71a..e13a189 100644 --- a/src/semvocbow.jl +++ b/src/semvocbow.jl @@ -4,9 +4,13 @@ function vectorize_knns!(D::Dict, model::SemanticVocabulary, tok) klex = model.sel.klex ksem = min(model.sel.ksem, size(model.knns, 1)) + @info "AAAAAA" res = getknnresult(klex) + @info "AAAAAA 1" search(model, tok, res) + @info "AAAAAA 2" sizehint!(D, length(res) * (1 + ksem)) + @info "BBBBBB" for p in res D[p.id] = get(D, p.id, 0f0) + 1f0 @@ -15,7 +19,8 @@ function vectorize_knns!(D::Dict, model::SemanticVocabulary, tok) D[i] = get(D, i, 0f0) + 1f0 end end - + + @info "CCC" D end @@ -31,15 +36,20 @@ function token2id(model::SemanticVocabulary, tok::AbstractString)::UInt32 search(model, tok, res) id = argmin(res)::UInt32 else + @info :first length(TEXT_SEARCH_CACHES.data) buff = take!(TEXT_SEARCH_CACHES) + @info :second length(TEXT_SEARCH_CACHES.data) try empty!(buff.vec) D = buff.vec + @info :third length(TEXT_SEARCH_CACHES.data) vectorize_knns!(D, model, tok) + @info :fourth length(TEXT_SEARCH_CACHES.data) id = length(D) == 0 ? zero(UInt32) : argmax(D) finally put!(TEXT_SEARCH_CACHES, buff) end + @info :finish length(TEXT_SEARCH_CACHES.data) end end diff --git a/src/tokenize.jl b/src/tokenize.jl index 9ba6a68..453fb0a 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -228,27 +228,27 @@ function unigrams(buff::TextSearchBuffer, tt::AbstractTokenTransformation) c = buff.normtext[i] p = buff.normtext[i-1] - if ispunct2(c) && !ispunct2(p) && p !== BLANK + if c == BLANK + flush_unigram!(buff, tt) + elseif isemoji(c) + # emoji flush_unigram!(buff, tt) write(buff.io, c) + flush_unigram!(buff, tt) elseif ispunct2(p) - if ispunct2(c) && buff.io.size > 2 - flush_unigram!(buff, tt) + # previous char is punct + if ispunct2(c) + # a punctuaction string + buff.io.size >= 3 && flush_unigram!(buff, tt) # a bit large, so we flush and restart the punc string (3 is for most emojis and ...) write(buff.io, c) - elseif !ispunct2(c) && !(p in ('#', '@', '_')) - flush_unigram!(buff, tt) - c !== BLANK && write(buff.io, c) else + !(p in ('#', '@', '_')) && flush_unigram!(buff, tt) # current is not punctuaction so we flush if not a meta word write(buff.io, c) end - elseif isemoji(c) + elseif ispunct2(c) && p !== BLANK + ## single punctuaction alone flush_unigram!(buff, tt) write(buff.io, c) - flush_unigram!(buff, tt) - elseif c == BLANK - if p !== BLANK - flush_unigram!(buff, tt) - end else write(buff.io, c) end diff --git a/src/vmodel.jl b/src/vmodel.jl index c20fa1d..d57f8ec 100644 --- a/src/vmodel.jl +++ b/src/vmodel.jl @@ -141,7 +141,7 @@ function table(model::VectorModel, TableConstructor) TableConstructor(; token=token(model), ndocs=ndocs(model), occs=occs(model), weight=weight(model)) end -Base.getindex(model::VectorModel, token::AbstractString) = model[get(model.voc.token2id, token, 0)] +Base.getindex(model::VectorModel, token::AbstractString) = model[token2id(model.voc, token)] function Base.getindex(model::VectorModel, tokenID::Integer) id = convert(UInt32, tokenID) diff --git a/test/runtests.jl b/test/runtests.jl index 304b680..88ca19a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -39,8 +39,9 @@ end include("tok.jl") include("voc.jl") include("vec.jl") -include("search.jl") +include("search.jl") +# experimental: include("semvoc.jl") @info "FINISH" diff --git a/test/tok.jl b/test/tok.jl index 6d60512..8c879e5 100644 --- a/test/tok.jl +++ b/test/tok.jl @@ -48,7 +48,7 @@ function test_equals(a, b) @info :intersection => intersect(a, b) @info :evaluated => a @info :correct => b - error("diff") + error("a difference was found") end @test a == b From 3d352f45d44cce47df44c36abe96dcaae5c22b1a Mon Sep 17 00:00:00 2001 From: "Eric S. Tellez" Date: Thu, 21 Sep 2023 12:32:59 -0500 Subject: [PATCH 5/6] update version --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index f798d02..bd8f93a 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TextSearch" uuid = "7f6f6c8a-3b03-11e9-223d-e7d88259bd6c" authors = ["Eric S. Tellez "] -version = "0.16.2" +version = "0.17.0" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" From 2fe486c031b0e720809674fed7fce79eb8023d53 Mon Sep 17 00:00:00 2001 From: "Eric S. Tellez" Date: Fri, 22 Sep 2023 11:10:53 -0500 Subject: [PATCH 6/6] removes semvoc in favor of approxvoc; a lexical approach; semantic approaches should be added outside of this package to kept it simpple --- src/TextSearch.jl | 4 +- src/approxvoc.jl | 50 ++++++++++++++++++ src/io.jl | 17 ------- src/semvoc.jl | 82 ------------------------------ src/semvocbow.jl | 127 ---------------------------------------------- src/textconfig.jl | 49 +----------------- src/tokentrans.jl | 67 ++++++++++++++++++++++++ src/updatevoc.jl | 4 +- src/voc.jl | 29 ++++++----- test/runtests.jl | 8 +-- test/semvoc.jl | 16 ------ test/voc.jl | 10 ++++ 12 files changed, 151 insertions(+), 312 deletions(-) create mode 100644 src/approxvoc.jl delete mode 100644 src/semvoc.jl delete mode 100644 src/semvocbow.jl create mode 100644 src/tokentrans.jl delete mode 100644 test/semvoc.jl diff --git a/src/TextSearch.jl b/src/TextSearch.jl index f7592b5..77394aa 100644 --- a/src/TextSearch.jl +++ b/src/TextSearch.jl @@ -65,6 +65,7 @@ end end end +include("tokentrans.jl") include("textconfig.jl") include("normalize.jl") include("tokenize.jl") @@ -78,8 +79,7 @@ include("emodel.jl") include("bm25.jl") include("bm25invfile.jl") include("bm25invfilesearch.jl") -include("semvoc.jl") -include("semvocbow.jl") +include("approxvoc.jl") include("io.jl") end diff --git a/src/approxvoc.jl b/src/approxvoc.jl new file mode 100644 index 0000000..20fd1de --- /dev/null +++ b/src/approxvoc.jl @@ -0,0 +1,50 @@ +# This file is a part of TextSearch.jl + +export QgramsLookup + +struct QgramsLookup <: AbstractTokenLookup + voc::Vocabulary{TokenLookup} + idx::BinaryInvertedFile + maxdist::Float32 +end + +""" + QgramsLookup( + voc::Vocabulary, + dist::SemiMetric=JaccardDistance(); + maxdist::Real = 0.7, + textconfig=TextConfig(qlist=[3]), + doc_min_freq::Integer=1, # any hard vocabulary pruning are expected to be made in `voc` + doc_max_ratio::AbstractFloat=0.4 # popular tokens are likely to be thrash + ) +""" +function QgramsLookup( + voc::Vocabulary, + dist::SemiMetric=JaccardDistance(); + maxdist::Real = 0.7, + textconfig=TextConfig(qlist=[3]), + doc_min_freq::Integer=1, # any hard vocabulary pruning are expected to be made in `voc` + doc_max_ratio::AbstractFloat=0.4 # popular tokens are likely to be thrash + ) + + voc_ = Vocabulary(textconfig, token(voc)) + voc_ = filter_tokens(voc_) do t + doc_min_freq <= t.ndocs <= doc_max_ratio * vocsize(voc_) + end + + invfile = BinaryInvertedFile(vocsize(voc_), dist) + append_items!(invfile, VectorDatabase(bagofwords_corpus(voc_, token(voc)))) + QgramsLookup(voc_, invfile, maxdist) +end + +function token2id(voc::Vocabulary{QgramsLookup}, tok)::UInt32 + lookup = voc.lookup + i = get(voc.token2id, tok, zero(UInt32)) + i > 0 && return i + tok == "" && return 0 + res = KnnResult(1) + search(lookup.idx, bagofwords(lookup.voc, tok), res) + p = res[1] + p.weight > lookup.maxdist ? 0 : p.id +end + diff --git a/src/io.jl b/src/io.jl index cd0f6ea..a723efd 100644 --- a/src/io.jl +++ b/src/io.jl @@ -27,13 +27,6 @@ function savemodel(filename::AbstractString, model; meta=nothing, parent="/") end end -function savemodel(file::JLDFile, model::SemanticVocabulary; meta=nothing, parent="/") - file[joinpath(parent, "meta")] = meta - file[joinpath(parent, "voc")] = model.voc - file[joinpath(parent, "knns")] = model.knns - file[joinpath(parent, "sel")] = model.sel - saveindex(file, model.lexidx; parent=joinpath(parent, "lexidx")) -end function loadmodel(t::Type, filename::AbstractString; parent="/", staticgraph=false) jldopen(filename) do f @@ -41,13 +34,3 @@ function loadmodel(t::Type, filename::AbstractString; parent="/", staticgraph=fa end end -function loadmodel(::Type{SemanticVocabulary}, file::JLDFile; parent="/", staticgraph=false) - meta = file[joinpath(parent, "meta")] - voc = file[joinpath(parent, "voc")] - knns = file[joinpath(parent, "knns")] - sel = file[joinpath(parent, "sel")] - lexidx, _ = loadindex(file; parent=joinpath(parent, "lexidx"), staticgraph) - - SemanticVocabulary(voc, lexidx, knns, sel), meta -end - diff --git a/src/semvoc.jl b/src/semvoc.jl deleted file mode 100644 index 79be7b8..0000000 --- a/src/semvoc.jl +++ /dev/null @@ -1,82 +0,0 @@ -# This file is a part of TextSearch.jl - -export SemanticVocabulary, AbstractTokenSelection, SelectCentralToken, SelectAllTokens, subvoc, decode, encode - -struct SemanticVocabulary{SelType} - voc::Vocabulary - lexidx::BM25InvertedFile - knns::Matrix{Int32} - sel::SelType -end - -abstract type AbstractTokenSelection end - -struct SelectCentralToken <: AbstractTokenSelection - klex::Int32 - ksem::Int32 -end - -struct SelectAllTokens <: AbstractTokenSelection - klex::Int32 - ksem::Int32 -end - -SemanticVocabulary(C::SemanticVocabulary; - voc=C.voc, lexidx=C.lexidx, knns=C.knns, sel=C.sel) = - SemanticVocabulary(voc, lexidx, knns, sel) - -vocsize(model::SemanticVocabulary) = vocsize(model.voc) - -function SemanticVocabulary(voc::Vocabulary, sel::AbstractTokenSelection=SelectCentralToken(16, 8); - textconfig::TextConfig=TextConfig(nlist=[1], qlist=[4]), - list_min_length_for_checking::Int=32, - list_max_allowed_length::Int=128, - doc_min_freq::Int=1, # any hard vocabulary pruning are expected to be made in `voc` - doc_max_ratio::AbstractFloat=0.3 # popular tokens are likely to be thrash - ) - doc_max_freq = ceil(Int, doc_max_ratio * vocsize(voc)) - C = tokenize_corpus(textconfig, voc.token) - lexidx = BM25InvertedFile(textconfig, C) do t - doc_min_freq <= t.ndocs <= doc_max_freq - end - - @info "append_items" - @time append_items!(lexidx, C; sort=false) - - #doc_max_freq = ceil(Int, vocsize(voc) * doc_max_ratio) - @info "filter lists!" - @time filter_lists!(lexidx; - list_min_length_for_checking, - list_max_allowed_length, - doc_min_freq, - doc_max_freq, - always_sort=true # we need this since we call append_items! without sorting - ) - @info "searchbatch" - @time knns, _ = searchbatch(lexidx, VectorDatabase(C), sel.ksem) - SemanticVocabulary(voc, lexidx, knns, sel) -end - -enrich_bow!(v::Dict, l::Nothing) = v -function enrich_bow!(v::Dict, l) - for (k, w) in l - v[k] = w - end - - v -end - -function search(model::SemanticVocabulary, text, res::KnnResult) - search(model.lexidx, text, res) -end - -function decode(model::SemanticVocabulary, idlist) - [model.voc.token[i] for i in itertokenid(idlist) if i > 0] -end - -Base.getindex(model::SemanticVocabulary, i::Integer) = model.voc[i] - -function subvoc(model::SemanticVocabulary, idlist, tc=model.lexidx.voc.textconfig) - corpus = [model.voc.token[i] for i in itertokenid(idlist)] - Vocabulary(tc, corpus) -end diff --git a/src/semvocbow.jl b/src/semvocbow.jl deleted file mode 100644 index e13a189..0000000 --- a/src/semvocbow.jl +++ /dev/null @@ -1,127 +0,0 @@ -# This file is a part of TextSearch.jl - -function vectorize_knns!(D::Dict, model::SemanticVocabulary, tok) - klex = model.sel.klex - ksem = min(model.sel.ksem, size(model.knns, 1)) - - @info "AAAAAA" - res = getknnresult(klex) - @info "AAAAAA 1" - search(model, tok, res) - @info "AAAAAA 2" - sizehint!(D, length(res) * (1 + ksem)) - @info "BBBBBB" - - for p in res - D[p.id] = get(D, p.id, 0f0) + 1f0 - for i in view(model.knns, 1:ksem, p.id) - i == 0 && break - D[i] = get(D, i, 0f0) + 1f0 - end - end - - @info "CCC" - D -end - -function token2id(model::SemanticVocabulary, tok::AbstractString)::UInt32 - id = token2id(model.voc, tok)::UInt32 - - if id == 0 - klex = model.sel.klex - ksem = min(model.sel.ksem, size(model.knns, 1)) - - if ksem == 0 - res = getknnresult(klex) - search(model, tok, res) - id = argmin(res)::UInt32 - else - @info :first length(TEXT_SEARCH_CACHES.data) - buff = take!(TEXT_SEARCH_CACHES) - @info :second length(TEXT_SEARCH_CACHES.data) - try - empty!(buff.vec) - D = buff.vec - @info :third length(TEXT_SEARCH_CACHES.data) - vectorize_knns!(D, model, tok) - @info :fourth length(TEXT_SEARCH_CACHES.data) - id = length(D) == 0 ? zero(UInt32) : argmax(D) - finally - put!(TEXT_SEARCH_CACHES, buff) - end - @info :finish length(TEXT_SEARCH_CACHES.data) - end - end - - id -end - -function tokenize!(model::SemanticVocabulary, tokens::TokenizedText) - for i in eachindex(tokens) - t = tokens[i] - id = token2id(model, t) - tokens[i] = token(model.voc, id) - end - - tokens -end - -function tokenize(model::SemanticVocabulary, text) - tokenize!(model, tokenize(model.voc.textconfig, text)) - -end - -function tokenize_corpus(model::SemanticVocabulary, corpus) - n = length(corpus) - arr = Vector{TokenizedText}(undef, n) - Threads.@threads for i in 1:n - arr[i] = tokenize!(model, tokenize(model.voc.textconfig, corpus[i])) - end - - arr -end - -function bagofwords!(bow::BOW, model::SemanticVocabulary{SelectCentralToken}, tokens::TokenizedText) - for t in tokens - id = token2id(model, t) - bow[id] = get(bow, id, 0) + 1 - end - - bow -end - -function bagofwords(model::SemanticVocabulary, text) - tokens = tokenize(model.voc.textconfig, text) - bow = BOW() - sizehint!(bow, length(tokens)) - bagofwords!(bow, model, tokens) -end - - -function vectorize(model::SemanticVocabulary, text; normalize=true) - klex = model.sel.klex - ksem = min(model.sel.ksem, size(model.knns, 1)) - - res = getknnresult(klex) - search(model, text, res) - D = DVEC{UInt32,Float32}() - sizehint!(D, length(res) * (1 + ksem)) - if ksem == 0 - for p in res - D[p.id] = abs(p.weight) - end - else - for p in res - D[p.id] = get(D, p.id, 0f0) + abs(p.weight) - for i in view(model.knns, 1:ksem, p.id) - i == 0 && break - D[i] = get(D, i, 0f0) + 1f0 - end - end - end - - normalize && normalize!(D) - - D -end - diff --git a/src/textconfig.jl b/src/textconfig.jl index 2bc149b..4f5d817 100644 --- a/src/textconfig.jl +++ b/src/textconfig.jl @@ -1,53 +1,6 @@ # This file is a part of TextSearch.jl -export TextConfig, Skipgram, AbstractTokenTransformation, IdentityTokenTransformation - -abstract type AbstractTokenTransformation end -struct IdentityTokenTransformation <: AbstractTokenTransformation end - -""" - transform_unigram(::AbstractTokenTransformation, tok) - -Hook applied in the tokenization stage to change the input token `tok` if needed. -For instance, it can be used to apply stemming or any other kind of normalization. -Return `nothing` to ignore the `tok` occurence (e.g., stop words). -""" -transform_unigram(::AbstractTokenTransformation, tok) = tok - -""" - transform_nword(::AbstractTokenTransformation, tok) - -Hook applied in the tokenization stage to change the input token `tok` if needed. -For instance, it can be used to apply stemming or any other kind of normalization. -Return `nothing` to ignore the `tok` occurence (e.g., stop words). -""" -transform_nword(::AbstractTokenTransformation, tok) = tok - -""" - transform_qgram(::AbstractTokenTransformation, tok) - -Hook applied in the tokenization stage to change the input token `tok` if needed. -For instance, it can be used to apply stemming or any other kind of normalization. -Return `nothing` to ignore the `tok` occurence (e.g., stop words). -""" -transform_qgram(::AbstractTokenTransformation, tok) = tok - -""" - transform_collocation(::AbstractTokenTransformation, tok) - -Hook applied in the tokenization stage to change the input token `tok` if needed. -Return `nothing` to ignore the `tok` occurence (e.g., stop words). -""" -transform_collocation(::AbstractTokenTransformation, tok) = tok - -""" - transform_skipgram(::AbstractTokenTransformation, tok) - -Hook applied in the tokenization stage to change the input token `tok` if needed. -For instance, it can be used to apply stemming or any other kind of normalization. -Return `nothing` to ignore the `tok` occurence (e.g., stop words). -""" -transform_skipgram(::AbstractTokenTransformation, tok) = tok +export TextConfig, Skipgram """ Skipgram(qsize, skip) diff --git a/src/tokentrans.jl b/src/tokentrans.jl new file mode 100644 index 0000000..057658e --- /dev/null +++ b/src/tokentrans.jl @@ -0,0 +1,67 @@ +# This file is a part of TextSearch.jl + +export TextConfig, Skipgram, AbstractTokenTransformation, IdentityTokenTransformation +export IgnoreStopwords, ChainTransformation + +abstract type AbstractTokenTransformation end +struct IdentityTokenTransformation <: AbstractTokenTransformation end + +""" + transform_unigram(::AbstractTokenTransformation, tok) + +Hook applied in the tokenization stage to change the input token `tok` if needed. +For instance, it can be used to apply stemming or any other kind of normalization. +Return `nothing` to ignore the `tok` occurence (e.g., stop words). +""" +transform_unigram(::AbstractTokenTransformation, tok) = tok + +""" + transform_nword(::AbstractTokenTransformation, tok) + +Hook applied in the tokenization stage to change the input token `tok` if needed. +For instance, it can be used to apply stemming or any other kind of normalization. +Return `nothing` to ignore the `tok` occurence (e.g., stop words). +""" +transform_nword(::AbstractTokenTransformation, tok) = tok + +""" + transform_qgram(::AbstractTokenTransformation, tok) + +Hook applied in the tokenization stage to change the input token `tok` if needed. +For instance, it can be used to apply stemming or any other kind of normalization. +Return `nothing` to ignore the `tok` occurence (e.g., stop words). +""" +transform_qgram(::AbstractTokenTransformation, tok) = tok + +""" + transform_collocation(::AbstractTokenTransformation, tok) + +Hook applied in the tokenization stage to change the input token `tok` if needed. +Return `nothing` to ignore the `tok` occurence (e.g., stop words). +""" +transform_collocation(::AbstractTokenTransformation, tok) = tok + +""" + transform_skipgram(::AbstractTokenTransformation, tok) + +Hook applied in the tokenization stage to change the input token `tok` if needed. +For instance, it can be used to apply stemming or any other kind of normalization. +Return `nothing` to ignore the `tok` occurence (e.g., stop words). +""" +transform_skipgram(::AbstractTokenTransformation, tok) = tok + + +### some transformations + + +struct IgnoreStopwords <: AbstractTokenTransformation + stopwords::Set{String} +end + +function TextSearch.transform_unigram(tt::IgnoreStopwords, tok) + tok in tt.stopwords ? nothing : tok +end + +struct ChainTransformation <: AbstractTokenTransformation + list::AbstractVector{<:AbstractTokenTransformation} +end diff --git a/src/updatevoc.jl b/src/updatevoc.jl index cba4b58..e905749 100644 --- a/src/updatevoc.jl +++ b/src/updatevoc.jl @@ -73,7 +73,7 @@ function merge_voc(pred::Function, voc1::Vocabulary, voc2::Vocabulary, voclist.. end sort!(L, by=vocsize, rev=true) - voc = Vocabulary(voc1.textconfig, sum(v.corpuslen for v in L)) + voc = Vocabulary(voc1.lookup, voc1.textconfig, sum(v.corpuslen for v in L)) for v in L update_voc!(pred, voc, v) @@ -88,7 +88,7 @@ end Returns a copy of reduced vocabulary based on evaluating `pred` function for each entry in `voc` """ function filter_tokens(pred::Function, voc::Vocabulary) - V = Vocabulary(voc.textconfig, voc.corpuslen) + V = Vocabulary(voc.lookup, voc.textconfig, voc.corpuslen) for i in eachindex(voc) v = voc[i] diff --git a/src/voc.jl b/src/voc.jl index edf1aae..572ae22 100644 --- a/src/voc.jl +++ b/src/voc.jl @@ -1,10 +1,12 @@ # This file is a part of TextSearch.jl -export Vocabulary, occs, ndocs, token, vocsize, trainsize, filter_tokens, tokenize_and_append!, merge_voc, update_voc!, vocabulary_from_thesaurus, token2id, +export Vocabulary, AbstractTokenLookup, TokenLookup, occs, ndocs, token, vocsize, trainsize, filter_tokens, tokenize_and_append!, merge_voc, update_voc!, vocabulary_from_thesaurus, token2id, encode, decode, table +abstract type AbstractTokenLookup end -struct Vocabulary +struct Vocabulary{TokenLookup<:AbstractTokenLookup} + lookup::TokenLookup textconfig::TextConfig token::Vector{String} occs::Vector{Int32} @@ -13,11 +15,13 @@ struct Vocabulary corpuslen::Int end -token2id(voc::Vocabulary, tok::AbstractString) = get(voc.token2id, tok, zero(UInt32)) +struct TokenLookup <: AbstractTokenLookup +end + +token2id(voc::Vocabulary{TokenLookup}, tok::AbstractString) = get(voc.token2id, tok, zero(UInt32)) -function Vocabulary(voc::Vocabulary; textconfig=voc.textconfig, token=voc.token, occs=voc.occs, ndocs=voc.ndocs, token2id=voc.token2id, corpuslen=voc.corpuslen) - Vocabulary(textconfig, token, occs, ndocs, token2id, corpuslen) - # Vocabulary(voc.deepcopy(voc.textconfig), voc.copy(voc.token), voc.copy(voc.occs), voc.copy(voc.ndocs), voc.copy(voc.token2id), corpuslen) +function Vocabulary(voc::Vocabulary; lookup=voc.lookup, textconfig=voc.textconfig, token=voc.token, occs=voc.occs, ndocs=voc.ndocs, token2id=voc.token2id, corpuslen=voc.corpuslen) + Vocabulary(lookup, textconfig, token, occs, ndocs, token2id, corpuslen) end function decode(voc::Vocabulary, bow::Dict) @@ -34,7 +38,6 @@ end function vocabulary_from_thesaurus(textconfig::TextConfig, tokens::AbstractVector) n = length(tokens) - token2id = Dict{String,UInt32} voc = Vocabulary(textconfig, n) for t in tokens push_token!(voc, t, 1, 1) @@ -48,9 +51,9 @@ end Creates a `Vocabulary` struct """ -function Vocabulary(textconfig::TextConfig, n::Integer) +function Vocabulary(lookup::AbstractTokenLookup, textconfig::TextConfig, n::Integer) # n == 0 means unknown - voc = Vocabulary(textconfig, String[], Int32[], Int32[], Dict{String,UInt32}(), n) + voc = Vocabulary(lookup, textconfig, String[], Int32[], Int32[], Dict{String,UInt32}(), n) vocsize = ceil(Int, n^0.6) # approx based on Heaps law sizehint!(voc.token, vocsize) sizehint!(voc.occs, vocsize) @@ -59,13 +62,15 @@ function Vocabulary(textconfig::TextConfig, n::Integer) voc end +Vocabulary(textconfig::TextConfig, n::Integer) = Vocabulary(TokenLookup(), textconfig, n) + """ Vocabulary(textconfig, corpus; minbatch=0) Computes a vocabulary from a corpus using the TextConfig `textconfig`. """ function vocab_from_small_collection(textconfig::TextConfig, corpus::AbstractVector; minbatch=0) - voc = Vocabulary(textconfig, length(corpus)) + voc = Vocabulary(TokenLookup(), textconfig, length(corpus)) tokenize_and_append!(voc, corpus; minbatch) voc end @@ -75,7 +80,7 @@ function Vocabulary(textconfig::TextConfig, corpusgenerator::Union{Base.EachLine return vocab_from_small_collection(textconfig, corpusgenerator; minbatch) end - voc = Vocabulary(textconfig, 0) + voc = Vocabulary(TokenLookup(), textconfig, 0) len = 0 corpus = String[] sizehint!(corpus, buffsize) @@ -95,7 +100,7 @@ function Vocabulary(textconfig::TextConfig, corpusgenerator::Union{Base.EachLine tokenize_and_append!(voc, corpus; minbatch) end - Vocabulary(voc; corpuslen=len) + Vocabulary(TokenLookup(), voc; corpuslen=len) end function locked_tokenize_and_push(voc, doc, buff, l) diff --git a/test/runtests.jl b/test/runtests.jl index 88ca19a..648722d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -38,10 +38,6 @@ end include("tok.jl") include("voc.jl") -include("vec.jl") - -include("search.jl") -# experimental: -include("semvoc.jl") - +#include("vec.jl") +#include("search.jl") @info "FINISH" diff --git a/test/semvoc.jl b/test/semvoc.jl deleted file mode 100644 index 7b10d06..0000000 --- a/test/semvoc.jl +++ /dev/null @@ -1,16 +0,0 @@ - -@testset "semvoc" begin - textconfig = TextConfig(nlist=[2]) - voc = vocabulary_from_thesaurus(textconfig, _corpus) - semvoc = SemanticVocabulary(voc; textconfig=TextConfig(qlist=[4])) - q = "laz mansanas" - d = vectorize(semvoc, q) - for (k, v) in d - @info voc[k] => v - end - - @test voc[token2id(semvoc, q)].token in ("la manzana verde esta rica", "la manzana roja") - @info bagofwords(semvoc, "la manzana roja es rica") - @info tokenize(semvoc, "la manzana roja es rica, pero la pera es ") -end - diff --git a/test/voc.jl b/test/voc.jl index b0582ff..3dffb66 100644 --- a/test/voc.jl +++ b/test/voc.jl @@ -18,3 +18,13 @@ end @test decode.(Ref(voc), B) == decode.(Ref(voc), C) end +@testset "Approximate vocabulary" begin + textconfig = TextConfig(nlist=[1]) + voc = Vocabulary(textconfig, corpus) + @info corpus + V = Vocabulary(voc; lookup=QgramsLookup(voc)) + @info "===================" + @info V.lookup + #@test decode.(Ref(voc), B) == decode.(Ref(voc), C) +end +