From de751754eda878136ea604da3a3e030e6d30555b Mon Sep 17 00:00:00 2001
From: "Eric S. Tellez" <donsadit@gmail.com>
Date: Mon, 11 Sep 2023 12:10:55 -0600
Subject: [PATCH 1/6] adds collocations

---
 src/TextSearch.jl |   1 +
 src/textconfig.jl |  29 +++++++++----
 src/tokenize.jl   |  46 ++++++++++++++++++--
 src/updatevoc.jl  | 101 ++++++++++++++++++++++++++++++++++++++++++++
 src/vmodel.jl     |   5 +++
 src/voc.jl        | 105 ++++++----------------------------------------
 6 files changed, 184 insertions(+), 103 deletions(-)
 create mode 100644 src/updatevoc.jl

diff --git a/src/TextSearch.jl b/src/TextSearch.jl
index 86a0420..721d4e3 100644
--- a/src/TextSearch.jl
+++ b/src/TextSearch.jl
@@ -68,6 +68,7 @@ include("textconfig.jl")
 include("normalize.jl")
 include("tokenize.jl")
 include("voc.jl")
+include("updatevoc.jl")
 include("tokcorpus.jl")
 include("bow.jl")
 include("sparseconversions.jl")
diff --git a/src/textconfig.jl b/src/textconfig.jl
index 1defbd0..2bc149b 100644
--- a/src/textconfig.jl
+++ b/src/textconfig.jl
@@ -32,6 +32,14 @@ Return `nothing` to ignore the `tok` occurence (e.g., stop words).
 """
 transform_qgram(::AbstractTokenTransformation, tok) = tok
 
+"""
+    transform_collocation(::AbstractTokenTransformation, tok)
+
+Hook applied in the tokenization stage to change the input token `tok` if needed.
+Return `nothing` to ignore the `tok` occurence (e.g., stop words).
+"""
+transform_collocation(::AbstractTokenTransformation, tok) = tok
+
 """
     transform_skipgram(::AbstractTokenTransformation, tok)
 
@@ -64,6 +72,7 @@ Base.isequal(a::Skipgram, b::Skipgram) = a.qsize == b.qsize && a.skip == b.skip
         group_usr::Bool=false,
         group_emo::Bool=false,
         lc::Bool=true,
+        collocations::Int8=0,
         qlist::Vector=Int8[],
         nlist::Vector=Int8[],
         slist::Vector{Skipgram}=Skipgram[],
@@ -81,6 +90,11 @@ Defines a preprocessing and tokenization pipeline
 - `group_usr`: indicates if users (@usr) should be grouped as _usr
 - `group_emo`: indicates if emojis should be grouped as _emo
 - `lc`: indicates if the text should be normalized to lower case
+- `collocations`: window to expand collocations as tokens, please take into account that:
+  - 0 => disables collocations 
+  - 1 => will compute words (ignored in favor of use typical unigrams)
+  - 2 => will compute bigrams (don't use this, but not disabled)
+  - 3 <= typical values
 - `qlist`: a list of character q-grams to use
 - `nlist`: a list of words n-grams to use
 - `slist`: a list of skip-grams tokenizers to use
@@ -98,13 +112,15 @@ Base.@kwdef struct TextConfig
     group_usr::Bool = false
     group_emo::Bool = false
     lc::Bool        = true
+    collocations::Int8 = 0
+    mark_token_type::Bool = true
     qlist::Vector{Int8} = Int8[]
     nlist::Vector{Int8} = Int8[]
     slist::Vector{Skipgram} = Skipgram[]
-    mark_token_type::Bool = true
     tt::AbstractTokenTransformation = IdentityTokenTransformation()
 
-    function TextConfig(del_diac, del_dup, del_punc, group_num, group_url, group_usr, group_emo, lc, qlist, nlist, slist, mark_token_type, tt)
+    function TextConfig(del_diac::Bool, del_dup::Bool, del_punc::Bool, group_num::Bool, group_url::Bool, group_usr::Bool, group_emo::Bool, lc::Bool, collocations::Integer,
+            mark_token_type::Bool, qlist::AbstractVector, nlist::AbstractVector, slist::AbstractVector, tt)
         if length(qlist) == length(nlist) == length(slist) == 0
             nlist = [1]
         end
@@ -112,7 +128,7 @@ Base.@kwdef struct TextConfig
         nlist = sort!(Vector{Int8}(nlist))
         slist = sort!(Vector{Skipgram}(slist))
 
-        new(del_diac, del_dup, del_punc, group_num, group_url, group_usr, group_emo, lc, qlist, nlist, slist, mark_token_type, tt)
+        new(del_diac, del_dup, del_punc, group_num, group_url, group_usr, group_emo, lc, collocations, mark_token_type, qlist, nlist, slist, tt)
     end
 end
 
@@ -125,16 +141,15 @@ function TextConfig(c::TextConfig;
         group_usr::Bool=c.group_usr,
         group_emo::Bool=c.group_emo,
         lc::Bool=c.lc,
+        collocations=c.collocations,
+        mark_token_type=c.mark_token_type,
         qlist=c.qlist,
         nlist=c.nlist,
         slist=c.slist,
-        mark_token_type=c.mark_token_type,
         tt::AbstractTokenTransformation=c.tt
     )
 
-    TextConfig(del_diac, del_dup, del_punc,
-        group_num, group_url, group_usr, group_emo,
-        lc, qlist, nlist, slist, mark_token_type, tt)
+    TextConfig(del_diac, del_dup, del_punc, group_num, group_url, group_usr, group_emo, lc, collocations, mark_token_type, qlist, nlist, slist, tt)
 end
 
 Base.broadcastable(c::TextConfig) = (c,)
diff --git a/src/tokenize.jl b/src/tokenize.jl
index a3b0093..39b7c53 100644
--- a/src/tokenize.jl
+++ b/src/tokenize.jl
@@ -98,7 +98,7 @@ function tokenize_(config::TextConfig, buff::TextSearchBuffer)
         qgrams(q, buff, config.tt, config.mark_token_type)
     end
     
-    if length(config.nlist) > 0 || length(config.slist) > 0
+    if length(config.nlist) > 0 || length(config.slist) > 0 || config.collocations
         n1 = length(buff.tokens)
         unigrams(buff, config.tt)  # unigrams are always activated if any |nlist| > 0 or |slist| > 0
 
@@ -113,6 +113,10 @@ function tokenize_(config::TextConfig, buff::TextSearchBuffer)
         for q in config.slist
             skipgrams(q, buff, config.tt, config.mark_token_type)
         end
+
+        if config.collocations > 1
+            collocations(config.collocations, buff, config.tt, config.mark_token_type)
+        end
     end
 
     buff.tokens
@@ -173,6 +177,21 @@ function flush_skipgram!(buff::TextSearchBuffer, tt::AbstractTokenTransformation
     s
 end
 
+"""
+    flush_collocations!(buff::TextSearchBuffer, tt::AbstractTokenTransformation, mark_token_type)
+
+Pushes a collocation inside the buffer to the token list; it discards empty strings.
+"""
+function flush_collocation!(buff::TextSearchBuffer, tt::AbstractTokenTransformation, mark_token_type)
+    buff.io.size == 0 && return nothing
+    mark_token_type && write(buff.io, '\t', 'c')
+    s = transform_collocation(tt, String(take!(buff.io)))
+    s === nothing && return nothing
+    push!(buff.tokens, s)
+    s
+end
+
+
 """
     qgrams(q::Integer, buff::TextSearchBuffer, tt::AbstractTokenTransformation, mark_token_type)
 
@@ -233,11 +252,9 @@ function unigrams(buff::TextSearchBuffer, tt::AbstractTokenTransformation)
         elseif c == BLANK
             if p !== BLANK
                 s = flush_unigram!(buff, tt)
-                #write(buff.io, c)
                 s !== nothing && push!(buff.unigrams, s)
             end
         else
-            ## @show :d
             write(buff.io, c)
         end
     end
@@ -267,6 +284,29 @@ function nwords(q::Integer, buff::TextSearchBuffer, tt::AbstractTokenTransformat
     buff.tokens
 end
 
+
+"""
+    collocations(q, buff::TextSearchBuffer, tt::AbstractTokenTransformation, mark_token_type)
+
+Computes a kind of collocations of the given text
+"""
+function collocations(q::Integer, buff::TextSearchBuffer, tt::AbstractTokenTransformation, mark_token_type)
+    tokens = buff.unigrams
+    n = length(tokens)
+
+    for i in 1:n-1 # the upper limit is an implementation detail to discard some entries 
+        for j in i+1:min(i+1+q, n)
+            write(buff.io, buff.unigrams[i])
+            write(buff.io, BLANK)
+            write(buff.io, buff.unigrams[j])
+            flush_collocation!(buff, tt, mark_token_type)
+        end
+    end
+    
+    buff.tokens
+end
+
+
 """
     skipgrams(q::Skipgram, buff::TextSearchBuffer, tt::AbstractTokenTransformation, mark_token_type)
 
diff --git a/src/updatevoc.jl b/src/updatevoc.jl
new file mode 100644
index 0000000..cba4b58
--- /dev/null
+++ b/src/updatevoc.jl
@@ -0,0 +1,101 @@
+"""
+    update_voc!(voc::Vocabulary, another::Vocabulary)
+    update_voc!(pred::Function, voc::Vocabulary, another::Vocabulary)
+
+Update `voc` vocabulary using another vocabulary. Optionally a predicate can be given to filter vocabularies.
+
+Note 1: `corpuslen` remains unchanged (the structure is immutable and a new `Vocabulary` should be created to update this field).
+Note 2: Both `voc` and `another` vocabularies should had been created with a _compatible_ [`Textconfig`](@ref) to be able to work on them.
+"""
+update_voc!(voc::Vocabulary, another::Vocabulary) = update_voc!(t->true, voc, another)
+
+function update_voc!(pred::Function, voc::Vocabulary, another::Vocabulary)
+    for i in eachindex(another)
+        v = another[i]
+        if pred(v)
+            push_token!(voc, v.token, v.occs, v.ndocs)
+        end
+    end
+
+    voc
+end
+
+
+# filtering functions
+"""
+    filter_tokens!(voc::Vocabulary, text::TokenizedText)
+
+Removes tokens from a given tokenized text based using the valid vocabulary
+"""
+function filter_tokens!(voc::Vocabulary, text::TokenizedText)
+    j = 0
+    for i in eachindex(text.tokens)
+        t = text.tokens[i]
+        if haskey(voc.token2id, t)
+            j += 1
+            text.tokens[j] = t
+        end
+    end
+
+    resize!(text.tokens, j)
+    text
+end
+
+"""
+    filter_tokens!(voc::Vocabulary, text::TokenizedText)
+
+Removes tokens from text array
+"""
+function filter_tokens!(voc::Vocabulary, arr::AbstractVector{TokenizedText})
+    for t in arr
+        filter_tokens!(voc, t)
+    end
+
+    arr
+end
+
+"""
+    merge_voc(voc1::Vocabulary, voc2::Vocabulary[, ...])
+    merge_voc(pred::Function, voc1::Vocabulary, voc2::Vocabulary[, ...])
+
+Merges two or more vocabularies into a new one. A predicate function can be used to filter token entries.
+
+Note: All vocabularies should had been created with a _compatible_ [`Textconfig`](@ref) to be able to work on them.
+"""
+merge_voc(voc1::Vocabulary, voc2::Vocabulary, voclist...) = merge_voc(x->true, voc1, voc2, voclist...)
+
+function merge_voc(pred::Function, voc1::Vocabulary, voc2::Vocabulary, voclist...)
+    #all(v -> v isa Vocabulary, voclist) || throw(ArgumentError("arguments should be of type `Vocabulary`"))
+    
+    L = [voc1, voc2]
+    for v in voclist
+        push!(L, v)
+    end
+
+    sort!(L, by=vocsize, rev=true)
+    voc = Vocabulary(voc1.textconfig, sum(v.corpuslen for v in L))
+
+    for v in L
+        update_voc!(pred, voc, v)
+    end
+
+    voc
+end
+
+"""
+    filter_tokens(pred::Function, voc::Vocabulary)
+
+Returns a copy of reduced vocabulary based on evaluating `pred` function for each entry in `voc`
+"""
+function filter_tokens(pred::Function, voc::Vocabulary)
+    V = Vocabulary(voc.textconfig, voc.corpuslen)
+
+    for i in eachindex(voc)
+        v = voc[i]
+        if pred(v)
+            push_token!(V, v.token, v.occs, v.ndocs)
+        end
+    end
+
+    V
+end
diff --git a/src/vmodel.jl b/src/vmodel.jl
index f0b5212..079dec6 100644
--- a/src/vmodel.jl
+++ b/src/vmodel.jl
@@ -137,6 +137,10 @@ Base.copy(e::VectorModel; kwargs...) = VectorModel(e::VectorModel; kwargs...)
 @inline ndocs(model::VectorModel) = ndocs(model.voc)
 @inline token(model::VectorModel) = token(model.voc)
 
+function totable(model::VectorModel, TableConstructor)
+    TableConstructor(; token=token(model), ndocs=ndocs(model), occs=occs(model), weight=weight(model))
+end
+
 function Base.getindex(model::VectorModel, tokenID::Integer)
     id = convert(UInt32, tokenID)
     voc = model.voc
@@ -171,6 +175,7 @@ function filter_tokens(pred::Function, model::VectorModel)
     VectorModel(model; voc=V, weight=W)
 end
 
+
 function vectorize_!(buff::TextSearchBuffer, model::VectorModel{G_,L_}, bow::BOW; normalize=true, minweight=1e-9) where {G_,L_}
     vec = buff.vec
     numtokens::Int = 0
diff --git a/src/voc.jl b/src/voc.jl
index d192a6c..b13ac10 100644
--- a/src/voc.jl
+++ b/src/voc.jl
@@ -15,6 +15,17 @@ end
 
 token2id(voc::Vocabulary, tok::AbstractString) = get(voc.token2id, tok, zero(UInt32))
 
+function Vocabulary(voc::Vocabulary)
+    Vocabulary(
+       voc.deepcopy(voc.textconfig),
+       voc.copy(voc.token),
+       voc.copy(voc.occs),
+       voc.copy(voc.ndocs),
+       voc.copy(voc.token2id),
+       voc.corpuslen
+    )
+end
+
 function decode(voc::Vocabulary, bow::Dict)
     Dict(voc.token[k] => v for (k, v) in bow)
 end
@@ -98,8 +109,7 @@ function tokenize_and_append!(voc::Vocabulary, corpus; minbatch=0)
     l = Threads.SpinLock()
     n = length(corpus)
     minbatch = getminbatch(minbatch, n)
-
-    
+ 
     Threads.@threads for i in 1:n # @batch per=thread minbatch=minbatch for i in 1:n
         doc = corpus[i]
 
@@ -121,79 +131,6 @@ function tokenize_and_append!(voc::Vocabulary, corpus; minbatch=0)
     voc
 end
 
-function filter_tokens!(voc::Vocabulary, text::TokenizedText)
-    j = 0
-    for i in eachindex(text.tokens)
-        t = text.tokens[i]
-        if haskey(voc.token2id, t)
-            j += 1
-            text.tokens[j] = t
-        end
-    end
-
-    resize!(text.tokens, j)
-    text
-end
-
-function filter_tokens!(voc::Vocabulary, arr::AbstractVector{TokenizedText})
-    for t in arr
-        filter_tokens!(voc, t)
-    end
-
-    arr
-end
-
-"""
-    update_voc!(voc::Vocabulary, another::Vocabulary)
-    update_voc!(pred::Function, voc::Vocabulary, another::Vocabulary)
-
-Update `voc` vocabulary using another one.
-Optionally a predicate can be given to filter vocabularies.
-
-Note 1: `corpuslen` remains unchanged (the structure is immutable and a new `Vocabulary` should be created to update this field).
-Note 2: Both `voc` and `another` vocabularies should had been created with a _compatible_ [`Textconfig`](@ref) to be able to work on them.
-"""
-update_voc!(voc::Vocabulary, another::Vocabulary) = update_voc!(t->true, voc, another)
-
-function update_voc!(pred::Function, voc::Vocabulary, another::Vocabulary)
-    for i in eachindex(another)
-        v = another[i]
-        if pred(v)
-            push_token!(voc, v.token, v.occs, v.ndocs)
-        end
-    end
-
-    voc
-end
-
-"""
-    merge_voc(voc1::Vocabulary, voc2::Vocabulary[, ...])
-    merge_voc(pred::Function, voc1::Vocabulary, voc2::Vocabulary[, ...])
-
-Merges two or more vocabularies into a new one. A predicate function can be used to filter token entries.
-
-Note: All vocabularies should had been created with a _compatible_ [`Textconfig`](@ref) to be able to work on them.
-"""
-merge_voc(voc1::Vocabulary, voc2::Vocabulary, voclist...) = merge_voc(x->true, voc1, voc2, voclist...)
-
-function merge_voc(pred::Function, voc1::Vocabulary, voc2::Vocabulary, voclist...)
-    #all(v -> v isa Vocabulary, voclist) || throw(ArgumentError("arguments should be of type `Vocabulary`"))
-    
-    L = [voc1, voc2]
-    for v in voclist
-        push!(L, v)
-    end
-
-    sort!(L, by=vocsize, rev=true)
-    voc = Vocabulary(voc1.textconfig, sum(v.corpuslen for v in L))
-
-    for v in L
-        update_voc!(pred, voc, v)
-    end
-
-    voc
-end
-
 Base.length(voc::Vocabulary) = length(voc.occs)
 Base.eachindex(voc::Vocabulary) = eachindex(voc.occs)
 vocsize(voc::Vocabulary) = length(voc)
@@ -254,21 +191,3 @@ function Base.getindex(voc::Vocabulary, tokenID::Integer)
     end
 end
 
-
-"""
-    filter_tokens(pred::Function, voc::Vocabulary)
-
-Returns a copy of reduced vocabulary based on evaluating `pred` function for each entry in `voc`
-"""
-function filter_tokens(pred::Function, voc::Vocabulary)
-    V = Vocabulary(voc.textconfig, voc.corpuslen)
-
-    for i in eachindex(voc)
-        v = voc[i]
-        if pred(v)
-            push_token!(V, v.token, v.occs, v.ndocs)
-        end
-    end
-
-    V
-end

From 9d3d76edd4d3c96249c1dfe668c567c314744ad4 Mon Sep 17 00:00:00 2001
From: "Eric S. Tellez" <donsadit@gmail.com>
Date: Thu, 14 Sep 2023 09:18:25 -0500
Subject: [PATCH 2/6] collocations

---
 src/tokenize.jl | 63 ++++++++++++++++++++++++++-----------------------
 src/vmodel.jl   |  4 +++-
 src/voc.jl      | 49 ++++++++++++++++++++++++++++----------
 3 files changed, 72 insertions(+), 44 deletions(-)

diff --git a/src/tokenize.jl b/src/tokenize.jl
index 39b7c53..6d6e3aa 100644
--- a/src/tokenize.jl
+++ b/src/tokenize.jl
@@ -98,7 +98,7 @@ function tokenize_(config::TextConfig, buff::TextSearchBuffer)
         qgrams(q, buff, config.tt, config.mark_token_type)
     end
     
-    if length(config.nlist) > 0 || length(config.slist) > 0 || config.collocations
+    if length(config.nlist) > 0 || length(config.slist) > 0 || config.collocations > 1
         n1 = length(buff.tokens)
         unigrams(buff, config.tt)  # unigrams are always activated if any |nlist| > 0 or |slist| > 0
 
@@ -122,6 +122,19 @@ function tokenize_(config::TextConfig, buff::TextSearchBuffer)
     buff.tokens
 end
 
+function push_token_from_transform!(tokens, s::Nothing)
+end
+
+function push_token_from_transform!(tokens, s::AbstractString)
+    push!(tokens, s)
+end
+
+function push_token_from_transform!(tokens, slist::AbstractVector)
+    for s in slist
+        push!(tokens, s)
+    end
+end
+
 """
     flush_unigram!(buff::TextSearchBuffer, tt::AbstractTokenTransformation)
 
@@ -130,9 +143,7 @@ Pushes the word inside the buffer to the token list; it discards empty strings.
 function flush_unigram!(buff::TextSearchBuffer, tt::AbstractTokenTransformation)
     buff.io.size == 0 && return nothing
     s = transform_unigram(tt, String(take!(buff.io)))
-    s === nothing && return nothing
-    push!(buff.tokens, s)
-    s
+    push_token_from_transform!(buff.tokens, s)
 end
 
 """
@@ -144,9 +155,7 @@ function flush_nword!(buff::TextSearchBuffer, tt::AbstractTokenTransformation, m
     buff.io.size == 0 && return nothing
     mark_token_type && write(buff.io, '\t', 'n')
     s = transform_nword(tt, String(take!(buff.io)))
-    s === nothing && return nothing
-    push!(buff.tokens, s)
-    s
+    push_token_from_transform!(buff.tokens, s)
 end
 
 """
@@ -158,9 +167,7 @@ function flush_qgram!(buff::TextSearchBuffer, tt::AbstractTokenTransformation, m
     buff.io.size == 0 && return nothing
     mark_token_type && write(buff.io, '\t', 'q')
     s = transform_qgram(tt, String(take!(buff.io)))
-    s === nothing && return nothing
-    push!(buff.tokens, s)
-    s
+    push_token_from_transform!(buff.tokens, s)
 end
 
 """
@@ -172,9 +179,7 @@ function flush_skipgram!(buff::TextSearchBuffer, tt::AbstractTokenTransformation
     buff.io.size == 0 && return nothing
     mark_token_type && write(buff.io, '\t', 's')
     s = transform_skipgram(tt, String(take!(buff.io)))
-    s === nothing && return nothing
-    push!(buff.tokens, s)
-    s
+    push_token_from_transform!(buff.tokens, s)
 end
 
 """
@@ -186,9 +191,7 @@ function flush_collocation!(buff::TextSearchBuffer, tt::AbstractTokenTransformat
     buff.io.size == 0 && return nothing
     mark_token_type && write(buff.io, '\t', 'c')
     s = transform_collocation(tt, String(take!(buff.io)))
-    s === nothing && return nothing
-    push!(buff.tokens, s)
-    s
+    push_token_from_transform!(buff.tokens, s)
 end
 
 
@@ -219,6 +222,7 @@ Performs the word tokenization
 """
 function unigrams(buff::TextSearchBuffer, tt::AbstractTokenTransformation)
     n = length(buff.normtext)
+    mfirst = length(buff.tokens) + 1
     # @info buff.normtext
     @inbounds for i in 2:n  # normtext[1] is BLANK
         c = buff.normtext[i]
@@ -227,40 +231,39 @@ function unigrams(buff::TextSearchBuffer, tt::AbstractTokenTransformation)
         ## @show i, p, c
         if ispunct2(c) && !ispunct2(p) && p !== BLANK
             ## @show :a
-            s = flush_unigram!(buff, tt)
-            s !== nothing && push!(buff.unigrams, s)
+            flush_unigram!(buff, tt)
             write(buff.io, c)
         elseif ispunct2(p)
             if ispunct2(c) && buff.io.size > 2
-                s = flush_unigram!(buff, tt)
-                s !== nothing && push!(buff.unigrams, s)
+                flush_unigram!(buff, tt)
                 write(buff.io, c)
             elseif !ispunct2(c) && !(p in ('#', '@', '_'))
                 ## @show :b
-                s = flush_unigram!(buff, tt)
-                s !== nothing && push!(buff.unigrams, s)
+                flush_unigram!(buff, tt)
                 c !== BLANK && write(buff.io, c)
             else
                 write(buff.io, c)
             end
         elseif isemoji(c)
-            s = flush_unigram!(buff, tt)
-            s !== nothing && push!(buff.unigrams, s)
+            flush_unigram!(buff, tt)
             write(buff.io, c)
-            s = flush_unigram!(buff, tt)
-            s !== nothing && push!(buff.unigrams, s)
+            flush_unigram!(buff, tt)
         elseif c == BLANK
             if p !== BLANK
-                s = flush_unigram!(buff, tt)
-                s !== nothing && push!(buff.unigrams, s)
+                flush_unigram!(buff, tt)
             end
         else
             write(buff.io, c)
         end
     end
 
-    s = flush_unigram!(buff, tt)
-    s !== nothing && push!(buff.unigrams, s)
+    flush_unigram!(buff, tt)
+    mlast = length(buff.tokens)
+
+    for i in mfirst:mlast
+        push!(buff.unigrams, buff.tokens[i])
+    end
+
     buff.tokens
 end
 
diff --git a/src/vmodel.jl b/src/vmodel.jl
index 079dec6..c20fa1d 100644
--- a/src/vmodel.jl
+++ b/src/vmodel.jl
@@ -137,10 +137,12 @@ Base.copy(e::VectorModel; kwargs...) = VectorModel(e::VectorModel; kwargs...)
 @inline ndocs(model::VectorModel) = ndocs(model.voc)
 @inline token(model::VectorModel) = token(model.voc)
 
-function totable(model::VectorModel, TableConstructor)
+function table(model::VectorModel, TableConstructor)
     TableConstructor(; token=token(model), ndocs=ndocs(model), occs=occs(model), weight=weight(model))
 end
 
+Base.getindex(model::VectorModel, token::AbstractString) = model[get(model.voc.token2id, token, 0)]
+
 function Base.getindex(model::VectorModel, tokenID::Integer)
     id = convert(UInt32, tokenID)
     voc = model.voc
diff --git a/src/voc.jl b/src/voc.jl
index b13ac10..edf1aae 100644
--- a/src/voc.jl
+++ b/src/voc.jl
@@ -1,7 +1,7 @@
 # This file is a part of TextSearch.jl
 
 export Vocabulary, occs, ndocs, token, vocsize, trainsize, filter_tokens, tokenize_and_append!, merge_voc, update_voc!, vocabulary_from_thesaurus, token2id, 
-       encode, decode, totable
+       encode, decode, table
 
 
 struct Vocabulary
@@ -15,15 +15,9 @@ end
 
 token2id(voc::Vocabulary, tok::AbstractString) = get(voc.token2id, tok, zero(UInt32))
 
-function Vocabulary(voc::Vocabulary)
-    Vocabulary(
-       voc.deepcopy(voc.textconfig),
-       voc.copy(voc.token),
-       voc.copy(voc.occs),
-       voc.copy(voc.ndocs),
-       voc.copy(voc.token2id),
-       voc.corpuslen
-    )
+function Vocabulary(voc::Vocabulary; textconfig=voc.textconfig, token=voc.token, occs=voc.occs, ndocs=voc.ndocs, token2id=voc.token2id, corpuslen=voc.corpuslen)
+    Vocabulary(textconfig, token, occs, ndocs, token2id, corpuslen)
+    # Vocabulary(voc.deepcopy(voc.textconfig), voc.copy(voc.token), voc.copy(voc.occs), voc.copy(voc.ndocs), voc.copy(voc.token2id), corpuslen)
 end
 
 function decode(voc::Vocabulary, bow::Dict)
@@ -34,7 +28,7 @@ function encode(voc::Vocabulary, bow::Dict)
     Dict(token2id(voc, k) => v for (k, v) in bow)
 end
 
-function totable(voc::Vocabulary, TableConstructor)
+function table(voc::Vocabulary, TableConstructor)
     TableConstructor(; voc.token, voc.ndocs, voc.occs)
 end
 
@@ -70,12 +64,40 @@ end
 
 Computes a vocabulary from a corpus using the TextConfig `textconfig`.
 """
-function Vocabulary(textconfig::TextConfig, corpus::AbstractVector; minbatch=0)
+function vocab_from_small_collection(textconfig::TextConfig, corpus::AbstractVector; minbatch=0)
     voc = Vocabulary(textconfig, length(corpus))
     tokenize_and_append!(voc, corpus; minbatch)
     voc
 end
 
+function Vocabulary(textconfig::TextConfig, corpusgenerator::Union{Base.EachLine,Base.Generator,AbstractVector}; minbatch=0, buffsize=2^16, verbose=true)
+    if corpusgenerator isa AbstractVector && length(corpusgenerator) <= buffsize
+        return vocab_from_small_collection(textconfig, corpusgenerator; minbatch)
+    end
+
+    voc = Vocabulary(textconfig, 0)
+    len = 0
+    corpus = String[]
+    sizehint!(corpus, buffsize)
+    for doc in corpusgenerator
+        push!(corpus, doc)
+
+        if length(corpus) == buffsize
+            verbose && (@info "computing vocabulary -- advance: $len - buffsize: $buffsize")
+            len += buffsize
+            tokenize_and_append!(voc, corpus; minbatch)
+            empty!(corpus) 
+        end 
+    end
+
+    if length(corpus) > 0
+        len += length(corpus)
+        tokenize_and_append!(voc, corpus; minbatch)
+    end
+
+    Vocabulary(voc; corpuslen=len)
+end
+
 function locked_tokenize_and_push(voc, doc, buff, l)
     empty!(buff)
 
@@ -112,7 +134,6 @@ function tokenize_and_append!(voc::Vocabulary, corpus; minbatch=0)
  
     Threads.@threads for i in 1:n # @batch per=thread minbatch=minbatch for i in 1:n
         doc = corpus[i]
-
         buff = take!(TEXT_SEARCH_CACHES)
 
         try
@@ -181,6 +202,8 @@ function Base.getindex(voc::Vocabulary, idlist)
     [voc[i] for i in itertokenid(idlist)]
 end
 
+Base.getindex(voc::Vocabulary, token::AbstractString) = voc[get(voc.token2id, token, 0)]
+
 function Base.getindex(voc::Vocabulary, tokenID::Integer)
     id = convert(UInt32, tokenID)
 

From 299988f4cc951518e4de73bdf5315551fb206c09 Mon Sep 17 00:00:00 2001
From: "Eric S. Tellez" <donsadit@gmail.com>
Date: Fri, 15 Sep 2023 10:23:27 -0500
Subject: [PATCH 3/6] removes output

---
 src/tokenize.jl | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/tokenize.jl b/src/tokenize.jl
index 6d6e3aa..9ba6a68 100644
--- a/src/tokenize.jl
+++ b/src/tokenize.jl
@@ -228,9 +228,7 @@ function unigrams(buff::TextSearchBuffer, tt::AbstractTokenTransformation)
         c = buff.normtext[i]
         p = buff.normtext[i-1]
 
-        ## @show i, p, c
         if ispunct2(c) && !ispunct2(p) && p !== BLANK
-            ## @show :a
             flush_unigram!(buff, tt)
             write(buff.io, c)
         elseif ispunct2(p)
@@ -238,7 +236,6 @@ function unigrams(buff::TextSearchBuffer, tt::AbstractTokenTransformation)
                 flush_unigram!(buff, tt)
                 write(buff.io, c)
             elseif !ispunct2(c) && !(p in ('#', '@', '_'))
-                ## @show :b
                 flush_unigram!(buff, tt)
                 c !== BLANK && write(buff.io, c)
             else

From 76eae3daf662b21be636d3f3433ea71416ffff16 Mon Sep 17 00:00:00 2001
From: "Eric S. Tellez" <donsadit@gmail.com>
Date: Fri, 15 Sep 2023 15:39:21 -0500
Subject: [PATCH 4/6] unigrams function rewritten; bug fixed that joined words
 and punctuactions; increases the number of buffers in the cache to allow
 semvoc work

---
 src/TextSearch.jl |  3 ++-
 src/semvocbow.jl  | 12 +++++++++++-
 src/tokenize.jl   | 24 ++++++++++++------------
 src/vmodel.jl     |  2 +-
 test/runtests.jl  |  3 ++-
 test/tok.jl       |  2 +-
 6 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/src/TextSearch.jl b/src/TextSearch.jl
index 721d4e3..f7592b5 100644
--- a/src/TextSearch.jl
+++ b/src/TextSearch.jl
@@ -50,11 +50,12 @@ function Base.empty!(buff::TextSearchBuffer)
 end
 
 function __init__()
-    for _ in 1:Threads.nthreads()
+    for _ in 1:2*Threads.nthreads()+4
         put!(TEXT_SEARCH_CACHES, TextSearchBuffer())
     end
 end
 
+
 @inline function textbuffer(f)
     buff = take!(TEXT_SEARCH_CACHES)
     try
diff --git a/src/semvocbow.jl b/src/semvocbow.jl
index 347f71a..e13a189 100644
--- a/src/semvocbow.jl
+++ b/src/semvocbow.jl
@@ -4,9 +4,13 @@ function vectorize_knns!(D::Dict, model::SemanticVocabulary, tok)
     klex = model.sel.klex
     ksem = min(model.sel.ksem, size(model.knns, 1))
 
+    @info "AAAAAA"
     res = getknnresult(klex)
+    @info "AAAAAA 1"
     search(model, tok, res)
+    @info "AAAAAA 2"
     sizehint!(D, length(res) * (1 + ksem))
+    @info "BBBBBB"
 
     for p in res
         D[p.id] = get(D, p.id, 0f0) + 1f0
@@ -15,7 +19,8 @@ function vectorize_knns!(D::Dict, model::SemanticVocabulary, tok)
             D[i] = get(D, i, 0f0) + 1f0
         end
     end
-
+    
+    @info "CCC"
     D
 end
 
@@ -31,15 +36,20 @@ function token2id(model::SemanticVocabulary, tok::AbstractString)::UInt32
             search(model, tok, res)
             id = argmin(res)::UInt32
         else
+            @info :first length(TEXT_SEARCH_CACHES.data)
             buff = take!(TEXT_SEARCH_CACHES)
+            @info :second length(TEXT_SEARCH_CACHES.data)
             try
                 empty!(buff.vec)
                 D = buff.vec
+                @info :third length(TEXT_SEARCH_CACHES.data)
                 vectorize_knns!(D, model, tok)
+                @info :fourth length(TEXT_SEARCH_CACHES.data)
                 id = length(D) == 0 ? zero(UInt32) : argmax(D)
             finally
                 put!(TEXT_SEARCH_CACHES, buff)
             end
+            @info :finish length(TEXT_SEARCH_CACHES.data)
         end
     end
 
diff --git a/src/tokenize.jl b/src/tokenize.jl
index 9ba6a68..453fb0a 100644
--- a/src/tokenize.jl
+++ b/src/tokenize.jl
@@ -228,27 +228,27 @@ function unigrams(buff::TextSearchBuffer, tt::AbstractTokenTransformation)
         c = buff.normtext[i]
         p = buff.normtext[i-1]
 
-        if ispunct2(c) && !ispunct2(p) && p !== BLANK
+        if c == BLANK
+            flush_unigram!(buff, tt)
+        elseif isemoji(c)
+            # emoji
             flush_unigram!(buff, tt)
             write(buff.io, c)
+            flush_unigram!(buff, tt)
         elseif ispunct2(p)
-            if ispunct2(c) && buff.io.size > 2
-                flush_unigram!(buff, tt)
+            # previous char is punct
+            if ispunct2(c)
+                # a punctuaction string
+                buff.io.size >= 3 && flush_unigram!(buff, tt)  # a bit large, so we flush and restart the punc string (3 is for most emojis and ...)
                 write(buff.io, c)
-            elseif !ispunct2(c) && !(p in ('#', '@', '_'))
-                flush_unigram!(buff, tt)
-                c !== BLANK && write(buff.io, c)
             else
+                !(p in ('#', '@', '_')) && flush_unigram!(buff, tt)  # current is not punctuaction so we flush if not a meta word
                 write(buff.io, c)
             end
-        elseif isemoji(c)
+        elseif ispunct2(c) && p !== BLANK
+            ## single punctuaction alone
             flush_unigram!(buff, tt)
             write(buff.io, c)
-            flush_unigram!(buff, tt)
-        elseif c == BLANK
-            if p !== BLANK
-                flush_unigram!(buff, tt)
-            end
         else
             write(buff.io, c)
         end
diff --git a/src/vmodel.jl b/src/vmodel.jl
index c20fa1d..d57f8ec 100644
--- a/src/vmodel.jl
+++ b/src/vmodel.jl
@@ -141,7 +141,7 @@ function table(model::VectorModel, TableConstructor)
     TableConstructor(; token=token(model), ndocs=ndocs(model), occs=occs(model), weight=weight(model))
 end
 
-Base.getindex(model::VectorModel, token::AbstractString) = model[get(model.voc.token2id, token, 0)]
+Base.getindex(model::VectorModel, token::AbstractString) = model[token2id(model.voc, token)]
 
 function Base.getindex(model::VectorModel, tokenID::Integer)
     id = convert(UInt32, tokenID)
diff --git a/test/runtests.jl b/test/runtests.jl
index 304b680..88ca19a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -39,8 +39,9 @@ end
 include("tok.jl")
 include("voc.jl")
 include("vec.jl")
-include("search.jl")
 
+include("search.jl")
+# experimental:
 include("semvoc.jl")
 
 @info "FINISH"
diff --git a/test/tok.jl b/test/tok.jl
index 6d60512..8c879e5 100644
--- a/test/tok.jl
+++ b/test/tok.jl
@@ -48,7 +48,7 @@ function test_equals(a, b)
         @info :intersection => intersect(a, b)
         @info :evaluated => a
         @info :correct => b
-        error("diff")
+        error("a difference was found")
     end
 
     @test a == b

From 3d352f45d44cce47df44c36abe96dcaae5c22b1a Mon Sep 17 00:00:00 2001
From: "Eric S. Tellez" <donsadit@gmail.com>
Date: Thu, 21 Sep 2023 12:32:59 -0500
Subject: [PATCH 5/6] update version

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index f798d02..bd8f93a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "TextSearch"
 uuid = "7f6f6c8a-3b03-11e9-223d-e7d88259bd6c"
 authors = ["Eric S. Tellez <donsadit@gmail.com>"]
-version = "0.16.2"
+version = "0.17.0"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"

From 2fe486c031b0e720809674fed7fce79eb8023d53 Mon Sep 17 00:00:00 2001
From: "Eric S. Tellez" <donsadit@gmail.com>
Date: Fri, 22 Sep 2023 11:10:53 -0500
Subject: [PATCH 6/6] removes semvoc in favor of approxvoc; a lexical approach;
 semantic approaches should be added outside of this package to kept it
 simpple

---
 src/TextSearch.jl |   4 +-
 src/approxvoc.jl  |  50 ++++++++++++++++++
 src/io.jl         |  17 -------
 src/semvoc.jl     |  82 ------------------------------
 src/semvocbow.jl  | 127 ----------------------------------------------
 src/textconfig.jl |  49 +-----------------
 src/tokentrans.jl |  67 ++++++++++++++++++++++++
 src/updatevoc.jl  |   4 +-
 src/voc.jl        |  29 ++++++-----
 test/runtests.jl  |   8 +--
 test/semvoc.jl    |  16 ------
 test/voc.jl       |  10 ++++
 12 files changed, 151 insertions(+), 312 deletions(-)
 create mode 100644 src/approxvoc.jl
 delete mode 100644 src/semvoc.jl
 delete mode 100644 src/semvocbow.jl
 create mode 100644 src/tokentrans.jl
 delete mode 100644 test/semvoc.jl

diff --git a/src/TextSearch.jl b/src/TextSearch.jl
index f7592b5..77394aa 100644
--- a/src/TextSearch.jl
+++ b/src/TextSearch.jl
@@ -65,6 +65,7 @@ end
     end
 end
 
+include("tokentrans.jl")
 include("textconfig.jl")
 include("normalize.jl")
 include("tokenize.jl")
@@ -78,8 +79,7 @@ include("emodel.jl")
 include("bm25.jl")
 include("bm25invfile.jl")
 include("bm25invfilesearch.jl")
-include("semvoc.jl")
-include("semvocbow.jl")
+include("approxvoc.jl")
 include("io.jl")
 
 end
diff --git a/src/approxvoc.jl b/src/approxvoc.jl
new file mode 100644
index 0000000..20fd1de
--- /dev/null
+++ b/src/approxvoc.jl
@@ -0,0 +1,50 @@
+# This file is a part of TextSearch.jl
+
+export QgramsLookup
+
+struct QgramsLookup <: AbstractTokenLookup
+    voc::Vocabulary{TokenLookup}
+    idx::BinaryInvertedFile
+    maxdist::Float32
+end
+
+"""
+    QgramsLookup(
+        voc::Vocabulary,
+        dist::SemiMetric=JaccardDistance();
+        maxdist::Real = 0.7,
+        textconfig=TextConfig(qlist=[3]),
+        doc_min_freq::Integer=1,  # any hard vocabulary pruning are expected to be made in `voc`
+        doc_max_ratio::AbstractFloat=0.4 # popular tokens are likely to be thrash
+    )
+"""
+function QgramsLookup(
+        voc::Vocabulary,
+        dist::SemiMetric=JaccardDistance();
+        maxdist::Real = 0.7,
+        textconfig=TextConfig(qlist=[3]),
+        doc_min_freq::Integer=1,  # any hard vocabulary pruning are expected to be made in `voc`
+        doc_max_ratio::AbstractFloat=0.4 # popular tokens are likely to be thrash
+    )
+    
+    voc_ = Vocabulary(textconfig, token(voc))
+    voc_ = filter_tokens(voc_) do t
+        doc_min_freq <= t.ndocs <= doc_max_ratio * vocsize(voc_)
+    end
+
+    invfile = BinaryInvertedFile(vocsize(voc_), dist)
+    append_items!(invfile, VectorDatabase(bagofwords_corpus(voc_, token(voc))))
+    QgramsLookup(voc_, invfile, maxdist)
+end
+
+function token2id(voc::Vocabulary{QgramsLookup}, tok)::UInt32
+    lookup = voc.lookup
+    i = get(voc.token2id, tok, zero(UInt32))
+    i > 0 && return i
+    tok == "" && return 0
+    res = KnnResult(1)
+    search(lookup.idx, bagofwords(lookup.voc, tok), res)
+    p = res[1]
+    p.weight > lookup.maxdist ? 0 : p.id
+end
+
diff --git a/src/io.jl b/src/io.jl
index cd0f6ea..a723efd 100644
--- a/src/io.jl
+++ b/src/io.jl
@@ -27,13 +27,6 @@ function savemodel(filename::AbstractString, model; meta=nothing, parent="/")
     end
 end
 
-function savemodel(file::JLDFile, model::SemanticVocabulary; meta=nothing, parent="/")
-    file[joinpath(parent, "meta")] = meta
-    file[joinpath(parent, "voc")] = model.voc
-    file[joinpath(parent, "knns")] = model.knns
-    file[joinpath(parent, "sel")] = model.sel
-    saveindex(file, model.lexidx; parent=joinpath(parent, "lexidx"))
-end
 
 function loadmodel(t::Type, filename::AbstractString; parent="/", staticgraph=false)
     jldopen(filename) do f
@@ -41,13 +34,3 @@ function loadmodel(t::Type, filename::AbstractString; parent="/", staticgraph=fa
     end
 end
 
-function loadmodel(::Type{SemanticVocabulary}, file::JLDFile; parent="/", staticgraph=false)
-    meta = file[joinpath(parent, "meta")]
-    voc = file[joinpath(parent, "voc")]
-    knns = file[joinpath(parent, "knns")]
-    sel = file[joinpath(parent, "sel")]
-    lexidx, _ = loadindex(file; parent=joinpath(parent, "lexidx"), staticgraph)
-
-    SemanticVocabulary(voc, lexidx, knns, sel), meta
-end
-
diff --git a/src/semvoc.jl b/src/semvoc.jl
deleted file mode 100644
index 79be7b8..0000000
--- a/src/semvoc.jl
+++ /dev/null
@@ -1,82 +0,0 @@
-# This file is a part of TextSearch.jl
-
-export SemanticVocabulary, AbstractTokenSelection, SelectCentralToken, SelectAllTokens, subvoc, decode, encode
-
-struct SemanticVocabulary{SelType}
-    voc::Vocabulary
-    lexidx::BM25InvertedFile
-    knns::Matrix{Int32}
-    sel::SelType
-end
-
-abstract type AbstractTokenSelection end
-
-struct SelectCentralToken <: AbstractTokenSelection
-    klex::Int32
-    ksem::Int32
-end
-
-struct SelectAllTokens <: AbstractTokenSelection
-    klex::Int32
-    ksem::Int32
-end
-
-SemanticVocabulary(C::SemanticVocabulary;
-                   voc=C.voc, lexidx=C.lexidx, knns=C.knns, sel=C.sel) =
-    SemanticVocabulary(voc, lexidx, knns, sel)
-
-vocsize(model::SemanticVocabulary) = vocsize(model.voc)
-
-function SemanticVocabulary(voc::Vocabulary, sel::AbstractTokenSelection=SelectCentralToken(16, 8);
-        textconfig::TextConfig=TextConfig(nlist=[1], qlist=[4]),
-        list_min_length_for_checking::Int=32,
-        list_max_allowed_length::Int=128,
-        doc_min_freq::Int=1,  # any hard vocabulary pruning are expected to be made in `voc`
-        doc_max_ratio::AbstractFloat=0.3 # popular tokens are likely to be thrash
-    )
-    doc_max_freq = ceil(Int, doc_max_ratio * vocsize(voc))
-    C = tokenize_corpus(textconfig, voc.token)
-    lexidx = BM25InvertedFile(textconfig, C) do t
-        doc_min_freq <= t.ndocs <= doc_max_freq
-    end
-
-    @info "append_items"
-    @time append_items!(lexidx, C; sort=false)
-
-    #doc_max_freq = ceil(Int, vocsize(voc) * doc_max_ratio)
-    @info "filter lists!"
-    @time filter_lists!(lexidx;
-                  list_min_length_for_checking,
-                  list_max_allowed_length,
-                  doc_min_freq,
-                  doc_max_freq,
-                  always_sort=true # we need this since we call append_items! without sorting
-                 )
-    @info "searchbatch"
-    @time knns, _ = searchbatch(lexidx, VectorDatabase(C), sel.ksem)
-    SemanticVocabulary(voc, lexidx, knns, sel)
-end
-
-enrich_bow!(v::Dict, l::Nothing) = v
-function enrich_bow!(v::Dict, l)
-    for (k, w) in l
-        v[k] = w
-    end
-
-    v
-end
-
-function search(model::SemanticVocabulary, text, res::KnnResult)
-    search(model.lexidx, text, res)
-end
-
-function decode(model::SemanticVocabulary, idlist)
-    [model.voc.token[i] for i in itertokenid(idlist) if i > 0]
-end
-
-Base.getindex(model::SemanticVocabulary, i::Integer) = model.voc[i]
-
-function subvoc(model::SemanticVocabulary, idlist, tc=model.lexidx.voc.textconfig)
-    corpus = [model.voc.token[i] for i in itertokenid(idlist)]
-    Vocabulary(tc, corpus)
-end
diff --git a/src/semvocbow.jl b/src/semvocbow.jl
deleted file mode 100644
index e13a189..0000000
--- a/src/semvocbow.jl
+++ /dev/null
@@ -1,127 +0,0 @@
-# This file is a part of TextSearch.jl
-
-function vectorize_knns!(D::Dict, model::SemanticVocabulary, tok)
-    klex = model.sel.klex
-    ksem = min(model.sel.ksem, size(model.knns, 1))
-
-    @info "AAAAAA"
-    res = getknnresult(klex)
-    @info "AAAAAA 1"
-    search(model, tok, res)
-    @info "AAAAAA 2"
-    sizehint!(D, length(res) * (1 + ksem))
-    @info "BBBBBB"
-
-    for p in res
-        D[p.id] = get(D, p.id, 0f0) + 1f0
-        for i in view(model.knns, 1:ksem, p.id)
-            i == 0 && break
-            D[i] = get(D, i, 0f0) + 1f0
-        end
-    end
-    
-    @info "CCC"
-    D
-end
-
-function token2id(model::SemanticVocabulary, tok::AbstractString)::UInt32
-    id = token2id(model.voc, tok)::UInt32
-
-    if id == 0
-        klex = model.sel.klex
-        ksem = min(model.sel.ksem, size(model.knns, 1))
-
-        if ksem == 0
-            res = getknnresult(klex)
-            search(model, tok, res)
-            id = argmin(res)::UInt32
-        else
-            @info :first length(TEXT_SEARCH_CACHES.data)
-            buff = take!(TEXT_SEARCH_CACHES)
-            @info :second length(TEXT_SEARCH_CACHES.data)
-            try
-                empty!(buff.vec)
-                D = buff.vec
-                @info :third length(TEXT_SEARCH_CACHES.data)
-                vectorize_knns!(D, model, tok)
-                @info :fourth length(TEXT_SEARCH_CACHES.data)
-                id = length(D) == 0 ? zero(UInt32) : argmax(D)
-            finally
-                put!(TEXT_SEARCH_CACHES, buff)
-            end
-            @info :finish length(TEXT_SEARCH_CACHES.data)
-        end
-    end
-
-    id
-end
-
-function tokenize!(model::SemanticVocabulary, tokens::TokenizedText)
-    for i in eachindex(tokens) 
-        t = tokens[i]
-        id = token2id(model, t)
-        tokens[i] = token(model.voc, id)
-    end
-
-    tokens
-end
-
-function tokenize(model::SemanticVocabulary, text)
-    tokenize!(model, tokenize(model.voc.textconfig, text))
-
-end
-
-function tokenize_corpus(model::SemanticVocabulary, corpus)
-    n = length(corpus)
-    arr = Vector{TokenizedText}(undef, n)
-    Threads.@threads for i in 1:n
-        arr[i] = tokenize!(model, tokenize(model.voc.textconfig, corpus[i]))
-    end
-
-    arr
-end
-
-function bagofwords!(bow::BOW, model::SemanticVocabulary{SelectCentralToken}, tokens::TokenizedText)
-    for t in tokens 
-        id = token2id(model, t)
-        bow[id] = get(bow, id, 0) + 1
-    end
-
-    bow
-end
-
-function bagofwords(model::SemanticVocabulary, text)
-    tokens = tokenize(model.voc.textconfig, text)
-    bow = BOW()
-    sizehint!(bow, length(tokens))
-    bagofwords!(bow, model, tokens)
-end
-
-
-function vectorize(model::SemanticVocabulary, text; normalize=true)
-    klex = model.sel.klex
-    ksem = min(model.sel.ksem, size(model.knns, 1))
-
-    res = getknnresult(klex)
-    search(model, text, res)
-    D = DVEC{UInt32,Float32}()
-    sizehint!(D, length(res) * (1 + ksem))
-    if ksem == 0
-        for p in res
-            D[p.id] = abs(p.weight)
-        end 
-    else
-        for p in res
-            D[p.id] = get(D, p.id, 0f0) + abs(p.weight)
-            for i in view(model.knns, 1:ksem, p.id)
-                i == 0 && break
-                D[i] = get(D, i, 0f0) + 1f0
-            end
-        end 
-    end
-
-    normalize && normalize!(D)
-
-    D
-end
-
diff --git a/src/textconfig.jl b/src/textconfig.jl
index 2bc149b..4f5d817 100644
--- a/src/textconfig.jl
+++ b/src/textconfig.jl
@@ -1,53 +1,6 @@
 # This file is a part of TextSearch.jl
 
-export TextConfig, Skipgram, AbstractTokenTransformation, IdentityTokenTransformation
-
-abstract type AbstractTokenTransformation end
-struct IdentityTokenTransformation <: AbstractTokenTransformation end
-
-"""
-    transform_unigram(::AbstractTokenTransformation, tok)
-
-Hook applied in the tokenization stage to change the input token `tok` if needed.
-For instance, it can be used to apply stemming or any other kind of normalization.
-Return `nothing` to ignore the `tok` occurence (e.g., stop words).
-"""
-transform_unigram(::AbstractTokenTransformation, tok) = tok
-
-"""
-    transform_nword(::AbstractTokenTransformation, tok)
-
-Hook applied in the tokenization stage to change the input token `tok` if needed.
-For instance, it can be used to apply stemming or any other kind of normalization.
-Return `nothing` to ignore the `tok` occurence (e.g., stop words).
-"""
-transform_nword(::AbstractTokenTransformation, tok) = tok
-
-"""
-    transform_qgram(::AbstractTokenTransformation, tok)
-
-Hook applied in the tokenization stage to change the input token `tok` if needed.
-For instance, it can be used to apply stemming or any other kind of normalization.
-Return `nothing` to ignore the `tok` occurence (e.g., stop words).
-"""
-transform_qgram(::AbstractTokenTransformation, tok) = tok
-
-"""
-    transform_collocation(::AbstractTokenTransformation, tok)
-
-Hook applied in the tokenization stage to change the input token `tok` if needed.
-Return `nothing` to ignore the `tok` occurence (e.g., stop words).
-"""
-transform_collocation(::AbstractTokenTransformation, tok) = tok
-
-"""
-    transform_skipgram(::AbstractTokenTransformation, tok)
-
-Hook applied in the tokenization stage to change the input token `tok` if needed.
-For instance, it can be used to apply stemming or any other kind of normalization.
-Return `nothing` to ignore the `tok` occurence (e.g., stop words).
-"""
-transform_skipgram(::AbstractTokenTransformation, tok) = tok
+export TextConfig, Skipgram
 
 """
     Skipgram(qsize, skip)
diff --git a/src/tokentrans.jl b/src/tokentrans.jl
new file mode 100644
index 0000000..057658e
--- /dev/null
+++ b/src/tokentrans.jl
@@ -0,0 +1,67 @@
+# This file is a part of TextSearch.jl
+
+export TextConfig, Skipgram, AbstractTokenTransformation, IdentityTokenTransformation
+export IgnoreStopwords, ChainTransformation
+
+abstract type AbstractTokenTransformation end
+struct IdentityTokenTransformation <: AbstractTokenTransformation end
+
+"""
+    transform_unigram(::AbstractTokenTransformation, tok)
+
+Hook applied in the tokenization stage to change the input token `tok` if needed.
+For instance, it can be used to apply stemming or any other kind of normalization.
+Return `nothing` to ignore the `tok` occurence (e.g., stop words).
+"""
+transform_unigram(::AbstractTokenTransformation, tok) = tok
+
+"""
+    transform_nword(::AbstractTokenTransformation, tok)
+
+Hook applied in the tokenization stage to change the input token `tok` if needed.
+For instance, it can be used to apply stemming or any other kind of normalization.
+Return `nothing` to ignore the `tok` occurence (e.g., stop words).
+"""
+transform_nword(::AbstractTokenTransformation, tok) = tok
+
+"""
+    transform_qgram(::AbstractTokenTransformation, tok)
+
+Hook applied in the tokenization stage to change the input token `tok` if needed.
+For instance, it can be used to apply stemming or any other kind of normalization.
+Return `nothing` to ignore the `tok` occurence (e.g., stop words).
+"""
+transform_qgram(::AbstractTokenTransformation, tok) = tok
+
+"""
+    transform_collocation(::AbstractTokenTransformation, tok)
+
+Hook applied in the tokenization stage to change the input token `tok` if needed.
+Return `nothing` to ignore the `tok` occurence (e.g., stop words).
+"""
+transform_collocation(::AbstractTokenTransformation, tok) = tok
+
+"""
+    transform_skipgram(::AbstractTokenTransformation, tok)
+
+Hook applied in the tokenization stage to change the input token `tok` if needed.
+For instance, it can be used to apply stemming or any other kind of normalization.
+Return `nothing` to ignore the `tok` occurence (e.g., stop words).
+"""
+transform_skipgram(::AbstractTokenTransformation, tok) = tok
+
+
+### some transformations
+
+
+struct IgnoreStopwords <: AbstractTokenTransformation
+    stopwords::Set{String}
+end
+
+function TextSearch.transform_unigram(tt::IgnoreStopwords, tok)
+    tok in tt.stopwords ? nothing : tok
+end
+
+struct ChainTransformation <: AbstractTokenTransformation
+    list::AbstractVector{<:AbstractTokenTransformation}    
+end
diff --git a/src/updatevoc.jl b/src/updatevoc.jl
index cba4b58..e905749 100644
--- a/src/updatevoc.jl
+++ b/src/updatevoc.jl
@@ -73,7 +73,7 @@ function merge_voc(pred::Function, voc1::Vocabulary, voc2::Vocabulary, voclist..
     end
 
     sort!(L, by=vocsize, rev=true)
-    voc = Vocabulary(voc1.textconfig, sum(v.corpuslen for v in L))
+    voc = Vocabulary(voc1.lookup, voc1.textconfig, sum(v.corpuslen for v in L))
 
     for v in L
         update_voc!(pred, voc, v)
@@ -88,7 +88,7 @@ end
 Returns a copy of reduced vocabulary based on evaluating `pred` function for each entry in `voc`
 """
 function filter_tokens(pred::Function, voc::Vocabulary)
-    V = Vocabulary(voc.textconfig, voc.corpuslen)
+    V = Vocabulary(voc.lookup, voc.textconfig, voc.corpuslen)
 
     for i in eachindex(voc)
         v = voc[i]
diff --git a/src/voc.jl b/src/voc.jl
index edf1aae..572ae22 100644
--- a/src/voc.jl
+++ b/src/voc.jl
@@ -1,10 +1,12 @@
 # This file is a part of TextSearch.jl
 
-export Vocabulary, occs, ndocs, token, vocsize, trainsize, filter_tokens, tokenize_and_append!, merge_voc, update_voc!, vocabulary_from_thesaurus, token2id, 
+export Vocabulary, AbstractTokenLookup, TokenLookup, occs, ndocs, token, vocsize, trainsize, filter_tokens, tokenize_and_append!, merge_voc, update_voc!, vocabulary_from_thesaurus, token2id, 
        encode, decode, table
 
+abstract type AbstractTokenLookup end
 
-struct Vocabulary
+struct Vocabulary{TokenLookup<:AbstractTokenLookup}
+    lookup::TokenLookup
     textconfig::TextConfig
     token::Vector{String}
     occs::Vector{Int32}
@@ -13,11 +15,13 @@ struct Vocabulary
     corpuslen::Int
 end
 
-token2id(voc::Vocabulary, tok::AbstractString) = get(voc.token2id, tok, zero(UInt32))
+struct TokenLookup <: AbstractTokenLookup
+end
+
+token2id(voc::Vocabulary{TokenLookup}, tok::AbstractString) = get(voc.token2id, tok, zero(UInt32))
 
-function Vocabulary(voc::Vocabulary; textconfig=voc.textconfig, token=voc.token, occs=voc.occs, ndocs=voc.ndocs, token2id=voc.token2id, corpuslen=voc.corpuslen)
-    Vocabulary(textconfig, token, occs, ndocs, token2id, corpuslen)
-    # Vocabulary(voc.deepcopy(voc.textconfig), voc.copy(voc.token), voc.copy(voc.occs), voc.copy(voc.ndocs), voc.copy(voc.token2id), corpuslen)
+function Vocabulary(voc::Vocabulary; lookup=voc.lookup, textconfig=voc.textconfig, token=voc.token, occs=voc.occs, ndocs=voc.ndocs, token2id=voc.token2id, corpuslen=voc.corpuslen)
+    Vocabulary(lookup, textconfig, token, occs, ndocs, token2id, corpuslen)
 end
 
 function decode(voc::Vocabulary, bow::Dict)
@@ -34,7 +38,6 @@ end
 
 function vocabulary_from_thesaurus(textconfig::TextConfig, tokens::AbstractVector)
     n = length(tokens)
-    token2id = Dict{String,UInt32}
     voc = Vocabulary(textconfig, n)
     for t in tokens
         push_token!(voc, t, 1, 1)
@@ -48,9 +51,9 @@ end
 
 Creates a `Vocabulary` struct
 """
-function Vocabulary(textconfig::TextConfig, n::Integer)
+function Vocabulary(lookup::AbstractTokenLookup, textconfig::TextConfig, n::Integer)
     # n == 0 means unknown
-    voc = Vocabulary(textconfig, String[], Int32[], Int32[], Dict{String,UInt32}(), n)
+    voc = Vocabulary(lookup, textconfig, String[], Int32[], Int32[], Dict{String,UInt32}(), n)
     vocsize = ceil(Int, n^0.6)  # approx based on Heaps law
     sizehint!(voc.token, vocsize)
     sizehint!(voc.occs, vocsize)
@@ -59,13 +62,15 @@ function Vocabulary(textconfig::TextConfig, n::Integer)
     voc 
 end
 
+Vocabulary(textconfig::TextConfig, n::Integer) = Vocabulary(TokenLookup(), textconfig, n)
+
 """
     Vocabulary(textconfig, corpus; minbatch=0)
 
 Computes a vocabulary from a corpus using the TextConfig `textconfig`.
 """
 function vocab_from_small_collection(textconfig::TextConfig, corpus::AbstractVector; minbatch=0)
-    voc = Vocabulary(textconfig, length(corpus))
+    voc = Vocabulary(TokenLookup(), textconfig, length(corpus))
     tokenize_and_append!(voc, corpus; minbatch)
     voc
 end
@@ -75,7 +80,7 @@ function Vocabulary(textconfig::TextConfig, corpusgenerator::Union{Base.EachLine
         return vocab_from_small_collection(textconfig, corpusgenerator; minbatch)
     end
 
-    voc = Vocabulary(textconfig, 0)
+    voc = Vocabulary(TokenLookup(), textconfig, 0)
     len = 0
     corpus = String[]
     sizehint!(corpus, buffsize)
@@ -95,7 +100,7 @@ function Vocabulary(textconfig::TextConfig, corpusgenerator::Union{Base.EachLine
         tokenize_and_append!(voc, corpus; minbatch)
     end
 
-    Vocabulary(voc; corpuslen=len)
+    Vocabulary(TokenLookup(), voc; corpuslen=len)
 end
 
 function locked_tokenize_and_push(voc, doc, buff, l)
diff --git a/test/runtests.jl b/test/runtests.jl
index 88ca19a..648722d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -38,10 +38,6 @@ end
 
 include("tok.jl")
 include("voc.jl")
-include("vec.jl")
-
-include("search.jl")
-# experimental:
-include("semvoc.jl")
-
+#include("vec.jl")
+#include("search.jl")
 @info "FINISH"
diff --git a/test/semvoc.jl b/test/semvoc.jl
deleted file mode 100644
index 7b10d06..0000000
--- a/test/semvoc.jl
+++ /dev/null
@@ -1,16 +0,0 @@
-
-@testset "semvoc" begin
-    textconfig = TextConfig(nlist=[2])
-    voc = vocabulary_from_thesaurus(textconfig, _corpus)
-    semvoc = SemanticVocabulary(voc; textconfig=TextConfig(qlist=[4]))
-    q = "laz mansanas"
-    d = vectorize(semvoc, q) 
-    for (k, v) in d
-        @info voc[k] => v
-    end
-
-    @test voc[token2id(semvoc, q)].token in ("la manzana verde esta rica", "la manzana roja")
-    @info bagofwords(semvoc, "la manzana roja es rica")
-    @info tokenize(semvoc, "la manzana roja es rica, pero la pera es ")
-end
-
diff --git a/test/voc.jl b/test/voc.jl
index b0582ff..3dffb66 100644
--- a/test/voc.jl
+++ b/test/voc.jl
@@ -18,3 +18,13 @@ end
     @test decode.(Ref(voc), B) == decode.(Ref(voc), C)
 end
 
+@testset "Approximate vocabulary" begin
+    textconfig = TextConfig(nlist=[1])
+    voc = Vocabulary(textconfig, corpus)
+    @info corpus
+    V = Vocabulary(voc; lookup=QgramsLookup(voc))
+    @info "==================="
+    @info V.lookup
+    #@test decode.(Ref(voc), B) == decode.(Ref(voc), C)
+end
+