In [1]:
using Pkg
Pkg.activate(".")

[32m[1m  Activating[22m[39m project at `~/JuliaSurvey2023`


In [2]:
#Pkg.instantiate()

In [3]:
#Pkg.resolve()

In [13]:
using SimilaritySearch, TextSearch, Printf

In [5]:
function load_survey(filename)
    L = [String[]]
    for line in eachline(filename)
        #line = strip(line)
        if match(r"^\d+\.", line) !== nothing
            push!(L, [line])
        else
            push!(L[end], line)
        end
    end

    L
end

preprocess(line) = replace(line, "|" => "", r"--+" => "", "- [ ]" => "") |> strip

preprocess (generic function with 1 method)

In [6]:
L22 = load_survey("survey2022.txt")
L23 = load_survey("JuliaSurvey.en_US.md")


L23_ = [[preprocess(line) for line in block] for block in L23]

length(L22), length(L23)

(42, 44)

In [7]:
voc = Vocabulary(TextConfig(nlist=[2, 3]), vcat(L22, L23_))
voc = filter_tokens(voc) do t
    2 <= t.ndocs <= 0.8 * length(L23_)
end

Vocabulary(TextConfig(true, false, false, true, true, false, false, true, Int8[], Int8[2, 3], Skipgram[], true, IdentityTokenTransformation()), ["not complete\tn", "complete the\tn", "the survey\tn", "survey at\tn", "at this\tn", "this time\tn", "do not complete\tn", "not complete the\tn", "complete the survey\tn", "the survey at\tn"  …  ", pharmaceuticals\tn", "like juliahub\tn", "juliahub to\tn", "to contact\tn", "contact you\tn", "would like juliahub\tn", "like juliahub to\tn", "juliahub to contact\tn", "to contact you\tn", "contact you to\tn"], Int32[2, 2, 9, 2, 2, 2, 2, 2, 2, 2  …  2, 3, 3, 2, 2, 3, 3, 2, 2, 2], Int32[2, 2, 8, 2, 2, 2, 2, 2, 2, 2  …  2, 3, 3, 2, 2, 3, 3, 2, 2, 2], Dict{String, UInt32}("or have not\tn" => 0x00000161, "( released\tn" => 0x000005e8, "classes at\tn" => 0x00000471, "that the julia\tn" => 0x00000387, "vi /\tn" => 0x00000508, "0 ( boston\tn" => 0x0000057d, "saint vincent\tn" => 0x0000072a, "0 questions and\tn" => 0x0000008c, "with the julia\tn" => 0x0000

In [8]:
vmodel = VectorModel(BinaryGlobalWeighting(), BinaryLocalWeighting(), voc)

{VectorModel
    global_weighting: BinaryGlobalWeighting()
    local_weighting: BinaryLocalWeighting()
    vocsize: 2276
    trainsize=86
    maxoccs=35                                    
}

In [9]:
V22 = vectorize.(Ref(vmodel), L22)
V23 = vectorize.(Ref(vmodel), L23_)
length(V22), length(V23)

(42, 44)

In [10]:
E = ExhaustiveSearch(; dist=CosineDistance(), db=VectorDatabase(V22))
Q = VectorDatabase(V23)
knns, dists = searchbatch(E, Q, 3)

(Int32[1 2 … 40 41; 42 25 … 39 42; 41 8 … 36 1], Float32[0.08699839 0.45003694 … -5.503118f-7 0.18834649; 0.8829829 0.81632644 … 0.59999985 0.29104424; 0.8930936 0.81699634 … 0.7303925 0.8951378])

In [17]:
for i in 1:size(knns, 2)
    A, B = view(knns, :, i), view(dists, :, i)
    path = joinpath("survey", @sprintf "%02d" (i-1))
    mkpath(path)
    open(joinpath(path, "scores.txt"), "w") do f
        for (nn, dist) in zip(A, B)
            println(f, "$nn  $(round(dist, digits=2))")
        end
    end
    
    open(joinpath(path, "23.txt"), "w") do f
        for line in L23[i]
            println(f, line)
        end
    end
    
    open(joinpath(path, "nearest-22.txt"), "w") do f
        for line in L22[A[1]]
            length(line) > 0 && println(f, line)
        end
    end
end
    

In [12]:
for t in tokenize(voc.textconfig, L23_[4])
    @info t
end

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39m0 how	n
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mhow much	n
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mmuch do	n
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mdo you	n
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39myou like	n
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mlike each	n
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39meach of	n
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mof the	n
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mthe following	n
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mfollowing languages	n
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mlanguages ?	n
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39m0 how much	n
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mhow much do	n
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mmuch do you	n
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mdo you like	n
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39myou like each	n
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mlike each of	n
