adds Polyester; support for SimilaritySearch 0.9

sadit · Jun 17, 2022 · 8d60bc8 · 8d60bc8 · sadit · Jun 18, 2022
1 parent b0f086c
commit 8d60bc8
Show file tree

Hide file tree

Showing 5 changed files with 23 additions and 20 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,14 +1,15 @@
 name = "KCenters"
 uuid = "5d8de97f-65f8-4dd6-a15b-0f89c36a43ce"
 authors = ["Eric S. Tellez <donsadit@gmail.com>"]
-version = "0.6.1"
+version = "0.7.0"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MLDataUtils = "cc2ba9b6-d476-5e6d-8eaf-a92d5412d41d"
+Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SimilaritySearch = "053f045d-5466-53fd-b400-a066f88fe02a"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
@@ -18,6 +19,7 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 CategoricalArrays = "0.8, 0.9, 0.10"
 Distances = "0.10"
 MLDataUtils = "0.5"
-SimilaritySearch = "0.8.15"
+SimilaritySearch = "0.9"
 StatsBase = "0.32, 0.33"
+Polyester = "0.6"
 julia = "1.6"
diff --git a/src/KCenters.jl b/src/KCenters.jl
@@ -1,7 +1,8 @@
 # This file is a part of KCenters.jl
 
 module KCenters
-using SimilaritySearch
+using SimilaritySearch, Polyester
+using SimilaritySearch: getminbatch, getknnresult, getpools
 
 include("criterions.jl")
 include("centerselection.jl")

diff --git a/src/clustering.jl b/src/clustering.jl
@@ -132,8 +132,7 @@ function kcenters_(dist::SemiMetric, X::AbstractDatabase, C::AbstractDatabase; s
         end
 
         verbose && println(stderr, "*** computing centroids ***")
-
-        Threads.@threads for i in 1:length(clusters)
+        @batch minbatch=getminbatch(0, length(clusters)) for i in 1:length(clusters)
             plist = clusters[i]
             # CC[i] can be empty because we could be using approximate search
             if length(plist) > 0
@@ -153,11 +152,12 @@ function kcenters_(dist::SemiMetric, X::AbstractDatabase, C::AbstractDatabase; s
     ClusteringData(CC, freqs, compute_dmax(numcenters, codes, distances), codes, distances, err)
 end
 
-function associate_centroids_and_compute_error!(X, index::AbstractSearchContext, codes, distances, counters)
-    pools = SimilaritySearch.getpools(index)
-    Threads.@threads for objID in 1:length(X)
-        res = SimilaritySearch.getknnresult(1, pools)
-        search(index, X[objID], res)
+function associate_centroids_and_compute_error!(X, index::AbstractSearchIndex, codes, distances, counters)
+    pools = getpools(index)
+
+    @batch minbatch=getminbatch(0, length(X)) for objID in 1:length(X)
+        res = getknnresult(1, pools)
+        search(index, X[objID], res; pools)
         codes[objID] = argmin(res)
         distances[objID] = maximum(res)
     end

diff --git a/src/enet.jl b/src/enet.jl
@@ -25,7 +25,7 @@ function enet(dist::SemiMetric, X::AbstractDatabase, stop::Function; verbose=tru
         verbose && println(stderr, "computing fartest point $(length(imaxlist)), dmax: $dmax, imax: $imax, n: $(length(X))")
 
         @inbounds pivot = X[imax]
-        Threads.@threads for i in 1:N
+        @batch minbatch=getminbatch(0, N) for i in 1:N
             @inbounds d = evaluate(dist, X[i], pivot)
             @inbounds nndist[i] = min(nndist[i], d)
         end

diff --git a/src/utils.jl b/src/utils.jl
@@ -3,7 +3,7 @@
 export partition, knr, sequence, invindex
 
 """
-    partition(callback::Function, objects::AbstractVector{T}, refs::AbstractSearchContext; k::Int=1) where T
+    partition(callback::Function, objects::AbstractVector{T}, refs::AbstractSearchIndex; k::Int=1) where T
 
 Groups items in `objects` using a nearest neighbor rule over `refs`.
 The output is controlled using a callback function. The call is performed in `objects` order.
@@ -19,7 +19,7 @@ The output is controlled using a callback function. The call is performed in `ob
 Please note that each object can be related to more than one group ``k > 1`` (default ``k=1``).
 
 """
-function partition(callback::Function, objects::AbstractVector{T}, refs::AbstractSearchContext; k::Int=1) where T
+function partition(callback::Function, objects::AbstractVector{T}, refs::AbstractSearchIndex; k::Int=1) where T
     res = KnnResult(k)
     for i in 1:length(objects)
         empty!(res)
@@ -28,15 +28,15 @@ function partition(callback::Function, objects::AbstractVector{T}, refs::Abstrac
 end
 
 """
-    invindex(objects::AbstractVector{T}, refs::AbstractSearchContext; k::Int=1) where T
+    invindex(objects::AbstractVector{T}, refs::AbstractSearchIndex; k::Int=1) where T
 
 Creates an inverted index from references to objects.
 So, an object ``u`` is in ``r``'s posting list iff ``r``
 is among the ``k`` nearest references of ``u``.
 
 """
-function invindex(objects::AbstractVector{T}, refs::AbstractSearchContext; k::Int=1) where T
-    π = [Vector{Int}() for i in 1:length(refs.db)]
+function invindex(objects::AbstractVector{T}, refs::AbstractSearchIndex; k::Int=1) where T
+    π = [Vector{Int}() for _ in 1:length(refs.db)]
     # partition((i, p) -> push!(π[p.id], i), dist, objects, refs, k=k)
     partition(objects, refs, k=k) do i, res
         for p in res
@@ -47,11 +47,11 @@ function invindex(objects::AbstractVector{T}, refs::AbstractSearchContext; k::In
 end
 
 """
-    sequence(objects::AbstractVector{T}, refs::AbstractSearchContext) where T
+    sequence(objects::AbstractVector{T}, refs::AbstractSearchIndex) where T
 
 Computes the nearest reference of each item in the dataset and return it as a sequence of identifiers
 """
-function sequence(objects::AbstractVector{T}, refs::AbstractSearchContext) where T
+function sequence(objects::AbstractVector{T}, refs::AbstractSearchIndex) where T
     s = Vector{Int}(undef, length(objects))
     partition(objects, refs) do i, res
         s[i] = first(res).id
@@ -60,11 +60,11 @@ function sequence(objects::AbstractVector{T}, refs::AbstractSearchContext) where
 end
 
 """
-    knr(objects::AbstractVector{T}, refs::AbstractSearchContext) where T
+    knr(objects::AbstractVector{T}, refs::AbstractSearchIndex) where T
 
 Computes an array of k-nearest neighbors for `objects`
 """
-function knr(objects::AbstractVector{T}, refs::AbstractSearchContext) where T
+function knr(objects::AbstractVector{T}, refs::AbstractSearchIndex) where T
     s = Vector{Vector{Int}}(undef, length(objects))
     partition(objects, refs) do i, res
         s[i] = [p.id for p in res]