diff --git a/build.sh b/build.sh
index 678df7b35..a283bcd07 100755
--- a/build.sh
+++ b/build.sh
@@ -18,8 +18,8 @@ ARGS=$*
 # scripts, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libcuvs python rust docs tests examples --uninstall  -v -g -n --compile-static-lib --allgpuarch --no-nvtx --show_depr_warn --incl-cache-stats --time -h"
-HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<tool>] [--limit-tests=<targets>] [--build-metrics=<filename>]
+VALIDARGS="clean libcuvs python rust docs tests bench-ann examples --uninstall  -v -g -n --compile-static-lib --allgpuarch --no-nvtx --show_depr_warn --incl-cache-stats --time -h"
+HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<tool>] [--limit-tests=<targets>] [--limit-bench-ann=<targets>] [--build-metrics=<filename>]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
    libcuvs          - build the cuvs C++ code only. Also builds the C-wrapper library
@@ -28,6 +28,7 @@ HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<to
    rust             - build the cuvs Rust bindings
    docs             - build the documentation
    tests            - build the tests
+   bench-ann        - build end-to-end ann benchmarks
    examples         - build the examples
 
  and <flag> is:
@@ -37,6 +38,7 @@ HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<to
    --uninstall                 - uninstall files for specified targets which were built and installed prior
    --compile-static-lib        - compile static library for all components
    --limit-tests               - semicolon-separated list of test executables to compile (e.g. NEIGHBORS_TEST;CLUSTER_TEST)
+   --limit-bench-ann           - semicolon-separated list of ann benchmark executables to compute (e.g. HNSWLIB_ANN_BENCH;RAFT_IVF_PQ_ANN_BENCH)
    --allgpuarch                - build for all supported GPU architectures
    --no-nvtx                   - disable nvtx (profiling markers), but allow enabling it in downstream projects
    --show_depr_warn            - show cmake deprecation warnings
@@ -70,6 +72,7 @@ BUILD_REPORT_METRICS=""
 BUILD_REPORT_INCL_CACHE_STATS=OFF
 
 TEST_TARGETS="NEIGHBORS_ANN_CAGRA_TEST"
+ANN_BENCH_TARGETS="CUVS_ANN_BENCH_ALL"
 
 CACHE_ARGS=""
 NVTX=ON
@@ -150,6 +153,21 @@ function limitTests {
     fi
 }
 
+function limitAnnBench {
+    # Check for option to limit the set of test binaries to build
+    if [[ -n $(echo $ARGS | { grep -E "\-\-limit\-bench-ann" || true; } ) ]]; then
+        # There are possible weird edge cases that may cause this regex filter to output nothing and fail silently
+        # the true pipe will catch any weird edge cases that may happen and will cause the program to fall back
+        # on the invalid option error
+        LIMIT_ANN_BENCH_TARGETS=$(echo $ARGS | sed -e 's/.*--limit-bench-ann=//' -e 's/ .*//')
+        if [[ -n ${LIMIT_ANN_BENCH_TARGETS} ]]; then
+            # Remove the full LIMIT_TEST_TARGETS argument from list of args so that it passes validArgs function
+            ARGS=${ARGS//--limit-bench-ann=$LIMIT_ANN_BENCH_TARGETS/}
+            ANN_BENCH_TARGETS=${LIMIT_ANN_BENCH_TARGETS}
+        fi
+    fi
+}
+
 function buildMetrics {
     # Check for multiple build-metrics options
     if [[ $(echo $ARGS | { grep -Eo "\-\-build\-metrics" || true; } | wc -l ) -gt 1 ]]; then
@@ -179,6 +197,7 @@ if (( ${NUMARGS} != 0 )); then
     cmakeArgs
     cacheTool
     limitTests
+    limitAnnBench
     buildMetrics
     for a in ${ARGS}; do
         if ! (echo " ${VALIDARGS} " | grep -q " ${a} "); then
@@ -255,6 +274,11 @@ if hasArg tests || (( ${NUMARGS} == 0 )); then
     fi
 fi
 
+if hasArg bench-ann || (( ${NUMARGS} == 0 )); then
+    BUILD_ANN_BENCH=ON
+    CMAKE_TARGET="${CMAKE_TARGET};${ANN_BENCH_TARGETS}"
+fi
+
 if hasArg --no-nvtx; then
     NVTX=OFF
 fi
@@ -327,6 +351,7 @@ if (( ${NUMARGS} == 0 )) || hasArg libcuvs || hasArg docs || hasArg tests || has
           -DDISABLE_DEPRECATION_WARNINGS=${DISABLE_DEPRECATION_WARNINGS} \
           -DBUILD_TESTS=${BUILD_TESTS} \
           -DBUILD_C_TESTS=${BUILD_TESTS} \
+          -DBUILD_ANN_BENCH=${BUILD_ANN_BENCH} \
           -DBUILD_CPU_ONLY=${BUILD_CPU_ONLY} \
           -DCMAKE_MESSAGE_LOG_LEVEL=${CMAKE_LOG_LEVEL} \
           ${CACHE_ARGS} \
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 9a50bb805..b4bfdebcf 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -41,6 +41,7 @@ dependencies:
 - numpy>=1.23,<2.0a0
 - numpydoc
 - nvcc_linux-aarch64=11.8
+- openblas
 - pre-commit
 - pydata-sphinx-theme
 - pylibraft==24.8.*,>=0.0.0a0
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index a0581ff45..e3181d69c 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -41,6 +41,7 @@ dependencies:
 - numpy>=1.23,<2.0a0
 - numpydoc
 - nvcc_linux-64=11.8
+- openblas
 - pre-commit
 - pydata-sphinx-theme
 - pylibraft==24.8.*,>=0.0.0a0
diff --git a/conda/environments/all_cuda-122_arch-aarch64.yaml b/conda/environments/all_cuda-122_arch-aarch64.yaml
index 9b151e4b7..e22d4388e 100644
--- a/conda/environments/all_cuda-122_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-122_arch-aarch64.yaml
@@ -37,6 +37,7 @@ dependencies:
 - ninja
 - numpy>=1.23,<2.0a0
 - numpydoc
+- openblas
 - pre-commit
 - pydata-sphinx-theme
 - pylibraft==24.8.*,>=0.0.0a0
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index d4b04daf0..08d2f23bb 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -37,6 +37,7 @@ dependencies:
 - ninja
 - numpy>=1.23,<2.0a0
 - numpydoc
+- openblas
 - pre-commit
 - pydata-sphinx-theme
 - pylibraft==24.8.*,>=0.0.0a0
diff --git a/conda/recipes/libcuvs/build_libcuvs_tests.sh b/conda/recipes/libcuvs/build_libcuvs_tests.sh
index fb116a692..5d77ae2d1 100644
--- a/conda/recipes/libcuvs/build_libcuvs_tests.sh
+++ b/conda/recipes/libcuvs/build_libcuvs_tests.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
-./build.sh tests --allgpuarch --no-nvtx --build-metrics=tests_bench --incl-cache-stats
+./build.sh tests bench-ann --allgpuarch --no-nvtx --build-metrics=tests_bench --incl-cache-stats
 cmake --install cpp/build --component testing
diff --git a/conda/recipes/libcuvs/meta.yaml b/conda/recipes/libcuvs/meta.yaml
index d1e8506c8..4ffdc91e4 100644
--- a/conda/recipes/libcuvs/meta.yaml
+++ b/conda/recipes/libcuvs/meta.yaml
@@ -198,6 +198,7 @@ outputs:
         - libraft ={{ minor_version }}
         - {{ pin_subpackage('libcuvs', exact=True) }}
         - cuda-version ={{ cuda_version }}
+        - openblas # required by some CPU algos in benchmarks
         {% if cuda_major == "11" %}
         - cuda-profiler-api {{ cuda11_cuda_profiler_api_run_version }}
         - libcublas {{ cuda11_libcublas_host_version }}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index abe6fa7b3..c7b61b92f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -55,6 +55,7 @@ option(BUILD_SHARED_LIBS "Build cuvs shared libraries" ON)
 option(BUILD_TESTS "Build cuvs unit-tests" ON)
 option(BUILD_C_LIBRARY "Build raft C API library" OFF)
 option(BUILD_C_TESTS "Build raft C API tests" OFF)
+option(BUILD_ANN_BENCH "Build cuVS ann benchmarks" OFF)
 option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINEINFO
        "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF
@@ -92,6 +93,7 @@ include(CMakeDependentOption)
 
 message(VERBOSE "cuVS: Build cuVS unit-tests: ${BUILD_TESTS}")
 message(VERBOSE "cuVS: Build CPU only components: ${BUILD_CPU_ONLY}")
+message(VERBOSE "cuVS: Build ANN benchmarks: ${BUILD_ANN_BENCH}")
 message(VERBOSE "cuVS: Enable detection of conda environment for dependencies: ${DETECT_CONDA_ENV}")
 message(VERBOSE "cuVS: Disable depreaction warnings " ${DISABLE_DEPRECATION_WARNINGS})
 message(VERBOSE "cuVS: Disable OpenMP: ${DISABLE_OPENMP}")
@@ -184,6 +186,11 @@ endif()
 
 include(cmake/thirdparty/get_cutlass.cmake)
 
+if(BUILD_ANN_BENCH)
+  include(${rapids-cmake-dir}/cpm/gbench.cmake)
+  rapids_cpm_gbench(BUILD_STATIC)
+endif()
+
 # ##################################################################################################
 # * cuvs ---------------------------------------------------------------------
 
@@ -663,3 +670,10 @@ if(BUILD_TESTS OR BUILD_C_TESTS)
   include(internal/CMakeLists.txt)
   include(test/CMakeLists.txt)
 endif()
+
+# ##################################################################################################
+# * build ann benchmark executable -----------------------------------------------
+
+if(BUILD_ANN_BENCH)
+  include(bench/ann/CMakeLists.txt)
+endif()
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
new file mode 100644
index 000000000..6e9e66fad
--- /dev/null
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -0,0 +1,367 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# ##################################################################################################
+# * benchmark options ------------------------------------------------------------------------------
+
+option(CUVS_ANN_BENCH_USE_FAISS_GPU_FLAT "Include faiss' brute-force knn algorithm in benchmark" ON)
+option(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON)
+option(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON)
+option(CUVS_ANN_BENCH_USE_FAISS_CPU_FLAT "Include faiss' cpu brute-force algorithm in benchmark" ON)
+option(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT "Include faiss' cpu ivf flat algorithm in benchmark"
+       ON
+)
+option(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_PQ "Include faiss' cpu ivf pq algorithm in benchmark" ON)
+option(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT "Include cuVS ivf flat algorithm in benchmark" ON)
+option(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ "Include cuVS ivf pq algorithm in benchmark" ON)
+option(CUVS_ANN_BENCH_USE_CUVS_CAGRA "Include cuVS CAGRA in benchmark" ON)
+option(CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE "Include cuVS brute force knn in benchmark" ON)
+option(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB "Include cuVS CAGRA with HNSW search in benchmark" ON)
+option(CUVS_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON)
+option(CUVS_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" OFF)
+option(CUVS_ANN_BENCH_SINGLE_EXE
+       "Make a single executable with benchmark as shared library modules" OFF
+)
+
+# ##################################################################################################
+# * Process options ----------------------------------------------------------
+
+find_package(Threads REQUIRED)
+
+if(BUILD_CPU_ONLY)
+  set(CUVS_FAISS_ENABLE_GPU OFF)
+  set(CUVS_ANN_BENCH_USE_FAISS_GPU_FLAT OFF)
+  set(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT OFF)
+  set(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_PQ OFF)
+  set(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT OFF)
+  set(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ OFF)
+  set(CUVS_ANN_BENCH_USE_CUVS_CAGRA OFF)
+  set(CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE OFF)
+  set(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB OFF)
+  set(CUVS_ANN_BENCH_USE_GGNN OFF)
+else()
+  set(CUVS_FAISS_ENABLE_GPU ON)
+endif()
+
+set(CUVS_ANN_BENCH_USE_FAISS OFF)
+if(CUVS_ANN_BENCH_USE_FAISS_GPU_FLAT
+   OR CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_PQ
+   OR CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT
+   OR CUVS_ANN_BENCH_USE_FAISS_CPU_FLAT
+   OR CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_PQ
+   OR CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT
+)
+  set(CUVS_ANN_BENCH_USE_FAISS ON)
+  set(CUVS_USE_FAISS_STATIC ON)
+endif()
+
+set(CUVS_ANN_BENCH_USE_CUVS OFF)
+if(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ
+   OR CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE
+   OR CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT
+   OR CUVS_ANN_BENCH_USE_CUVS_CAGRA
+   OR CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB
+)
+  set(CUVS_ANN_BENCH_USE_CUVS ON)
+endif()
+
+# ##################################################################################################
+# * Fetch requirements -------------------------------------------------------------
+
+if(CUVS_ANN_BENCH_USE_HNSWLIB OR CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
+  include(cmake/thirdparty/get_hnswlib.cmake)
+endif()
+
+include(cmake/thirdparty/get_nlohmann_json.cmake)
+
+if(CUVS_ANN_BENCH_USE_GGNN)
+  include(cmake/thirdparty/get_ggnn.cmake)
+endif()
+
+if(CUVS_ANN_BENCH_USE_FAISS)
+  # We need to ensure that faiss has all the conda information. So we currently use the very ugly
+  # hammer of `link_libraries` to ensure that all targets in this directory and the faiss directory
+  # will have the conda includes/link dirs
+  link_libraries($<TARGET_NAME_IF_EXISTS:conda_env>)
+  include(cmake/thirdparty/get_faiss.cmake)
+endif()
+
+# ##################################################################################################
+# * Enable NVTX if available
+
+# Note: ANN_BENCH wrappers have extra NVTX code not related to raft::nvtx.They track gbench
+# benchmark cases and iterations. This is to make limited NVTX available to all algos, not just
+# raft/cuVS.
+if(TARGET CUDA::nvtx3)
+  set(_CMAKE_REQUIRED_INCLUDES_ORIG ${CMAKE_REQUIRED_INCLUDES})
+  get_target_property(CMAKE_REQUIRED_INCLUDES CUDA::nvtx3 INTERFACE_INCLUDE_DIRECTORIES)
+  unset(NVTX3_HEADERS_FOUND CACHE)
+  # Check the headers explicitly to make sure the cpu-only build succeeds
+  CHECK_INCLUDE_FILE_CXX(nvtx3/nvToolsExt.h NVTX3_HEADERS_FOUND)
+  set(CMAKE_REQUIRED_INCLUDES ${_CMAKE_REQUIRED_INCLUDES_ORIG})
+endif()
+
+# ##################################################################################################
+# * Target function -------------------------------------------------------------
+
+function(ConfigureAnnBench)
+
+  set(oneValueArgs NAME)
+  set(multiValueArgs PATH LINKS CXXFLAGS)
+
+  if(NOT BUILD_CPU_ONLY)
+    set(GPU_BUILD ON)
+  endif()
+
+  cmake_parse_arguments(
+    ConfigureAnnBench "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}
+  )
+
+  set(BENCH_NAME ${ConfigureAnnBench_NAME}_ANN_BENCH)
+
+  if(CUVS_ANN_BENCH_SINGLE_EXE)
+    add_library(${BENCH_NAME} SHARED ${ConfigureAnnBench_PATH})
+    string(TOLOWER ${BENCH_NAME} BENCH_LIB_NAME)
+    set_target_properties(${BENCH_NAME} PROPERTIES OUTPUT_NAME ${BENCH_LIB_NAME})
+    add_dependencies(${BENCH_NAME} ANN_BENCH)
+  else()
+    add_executable(${BENCH_NAME} ${ConfigureAnnBench_PATH})
+    target_compile_definitions(
+      ${BENCH_NAME} PRIVATE ANN_BENCH_BUILD_MAIN
+                            $<$<BOOL:${NVTX3_HEADERS_FOUND}>:ANN_BENCH_NVTX3_HEADERS_FOUND>
+    )
+    target_link_libraries(
+      ${BENCH_NAME} PRIVATE benchmark::benchmark $<$<BOOL:${NVTX3_HEADERS_FOUND}>:CUDA::nvtx3>
+    )
+  endif()
+
+  target_link_libraries(
+    ${BENCH_NAME}
+    PRIVATE ${ConfigureAnnBench_LINKS}
+            nlohmann_json::nlohmann_json
+            Threads::Threads
+            $<$<BOOL:${GPU_BUILD}>:CUDA::cudart_static>
+            $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+            $<TARGET_NAME_IF_EXISTS:conda_env>
+            -static-libgcc
+            -static-libstdc++
+  )
+
+  set_target_properties(
+    ${BENCH_NAME}
+    PROPERTIES # set target compile options
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
+               POSITION_INDEPENDENT_CODE ON
+               INTERFACE_POSITION_INDEPENDENT_CODE ON
+               BUILD_RPATH "\$ORIGIN"
+               INSTALL_RPATH "\$ORIGIN"
+  )
+
+  set(${ConfigureAnnBench_CXXFLAGS} ${CUVS_CXX_FLAGS} ${ConfigureAnnBench_CXXFLAGS})
+
+  target_compile_options(
+    ${BENCH_NAME} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ConfigureAnnBench_CXXFLAGS}>"
+                          "$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
+  )
+
+  if(CUVS_ANN_BENCH_USE_${ConfigureAnnBench_NAME})
+    target_compile_definitions(
+      ${BENCH_NAME}
+      PUBLIC
+        CUVS_ANN_BENCH_USE_${ConfigureAnnBench_NAME}=CUVS_ANN_BENCH_USE_${ConfigureAnnBench_NAME}
+    )
+  endif()
+
+  target_include_directories(
+    ${BENCH_NAME}
+    PUBLIC "$<BUILD_INTERFACE:${CUVS_SOURCE_DIR}/include>"
+    PRIVATE ${ConfigureAnnBench_INCLUDES}
+  )
+
+  install(
+    TARGETS ${BENCH_NAME}
+    COMPONENT ann_bench
+    DESTINATION bin/ann
+  )
+
+  add_dependencies(CUVS_ANN_BENCH_ALL ${BENCH_NAME})
+endfunction()
+
+# ##################################################################################################
+# * Configure benchmark targets -------------------------------------------------------------
+
+if(NOT TARGET CUVS_ANN_BENCH_ALL)
+  add_custom_target(CUVS_ANN_BENCH_ALL)
+endif()
+
+if(CUVS_ANN_BENCH_USE_HNSWLIB)
+  ConfigureAnnBench(
+    NAME HNSWLIB PATH bench/ann/src/hnswlib/hnswlib_benchmark.cpp LINKS hnswlib::hnswlib
+  )
+
+endif()
+
+if(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ)
+  ConfigureAnnBench(
+    NAME CUVS_IVF_PQ PATH bench/ann/src/cuvs/cuvs_benchmark.cu
+    $<$<BOOL:${CUVS_ANN_BENCH_USE_CUVS_IVF_PQ}>:bench/ann/src/cuvs/cuvs_ivf_pq.cu> LINKS cuvs
+  )
+endif()
+
+if(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT)
+  ConfigureAnnBench(
+    NAME CUVS_IVF_FLAT PATH bench/ann/src/cuvs/cuvs_benchmark.cu
+    $<$<BOOL:${CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT}>:bench/ann/src/cuvs/cuvs_ivf_flat.cu> LINKS cuvs
+  )
+endif()
+
+if(CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE)
+  ConfigureAnnBench(NAME CUVS_BRUTE_FORCE PATH bench/ann/src/cuvs/cuvs_benchmark.cu LINKS cuvs)
+endif()
+
+if(CUVS_ANN_BENCH_USE_CUVS_CAGRA)
+  ConfigureAnnBench(
+    NAME
+    CUVS_CAGRA
+    PATH
+    bench/ann/src/cuvs/cuvs_benchmark.cu
+    $<$<BOOL:${CUVS_ANN_BENCH_USE_CUVS_CAGRA}>:bench/ann/src/cuvs/cuvs_cagra_float.cu>
+    $<$<BOOL:${CUVS_ANN_BENCH_USE_CUVS_CAGRA}>:bench/ann/src/cuvs/cuvs_cagra_half.cu>
+    $<$<BOOL:${CUVS_ANN_BENCH_USE_CUVS_CAGRA}>:bench/ann/src/cuvs/cuvs_cagra_int8_t.cu>
+    $<$<BOOL:${CUVS_ANN_BENCH_USE_CUVS_CAGRA}>:bench/ann/src/cuvs/cuvs_cagra_uint8_t.cu>
+    LINKS
+    cuvs
+  )
+endif()
+
+if(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
+  ConfigureAnnBench(
+    NAME CUVS_CAGRA_HNSWLIB PATH bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu LINKS cuvs
+    hnswlib::hnswlib
+  )
+endif()
+
+set(CUVS_FAISS_TARGETS faiss::faiss)
+if(TARGET faiss::faiss_avx2)
+  set(CUVS_FAISS_TARGETS faiss::faiss_avx2)
+endif()
+
+message("CUVS_FAISS_TARGETS: ${CUVS_FAISS_TARGETS}")
+message("CUDAToolkit_LIBRARY_DIR: ${CUDAToolkit_LIBRARY_DIR}")
+if(CUVS_ANN_BENCH_USE_FAISS_CPU_FLAT)
+  ConfigureAnnBench(
+    NAME FAISS_CPU_FLAT PATH bench/ann/src/faiss/faiss_cpu_benchmark.cpp LINKS
+    ${CUVS_FAISS_TARGETS}
+  )
+endif()
+
+if(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT)
+  ConfigureAnnBench(
+    NAME FAISS_CPU_IVF_FLAT PATH bench/ann/src/faiss/faiss_cpu_benchmark.cpp LINKS
+    ${CUVS_FAISS_TARGETS}
+  )
+endif()
+
+if(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_PQ)
+  ConfigureAnnBench(
+    NAME FAISS_CPU_IVF_PQ PATH bench/ann/src/faiss/faiss_cpu_benchmark.cpp LINKS
+    ${CUVS_FAISS_TARGETS}
+  )
+endif()
+
+if(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT)
+  ConfigureAnnBench(
+    NAME FAISS_GPU_IVF_FLAT PATH bench/ann/src/faiss/faiss_gpu_benchmark.cu LINKS
+    ${CUVS_FAISS_TARGETS}
+  )
+endif()
+
+if(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_PQ)
+  ConfigureAnnBench(
+    NAME FAISS_GPU_IVF_PQ PATH bench/ann/src/faiss/faiss_gpu_benchmark.cu LINKS
+    ${CUVS_FAISS_TARGETS}
+  )
+endif()
+
+if(CUVS_ANN_BENCH_USE_FAISS_GPU_FLAT)
+  ConfigureAnnBench(
+    NAME FAISS_GPU_FLAT PATH bench/ann/src/faiss/faiss_gpu_benchmark.cu LINKS ${CUVS_FAISS_TARGETS}
+  )
+endif()
+
+if(CUVS_ANN_BENCH_USE_GGNN)
+  include(cmake/thirdparty/get_glog.cmake)
+  ConfigureAnnBench(
+    NAME GGNN PATH bench/ann/src/ggnn/ggnn_benchmark.cu LINKS glog::glog ggnn::ggnn CUDA::curand
+  )
+endif()
+
+# ##################################################################################################
+# * Dynamically-loading ANN_BENCH executable -------------------------------------------------------
+if(CUVS_ANN_BENCH_SINGLE_EXE)
+  add_executable(ANN_BENCH bench/ann/src/common/benchmark.cpp)
+
+  # Build and link static version of the GBench to keep ANN_BENCH self-contained.
+  get_target_property(TMP_PROP benchmark::benchmark SOURCES)
+  add_library(benchmark_static STATIC ${TMP_PROP})
+  get_target_property(TMP_PROP benchmark::benchmark INCLUDE_DIRECTORIES)
+  target_include_directories(benchmark_static PUBLIC ${TMP_PROP})
+  get_target_property(TMP_PROP benchmark::benchmark LINK_LIBRARIES)
+  target_link_libraries(benchmark_static PUBLIC ${TMP_PROP})
+
+  target_include_directories(ANN_BENCH PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+
+  target_link_libraries(
+    ANN_BENCH
+    PRIVATE raft::raft
+            nlohmann_json::nlohmann_json
+            benchmark_static
+            dl
+            -static-libgcc
+            fmt::fmt-header-only
+            spdlog::spdlog_header_only
+            -static-libstdc++
+            $<$<BOOL:${NVTX3_HEADERS_FOUND}>:CUDA::nvtx3>
+  )
+  set_target_properties(
+    ANN_BENCH
+    PROPERTIES # set target compile options
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
+               POSITION_INDEPENDENT_CODE ON
+               INTERFACE_POSITION_INDEPENDENT_CODE ON
+               BUILD_RPATH "\$ORIGIN"
+               INSTALL_RPATH "\$ORIGIN"
+  )
+  target_compile_definitions(
+    ANN_BENCH
+    PRIVATE
+      $<$<BOOL:${CUDAToolkit_FOUND}>:ANN_BENCH_LINK_CUDART="libcudart.so.${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}.${CUDAToolkit_VERSION_PATCH}">
+      $<$<BOOL:${NVTX3_HEADERS_FOUND}>:ANN_BENCH_NVTX3_HEADERS_FOUND>
+  )
+
+  target_link_options(ANN_BENCH PRIVATE -export-dynamic)
+
+  install(
+    TARGETS ANN_BENCH
+    COMPONENT ann_bench
+    DESTINATION bin/ann
+    EXCLUDE_FROM_ALL
+  )
+endif()
diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp
new file mode 100644
index 000000000..4b17885c0
--- /dev/null
+++ b/cpp/bench/ann/src/common/ann_types.hpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cuda_stub.hpp"  // cudaStream_t
+
+#include <memory>
+#include <nlohmann/json.hpp>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace cuvs::bench {
+
+/** Benchmark mode: measuring latency vs throughput. */
+enum class Mode {
+  kThroughput,  // See how many vectors we can push through
+  kLatency      // See how fast we can push a vector through
+};
+
+enum class MemoryType {
+  kHost,
+  kHostMmap,
+  kDevice,
+};
+
+enum class Metric {
+  kInnerProduct,
+  kEuclidean,
+};
+
+inline auto parse_metric(const std::string& metric_str) -> Metric
+{
+  if (metric_str == "inner_product") {
+    return cuvs::bench::Metric::kInnerProduct;
+  } else if (metric_str == "euclidean") {
+    return cuvs::bench::Metric::kEuclidean;
+  } else {
+    throw std::runtime_error("invalid metric: '" + metric_str + "'");
+  }
+}
+
+inline auto parse_memory_type(const std::string& memory_type) -> MemoryType
+{
+  if (memory_type == "host") {
+    return MemoryType::kHost;
+  } else if (memory_type == "mmap") {
+    return MemoryType::kHostMmap;
+  } else if (memory_type == "device") {
+    return MemoryType::kDevice;
+  } else {
+    throw std::runtime_error("invalid memory type: '" + memory_type + "'");
+  }
+}
+
+struct algo_property {
+  MemoryType dataset_memory_type;
+  // neighbors/distances should have same memory type as queries
+  MemoryType query_memory_type;
+};
+
+class algo_base {
+ public:
+  using index_type = int64_t;
+
+  inline algo_base(Metric metric, int dim) : metric_(metric), dim_(dim) {}
+  virtual ~algo_base() noexcept = default;
+
+ protected:
+  Metric metric_;
+  int dim_;
+};
+
+/**
+ * The GPU-based algorithms, which do not perform CPU synchronization at the end of their build or
+ * search methods, must implement this interface.
+ *
+ * The `cuda_timer` / `cuda_lap`  from `util.hpp` uses this stream to record GPU times with events
+ * and, if necessary, also synchronize (via events) between iterations.
+ *
+ * If the algo does not implement this interface, GPU timings are disabled.
+ */
+class algo_gpu {
+ public:
+  /**
+   * Return the main cuda stream for this algorithm.
+   * If any work is done in multiple streams, they should synchornize with the main stream at the
+   * end.
+   */
+  [[nodiscard]] virtual auto get_sync_stream() const noexcept -> cudaStream_t = 0;
+  /**
+   * By default a GPU algorithm uses a fixed stream to order GPU operations.
+   * However, an algorithm may need to synchronize with the host at the end of its execution.
+   * In that case, also synchronizing with a benchmark event would put it at disadvantage.
+   *
+   * We can disable event sync by passing `false` here
+   *   - ONLY IF THE ALGORITHM HAS PRODUCED ITS OUTPUT BY THE TIME IT SYNCHRONIZES WITH CPU.
+   */
+  [[nodiscard]] virtual auto uses_stream() const noexcept -> bool { return true; }
+  virtual ~algo_gpu() noexcept = default;
+};
+
+template <typename T>
+class algo : public algo_base {
+ public:
+  struct search_param {
+    virtual ~search_param() = default;
+    [[nodiscard]] virtual auto needs_dataset() const -> bool { return false; };
+  };
+
+  inline algo(Metric metric, int dim) : algo_base(metric, dim) {}
+  ~algo() noexcept override = default;
+
+  virtual void build(const T* dataset, size_t nrow) = 0;
+
+  virtual void set_search_param(const search_param& param) = 0;
+  // TODO(snanditale): this assumes that an algorithm can always return k results.
+  // This is not always possible.
+  virtual void search(const T* queries,
+                      int batch_size,
+                      int k,
+                      algo_base::index_type* neighbors,
+                      float* distances) const = 0;
+
+  virtual void save(const std::string& file) const = 0;
+  virtual void load(const std::string& file)       = 0;
+
+  [[nodiscard]] virtual auto get_preference() const -> algo_property = 0;
+
+  // Some algorithms don't save the building dataset in their indices.
+  // So they should be given the access to that dataset during searching.
+  // The advantage of this way is that index has smaller size
+  // and many indices can share one dataset.
+  //
+  // search_param::needs_dataset() of such algorithm should be true,
+  // and set_search_dataset() should save the passed-in pointer somewhere.
+  // The client code should call set_search_dataset() before searching,
+  // and should not release dataset before searching is finished.
+  virtual void set_search_dataset(const T* /*dataset*/, size_t /*nrow*/){};
+
+  /**
+   * Make a shallow copy of the algo wrapper that shares the resources and ensures thread-safe
+   * access to them. */
+  virtual auto copy() -> std::unique_ptr<algo<T>> = 0;
+};
+
+}  // namespace cuvs::bench
+
+#define REGISTER_ALGO_INSTANCE(DataT)                                                              \
+  template auto cuvs::bench::create_algo<DataT>(                                                   \
+    const std::string&, const std::string&, int, const nlohmann::json&)                            \
+    ->std::unique_ptr<cuvs::bench::algo<DataT>>;                                                   \
+  template auto cuvs::bench::create_search_param<DataT>(const std::string&, const nlohmann::json&) \
+    ->std::unique_ptr<typename cuvs::bench::algo<DataT>::search_param>;
diff --git a/cpp/bench/ann/src/common/benchmark.cpp b/cpp/bench/ann/src/common/benchmark.cpp
new file mode 100644
index 000000000..7ec8558a4
--- /dev/null
+++ b/cpp/bench/ann/src/common/benchmark.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// clang-format off
+#include "cuda_stub.hpp"  // must go first
+// clang-format on
+
+#include "ann_types.hpp"
+
+#include <dlfcn.h>
+
+#include <filesystem>
+#include <memory>
+#include <unordered_map>
+
+namespace cuvs::bench {
+
+struct lib_handle {
+  void* handle{nullptr};
+  explicit lib_handle(const std::string& name)
+  {
+    handle = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
+    if (handle == nullptr) {
+      auto error_msg = "Failed to load " + name;
+      auto err       = dlerror();
+      if (err != nullptr && err[0] != '\0') { error_msg += ": " + std::string(err); }
+      throw std::runtime_error(error_msg);
+    }
+  }
+  ~lib_handle() noexcept
+  {
+    if (handle != nullptr) { dlclose(handle); }
+  }
+};
+
+auto load_lib(const std::string& algo) -> void*
+{
+  static std::unordered_map<std::string, lib_handle> libs{};
+  auto found = libs.find(algo);
+
+  if (found != libs.end()) { return found->second.handle; }
+  auto lib_name = "lib" + algo + "_ann_bench.so";
+  return libs.emplace(algo, lib_name).first->second.handle;
+}
+
+/*
+  TODO(achirkin): remove this compatibility layer.
+  When reading old raft algo configs, we may encounter raft_xxx algorithms;
+  they all are renamed to cuvs_xxx algorithm.
+  This compatibility layer helps using old configs with the new benchmark executable.
+ */
+auto load_lib_raft_compat(const std::string& algo) -> void*
+{
+  try {
+    return load_lib(algo);
+  } catch (std::runtime_error& e) {
+    if (algo.rfind("raft", 0) == 0) { return load_lib("cuvs" + algo.substr(4)); }
+    throw e;
+  }
+}
+
+auto get_fun_name(void* addr) -> std::string
+{
+  Dl_info dl_info;
+  if (dladdr(addr, &dl_info) != 0) {
+    if (dl_info.dli_sname != nullptr && dl_info.dli_sname[0] != '\0') {
+      return std::string{dl_info.dli_sname};
+    }
+  }
+  throw std::logic_error("Failed to find out name of the looked up function");
+}
+
+template <typename T>
+auto create_algo(const std::string& algo,
+                 const std::string& distance,
+                 int dim,
+                 const nlohmann::json& conf) -> std::unique_ptr<cuvs::bench::algo<T>>
+{
+  static auto fname = get_fun_name(reinterpret_cast<void*>(&create_algo<T>));
+  auto handle       = load_lib_raft_compat(algo);
+  auto fun_addr     = dlsym(handle, fname.c_str());
+  if (fun_addr == nullptr) {
+    throw std::runtime_error("Couldn't load the create_algo function (" + algo + ")");
+  }
+  auto fun = reinterpret_cast<decltype(&create_algo<T>)>(fun_addr);
+  return fun(algo, distance, dim, conf);
+}
+
+template <typename T>
+std::unique_ptr<typename cuvs::bench::algo<T>::search_param> create_search_param(
+  const std::string& algo, const nlohmann::json& conf)
+{
+  static auto fname = get_fun_name(reinterpret_cast<void*>(&create_search_param<T>));
+  auto handle       = load_lib_raft_compat(algo);
+  auto fun_addr     = dlsym(handle, fname.c_str());
+  if (fun_addr == nullptr) {
+    throw std::runtime_error("Couldn't load the create_search_param function (" + algo + ")");
+  }
+  auto fun = reinterpret_cast<decltype(&create_search_param<T>)>(fun_addr);
+  return fun(algo, conf);
+}
+
+};  // namespace cuvs::bench
+
+REGISTER_ALGO_INSTANCE(float);
+REGISTER_ALGO_INSTANCE(std::int8_t);
+REGISTER_ALGO_INSTANCE(std::uint8_t);
+
+#include "benchmark.hpp"
+
+auto main(int argc, char** argv) -> int { return cuvs::bench::run_main(argc, argv); }
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
new file mode 100644
index 000000000..e6462c157
--- /dev/null
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -0,0 +1,725 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "ann_types.hpp"
+#include "conf.hpp"
+#include "dataset.hpp"
+#include "util.hpp"
+
+#include <benchmark/benchmark.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdint>
+#include <fstream>
+#include <limits>
+#include <memory>
+#include <mutex>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <vector>
+
+namespace cuvs::bench {
+
+static inline std::unique_ptr<algo_base> current_algo{nullptr};
+static inline std::unique_ptr<algo_property> current_algo_props{nullptr};
+
+using kv_series = std::vector<std::tuple<std::string, std::vector<nlohmann::json>>>;
+
+inline auto apply_overrides(const std::vector<nlohmann::json>& configs,
+                            const kv_series& overrides,
+                            std::size_t override_idx = 0) -> std::vector<nlohmann::json>
+{
+  std::vector<nlohmann::json> results{};
+  if (override_idx >= overrides.size()) {
+    auto n = configs.size();
+    for (size_t i = 0; i < n; i++) {
+      auto c               = configs[i];
+      c["override_suffix"] = n > 1 ? "/" + std::to_string(i) : "";
+      results.push_back(c);
+    }
+    return results;
+  }
+  auto rec_configs = apply_overrides(configs, overrides, override_idx + 1);
+  auto [key, vals] = overrides[override_idx];
+  auto n           = vals.size();
+  for (size_t i = 0; i < n; i++) {
+    const auto& val = vals[i];
+    for (auto rc : rec_configs) {
+      if (n > 1) {
+        rc["override_suffix"] =
+          static_cast<std::string>(rc["override_suffix"]) + "/" + std::to_string(i);
+      }
+      rc[key] = val;
+      results.push_back(rc);
+    }
+  }
+  return results;
+}
+
+inline auto apply_overrides(const nlohmann::json& config,
+                            const kv_series& overrides,
+                            std::size_t override_idx = 0)
+{
+  return apply_overrides(std::vector{config}, overrides, 0);
+}
+
+inline void dump_parameters(::benchmark::State& state, nlohmann::json params)
+{
+  std::string label = "";
+  bool label_empty  = true;
+  for (auto& [key, val] : params.items()) {
+    if (val.is_number()) {
+      state.counters.insert({{key, val}});
+    } else if (val.is_boolean()) {
+      state.counters.insert({{key, val ? 1.0 : 0.0}});
+    } else {
+      auto kv = key + "=" + val.dump();
+      if (label_empty) {
+        label = kv;
+      } else {
+        label += "#" + kv;
+      }
+      label_empty = false;
+    }
+  }
+  if (!label_empty) { state.SetLabel(label); }
+}
+
+inline auto parse_algo_property(algo_property prop, const nlohmann::json& conf) -> algo_property
+{
+  if (conf.contains("dataset_memory_type")) {
+    prop.dataset_memory_type = parse_memory_type(conf.at("dataset_memory_type"));
+  }
+  if (conf.contains("query_memory_type")) {
+    prop.query_memory_type = parse_memory_type(conf.at("query_memory_type"));
+  }
+  return prop;
+};
+
+template <typename T>
+void bench_build(::benchmark::State& state,
+                 std::shared_ptr<const dataset<T>> dataset,
+                 configuration::index index,
+                 bool force_overwrite)
+{
+  // NB: these two thread-local vars can be used within algo wrappers
+  cuvs::bench::benchmark_thread_id = state.thread_index();
+  cuvs::bench::benchmark_n_threads = state.threads();
+  dump_parameters(state, index.build_param);
+  if (file_exists(index.file)) {
+    if (force_overwrite) {
+      log_info("Overwriting file: %s", index.file.c_str());
+    } else {
+      return state.SkipWithMessage(
+        "Index file already exists (use --force to overwrite the index).");
+    }
+  }
+
+  std::unique_ptr<algo<T>> algo;
+  try {
+    algo = create_algo<T>(index.algo, dataset->distance(), dataset->dim(), index.build_param);
+  } catch (const std::exception& e) {
+    return state.SkipWithError("Failed to create an algo: " + std::string(e.what()));
+  }
+
+  const auto algo_property = parse_algo_property(algo->get_preference(), index.build_param);
+
+  const T* base_set      = dataset->base_set(algo_property.dataset_memory_type);
+  std::size_t index_size = dataset->base_set_size();
+
+  cuda_timer gpu_timer{algo};
+  {
+    nvtx_case nvtx{state.name()};
+    for (auto _ : state) {
+      [[maybe_unused]] auto ntx_lap = nvtx.lap();
+      [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
+      try {
+        algo->build(base_set, index_size);
+      } catch (const std::exception& e) {
+        state.SkipWithError(std::string(e.what()));
+      }
+    }
+  }
+  if (gpu_timer.active()) {
+    state.counters.insert({"GPU", {gpu_timer.total_time(), benchmark::Counter::kAvgIterations}});
+  }
+  state.counters.insert({{"index_size", index_size}});
+
+  if (state.skipped()) { return; }
+  make_sure_parent_dir_exists(index.file);
+  algo->save(index.file);
+}
+
+template <typename T>
+void bench_search(::benchmark::State& state,
+                  configuration::index index,
+                  std::size_t search_param_ix,
+                  std::shared_ptr<const dataset<T>> dataset)
+{
+  // NB: these two thread-local vars can be used within algo wrappers
+  cuvs::bench::benchmark_thread_id = state.thread_index();
+  cuvs::bench::benchmark_n_threads = state.threads();
+  std::size_t queries_processed    = 0;
+
+  const auto& sp_json = index.search_params[search_param_ix];
+
+  if (state.thread_index() == 0) { dump_parameters(state, sp_json); }
+
+  // NB: `k` and `n_queries` are guaranteed to be populated in conf.cpp
+  const std::uint32_t k = sp_json["k"];
+  // Amount of data processes in one go
+  const std::size_t n_queries = sp_json["n_queries"];
+  // Round down the query data to a multiple of the batch size to loop over full batches of data
+  const std::size_t query_set_size = (dataset->query_set_size() / n_queries) * n_queries;
+
+  if (dataset->query_set_size() < n_queries) {
+    std::stringstream msg;
+    msg << "Not enough queries in benchmark set. Expected " << n_queries << ", actual "
+        << dataset->query_set_size();
+    state.SkipWithError(msg.str());
+    return;
+  }
+
+  // Each thread start from a different offset, so that the queries that they process do not
+  // overlap.
+  std::ptrdiff_t batch_offset   = (state.thread_index() * n_queries) % query_set_size;
+  std::ptrdiff_t queries_stride = state.threads() * n_queries;
+  // Output is saved into a contiguous buffer (separate buffers for each thread).
+  std::ptrdiff_t out_offset = 0;
+
+  const T* query_set = nullptr;
+
+  if (!file_exists(index.file)) {
+    state.SkipWithError("Index file is missing. Run the benchmark in the build mode first.");
+    return;
+  }
+
+  /**
+   * Make sure the first thread loads the algo and dataset
+   */
+  progress_barrier load_barrier{};
+  if (load_barrier.arrive(1) == 0) {
+    // algo is static to cache it between close search runs to save time on index loading
+    static std::string index_file = "";
+    if (index.file != index_file) {
+      current_algo.reset();
+      index_file = index.file;
+    }
+
+    std::unique_ptr<typename algo<T>::search_param> search_param;
+    algo<T>* a;
+    try {
+      if (!current_algo || (a = dynamic_cast<algo<T>*>(current_algo.get())) == nullptr) {
+        auto ualgo =
+          create_algo<T>(index.algo, dataset->distance(), dataset->dim(), index.build_param);
+        a = ualgo.get();
+        a->load(index_file);
+        current_algo = std::move(ualgo);
+      }
+      search_param = create_search_param<T>(index.algo, sp_json);
+    } catch (const std::exception& e) {
+      state.SkipWithError("Failed to create an algo: " + std::string(e.what()));
+      return;
+    }
+
+    current_algo_props =
+      std::make_unique<algo_property>(std::move(parse_algo_property(a->get_preference(), sp_json)));
+
+    if (search_param->needs_dataset()) {
+      try {
+        a->set_search_dataset(dataset->base_set(current_algo_props->dataset_memory_type),
+                              dataset->base_set_size());
+      } catch (const std::exception& ex) {
+        state.SkipWithError("The algorithm '" + index.name +
+                            "' requires the base set, but it's not available. " +
+                            "Exception: " + std::string(ex.what()));
+        return;
+      }
+    }
+    try {
+      a->set_search_param(*search_param);
+    } catch (const std::exception& ex) {
+      state.SkipWithError("An error occurred setting search parameters: " + std::string(ex.what()));
+      return;
+    }
+
+    query_set = dataset->query_set(current_algo_props->query_memory_type);
+    load_barrier.arrive(state.threads());
+  } else {
+    // All other threads will wait for the first thread to initialize the algo.
+    load_barrier.wait(state.threads() * 2);
+    // gbench ensures that all threads are synchronized at the start of the benchmark loop.
+    // We are accessing shared variables (like current_algo, current_algo_probs) before the
+    // benchmark loop, therefore the synchronization here is necessary.
+  }
+  query_set = dataset->query_set(current_algo_props->query_memory_type);
+
+  /**
+   * Each thread will manage its own outputs
+   */
+  using index_type                 = algo_base::index_type;
+  constexpr size_t kAlignResultBuf = 64;
+  size_t result_elem_count         = k * query_set_size;
+  result_elem_count =
+    ((result_elem_count + kAlignResultBuf - 1) / kAlignResultBuf) * kAlignResultBuf;
+  auto& result_buf =
+    get_result_buffer_from_global_pool(result_elem_count * (sizeof(float) + sizeof(index_type)));
+  auto* neighbors_ptr =
+    reinterpret_cast<index_type*>(result_buf.data(current_algo_props->query_memory_type));
+  auto* distances_ptr = reinterpret_cast<float*>(neighbors_ptr + result_elem_count);
+
+  {
+    nvtx_case nvtx{state.name()};
+
+    std::unique_ptr<algo<T>> a{nullptr};
+    try {
+      dynamic_cast<algo<T>*>(current_algo.get())->copy().swap(a);
+    } catch (const std::exception& e) {
+      state.SkipWithError("Algo::copy: " + std::string(e.what()));
+      return;
+    }
+    // Initialize with algo, so that the timer.lap() object can sync with algo::get_sync_stream()
+    cuda_timer gpu_timer{a};
+    auto start = std::chrono::high_resolution_clock::now();
+    for (auto _ : state) {
+      [[maybe_unused]] auto ntx_lap = nvtx.lap();
+      [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
+      try {
+        a->search(query_set + batch_offset * dataset->dim(),
+                  n_queries,
+                  k,
+                  neighbors_ptr + out_offset * k,
+                  distances_ptr + out_offset * k);
+      } catch (const std::exception& e) {
+        state.SkipWithError("Benchmark loop: " + std::string(e.what()));
+        break;
+      }
+
+      // advance to the next batch
+      batch_offset = (batch_offset + queries_stride) % query_set_size;
+      out_offset   = (out_offset + n_queries) % query_set_size;
+
+      queries_processed += n_queries;
+    }
+    auto end      = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count();
+    if (state.thread_index() == 0) { state.counters.insert({{"end_to_end", duration}}); }
+    state.counters.insert({"Latency", {duration, benchmark::Counter::kAvgIterations}});
+
+    if (gpu_timer.active()) {
+      state.counters.insert({"GPU", {gpu_timer.total_time(), benchmark::Counter::kAvgIterations}});
+    }
+  }
+
+  state.SetItemsProcessed(queries_processed);
+
+  // This will be the total number of queries across all threads
+  state.counters.insert({{"total_queries", queries_processed}});
+
+  if (state.skipped()) { return; }
+
+  // Each thread calculates recall on their partition of queries.
+  // evaluate recall
+  if (dataset->max_k() >= k) {
+    const std::int32_t* gt    = dataset->gt_set();
+    const std::uint32_t max_k = dataset->max_k();
+    result_buf.transfer_data(MemoryType::kHost, current_algo_props->query_memory_type);
+    auto* neighbors_host    = reinterpret_cast<index_type*>(result_buf.data(MemoryType::kHost));
+    std::size_t rows        = std::min(queries_processed, query_set_size);
+    std::size_t match_count = 0;
+    std::size_t total_count = rows * static_cast<size_t>(k);
+
+    // We go through the groundtruth with same stride as the benchmark loop.
+    size_t out_offset   = 0;
+    size_t batch_offset = (state.thread_index() * n_queries) % query_set_size;
+    while (out_offset < rows) {
+      for (std::size_t i = 0; i < n_queries; i++) {
+        size_t i_orig_idx = batch_offset + i;
+        size_t i_out_idx  = out_offset + i;
+        if (i_out_idx < rows) {
+          for (std::uint32_t j = 0; j < k; j++) {
+            auto act_idx = static_cast<std::int32_t>(neighbors_host[i_out_idx * k + j]);
+            for (std::uint32_t l = 0; l < k; l++) {
+              auto exp_idx = gt[i_orig_idx * max_k + l];
+              if (act_idx == exp_idx) {
+                match_count++;
+                break;
+              }
+            }
+          }
+        }
+      }
+      out_offset += n_queries;
+      batch_offset = (batch_offset + queries_stride) % query_set_size;
+    }
+    double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
+    state.counters.insert({"Recall", {actual_recall, benchmark::Counter::kAvgThreads}});
+  }
+}
+
+inline void printf_usage()
+{
+  ::benchmark::PrintDefaultHelp();
+  fprintf(stdout,
+          "          [--build|--search] \n"
+          "          [--force]\n"
+          "          [--data_prefix=<prefix>]\n"
+          "          [--index_prefix=<prefix>]\n"
+          "          [--override_kv=<key:value1:value2:...:valueN>]\n"
+          "          [--mode=<latency|throughput>\n"
+          "          [--threads=min[:max]]\n"
+          "          <conf>.json\n"
+          "\n"
+          "Note the non-standard benchmark parameters:\n"
+          "  --build: build mode, will build index\n"
+          "  --search: search mode, will search using the built index\n"
+          "            one and only one of --build and --search should be specified\n"
+          "  --force: force overwriting existing index files\n"
+          "  --data_prefix=<prefix>:"
+          " prepend <prefix> to dataset file paths specified in the <conf>.json (default = "
+          "'data/').\n"
+          "  --index_prefix=<prefix>:"
+          " prepend <prefix> to index file paths specified in the <conf>.json (default = "
+          "'index/').\n"
+          "  --override_kv=<key:value1:value2:...:valueN>:"
+          " override a build/search key one or more times multiplying the number of configurations;"
+          " you can use this parameter multiple times to get the Cartesian product of benchmark"
+          " configs.\n"
+          "  --mode=<latency|throughput>"
+          " run the benchmarks in latency (accumulate times spent in each batch) or "
+          " throughput (pipeline batches and measure end-to-end) mode\n"
+          "  --threads=min[:max] specify the number threads to use for throughput benchmark."
+          " Power of 2 values between 'min' and 'max' will be used. If only 'min' is specified,"
+          " then a single test is run with 'min' threads. By default min=1, max=<num hyper"
+          " threads>.\n");
+}
+
+template <typename T>
+void register_build(std::shared_ptr<const dataset<T>> dataset,
+                    std::vector<configuration::index> indices,
+                    bool force_overwrite)
+{
+  for (auto index : indices) {
+    auto suf      = static_cast<std::string>(index.build_param["override_suffix"]);
+    auto file_suf = suf;
+    index.build_param.erase("override_suffix");
+    std::replace(file_suf.begin(), file_suf.end(), '/', '-');
+    index.file += file_suf;
+    auto* b = ::benchmark::RegisterBenchmark(
+      index.name + suf, bench_build<T>, dataset, index, force_overwrite);
+    b->Unit(benchmark::kSecond);
+    b->MeasureProcessCPUTime();
+    b->UseRealTime();
+  }
+}
+
+template <typename T>
+void register_search(std::shared_ptr<const dataset<T>> dataset,
+                     std::vector<configuration::index> indices,
+                     Mode metric_objective,
+                     const std::vector<int>& threads)
+{
+  for (auto index : indices) {
+    for (std::size_t i = 0; i < index.search_params.size(); i++) {
+      auto suf = static_cast<std::string>(index.search_params[i]["override_suffix"]);
+      index.search_params[i].erase("override_suffix");
+
+      auto* b = ::benchmark::RegisterBenchmark(index.name + suf, bench_search<T>, index, i, dataset)
+                  ->Unit(benchmark::kMillisecond)
+                  /**
+                   * The following are important for getting accuracy QPS measurements on both CPU
+                   * and GPU These make sure that
+                   *   - `end_to_end` ~ (`Time` * `Iterations`)
+                   *   - `items_per_second` ~ (`total_queries` / `end_to_end`)
+                   *   - Throughput = `items_per_second`
+                   */
+                  ->MeasureProcessCPUTime()
+                  ->UseRealTime();
+
+      if (metric_objective == Mode::kThroughput) { b->ThreadRange(threads[0], threads[1]); }
+    }
+  }
+}
+
+template <typename T>
+void dispatch_benchmark(const configuration& conf,
+                        bool force_overwrite,
+                        bool build_mode,
+                        bool search_mode,
+                        std::string data_prefix,
+                        std::string index_prefix,
+                        kv_series override_kv,
+                        Mode metric_objective,
+                        const std::vector<int>& threads)
+{
+  if (cudart.found()) {
+    for (auto [key, value] : cuda_info()) {
+      ::benchmark::AddCustomContext(key, value);
+    }
+  }
+  const auto dataset_conf = conf.get_dataset_conf();
+  auto base_file          = combine_path(data_prefix, dataset_conf.base_file);
+  auto query_file         = combine_path(data_prefix, dataset_conf.query_file);
+  auto gt_file            = dataset_conf.groundtruth_neighbors_file;
+  if (gt_file.has_value()) { gt_file.emplace(combine_path(data_prefix, gt_file.value())); }
+  auto dataset = std::make_shared<bin_dataset<T>>(dataset_conf.name,
+                                                  base_file,
+                                                  dataset_conf.subset_first_row,
+                                                  dataset_conf.subset_size,
+                                                  query_file,
+                                                  dataset_conf.distance,
+                                                  gt_file);
+  ::benchmark::AddCustomContext("dataset", dataset_conf.name);
+  ::benchmark::AddCustomContext("distance", dataset_conf.distance);
+  std::vector<configuration::index> indices = conf.get_indices();
+  if (build_mode) {
+    if (file_exists(base_file)) {
+      log_info("Using the dataset file '%s'", base_file.c_str());
+      ::benchmark::AddCustomContext("n_records", std::to_string(dataset->base_set_size()));
+      ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim()));
+    } else {
+      log_warn("dataset file '%s' does not exist; benchmarking index building is impossible.",
+               base_file.c_str());
+    }
+    std::vector<configuration::index> more_indices{};
+    for (auto& index : indices) {
+      for (auto param : apply_overrides(index.build_param, override_kv)) {
+        auto modified_index        = index;
+        modified_index.build_param = param;
+        modified_index.file        = combine_path(index_prefix, modified_index.file);
+        more_indices.push_back(modified_index);
+      }
+    }
+    register_build<T>(dataset, more_indices, force_overwrite);
+  } else if (search_mode) {
+    if (file_exists(query_file)) {
+      log_info("Using the query file '%s'", query_file.c_str());
+      ::benchmark::AddCustomContext("max_n_queries", std::to_string(dataset->query_set_size()));
+      ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim()));
+      if (gt_file.has_value()) {
+        if (file_exists(*gt_file)) {
+          log_info("Using the ground truth file '%s'", gt_file->c_str());
+          ::benchmark::AddCustomContext("max_k", std::to_string(dataset->max_k()));
+        } else {
+          log_warn("Ground truth file '%s' does not exist; the recall won't be reported.",
+                   gt_file->c_str());
+        }
+      } else {
+        log_warn(
+          "Ground truth file is not provided; the recall won't be reported. NB: use "
+          "the 'groundtruth_neighbors_file' alongside the 'query_file' key to specify the "
+          "path to "
+          "the ground truth in your conf.json.");
+      }
+    } else {
+      log_warn("Query file '%s' does not exist; benchmarking search is impossible.",
+               query_file.c_str());
+    }
+    for (auto& index : indices) {
+      index.search_params = apply_overrides(index.search_params, override_kv);
+      index.file          = combine_path(index_prefix, index.file);
+    }
+    register_search<T>(dataset, indices, metric_objective, threads);
+  }
+}
+
+inline auto parse_bool_flag(const char* arg, const char* pat, bool& result) -> bool
+{
+  if (strcmp(arg, pat) == 0) {
+    result = true;
+    return true;
+  }
+  return false;
+}
+
+inline auto parse_string_flag(const char* arg, const char* pat, std::string& result) -> bool
+{
+  auto n = strlen(pat);
+  if (strncmp(pat, arg, strlen(pat)) == 0) {
+    result = arg + n + 1;
+    return true;
+  }
+  return false;
+}
+
+inline auto run_main(int argc, char** argv) -> int
+{
+  bool force_overwrite                = false;
+  bool build_mode                     = false;
+  bool search_mode                    = false;
+  std::string data_prefix             = "data";
+  std::string index_prefix            = "index";
+  std::string new_override_kv         = "";
+  std::string mode                    = "latency";
+  std::string threads_arg_txt         = "";
+  std::vector<int> threads            = {1, -1};  // min_thread, max_thread
+  std::string log_level_str           = "";
+  [[maybe_unused]] int raft_log_level = 0;  // raft::logger::get(RAFT_NAME).get_level();
+  kv_series override_kv{};
+
+  char arg0_default[] = "benchmark";  // NOLINT
+  char* args_default  = arg0_default;
+  if (!argv) {
+    argc = 1;
+    argv = &args_default;
+  }
+  if (argc == 1) {
+    printf_usage();
+    return -1;
+  }
+
+  char* conf_path = argv[--argc];
+  std::ifstream conf_stream(conf_path);
+
+  for (int i = 1; i < argc; i++) {
+    if (parse_bool_flag(argv[i], "--force", force_overwrite) ||
+        parse_bool_flag(argv[i], "--build", build_mode) ||
+        parse_bool_flag(argv[i], "--search", search_mode) ||
+        parse_string_flag(argv[i], "--data_prefix", data_prefix) ||
+        parse_string_flag(argv[i], "--index_prefix", index_prefix) ||
+        parse_string_flag(argv[i], "--mode", mode) ||
+        parse_string_flag(argv[i], "--override_kv", new_override_kv) ||
+        parse_string_flag(argv[i], "--threads", threads_arg_txt) ||
+        parse_string_flag(argv[i], "--raft_log_level", log_level_str)) {
+      if (!log_level_str.empty()) {
+        raft_log_level = std::stoi(log_level_str);
+        log_level_str  = "";
+      }
+      if (!threads_arg_txt.empty()) {
+        auto threads_arg = split(threads_arg_txt, ':');
+        threads[0]       = std::stoi(threads_arg[0]);
+        if (threads_arg.size() > 1) {
+          threads[1] = std::stoi(threads_arg[1]);
+        } else {
+          threads[1] = threads[0];
+        }
+        threads_arg_txt = "";
+      }
+      if (!new_override_kv.empty()) {
+        auto kvv = split(new_override_kv, ':');
+        auto key = kvv[0];
+        std::vector<nlohmann::json> vals{};
+        for (std::size_t j = 1; j < kvv.size(); j++) {
+          vals.push_back(nlohmann::json::parse(kvv[j]));
+        }
+        override_kv.emplace_back(key, vals);
+        new_override_kv = "";
+      }
+      for (int j = i; j < argc - 1; j++) {
+        argv[j] = argv[j + 1];
+      }
+      argc--;
+      i--;
+    }
+  }
+
+  // raft::logger::get(RAFT_NAME).set_level(raft_log_level);
+
+  Mode metric_objective = Mode::kLatency;
+  if (mode == "throughput") { metric_objective = Mode::kThroughput; }
+
+  int max_threads =
+    (metric_objective == Mode::kThroughput) ? std::thread::hardware_concurrency() : 1;
+  if (threads[1] == -1) threads[1] = max_threads;
+
+  if (metric_objective == Mode::kLatency) {
+    if (threads[0] != 1 || threads[1] != 1) {
+      log_warn("Latency mode enabled. Overriding threads arg, running with single thread.");
+      threads = {1, 1};
+    }
+  }
+
+  if (build_mode == search_mode) {
+    log_error("One and only one of --build and --search should be specified");
+    printf_usage();
+    return -1;
+  }
+
+  if (!conf_stream) {
+    log_error("Can't open configuration file: %s", conf_path);
+    return -1;
+  }
+
+  if (cudart.needed() && !cudart.found()) {
+    log_warn("cudart library is not found, GPU-based indices won't work.");
+  }
+
+  configuration conf(conf_stream);
+  std::string dtype = conf.get_dataset_conf().dtype;
+
+  if (dtype == "float") {
+    dispatch_benchmark<float>(conf,
+                              force_overwrite,
+                              build_mode,
+                              search_mode,
+                              data_prefix,
+                              index_prefix,
+                              override_kv,
+                              metric_objective,
+                              threads);
+    // } else if (dtype == "half") {
+    //   dispatch_benchmark<half>(conf,
+    //                            force_overwrite,
+    //                            build_mode,
+    //                            search_mode,
+    //                            data_prefix,
+    //                            index_prefix,
+    //                            override_kv,
+    //                            metric_objective,
+    //                            threads);
+  } else if (dtype == "uint8") {
+    dispatch_benchmark<std::uint8_t>(conf,
+                                     force_overwrite,
+                                     build_mode,
+                                     search_mode,
+                                     data_prefix,
+                                     index_prefix,
+                                     override_kv,
+                                     metric_objective,
+                                     threads);
+  } else if (dtype == "int8") {
+    dispatch_benchmark<std::int8_t>(conf,
+                                    force_overwrite,
+                                    build_mode,
+                                    search_mode,
+                                    data_prefix,
+                                    index_prefix,
+                                    override_kv,
+                                    metric_objective,
+                                    threads);
+  } else {
+    log_error("datatype '%s' is not supported", dtype.c_str());
+    return -1;
+  }
+
+  ::benchmark::Initialize(&argc, argv, printf_usage);
+  if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return -1;
+  ::benchmark::RunSpecifiedBenchmarks();
+  ::benchmark::Shutdown();
+  // Release a possibly cached algo object, so that it cannot be alive longer than the handle
+  // to a shared library it depends on (dynamic benchmark executable).
+  current_algo.reset();
+  current_algo_props.reset();
+  reset_global_device_resources();
+  return 0;
+}
+};  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/common/conf.hpp b/cpp/bench/ann/src/common/conf.hpp
new file mode 100644
index 000000000..1fc7327cb
--- /dev/null
+++ b/cpp/bench/ann/src/common/conf.hpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "util.hpp"
+
+#include <iostream>
+#include <optional>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+namespace cuvs::bench {
+
+class configuration {
+ public:
+  struct index {
+    std::string name;
+    std::string algo;
+    nlohmann::json build_param;
+    std::string file;
+
+    int batch_size;
+    int k;
+    std::vector<nlohmann::json> search_params;
+  };
+
+  struct dataset_conf {
+    std::string name;
+    std::string base_file;
+    // use only a subset of base_file,
+    // the range of rows is [subset_first_row, subset_first_row + subset_size)
+    // however, subset_size = 0 means using all rows after subset_first_row
+    // that is, the subset is [subset_first_row, #rows in base_file)
+    size_t subset_first_row{0};
+    size_t subset_size{0};
+    std::string query_file;
+    std::string distance;
+    std::optional<std::string> groundtruth_neighbors_file{std::nullopt};
+
+    // data type of input dataset, possible values ["float", "int8", "uint8"]
+    std::string dtype;
+  };
+
+  explicit inline configuration(std::istream& conf_stream)
+  {
+    // to enable comments in json
+    auto conf = nlohmann::json::parse(conf_stream, nullptr, true, true);
+
+    parse_dataset(conf.at("dataset"));
+    parse_index(conf.at("index"), conf.at("search_basic_param"));
+  }
+
+  [[nodiscard]] inline auto get_dataset_conf() const -> dataset_conf { return dataset_conf_; }
+  [[nodiscard]] inline auto get_indices() const -> std::vector<index> { return indices_; };
+
+ private:
+  inline void parse_dataset(const nlohmann::json& conf)
+  {
+    dataset_conf_.name       = conf.at("name");
+    dataset_conf_.base_file  = conf.at("base_file");
+    dataset_conf_.query_file = conf.at("query_file");
+    dataset_conf_.distance   = conf.at("distance");
+
+    if (conf.contains("groundtruth_neighbors_file")) {
+      dataset_conf_.groundtruth_neighbors_file = conf.at("groundtruth_neighbors_file");
+    }
+    if (conf.contains("subset_first_row")) {
+      dataset_conf_.subset_first_row = conf.at("subset_first_row");
+    }
+    if (conf.contains("subset_size")) { dataset_conf_.subset_size = conf.at("subset_size"); }
+
+    if (conf.contains("dtype")) {
+      dataset_conf_.dtype = conf.at("dtype");
+    } else {
+      auto filename = dataset_conf_.base_file;
+      if (filename.size() > 6 && filename.compare(filename.size() - 6, 6, "f16bin") == 0) {
+        dataset_conf_.dtype = "half";
+      } else if (filename.size() > 9 &&
+                 filename.compare(filename.size() - 9, 9, "fp16.fbin") == 0) {
+        dataset_conf_.dtype = "half";
+      } else if (filename.size() > 4 && filename.compare(filename.size() - 4, 4, "fbin") == 0) {
+        dataset_conf_.dtype = "float";
+      } else if (filename.size() > 5 && filename.compare(filename.size() - 5, 5, "u8bin") == 0) {
+        dataset_conf_.dtype = "uint8";
+      } else if (filename.size() > 5 && filename.compare(filename.size() - 5, 5, "i8bin") == 0) {
+        dataset_conf_.dtype = "int8";
+      } else {
+        log_error("Could not determine data type of the dataset %s", filename.c_str());
+      }
+    }
+  }
+  inline void parse_index(const nlohmann::json& index_conf, const nlohmann::json& search_basic_conf)
+  {
+    const int batch_size = search_basic_conf.at("batch_size");
+    const int k          = search_basic_conf.at("k");
+
+    for (const auto& conf : index_conf) {
+      index index;
+      index.name        = conf.at("name");
+      index.algo        = conf.at("algo");
+      index.build_param = conf.at("build_param");
+      index.file        = conf.at("file");
+      index.batch_size  = batch_size;
+      index.k           = k;
+
+      for (auto param : conf.at("search_params")) {
+        /*  ### Special parameters for backward compatibility ###
+
+          - Local values of `k` and `n_queries` take priority.
+          - The legacy "batch_size" renamed to `n_queries`.
+          - Basic search params are used otherwise.
+        */
+        if (!param.contains("k")) { param["k"] = k; }
+        if (!param.contains("n_queries")) {
+          if (param.contains("batch_size")) {
+            param["n_queries"] = param["batch_size"];
+            param.erase("batch_size");
+          } else {
+            param["n_queries"] = batch_size;
+          }
+        }
+        index.search_params.push_back(param);
+      }
+
+      indices_.push_back(index);
+    }
+  }
+
+  dataset_conf dataset_conf_;
+  std::vector<index> indices_;
+};
+
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/common/cuda_huge_page_resource.hpp b/cpp/bench/ann/src/common/cuda_huge_page_resource.hpp
new file mode 100644
index 000000000..a4d967dd1
--- /dev/null
+++ b/cpp/bench/ann/src/common/cuda_huge_page_resource.hpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <sys/mman.h>
+
+#include <cstddef>
+#include <cstring>
+
+namespace raft::mr {
+/**
+ * @brief `device_memory_resource` derived class that uses mmap to allocate memory.
+ * This class enables memory allocation using huge pages.
+ * It is assumed that the allocated memory is directly accessible on device. This currently only
+ * works on GH systems.
+ *
+ * TODO(tfeher): consider improving or removing this helper once we made progress with
+ * https://github.com/rapidsai/raft/issues/1819
+ */
+class cuda_huge_page_resource final : public rmm::mr::device_memory_resource {
+ public:
+  cuda_huge_page_resource()                                                  = default;
+  ~cuda_huge_page_resource() override                                        = default;
+  cuda_huge_page_resource(cuda_huge_page_resource const&)                    = default;
+  cuda_huge_page_resource(cuda_huge_page_resource&&)                         = default;
+  auto operator=(cuda_huge_page_resource const&) -> cuda_huge_page_resource& = default;
+  auto operator=(cuda_huge_page_resource&&) -> cuda_huge_page_resource&      = default;
+
+ private:
+  /**
+   * @brief Allocates memory of size at least `bytes` using cudaMalloc.
+   *
+   * The returned pointer has at least 256B alignment.
+   *
+   * @note Stream argument is ignored
+   *
+   * @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled
+   *
+   * @param bytes The size, in bytes, of the allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  auto do_allocate(std::size_t bytes, rmm::cuda_stream_view) -> void* override
+  {
+    void* addr{nullptr};
+    addr = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (addr == MAP_FAILED) { RAFT_FAIL("huge_page_resource::MAP FAILED"); }
+    if (madvise(addr, bytes, MADV_HUGEPAGE) == -1) {
+      munmap(addr, bytes);
+      RAFT_FAIL("huge_page_resource::madvise MADV_HUGEPAGE");
+    }
+    memset(addr, 0, bytes);
+    return addr;
+  }
+
+  /**
+   * @brief Deallocate memory pointed to by \p p.
+   *
+   * @note Stream argument is ignored.
+   *
+   * @throws Nothing.
+   *
+   * @param p Pointer to be deallocated
+   */
+  void do_deallocate(void* ptr, std::size_t size, rmm::cuda_stream_view) override
+  {
+    if (munmap(ptr, size) == -1) { RAFT_FAIL("huge_page_resource::munmap"); }
+  }
+
+  /**
+   * @brief Compare this resource to another.
+   *
+   * Two cuda_huge_page_resources always compare equal, because they can each
+   * deallocate memory allocated by the other.
+   *
+   * @throws Nothing.
+   *
+   * @param other The other resource to compare to
+   * @return true If the two resources are equivalent
+   * @return false If the two resources are not equal
+   */
+  [[nodiscard]] auto do_is_equal(device_memory_resource const& other) const noexcept
+    -> bool override
+  {
+    return dynamic_cast<cuda_huge_page_resource const*>(&other) != nullptr;
+  }
+};
+}  // namespace raft::mr
diff --git a/cpp/bench/ann/src/common/cuda_pinned_resource.hpp b/cpp/bench/ann/src/common/cuda_pinned_resource.hpp
new file mode 100644
index 000000000..fb977cd34
--- /dev/null
+++ b/cpp/bench/ann/src/common/cuda_pinned_resource.hpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/detail/error.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <cstddef>
+
+namespace raft::mr {
+/**
+ * @brief `device_memory_resource` derived class that uses cudaMallocHost/Free for
+ * allocation/deallocation.
+ *
+ * This is almost the same as rmm::mr::host::pinned_memory_resource, but it has
+ * device_memory_resource as base class. Pinned memory can be accessed from device,
+ * and using this allocator we can create device_mdarray backed by pinned allocator.
+ *
+ * TODO(tfeher): it would be preferred to just rely on the existing allocator from rmm
+ * (pinned_memory_resource), but that is incompatible with the container_policy class
+ * for device matrix, because the latter expects a device_memory_resource. We shall
+ * revise this once we progress with Issue https://github.com/rapidsai/raft/issues/1819
+ */
+class cuda_pinned_resource final : public rmm::mr::device_memory_resource {
+ public:
+  cuda_pinned_resource()                                               = default;
+  ~cuda_pinned_resource() override                                     = default;
+  cuda_pinned_resource(cuda_pinned_resource const&)                    = default;
+  cuda_pinned_resource(cuda_pinned_resource&&)                         = default;
+  auto operator=(cuda_pinned_resource const&) -> cuda_pinned_resource& = default;
+  auto operator=(cuda_pinned_resource&&) -> cuda_pinned_resource&      = default;
+
+ private:
+  /**
+   * @brief Allocates memory of size at least `bytes` using cudaMalloc.
+   *
+   * The returned pointer has at least 256B alignment.
+   *
+   * @note Stream argument is ignored
+   *
+   * @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled
+   *
+   * @param bytes The size, in bytes, of the allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  auto do_allocate(std::size_t bytes, rmm::cuda_stream_view) -> void* override
+  {
+    void* ptr{nullptr};
+    RMM_CUDA_TRY_ALLOC(cudaMallocHost(&ptr, bytes));
+    return ptr;
+  }
+
+  /**
+   * @brief Deallocate memory pointed to by \p p.
+   *
+   * @note Stream argument is ignored.
+   *
+   * @throws Nothing.
+   *
+   * @param p Pointer to be deallocated
+   */
+  void do_deallocate(void* ptr, std::size_t, rmm::cuda_stream_view) override
+  {
+    RMM_ASSERT_CUDA_SUCCESS(cudaFreeHost(ptr));
+  }
+
+  /**
+   * @brief Compare this resource to another.
+   *
+   * Two cuda_pinned_resources always compare equal, because they can each
+   * deallocate memory allocated by the other.
+   *
+   * @throws Nothing.
+   *
+   * @param other The other resource to compare to
+   * @return true If the two resources are equivalent
+   * @return false If the two resources are not equal
+   */
+  [[nodiscard]] auto do_is_equal(device_memory_resource const& other) const noexcept
+    -> bool override
+  {
+    return dynamic_cast<cuda_pinned_resource const*>(&other) != nullptr;
+  }
+};
+}  // namespace raft::mr
diff --git a/cpp/bench/ann/src/common/cuda_stub.hpp b/cpp/bench/ann/src/common/cuda_stub.hpp
new file mode 100644
index 000000000..0ee087a11
--- /dev/null
+++ b/cpp/bench/ann/src/common/cuda_stub.hpp
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+The content of this header is governed by two preprocessor definitions:
+
+  - BUILD_CPU_ONLY - whether none of the CUDA functions are used.
+  - ANN_BENCH_LINK_CUDART - dynamically link against this string if defined.
+
+___________________________________________________________________________________
+|BUILD_CPU_ONLY | ANN_BENCH_LINK_CUDART |         cudart      | cuda_runtime_api.h |
+|               |                       |  found    |  needed |      included      |
+|---------------|-----------------------|-----------|---------|--------------------|
+|   ON          |    <not defined>      |  false    |  false  |       NO           |
+|   ON          |   "cudart.so.xx.xx"   |  false    |  false  |       NO           |
+|  OFF          |     <not defined>     |   true    |   true  |      YES           |
+|  OFF          |   "cudart.so.xx.xx"   | <runtime> |   true  |      YES           |
+------------------------------------------------------------------------------------
+*/
+
+#pragma once
+
+#ifndef BUILD_CPU_ONLY
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#ifdef ANN_BENCH_LINK_CUDART
+#include <dlfcn.h>
+
+#include <cstring>
+#endif
+#else
+#include <cstdint>
+
+typedef void* cudaStream_t;
+typedef void* cudaEvent_t;
+typedef uint16_t half;
+#endif
+
+namespace cuvs::bench {
+
+struct cuda_lib_handle {
+  void* handle{nullptr};
+  explicit cuda_lib_handle()
+  {
+#ifdef ANN_BENCH_LINK_CUDART
+    constexpr int kFlags = RTLD_NOW | RTLD_GLOBAL | RTLD_DEEPBIND | RTLD_NODELETE;
+    // The full name of the linked cudart library 'cudart.so.MAJOR.MINOR.PATCH'
+    char libname[] = ANN_BENCH_LINK_CUDART;  // NOLINT
+    handle         = dlopen(ANN_BENCH_LINK_CUDART, kFlags);
+    if (handle != nullptr) { return; }
+    // try strip the PATCH
+    auto p = strrchr(libname, '.');
+    p[0]   = 0;
+    handle = dlopen(libname, kFlags);
+    if (handle != nullptr) { return; }
+    // try set the MINOR version to 0
+    p      = strrchr(libname, '.');
+    p[1]   = '0';
+    p[2]   = 0;
+    handle = dlopen(libname, kFlags);
+    if (handle != nullptr) { return; }
+    // try strip the MINOR
+    p[0]   = 0;
+    handle = dlopen(libname, kFlags);
+    if (handle != nullptr) { return; }
+    // try strip the MAJOR
+    p      = strrchr(libname, '.');
+    p[0]   = 0;
+    handle = dlopen(libname, kFlags);
+#endif
+  }
+  ~cuda_lib_handle() noexcept
+  {
+#ifdef ANN_BENCH_LINK_CUDART
+    if (handle != nullptr) { dlclose(handle); }
+#endif
+  }
+
+  template <typename Symbol>
+  auto sym(const char* name) -> Symbol
+  {
+#ifdef ANN_BENCH_LINK_CUDART
+    return reinterpret_cast<Symbol>(dlsym(handle, name));
+#else
+    return nullptr;
+#endif
+  }
+
+  /** Whether this is NOT a cpu-only package. */
+  [[nodiscard]] constexpr inline auto needed() const -> bool
+  {
+#if defined(BUILD_CPU_ONLY)
+    return false;
+#else
+    return true;
+#endif
+  }
+
+  /** CUDA found, either at compile time or at runtime. */
+  [[nodiscard]] inline auto found() const -> bool
+  {
+#if defined(BUILD_CPU_ONLY)
+    return false;
+#elif defined(ANN_BENCH_LINK_CUDART)
+    return handle != nullptr;
+#else
+    return true;
+#endif
+  }
+};
+
+static inline cuda_lib_handle cudart{};
+
+#ifdef ANN_BENCH_LINK_CUDART
+namespace stub {
+
+[[gnu::weak, gnu::noinline]] auto cuda_memcpy(void* dst,
+                                              const void* src,
+                                              size_t count,
+                                              enum cudaMemcpyKind kind) -> cudaError_t
+{
+  return cudaSuccess;
+}
+
+[[gnu::weak, gnu::noinline]] auto cuda_malloc(void** ptr, size_t size) -> cudaError_t
+{
+  *ptr = nullptr;
+  return cudaSuccess;
+}
+[[gnu::weak, gnu::noinline]] auto cuda_memset(void* devPtr, int value, size_t count) -> cudaError_t
+{
+  return cudaSuccess;
+}
+[[gnu::weak, gnu::noinline]] auto cuda_free(void* devPtr) -> cudaError_t { return cudaSuccess; }
+[[gnu::weak, gnu::noinline]] auto cuda_stream_create(cudaStream_t* pStream) -> cudaError_t
+{
+  *pStream = nullptr;
+  return cudaSuccess;
+}
+[[gnu::weak, gnu::noinline]] auto cuda_stream_create_with_flags(cudaStream_t* pStream,
+                                                                unsigned int flags) -> cudaError_t
+{
+  *pStream = nullptr;
+  return cudaSuccess;
+}
+[[gnu::weak, gnu::noinline]] auto cuda_stream_destroy(cudaStream_t pStream) -> cudaError_t
+{
+  return cudaSuccess;
+}
+[[gnu::weak, gnu::noinline]] auto cuda_device_synchronize() -> cudaError_t { return cudaSuccess; }
+
+[[gnu::weak, gnu::noinline]] auto cuda_stream_synchronize(cudaStream_t pStream) -> cudaError_t
+{
+  return cudaSuccess;
+}
+[[gnu::weak, gnu::noinline]] auto cuda_event_create(cudaEvent_t* event) -> cudaError_t
+{
+  *event = nullptr;
+  return cudaSuccess;
+}
+[[gnu::weak, gnu::noinline]] auto cuda_event_record(cudaEvent_t event, cudaStream_t stream)
+  -> cudaError_t
+{
+  return cudaSuccess;
+}
+[[gnu::weak, gnu::noinline]] auto cuda_event_synchronize(cudaEvent_t event) -> cudaError_t
+{
+  return cudaSuccess;
+}
+[[gnu::weak, gnu::noinline]] auto cuda_event_elapsed_time(float* ms,
+                                                          cudaEvent_t start,
+                                                          cudaEvent_t end) -> cudaError_t
+{
+  *ms = 0;
+  return cudaSuccess;
+}
+[[gnu::weak, gnu::noinline]] auto cuda_event_destroy(cudaEvent_t event) -> cudaError_t
+{
+  return cudaSuccess;
+}
+[[gnu::weak, gnu::noinline]] auto cuda_get_device(int* device) -> cudaError_t
+{
+  *device = 0;
+  return cudaSuccess;
+};
+[[gnu::weak, gnu::noinline]] auto cuda_driver_get_version(int* driver) -> cudaError_t
+{
+  *driver = 0;
+  return cudaSuccess;
+};
+[[gnu::weak, gnu::noinline]] auto cuda_runtime_get_version(int* runtime) -> cudaError_t
+{
+  *runtime = 0;
+  return cudaSuccess;
+};
+[[gnu::weak, gnu::noinline]] cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp* prop,
+                                                                 int device)
+{
+  *prop = cudaDeviceProp{};
+  return cudaSuccess;
+}
+
+}  // namespace stub
+
+#define RAFT_DECLARE_CUDART(fun)           \
+  static inline decltype(&stub::fun) fun = \
+    cudart.found() ? cudart.sym<decltype(&stub::fun)>(#fun) : &stub::fun
+
+RAFT_DECLARE_CUDART(cuda_memcpy);
+RAFT_DECLARE_CUDART(cuda_malloc);
+RAFT_DECLARE_CUDART(cuda_memset);
+RAFT_DECLARE_CUDART(cuda_free);
+RAFT_DECLARE_CUDART(cuda_stream_create);
+RAFT_DECLARE_CUDART(cuda_stream_create_with_flags);
+RAFT_DECLARE_CUDART(cuda_stream_destroy);
+RAFT_DECLARE_CUDART(cuda_device_synchronize);
+RAFT_DECLARE_CUDART(cuda_stream_synchronize);
+RAFT_DECLARE_CUDART(cuda_event_create);
+RAFT_DECLARE_CUDART(cuda_event_record);
+RAFT_DECLARE_CUDART(cuda_event_synchronize);
+RAFT_DECLARE_CUDART(cuda_event_elapsed_time);
+RAFT_DECLARE_CUDART(cuda_event_destroy);
+RAFT_DECLARE_CUDART(cuda_get_device);
+RAFT_DECLARE_CUDART(cuda_driver_get_version);
+RAFT_DECLARE_CUDART(cuda_runtime_get_version);
+RAFT_DECLARE_CUDART(cudaGetDeviceProperties);
+
+#undef RAFT_DECLARE_CUDART
+#endif
+
+};  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/common/dataset.hpp b/cpp/bench/ann/src/common/dataset.hpp
new file mode 100644
index 000000000..95f1a82a2
--- /dev/null
+++ b/cpp/bench/ann/src/common/dataset.hpp
@@ -0,0 +1,498 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "ann_types.hpp"
+#include "util.hpp"
+
+#include <cerrno>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <optional>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace cuvs::bench {
+
+// http://big-algo-benchmarks.com/index.html:
+// binary format that starts with 8 bytes of data consisting of num_points(uint32_t)
+// num_dimensions(uint32) followed by num_pts x num_dimensions x sizeof(type) bytes of
+// data stored one vector after another.
+// Data files will have suffixes .fbin, .u8bin, and .i8bin to represent float32, uint8
+// and int8 type data.
+// As extensions for this benchmark, half and int data files will have suffixes .f16bin
+// and .ibin, respectively.
+template <typename T>
+class bin_file {
+ public:
+  bin_file(std::string file,
+           const std::string& mode,
+           uint32_t subset_first_row = 0,
+           uint32_t subset_size      = 0);
+  ~bin_file()
+  {
+    if (mapped_ptr_ != nullptr) { unmap(); }
+    if (fp_ != nullptr) { fclose(fp_); }
+  }
+  bin_file(const bin_file&)                    = delete;
+  auto operator=(const bin_file&) -> bin_file& = delete;
+
+  void get_shape(size_t* nrows, int* ndims) const
+  {
+    assert(read_mode_);
+    if (!fp_) { open_file(); }
+    *nrows = nrows_;
+    *ndims = ndims_;
+  }
+
+  void read(T* data) const
+  {
+    assert(read_mode_);
+    if (!fp_) { open_file(); }
+    size_t total = static_cast<size_t>(nrows_) * ndims_;
+    if (fread(data, sizeof(T), total, fp_) != total) {
+      throw std::runtime_error{"fread() bin_file " + file_ + " failed"};
+    }
+  }
+
+  void write(const T* data, uint32_t nrows, uint32_t ndims)
+  {
+    assert(!read_mode_);
+    if (!fp_) { open_file(); }
+    if (fwrite(&nrows, sizeof(uint32_t), 1, fp_) != 1) {
+      throw std::runtime_error{"fwrite() bin_file " + file_ + " failed"};
+    }
+    if (fwrite(&ndims, sizeof(uint32_t), 1, fp_) != 1) {
+      throw std::runtime_error{"fwrite() bin_file " + file_ + " failed"};
+    }
+
+    size_t total = static_cast<size_t>(nrows) * ndims;
+    if (fwrite(data, sizeof(T), total, fp_) != total) {
+      throw std::runtime_error{"fwrite() bin_file " + file_ + " failed"};
+    }
+  }
+
+  auto map() const -> T*
+  {
+    assert(read_mode_);
+    if (!fp_) { open_file(); }
+    int fid     = fileno(fp_);
+    mapped_ptr_ = mmap(nullptr, file_size_, PROT_READ, MAP_PRIVATE, fid, 0);
+    if (mapped_ptr_ == MAP_FAILED) {
+      mapped_ptr_ = nullptr;
+      throw std::runtime_error{"mmap error: Value of errno " + std::to_string(errno) + ", " +
+                               std::string(strerror(errno))};
+    }
+    return reinterpret_cast<T*>(reinterpret_cast<uint8_t*>(mapped_ptr_) + 2 * sizeof(uint32_t) +
+                                subset_first_row_ * ndims_ * sizeof(T));
+  }
+
+  void unmap() const
+  {
+    if (munmap(mapped_ptr_, file_size_) == -1) {
+      throw std::runtime_error{"munmap error: " + std::string(strerror(errno))};
+    }
+  }
+
+ private:
+  void check_suffix();
+  void open_file() const;
+
+  std::string file_;
+  bool read_mode_;
+  uint32_t subset_first_row_;
+  uint32_t subset_size_;
+
+  mutable FILE* fp_{nullptr};
+  mutable uint32_t nrows_;
+  mutable uint32_t ndims_;
+  mutable size_t file_size_;
+  mutable void* mapped_ptr_{nullptr};
+};
+
+template <typename T>
+bin_file<T>::bin_file(std::string file,
+                      const std::string& mode,
+                      uint32_t subset_first_row,
+                      uint32_t subset_size)
+  : file_(std::move(file)),
+    read_mode_(mode == "r"),
+    subset_first_row_(subset_first_row),
+    subset_size_(subset_size)
+
+{
+  check_suffix();
+
+  if (!read_mode_) {
+    if (mode == "w") {
+      if (subset_first_row != 0) {
+        throw std::runtime_error{"subset_first_row should be zero for write mode"};
+      }
+      if (subset_size != 0) {
+        throw std::runtime_error{"subset_size should be zero for write mode"};
+      }
+    } else {
+      throw std::runtime_error{"bin_file's mode must be either 'r' or 'w': " + file_};
+    }
+  }
+}
+
+template <typename T>
+void bin_file<T>::open_file() const
+{
+  fp_ = fopen(file_.c_str(), read_mode_ ? "r" : "w");
+  if (!fp_) { throw std::runtime_error{"open bin_file failed: " + file_}; }
+
+  if (read_mode_) {
+    struct stat statbuf;
+    if (stat(file_.c_str(), &statbuf) != 0) { throw std::runtime_error{"stat() failed: " + file_}; }
+    file_size_ = statbuf.st_size;
+
+    uint32_t header[2];
+    if (fread(header, sizeof(uint32_t), 2, fp_) != 2) {
+      throw std::runtime_error{"read header of bin_file failed: " + file_};
+    }
+    nrows_ = header[0];
+    ndims_ = header[1];
+
+    size_t expected_file_size =
+      2 * sizeof(uint32_t) + static_cast<size_t>(nrows_) * ndims_ * sizeof(T);
+    if (file_size_ != expected_file_size) {
+      throw std::runtime_error{"expected file size of " + file_ + " is " +
+                               std::to_string(expected_file_size) + ", however, actual size is " +
+                               std::to_string(file_size_)};
+    }
+
+    if (subset_first_row_ >= nrows_) {
+      throw std::runtime_error{file_ + ": subset_first_row (" + std::to_string(subset_first_row_) +
+                               ") >= nrows (" + std::to_string(nrows_) + ")"};
+    }
+    if (subset_first_row_ + subset_size_ > nrows_) {
+      throw std::runtime_error{file_ + ": subset_first_row (" + std::to_string(subset_first_row_) +
+                               ") + subset_size (" + std::to_string(subset_size_) + ") > nrows (" +
+                               std::to_string(nrows_) + ")"};
+    }
+
+    if (subset_first_row_) {
+      static_assert(sizeof(long) == 8, "fseek() don't support 64-bit offset");
+      if (fseek(fp_, sizeof(T) * subset_first_row_ * ndims_, SEEK_CUR) == -1) {
+        throw std::runtime_error{file_ + ": fseek failed"};
+      }
+      nrows_ -= subset_first_row_;
+    }
+    if (subset_size_) { nrows_ = subset_size_; }
+  }
+}
+
+template <typename T>
+void bin_file<T>::check_suffix()
+{
+  auto pos = file_.rfind('.');
+  if (pos == std::string::npos) {
+    throw std::runtime_error{"name of bin_file doesn't have a suffix: " + file_};
+  }
+  std::string suffix = file_.substr(pos + 1);
+
+  if constexpr (std::is_same_v<T, float>) {
+    if (suffix != "fbin") {
+      throw std::runtime_error{"bin_file<float> should has .fbin suffix: " + file_};
+    }
+  } else if constexpr (std::is_same_v<T, half>) {
+    if (suffix != "f16bin" && suffix != "fbin") {
+      throw std::runtime_error{"bin_file<half> should has .f16bin suffix: " + file_};
+    }
+  } else if constexpr (std::is_same_v<T, int>) {
+    if (suffix != "ibin") {
+      throw std::runtime_error{"bin_file<int> should has .ibin suffix: " + file_};
+    }
+  } else if constexpr (std::is_same_v<T, uint8_t>) {
+    if (suffix != "u8bin") {
+      throw std::runtime_error{"bin_file<uint8_t> should has .u8bin suffix: " + file_};
+    }
+  } else if constexpr (std::is_same_v<T, int8_t>) {
+    if (suffix != "i8bin") {
+      throw std::runtime_error{"bin_file<int8_t> should has .i8bin suffix: " + file_};
+    }
+  } else {
+    throw std::runtime_error(
+      "T of bin_file<T> should be one of float, half, int, uint8_t, or int8_t");
+  }
+}
+
+template <typename T>
+class dataset {
+ public:
+  explicit dataset(std::string name) : name_(std::move(name)) {}
+  dataset(std::string name, std::string distance)
+    : name_(std::move(name)), distance_(std::move(distance))
+  {
+  }
+  dataset(const dataset&)                    = delete;
+  auto operator=(const dataset&) -> dataset& = delete;
+  virtual ~dataset();
+
+  auto name() const -> std::string { return name_; }
+  auto distance() const -> std::string { return distance_; }
+  virtual auto dim() const -> int               = 0;
+  virtual auto max_k() const -> uint32_t        = 0;
+  virtual auto base_set_size() const -> size_t  = 0;
+  virtual auto query_set_size() const -> size_t = 0;
+
+  // load data lazily, so don't pay the overhead of reading unneeded set
+  // e.g. don't load base set when searching
+  auto base_set() const -> const T*
+  {
+    if (!base_set_) { load_base_set(); }
+    return base_set_;
+  }
+
+  auto query_set() const -> const T*
+  {
+    if (!query_set_) { load_query_set(); }
+    return query_set_;
+  }
+
+  auto gt_set() const -> const int32_t*
+  {
+    if (!gt_set_) { load_gt_set(); }
+    return gt_set_;
+  }
+
+  auto base_set_on_gpu() const -> const T*;
+  auto query_set_on_gpu() const -> const T*;
+  auto mapped_base_set() const -> const T*;
+
+  auto query_set(MemoryType memory_type) const -> const T*
+  {
+    switch (memory_type) {
+      case MemoryType::kDevice: return query_set_on_gpu();
+      default: return query_set();
+    }
+  }
+
+  auto base_set(MemoryType memory_type) const -> const T*
+  {
+    switch (memory_type) {
+      case MemoryType::kDevice: return base_set_on_gpu();
+      case MemoryType::kHost: return base_set();
+      case MemoryType::kHostMmap: return mapped_base_set();
+      default: return nullptr;
+    }
+  }
+
+ protected:
+  virtual void load_base_set() const  = 0;
+  virtual void load_gt_set() const    = 0;
+  virtual void load_query_set() const = 0;
+  virtual void map_base_set() const   = 0;
+
+  std::string name_;
+  std::string distance_;
+
+  mutable T* base_set_        = nullptr;
+  mutable T* query_set_       = nullptr;
+  mutable T* d_base_set_      = nullptr;
+  mutable T* d_query_set_     = nullptr;
+  mutable T* mapped_base_set_ = nullptr;
+  mutable int32_t* gt_set_    = nullptr;
+};
+
+template <typename T>
+dataset<T>::~dataset()
+{
+  delete[] base_set_;
+  delete[] query_set_;
+  delete[] gt_set_;
+#ifndef BUILD_CPU_ONLY
+  if (d_base_set_) { cudaFree(d_base_set_); }
+  if (d_query_set_) { cudaFree(d_query_set_); }
+#endif
+}
+
+template <typename T>
+auto dataset<T>::base_set_on_gpu() const -> const T*
+{
+#ifndef BUILD_CPU_ONLY
+  if (!d_base_set_) {
+    base_set();
+    cudaMalloc(reinterpret_cast<void**>(&d_base_set_), base_set_size() * dim() * sizeof(T));
+    cudaMemcpy(d_base_set_, base_set_, base_set_size() * dim() * sizeof(T), cudaMemcpyHostToDevice);
+  }
+#endif
+  return d_base_set_;
+}
+
+template <typename T>
+auto dataset<T>::query_set_on_gpu() const -> const T*
+{
+#ifndef BUILD_CPU_ONLY
+  if (!d_query_set_) {
+    query_set();
+    cudaMalloc(reinterpret_cast<void**>(&d_query_set_), query_set_size() * dim() * sizeof(T));
+    cudaMemcpy(
+      d_query_set_, query_set_, query_set_size() * dim() * sizeof(T), cudaMemcpyHostToDevice);
+  }
+#endif
+  return d_query_set_;
+}
+
+template <typename T>
+auto dataset<T>::mapped_base_set() const -> const T*
+{
+  if (!mapped_base_set_) { map_base_set(); }
+  return mapped_base_set_;
+}
+
+template <typename T>
+class bin_dataset : public dataset<T> {
+ public:
+  bin_dataset(const std::string& name,
+              const std::string& base_file,
+              size_t subset_first_row,
+              size_t subset_size,
+              const std::string& query_file,
+              const std::string& distance,
+              const std::optional<std::string>& groundtruth_neighbors_file);
+
+  auto dim() const -> int override;
+  auto max_k() const -> uint32_t override;
+  auto base_set_size() const -> size_t override;
+  auto query_set_size() const -> size_t override;
+
+ private:
+  void load_base_set() const;
+  void load_query_set() const;
+  void load_gt_set() const;
+  void map_base_set() const;
+
+  mutable int dim_               = 0;
+  mutable uint32_t max_k_        = 0;
+  mutable size_t base_set_size_  = 0;
+  mutable size_t query_set_size_ = 0;
+
+  bin_file<T> base_file_;
+  bin_file<T> query_file_;
+  std::optional<bin_file<std::int32_t>> gt_file_{std::nullopt};
+};
+
+template <typename T>
+bin_dataset<T>::bin_dataset(const std::string& name,
+                            const std::string& base_file,
+                            size_t subset_first_row,
+                            size_t subset_size,
+                            const std::string& query_file,
+                            const std::string& distance,
+                            const std::optional<std::string>& groundtruth_neighbors_file)
+  : dataset<T>(name, distance),
+    base_file_(base_file, "r", subset_first_row, subset_size),
+    query_file_(query_file, "r")
+{
+  if (groundtruth_neighbors_file.has_value()) {
+    gt_file_.emplace(groundtruth_neighbors_file.value(), "r");
+  }
+}
+
+template <typename T>
+auto bin_dataset<T>::dim() const -> int
+{
+  if (dim_ > 0) { return dim_; }
+  if (base_set_size() > 0) { return dim_; }
+  if (query_set_size() > 0) { return dim_; }
+  return dim_;
+}
+
+template <typename T>
+auto bin_dataset<T>::max_k() const -> uint32_t
+{
+  if (!this->gt_set_) { load_gt_set(); }
+  return max_k_;
+}
+
+template <typename T>
+auto bin_dataset<T>::query_set_size() const -> size_t
+{
+  if (query_set_size_ > 0) { return query_set_size_; }
+  int dim;
+  query_file_.get_shape(&query_set_size_, &dim);
+  if (query_set_size_ == 0) { throw std::runtime_error{"Zero query set size"}; }
+  if (dim == 0) { throw std::runtime_error{"Zero query set dim"}; }
+  if (dim_ == 0) {
+    dim_ = dim;
+  } else if (dim_ != dim) {
+    throw std::runtime_error{"base set dim (" + std::to_string(dim_) + ") != query set dim (" +
+                             std::to_string(dim)};
+  }
+  return query_set_size_;
+}
+
+template <typename T>
+auto bin_dataset<T>::base_set_size() const -> size_t
+{
+  if (base_set_size_ > 0) { return base_set_size_; }
+  int dim;
+  base_file_.get_shape(&base_set_size_, &dim);
+  if (base_set_size_ == 0) { throw std::runtime_error{"Zero base set size"}; }
+  if (dim == 0) { throw std::runtime_error{"Zero base set dim"}; }
+  if (dim_ == 0) {
+    dim_ = dim;
+  } else if (dim_ != dim) {
+    throw std::runtime_error{"base set dim (" + std::to_string(dim) + ") != query set dim (" +
+                             std::to_string(dim_)};
+  }
+  return base_set_size_;
+}
+
+template <typename T>
+void bin_dataset<T>::load_base_set() const
+{
+  this->base_set_ = new T[base_set_size() * dim()];
+  base_file_.read(this->base_set_);
+}
+
+template <typename T>
+void bin_dataset<T>::load_query_set() const
+{
+  this->query_set_ = new T[query_set_size() * dim()];
+  query_file_.read(this->query_set_);
+}
+
+template <typename T>
+void bin_dataset<T>::load_gt_set() const
+{
+  if (gt_file_.has_value()) {
+    size_t queries;
+    int k;
+    gt_file_->get_shape(&queries, &k);
+    this->gt_set_ = new std::int32_t[queries * k];
+    gt_file_->read(this->gt_set_);
+    max_k_ = k;
+  }
+}
+
+template <typename T>
+void bin_dataset<T>::map_base_set() const
+{
+  this->mapped_base_set_ = base_file_.map();
+}
+
+}  // namespace  cuvs::bench
diff --git a/cpp/bench/ann/src/common/thread_pool.hpp b/cpp/bench/ann/src/common/thread_pool.hpp
new file mode 100644
index 000000000..9d0218606
--- /dev/null
+++ b/cpp/bench/ann/src/common/thread_pool.hpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <omp.h>
+
+#include <atomic>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <thread>
+#include <utility>
+
+class fixed_thread_pool {
+ public:
+  explicit fixed_thread_pool(int num_threads)
+  {
+    if (num_threads < 1) {
+      throw std::runtime_error("num_threads must >= 1");
+    } else if (num_threads == 1) {
+      return;
+    }
+
+    tasks_ = new task[num_threads];
+
+    threads_.reserve(num_threads);
+    for (int i = 0; i < num_threads; ++i) {
+      threads_.emplace_back([&, i] {
+        auto& task = tasks_[i];
+        while (true) {
+          std::unique_lock<std::mutex> lock(task.mtx);
+          task.cv.wait(lock,
+                       [&] { return task.has_task || finished_.load(std::memory_order_relaxed); });
+          if (finished_.load(std::memory_order_relaxed)) { break; }
+
+          task.task();
+          task.has_task = false;
+        }
+      });
+    }
+  }
+
+  ~fixed_thread_pool()
+  {
+    if (threads_.empty()) { return; }
+
+    finished_.store(true, std::memory_order_relaxed);
+    for (unsigned i = 0; i < threads_.size(); ++i) {
+      auto& task = tasks_[i];
+      std::lock_guard<std::mutex>(task.mtx);
+
+      task.cv.notify_one();
+      threads_[i].join();
+    }
+
+    delete[] tasks_;
+  }
+
+  template <typename Func, typename IdxT>
+  void submit(Func f, IdxT len)
+  {
+    // Run functions in main thread if thread pool has no threads
+    if (threads_.empty()) {
+      for (IdxT i = 0; i < len; ++i) {
+        f(i);
+      }
+      return;
+    }
+
+    const int num_threads = threads_.size();
+    // one extra part for competition among threads
+    const IdxT items_per_thread = len / (num_threads + 1);
+    std::atomic<IdxT> cnt(items_per_thread * num_threads);
+
+    // Wrap function
+    auto wrapped_f = [&](IdxT start, IdxT end) {
+      for (IdxT i = start; i < end; ++i) {
+        f(i);
+      }
+
+      while (true) {
+        IdxT i = cnt.fetch_add(1, std::memory_order_relaxed);
+        if (i >= len) { break; }
+        f(i);
+      }
+    };
+
+    std::vector<std::future<void>> futures;
+    futures.reserve(num_threads);
+    for (int i = 0; i < num_threads; ++i) {
+      IdxT start = i * items_per_thread;
+      auto& task = tasks_[i];
+      {
+        std::lock_guard lock(task.mtx);
+        (void)lock;  // stop nvcc warning
+        task.task = std::packaged_task<void()>([=] { wrapped_f(start, start + items_per_thread); });
+        futures.push_back(task.task.get_future());
+        task.has_task = true;
+      }
+      task.cv.notify_one();
+    }
+
+    for (auto& fut : futures) {
+      fut.wait();
+    }
+    return;
+  }
+
+ private:
+  struct alignas(64) task {
+    std::mutex mtx;
+    std::condition_variable cv;
+    bool has_task = false;
+    std::packaged_task<void()> task;
+  };
+
+  task* tasks_;
+  std::vector<std::thread> threads_;
+  std::atomic<bool> finished_{false};
+};
diff --git a/cpp/bench/ann/src/common/util.hpp b/cpp/bench/ann/src/common/util.hpp
new file mode 100644
index 000000000..490b0326e
--- /dev/null
+++ b/cpp/bench/ann/src/common/util.hpp
@@ -0,0 +1,561 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "ann_types.hpp"
+#include "cuda_stub.hpp"  // cuda-related utils
+
+#ifdef ANN_BENCH_NVTX3_HEADERS_FOUND
+#include <nvtx3/nvToolsExt.h>
+#endif
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <filesystem>
+#include <functional>
+#include <mutex>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace cuvs::bench {
+
+/**
+ * Current thread id as given by the benchmark State.
+ * It's populated on every call of a benchmark case.
+ * It's relevant in the 'throughput' mode of the search benchmarks,
+ * where some algorithms might want to coordinate allocation of the resources.
+ */
+inline thread_local int benchmark_thread_id = 0;
+/**
+ * Total concurrent thread count as given by the benchmark State.
+ * It's populated on every call of a benchmark case.
+ * It's relevant in the 'throughput' mode of the search benchmarks,
+ * where some algorithms might want to coordinate allocation of the resources.
+ */
+inline thread_local int benchmark_n_threads = 1;
+
+struct cuda_timer {
+ private:
+  std::optional<cudaStream_t> stream_;
+  cudaEvent_t start_{nullptr};
+  cudaEvent_t stop_{nullptr};
+  double total_time_{0};
+
+  template <typename AnnT>
+  static inline auto extract_stream(AnnT* algo) -> std::optional<cudaStream_t>
+  {
+    auto gpu_ann = dynamic_cast<algo_gpu*>(algo);
+    if (gpu_ann != nullptr && gpu_ann->uses_stream()) {
+      return std::make_optional(gpu_ann->get_sync_stream());
+    }
+    return std::nullopt;
+  }
+
+ public:
+  struct cuda_lap {
+   private:
+    cudaStream_t stream_;
+    cudaEvent_t start_;
+    cudaEvent_t stop_;
+    double& total_time_;
+
+   public:
+    cuda_lap(cudaStream_t stream, cudaEvent_t start, cudaEvent_t stop, double& total_time)
+      : start_(start), stop_(stop), stream_(stream), total_time_(total_time)
+    {
+#ifndef BUILD_CPU_ONLY
+      cudaEventRecord(start_, stream_);
+#endif
+    }
+    cuda_lap() = delete;
+
+    ~cuda_lap() noexcept
+    {
+#ifndef BUILD_CPU_ONLY
+      cudaEventRecord(stop_, stream_);
+      cudaEventSynchronize(stop_);
+      float milliseconds = 0.0f;
+      cudaEventElapsedTime(&milliseconds, start_, stop_);
+      total_time_ += milliseconds / 1000.0;
+#endif
+    }
+  };
+
+  explicit cuda_timer(std::optional<cudaStream_t> stream) : stream_{stream}
+  {
+#ifndef BUILD_CPU_ONLY
+    if (stream_.has_value()) {
+      cudaEventCreate(&stop_);
+      cudaEventCreate(&start_);
+    }
+#endif
+  }
+
+  template <typename AnnT>
+  explicit cuda_timer(const std::unique_ptr<AnnT>& algo) : cuda_timer{extract_stream(algo.get())}
+  {
+  }
+
+  ~cuda_timer() noexcept
+  {
+#ifndef BUILD_CPU_ONLY
+    if (stream_.has_value()) {
+      cudaStreamSynchronize(stream_.value());
+      cudaEventDestroy(start_);
+      cudaEventDestroy(stop_);
+    }
+#endif
+  }
+
+  cuda_timer()                                     = delete;
+  cuda_timer(cuda_timer const&)                    = delete;
+  cuda_timer(cuda_timer&&)                         = delete;
+  auto operator=(cuda_timer const&) -> cuda_timer& = delete;
+  auto operator=(cuda_timer&&) -> cuda_timer&      = delete;
+
+  [[nodiscard]] auto stream() const -> std::optional<cudaStream_t> { return stream_; }
+
+  [[nodiscard]] auto active() const -> bool { return stream_.has_value(); }
+
+  [[nodiscard]] auto total_time() const -> double { return total_time_; }
+
+  [[nodiscard]] auto lap(bool enabled = true) -> std::optional<cuda_timer::cuda_lap>
+  {
+    return enabled && stream_.has_value()
+             ? std::make_optional<cuda_timer::cuda_lap>(stream_.value(), start_, stop_, total_time_)
+             : std::nullopt;
+  }
+};
+
+#ifndef BUILD_CPU_ONLY
+// ATM, rmm::stream does not support passing in flags; hence this helper type.
+struct non_blocking_stream {
+  non_blocking_stream() { cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking); }
+  ~non_blocking_stream() noexcept
+  {
+    if (stream_ != nullptr) { cudaStreamDestroy(stream_); }
+  }
+  non_blocking_stream(non_blocking_stream const&) = delete;
+  non_blocking_stream(non_blocking_stream&& other) noexcept { std::swap(stream_, other.stream_); }
+  auto operator=(non_blocking_stream const&) -> non_blocking_stream& = delete;
+  auto operator=(non_blocking_stream&&) -> non_blocking_stream&      = delete;
+  [[nodiscard]] auto view() const noexcept -> cudaStream_t { return stream_; }
+
+ private:
+  cudaStream_t stream_{nullptr};
+};
+
+namespace detail {
+inline std::vector<non_blocking_stream> global_stream_pool(0);
+inline std::mutex gsp_mutex;
+}  // namespace detail
+#endif
+
+/**
+ * Get a stream associated with the current benchmark thread.
+ *
+ * Note, the streams are reused between the benchmark cases.
+ * This makes it easier to profile and analyse multiple benchmark cases in one timeline using tools
+ * like nsys.
+ */
+inline auto get_stream_from_global_pool() -> cudaStream_t
+{
+#ifndef BUILD_CPU_ONLY
+  std::lock_guard guard(detail::gsp_mutex);
+  if (static_cast<int>(detail::global_stream_pool.size()) < benchmark_n_threads) {
+    detail::global_stream_pool.resize(benchmark_n_threads);
+  }
+  return detail::global_stream_pool[benchmark_thread_id].view();
+#else
+  return nullptr;
+#endif
+}
+
+struct result_buffer {
+  explicit result_buffer(size_t size, cudaStream_t stream) : size_{size}, stream_{stream}
+  {
+    if (size_ == 0) { return; }
+    data_host_ = malloc(size_);
+#ifndef BUILD_CPU_ONLY
+    cudaMallocAsync(&data_device_, size_, stream_);
+    cudaStreamSynchronize(stream_);
+#endif
+  }
+  result_buffer()                                        = delete;
+  result_buffer(result_buffer&&)                         = delete;
+  auto operator=(result_buffer&&) -> result_buffer&      = delete;
+  result_buffer(const result_buffer&)                    = delete;
+  auto operator=(const result_buffer&) -> result_buffer& = delete;
+  ~result_buffer() noexcept
+  {
+    if (size_ == 0) { return; }
+#ifndef BUILD_CPU_ONLY
+    cudaFreeAsync(data_device_, stream_);
+    cudaStreamSynchronize(stream_);
+#endif
+    free(data_host_);
+  }
+
+  [[nodiscard]] auto size() const noexcept { return size_; }
+  [[nodiscard]] auto data(MemoryType loc) const noexcept
+  {
+    switch (loc) {
+      case MemoryType::kDevice: return data_device_;
+      default: return data_host_;
+    }
+  }
+
+  void transfer_data(MemoryType dst, MemoryType src)
+  {
+    auto dst_ptr = data(dst);
+    auto src_ptr = data(src);
+    if (dst_ptr == src_ptr) { return; }
+#ifndef BUILD_CPU_ONLY
+    cudaMemcpyAsync(dst_ptr, src_ptr, size_, cudaMemcpyDefault, stream_);
+    cudaStreamSynchronize(stream_);
+#endif
+  }
+
+ private:
+  size_t size_{0};
+  cudaStream_t stream_ = nullptr;
+  void* data_host_     = nullptr;
+  void* data_device_   = nullptr;
+};
+
+namespace detail {
+inline std::vector<std::unique_ptr<result_buffer>> global_result_buffer_pool(0);
+inline std::mutex grp_mutex;
+}  // namespace detail
+
+/**
+ * Get a result buffer associated with the current benchmark thread.
+ *
+ * Note, the allocations are reused between the benchmark cases.
+ * This reduces the setup overhead and number of times the context is being blocked
+ * (this is relevant if there is a persistent kernel running across multiples benchmark cases).
+ */
+inline auto get_result_buffer_from_global_pool(size_t size) -> result_buffer&
+{
+  auto stream = get_stream_from_global_pool();
+  auto& rb    = [stream, size]() -> result_buffer& {
+    std::lock_guard guard(detail::grp_mutex);
+    if (static_cast<int>(detail::global_result_buffer_pool.size()) < benchmark_n_threads) {
+      detail::global_result_buffer_pool.resize(benchmark_n_threads);
+    }
+    auto& rb = detail::global_result_buffer_pool[benchmark_thread_id];
+    if (!rb || rb->size() < size) { rb = std::make_unique<result_buffer>(size, stream); }
+    return *rb;
+  }();
+
+  memset(rb.data(MemoryType::kHost), 0, size);
+#ifndef BUILD_CPU_ONLY
+  cudaMemsetAsync(rb.data(MemoryType::kDevice), 0, size, stream);
+  cudaStreamSynchronize(stream);
+#endif
+  return rb;
+}
+
+/**
+ * Delete all streams and memory allocations in the global pool.
+ * It's called at the end of the `main` function - before global/static variables and cuda context
+ * is destroyed - to make sure they are destroyed gracefully and correctly seen by analysis tools
+ * such as nsys.
+ */
+inline void reset_global_device_resources()
+{
+#ifndef BUILD_CPU_ONLY
+  std::lock_guard guard(detail::gsp_mutex);
+  detail::global_result_buffer_pool.resize(0);
+  detail::global_stream_pool.resize(0);
+#endif
+}
+
+inline auto cuda_info()
+{
+  std::vector<std::tuple<std::string, std::string>> props;
+#ifndef BUILD_CPU_ONLY
+  int dev, driver = 0, runtime = 0;
+  cudaDriverGetVersion(&driver);
+  cudaRuntimeGetVersion(&runtime);
+
+  cudaDeviceProp device_prop;
+  cudaGetDevice(&dev);
+  cudaGetDeviceProperties(&device_prop, dev);
+  props.emplace_back("gpu_name", std::string(device_prop.name));
+  props.emplace_back("gpu_sm_count", std::to_string(device_prop.multiProcessorCount));
+  props.emplace_back("gpu_sm_freq", std::to_string(device_prop.clockRate * 1e3));
+  props.emplace_back("gpu_mem_freq", std::to_string(device_prop.memoryClockRate * 1e3));
+  props.emplace_back("gpu_mem_bus_width", std::to_string(device_prop.memoryBusWidth));
+  props.emplace_back("gpu_mem_global_size", std::to_string(device_prop.totalGlobalMem));
+  props.emplace_back("gpu_mem_shared_size", std::to_string(device_prop.sharedMemPerMultiprocessor));
+  props.emplace_back("gpu_driver_version",
+                     std::to_string(driver / 1000) + "." + std::to_string((driver % 100) / 10));
+  props.emplace_back("gpu_runtime_version",
+                     std::to_string(runtime / 1000) + "." + std::to_string((runtime % 100) / 10));
+#endif
+  return props;
+}
+
+struct nvtx_case {
+#ifdef ANN_BENCH_NVTX3_HEADERS_FOUND
+ private:
+  std::string case_name_;
+  std::array<char, 32> iter_name_{0};
+  nvtxDomainHandle_t domain_;
+  int64_t iteration_ = 0;
+  nvtxEventAttributes_t case_attrib_{0};
+  nvtxEventAttributes_t iter_attrib_{0};
+#endif
+
+ public:
+  struct nvtx_lap {
+#ifdef ANN_BENCH_NVTX3_HEADERS_FOUND
+   private:
+    nvtxDomainHandle_t domain_;
+
+   public:
+    nvtx_lap(nvtxDomainHandle_t domain, nvtxEventAttributes_t* attr) : domain_(domain)
+    {
+      nvtxDomainRangePushEx(domain_, attr);
+    }
+    nvtx_lap() = delete;
+    ~nvtx_lap() noexcept { nvtxDomainRangePop(domain_); }
+#endif
+  };
+
+#ifdef ANN_BENCH_NVTX3_HEADERS_FOUND
+  explicit nvtx_case(std::string case_name)
+    : case_name_(std::move(case_name)), domain_(nvtxDomainCreateA("algo benchmark"))
+  {
+    case_attrib_.version       = NVTX_VERSION;
+    iter_attrib_.version       = NVTX_VERSION;
+    case_attrib_.size          = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+    iter_attrib_.size          = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+    case_attrib_.colorType     = NVTX_COLOR_ARGB;
+    iter_attrib_.colorType     = NVTX_COLOR_ARGB;
+    case_attrib_.messageType   = NVTX_MESSAGE_TYPE_ASCII;
+    iter_attrib_.messageType   = NVTX_MESSAGE_TYPE_ASCII;
+    case_attrib_.message.ascii = case_name_.c_str();
+    auto c                     = std::hash<std::string>{}(case_name_);
+    case_attrib_.color         = c | 0xA0A0A0;
+    nvtxDomainRangePushEx(domain_, &case_attrib_);
+  }
+
+  ~nvtx_case()
+  {
+    nvtxDomainRangePop(domain_);
+    nvtxDomainDestroy(domain_);
+  }
+#else
+  explicit nvtx_case(std::string) {}
+#endif
+
+  [[nodiscard]] auto lap() -> nvtx_case::nvtx_lap
+  {
+#ifdef ANN_BENCH_NVTX3_HEADERS_FOUND
+    auto i     = iteration_++;
+    uint32_t c = (i % 5);
+    uint32_t r = 150 + c * 20;
+    uint32_t g = 200 + c * 10;
+    uint32_t b = 220 + c * 5;
+    std::snprintf(iter_name_.data(), iter_name_.size(), "Lap %zd", i);
+    iter_attrib_.message.ascii = iter_name_.data();
+    iter_attrib_.color         = (r << 16) + (g << 8) + b;
+    return nvtx_lap{domain_, &iter_attrib_};
+#else
+    return nvtx_lap{};
+#endif
+  }
+};
+
+/**
+ * A progress tracker that allows syncing threads multiple times and resets the global
+ * progress once the threads are done.
+ */
+struct progress_barrier {
+  progress_barrier() = default;
+  ~progress_barrier() noexcept
+  {
+    {
+      // Lock makes sure the notified threads see the updates to `done_`.
+      std::unique_lock lk(mutex_);
+      done_.store(true, std::memory_order_relaxed);
+      cv_.notify_all();
+    }
+    // This is the only place where the order of the updates to thread_progress_ and done_ is
+    // important. They are not guarded by the mutex, and `done_` must not be reset to `true` by
+    // other threads after the `total_progress_` is zero.
+    // Hence the default memory order (std::memory_order_seq_cst).
+    auto rem = total_progress_.fetch_sub(thread_progress_);
+    if (rem == thread_progress_) {
+      // the last thread to exit clears the progress state.
+      done_.store(false);
+    }
+  }
+
+  /**
+   * Advance the progress counter by `n` and return the previous `progress` value.
+   *
+   * This can be used to track which thread arrives on the call site first.
+   *
+   * @return the previous progress counter value (before incrementing it by `n`).
+   */
+  auto arrive(int n)
+  {
+    thread_progress_ += n;
+    // Lock makes sure the notified threads see the updates to `total_progress_`.
+    std::unique_lock lk(mutex_);
+    auto prev = total_progress_.fetch_add(n, std::memory_order_relaxed);
+    cv_.notify_all();
+    return prev;
+  }
+
+  /**
+   * Wait till the progress counter reaches `n` or finishes abnormally.
+   *
+   * @return the latest observed value of the progress counter.
+   */
+  auto wait(int limit)
+  {
+    int cur = total_progress_.load(std::memory_order_relaxed);
+    if (cur >= limit) { return cur; }
+    auto done = done_.load(std::memory_order_relaxed);
+    if (done) { return cur; }
+    std::unique_lock lk(mutex_);
+    while (cur < limit && !done) {
+      using namespace std::chrono_literals;
+      cv_.wait_for(lk, 10ms);
+      cur  = total_progress_.load(std::memory_order_relaxed);
+      done = done_.load(std::memory_order_relaxed);
+    }
+    return cur;
+  }
+
+ private:
+  static inline std::atomic<int> total_progress_;
+  static inline std::atomic<bool> done_;
+  static inline std::mutex mutex_;
+  static inline std::condition_variable cv_;
+  int thread_progress_{0};
+};
+
+inline auto split(const std::string& s, char delimiter) -> std::vector<std::string>
+{
+  std::vector<std::string> tokens;
+  std::string token;
+  std::istringstream iss(s);
+  while (getline(iss, token, delimiter)) {
+    if (!token.empty()) { tokens.push_back(token); }
+  }
+  return tokens;
+}
+
+inline auto file_exists(const std::string& filename) -> bool
+{
+  struct stat statbuf;
+  if (stat(filename.c_str(), &statbuf) != 0) { return false; }
+  return S_ISREG(statbuf.st_mode);
+}
+
+inline auto dir_exists(const std::string& dir) -> bool
+{
+  struct stat statbuf;
+  if (stat(dir.c_str(), &statbuf) != 0) { return false; }
+  return S_ISDIR(statbuf.st_mode);
+}
+
+inline auto create_dir(const std::string& dir) -> bool
+{
+  const auto path = split(dir, '/');
+
+  std::string cwd;
+  if (!dir.empty() && dir[0] == '/') { cwd += '/'; }
+
+  for (const auto& p : path) {
+    cwd += p + "/";
+    if (!dir_exists(cwd)) {
+      int ret = mkdir(cwd.c_str(), S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+      if (ret != 0) { return false; }
+    }
+  }
+  return true;
+}
+
+inline void make_sure_parent_dir_exists(const std::string& file_path)
+{
+  const auto pos = file_path.rfind('/');
+  if (pos != std::string::npos) {
+    auto dir = file_path.substr(0, pos);
+    if (!dir_exists(dir)) { create_dir(dir); }
+  }
+}
+
+inline auto combine_path(const std::string& dir, const std::string& path)
+{
+  std::filesystem::path p_dir(dir);
+  std::filesystem::path p_suf(path);
+  return (p_dir / p_suf).string();
+}
+
+template <typename... Ts>
+void log_with_level(const char* level, const Ts&... vs)
+{
+  char buf[20];
+  auto now    = std::chrono::system_clock::now();
+  auto now_tt = std::chrono::system_clock::to_time_t(now);
+  size_t millis =
+    std::chrono::duration_cast<std::chrono::microseconds>(now.time_since_epoch()).count() %
+    1000000ULL;
+  std::strftime(buf, sizeof(buf), "%H:%M:%S", std::localtime(&now_tt));
+  printf("[%s] [%s.%06zu] ", level, buf, millis);
+  if constexpr (sizeof...(Ts) == 1) {
+    printf("%s", vs...);
+  } else {
+    printf(vs...);
+  }
+  printf("\n");
+  fflush(stdout);
+}
+
+template <typename... Ts>
+void log_info(Ts&&... vs)
+{
+  log_with_level("I", std::forward<Ts>(vs)...);
+}
+
+template <typename... Ts>
+void log_warn(Ts&&... vs)
+{
+  log_with_level("W", std::forward<Ts>(vs)...);
+}
+
+template <typename... Ts>
+void log_error(Ts&&... vs)
+{
+  log_with_level("E", std::forward<Ts>(vs)...);
+}
+
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
new file mode 100644
index 000000000..67f8ed39d
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#undef WARP_SIZE
+#ifdef CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE
+#include "cuvs_wrapper.h"
+#endif
+#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT
+#include "cuvs_ivf_flat_wrapper.h"
+extern template class cuvs::bench::cuvs_ivf_flat<float, int64_t>;
+extern template class cuvs::bench::cuvs_ivf_flat<uint8_t, int64_t>;
+extern template class cuvs::bench::cuvs_ivf_flat<int8_t, int64_t>;
+#endif
+#if defined(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) || \
+  defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
+#include "cuvs_ivf_pq_wrapper.h"
+#endif
+#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_PQ
+extern template class cuvs::bench::cuvs_ivf_pq<float, int64_t>;
+extern template class cuvs::bench::cuvs_ivf_pq<uint8_t, int64_t>;
+extern template class cuvs::bench::cuvs_ivf_pq<int8_t, int64_t>;
+#endif
+#if defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
+#include "cuvs_cagra_wrapper.h"
+#endif
+#ifdef CUVS_ANN_BENCH_USE_CUVS_CAGRA
+extern template class cuvs::bench::cuvs_cagra<float, uint32_t>;
+extern template class cuvs::bench::cuvs_cagra<half, uint32_t>;
+extern template class cuvs::bench::cuvs_cagra<uint8_t, uint32_t>;
+extern template class cuvs::bench::cuvs_cagra<int8_t, uint32_t>;
+#endif
+
+#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuvs::bench::cuvs_ivf_flat<T, IdxT>::build_param& param)
+{
+  param.n_lists = conf.at("nlist");
+  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
+  if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); }
+}
+
+template <typename T, typename IdxT>
+void parse_search_param(const nlohmann::json& conf,
+                        typename cuvs::bench::cuvs_ivf_flat<T, IdxT>::search_param& param)
+{
+  param.ivf_flat_params.n_probes = conf.at("nprobe");
+}
+#endif
+
+#if defined(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) || \
+  defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuvs::bench::cuvs_ivf_pq<T, IdxT>::build_param& param)
+{
+  if (conf.contains("nlist")) { param.n_lists = conf.at("nlist"); }
+  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
+  if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); }
+  if (conf.contains("pq_bits")) { param.pq_bits = conf.at("pq_bits"); }
+  if (conf.contains("pq_dim")) { param.pq_dim = conf.at("pq_dim"); }
+  if (conf.contains("codebook_kind")) {
+    std::string kind = conf.at("codebook_kind");
+    if (kind == "cluster") {
+      param.codebook_kind = cuvs::neighbors::ivf_pq::codebook_gen::PER_CLUSTER;
+    } else if (kind == "subspace") {
+      param.codebook_kind = cuvs::neighbors::ivf_pq::codebook_gen::PER_SUBSPACE;
+    } else {
+      throw std::runtime_error("codebook_kind: '" + kind +
+                               "', should be either 'cluster' or 'subspace'");
+    }
+  }
+}
+
+template <typename T, typename IdxT>
+void parse_search_param(const nlohmann::json& conf,
+                        typename cuvs::bench::cuvs_ivf_pq<T, IdxT>::search_param& param)
+{
+  if (conf.contains("nprobe")) { param.pq_param.n_probes = conf.at("nprobe"); }
+  if (conf.contains("internalDistanceDtype")) {
+    std::string type = conf.at("internalDistanceDtype");
+    if (type == "float") {
+      param.pq_param.internal_distance_dtype = CUDA_R_32F;
+    } else if (type == "half") {
+      param.pq_param.internal_distance_dtype = CUDA_R_16F;
+    } else {
+      throw std::runtime_error("internalDistanceDtype: '" + type +
+                               "', should be either 'float' or 'half'");
+    }
+  } else {
+    // set half as default type
+    param.pq_param.internal_distance_dtype = CUDA_R_16F;
+  }
+
+  if (conf.contains("smemLutDtype")) {
+    std::string type = conf.at("smemLutDtype");
+    if (type == "float") {
+      param.pq_param.lut_dtype = CUDA_R_32F;
+    } else if (type == "half") {
+      param.pq_param.lut_dtype = CUDA_R_16F;
+    } else if (type == "fp8") {
+      param.pq_param.lut_dtype = CUDA_R_8U;
+    } else {
+      throw std::runtime_error("smemLutDtype: '" + type +
+                               "', should be either 'float', 'half' or 'fp8'");
+    }
+  } else {
+    // set half as default
+    param.pq_param.lut_dtype = CUDA_R_16F;
+  }
+  if (conf.contains("refine_ratio")) {
+    param.refine_ratio = conf.at("refine_ratio");
+    if (param.refine_ratio < 1.0f) { throw std::runtime_error("refine_ratio should be >= 1.0"); }
+  }
+}
+#endif
+
+#if defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::nn_descent::index_params& param)
+{
+  if (conf.contains("graph_degree")) { param.graph_degree = conf.at("graph_degree"); }
+  if (conf.contains("intermediate_graph_degree")) {
+    param.intermediate_graph_degree = conf.at("intermediate_graph_degree");
+  }
+  // we allow niter shorthand for max_iterations
+  if (conf.contains("niter")) { param.max_iterations = conf.at("niter"); }
+  if (conf.contains("max_iterations")) { param.max_iterations = conf.at("max_iterations"); }
+  if (conf.contains("termination_threshold")) {
+    param.termination_threshold = conf.at("termination_threshold");
+  }
+}
+
+inline void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::vpq_params& param)
+{
+  if (conf.contains("pq_bits")) { param.pq_bits = conf.at("pq_bits"); }
+  if (conf.contains("pq_dim")) { param.pq_dim = conf.at("pq_dim"); }
+  if (conf.contains("vq_n_centers")) { param.vq_n_centers = conf.at("vq_n_centers"); }
+  if (conf.contains("kmeans_n_iters")) { param.kmeans_n_iters = conf.at("kmeans_n_iters"); }
+  if (conf.contains("vq_kmeans_trainset_fraction")) {
+    param.vq_kmeans_trainset_fraction = conf.at("vq_kmeans_trainset_fraction");
+  }
+  if (conf.contains("pq_kmeans_trainset_fraction")) {
+    param.pq_kmeans_trainset_fraction = conf.at("pq_kmeans_trainset_fraction");
+  }
+}
+
+nlohmann::json collect_conf_with_prefix(const nlohmann::json& conf,
+                                        const std::string& prefix,
+                                        bool remove_prefix = true)
+{
+  nlohmann::json out;
+  for (auto& i : conf.items()) {
+    if (i.key().compare(0, prefix.size(), prefix) == 0) {
+      auto new_key = remove_prefix ? i.key().substr(prefix.size()) : i.key();
+      out[new_key] = i.value();
+    }
+  }
+  return out;
+}
+
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuvs::bench::cuvs_cagra<T, IdxT>::build_param& param)
+{
+  if (conf.contains("graph_degree")) {
+    param.cagra_params.graph_degree              = conf.at("graph_degree");
+    param.cagra_params.intermediate_graph_degree = param.cagra_params.graph_degree * 2;
+  }
+  if (conf.contains("intermediate_graph_degree")) {
+    param.cagra_params.intermediate_graph_degree = conf.at("intermediate_graph_degree");
+  }
+  if (conf.contains("graph_build_algo")) {
+    if (conf.at("graph_build_algo") == "IVF_PQ") {
+      param.algo = cuvs::bench::CagraBuildAlgo::kIvfPq;
+    } else if (conf.at("graph_build_algo") == "NN_DESCENT") {
+      param.algo = cuvs::bench::CagraBuildAlgo::kNnDescent;
+    } else {
+      param.algo = cuvs::bench::CagraBuildAlgo::kAuto;
+    }
+  }
+  nlohmann::json ivf_pq_build_conf = collect_conf_with_prefix(conf, "ivf_pq_build_");
+  if (!ivf_pq_build_conf.empty()) {
+    cuvs::neighbors::ivf_pq::index_params bparam;
+    parse_build_param<T, IdxT>(ivf_pq_build_conf, bparam);
+    param.ivf_pq_build_params = bparam;
+  }
+  nlohmann::json ivf_pq_search_conf = collect_conf_with_prefix(conf, "ivf_pq_search_");
+  if (!ivf_pq_search_conf.empty()) {
+    typename cuvs::bench::cuvs_ivf_pq<T, IdxT>::search_param sparam;
+    parse_search_param<T, IdxT>(ivf_pq_search_conf, sparam);
+    param.ivf_pq_search_params = sparam.pq_param;
+    param.ivf_pq_refine_rate   = sparam.refine_ratio;
+  }
+  nlohmann::json nn_descent_conf = collect_conf_with_prefix(conf, "nn_descent_");
+  if (!nn_descent_conf.empty()) {
+    cuvs::neighbors::nn_descent::index_params nn_param;
+    nn_param.intermediate_graph_degree = 1.5 * param.cagra_params.intermediate_graph_degree;
+    parse_build_param<T, IdxT>(nn_descent_conf, nn_param);
+    if (nn_param.graph_degree != param.cagra_params.intermediate_graph_degree) {
+      nn_param.graph_degree = param.cagra_params.intermediate_graph_degree;
+    }
+    param.nn_descent_params = nn_param;
+  }
+  nlohmann::json comp_search_conf = collect_conf_with_prefix(conf, "compression_");
+  if (!comp_search_conf.empty()) {
+    cuvs::neighbors::vpq_params vpq_pams;
+    parse_build_param(comp_search_conf, vpq_pams);
+    param.cagra_params.compression.emplace(vpq_pams);
+  }
+}
+
+cuvs::bench::AllocatorType parse_allocator(std::string mem_type)
+{
+  if (mem_type == "device") {
+    return cuvs::bench::AllocatorType::kDevice;
+  } else if (mem_type == "host_pinned") {
+    return cuvs::bench::AllocatorType::kHostPinned;
+  } else if (mem_type == "host_huge_page") {
+    return cuvs::bench::AllocatorType::kHostHugePage;
+  }
+  THROW(
+    "Invalid value for memory type %s, must be one of [\"device\", \"host_pinned\", "
+    "\"host_huge_page\"",
+    mem_type.c_str());
+}
+
+template <typename T, typename IdxT>
+void parse_search_param(const nlohmann::json& conf,
+                        typename cuvs::bench::cuvs_cagra<T, IdxT>::search_param& param)
+{
+  if (conf.contains("itopk")) { param.p.itopk_size = conf.at("itopk"); }
+  if (conf.contains("search_width")) { param.p.search_width = conf.at("search_width"); }
+  if (conf.contains("max_iterations")) { param.p.max_iterations = conf.at("max_iterations"); }
+  if (conf.contains("algo")) {
+    if (conf.at("algo") == "single_cta") {
+      param.p.algo = cuvs::neighbors::cagra::search_algo::SINGLE_CTA;
+    } else if (conf.at("algo") == "multi_cta") {
+      param.p.algo = cuvs::neighbors::cagra::search_algo::MULTI_CTA;
+    } else if (conf.at("algo") == "multi_kernel") {
+      param.p.algo = cuvs::neighbors::cagra::search_algo::MULTI_KERNEL;
+    } else if (conf.at("algo") == "auto") {
+      param.p.algo = cuvs::neighbors::cagra::search_algo::AUTO;
+    } else {
+      std::string tmp = conf.at("algo");
+      THROW("Invalid value for algo: %s", tmp.c_str());
+    }
+  }
+  if (conf.contains("graph_memory_type")) {
+    param.graph_mem = parse_allocator(conf.at("graph_memory_type"));
+  }
+  if (conf.contains("internal_dataset_memory_type")) {
+    param.dataset_mem = parse_allocator(conf.at("internal_dataset_memory_type"));
+  }
+  // Same ratio as in IVF-PQ
+  param.refine_ratio = conf.value("refine_ratio", 1.0f);
+}
+#endif
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_utils.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_utils.h
new file mode 100644
index 000000000..b92785943
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_utils.h
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../common/ann_types.hpp"
+#include "../common/util.hpp"
+#include <cuvs/neighbors/refine.hpp>
+
+#include <cuvs/distance/distance.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/failure_callback_resource_adaptor.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <memory>
+#include <type_traits>
+
+namespace cuvs::bench {
+
+inline auto parse_metric_type(cuvs::bench::Metric metric) -> cuvs::distance::DistanceType
+{
+  if (metric == cuvs::bench::Metric::kInnerProduct) {
+    return cuvs::distance::DistanceType::InnerProduct;
+  } else if (metric == cuvs::bench::Metric::kEuclidean) {
+    // Even for L2 expanded RAFT IVF Flat uses unexpanded formula
+    return cuvs::distance::DistanceType::L2Expanded;
+  } else {
+    throw std::runtime_error("raft supports only metric type of inner product and L2");
+  }
+}
+
+/** Report a more verbose error with a backtrace when OOM occurs on RMM side. */
+inline auto rmm_oom_callback(std::size_t bytes, void*) -> bool
+{
+  auto cuda_status = cudaGetLastError();
+  size_t free      = 0;
+  size_t total     = 0;
+  RAFT_CUDA_TRY_NO_THROW(cudaMemGetInfo(&free, &total));
+  RAFT_FAIL(
+    "Failed to allocate %zu bytes using RMM memory resource. "
+    "NB: latest cuda status = %s, free memory = %zu, total memory = %zu.",
+    bytes,
+    cudaGetErrorName(cuda_status),
+    free,
+    total);
+}
+
+/**
+ * This container keeps the part of raft state that should be shared among multiple copies of raft
+ * handles (in different CPU threads).
+ * An example of this is an RMM memory resource: if we had an RMM memory pool per thread, we'd
+ * quickly run out of memory.
+ */
+class shared_raft_resources {
+ public:
+  using pool_mr_type = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
+  using mr_type      = rmm::mr::failure_callback_resource_adaptor<pool_mr_type>;
+
+  shared_raft_resources()
+  try : orig_resource_{rmm::mr::get_current_device_resource()},
+    pool_resource_(orig_resource_, 1024 * 1024 * 1024ull),
+    resource_(&pool_resource_, rmm_oom_callback, nullptr) {
+    rmm::mr::set_current_device_resource(&resource_);
+  } catch (const std::exception& e) {
+    auto cuda_status = cudaGetLastError();
+    size_t free      = 0;
+    size_t total     = 0;
+    RAFT_CUDA_TRY_NO_THROW(cudaMemGetInfo(&free, &total));
+    RAFT_FAIL(
+      "Failed to initialize shared raft resources (NB: latest cuda status = %s, free memory = %zu, "
+      "total memory = %zu): %s",
+      cudaGetErrorName(cuda_status),
+      free,
+      total,
+      e.what());
+  }
+
+  shared_raft_resources(shared_raft_resources&&)                               = delete;
+  auto operator=(shared_raft_resources&&) -> shared_raft_resources&            = delete;
+  shared_raft_resources(const shared_raft_resources& res)                      = delete;
+  auto operator=(const shared_raft_resources& other) -> shared_raft_resources& = delete;
+
+  ~shared_raft_resources() noexcept { rmm::mr::set_current_device_resource(orig_resource_); }
+
+ private:
+  rmm::mr::device_memory_resource* orig_resource_;
+  pool_mr_type pool_resource_;
+  mr_type resource_;
+};
+
+/**
+ * This struct is used by multiple raft benchmark wrappers. It serves as a thread-safe keeper of
+ * shared and private GPU resources (see below).
+ *
+ * - Accessing the same `configured_raft_resources` from concurrent threads is not safe.
+ * - Accessing the copies of `configured_raft_resources` from concurrent threads is safe.
+ * - There must be at most one "original" `configured_raft_resources` at any time, but as many
+ *   copies of it as needed (modifies the program static state).
+ */
+class configured_raft_resources {
+ public:
+  /**
+   * This constructor has the shared state passed unmodified but creates the local state anew.
+   * It's used by the copy constructor.
+   */
+  explicit configured_raft_resources(const std::shared_ptr<shared_raft_resources>& shared_res)
+    : shared_res_{shared_res},
+      res_{std::make_unique<raft::device_resources>(
+        rmm::cuda_stream_view(get_stream_from_global_pool()))}
+  {
+  }
+
+  /** Default constructor creates all resources anew. */
+  configured_raft_resources() : configured_raft_resources{std::make_shared<shared_raft_resources>()}
+  {
+  }
+
+  configured_raft_resources(configured_raft_resources&&);
+  auto operator=(configured_raft_resources&&) -> configured_raft_resources&;
+  ~configured_raft_resources() = default;
+  configured_raft_resources(const configured_raft_resources& res)
+    : configured_raft_resources{res.shared_res_}
+  {
+  }
+  auto operator=(const configured_raft_resources& other) -> configured_raft_resources&
+  {
+    this->shared_res_ = other.shared_res_;
+    return *this;
+  }
+
+  operator raft::resources&() noexcept { return *res_; }              // NOLINT
+  operator const raft::resources&() const noexcept { return *res_; }  // NOLINT
+
+  /** Get the main stream */
+  [[nodiscard]] auto get_sync_stream() const noexcept
+  {
+    return raft::resource::get_cuda_stream(*res_);
+  }
+
+ private:
+  /** The resources shared among multiple raft handles / threads. */
+  std::shared_ptr<shared_raft_resources> shared_res_;
+  /**
+   * Until we make the use of copies of raft::resources thread-safe, each benchmark wrapper must
+   * have its own copy of it.
+   */
+  std::unique_ptr<raft::device_resources> res_ = std::make_unique<raft::device_resources>();
+};
+
+inline configured_raft_resources::configured_raft_resources(configured_raft_resources&&) = default;
+inline auto configured_raft_resources::operator=(configured_raft_resources&&)
+  -> configured_raft_resources& = default;
+
+/** A helper to refine the neighbors when the data is on device or on host. */
+template <typename DatasetT, typename QueriesT, typename CandidatesT>
+void refine_helper(const raft::resources& res,
+                   DatasetT dataset,
+                   QueriesT queries,
+                   CandidatesT candidates,
+                   int k,
+                   algo_base::index_type* neighbors,
+                   float* distances,
+                   cuvs::distance::DistanceType metric)
+{
+  using data_type    = typename DatasetT::value_type;
+  using index_type   = algo_base::index_type;
+  using extents_type = int64_t;  // device-side refine requires this
+
+  static_assert(std::is_same_v<data_type, typename QueriesT::value_type>);
+  static_assert(std::is_same_v<data_type, typename DatasetT::value_type>);
+  static_assert(std::is_same_v<index_type, typename CandidatesT::value_type>);
+
+  extents_type batch_size = queries.extent(0);
+  extents_type dim        = queries.extent(1);
+  extents_type k0         = candidates.extent(1);
+
+  if (raft::get_device_for_address(dataset.data_handle()) >= 0) {
+    auto dataset_device = raft::make_device_matrix_view<const data_type, extents_type>(
+      dataset.data_handle(), dataset.extent(0), dataset.extent(1));
+    auto queries_device = raft::make_device_matrix_view<const data_type, extents_type>(
+      queries.data_handle(), batch_size, dim);
+    auto candidates_device = raft::make_device_matrix_view<const index_type, extents_type>(
+      candidates.data_handle(), batch_size, k0);
+    auto neighbors_device =
+      raft::make_device_matrix_view<index_type, extents_type>(neighbors, batch_size, k);
+    auto distances_device =
+      raft::make_device_matrix_view<float, extents_type>(distances, batch_size, k);
+
+    cuvs::neighbors::refine(res,
+                            dataset_device,
+                            queries_device,
+                            candidates_device,
+                            neighbors_device,
+                            distances_device,
+                            metric);
+  } else {
+    auto dataset_host = raft::make_host_matrix_view<const data_type, extents_type>(
+      dataset.data_handle(), dataset.extent(0), dataset.extent(1));
+    auto queries_host    = raft::make_host_matrix<data_type, extents_type>(batch_size, dim);
+    auto candidates_host = raft::make_host_matrix<index_type, extents_type>(batch_size, k0);
+    auto neighbors_host  = raft::make_host_matrix<index_type, extents_type>(batch_size, k);
+    auto distances_host  = raft::make_host_matrix<float, extents_type>(batch_size, k);
+
+    auto stream = raft::resource::get_cuda_stream(res);
+    raft::copy(queries_host.data_handle(), queries.data_handle(), queries_host.size(), stream);
+    raft::copy(
+      candidates_host.data_handle(), candidates.data_handle(), candidates_host.size(), stream);
+
+    raft::resource::sync_stream(res);  // wait for the queries and candidates
+    cuvs::neighbors::refine(res,
+                            dataset_host,
+                            queries_host.view(),
+                            candidates_host.view(),
+                            neighbors_host.view(),
+                            distances_host.view(),
+                            metric);
+
+    raft::copy(neighbors, neighbors_host.data_handle(), neighbors_host.size(), stream);
+    raft::copy(distances, distances_host.data_handle(), distances_host.size(), stream);
+  }
+}
+
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_benchmark.cu b/cpp/bench/ann/src/cuvs/cuvs_benchmark.cu
new file mode 100644
index 000000000..a7495c23a
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_benchmark.cu
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../common/ann_types.hpp"
+#include "cuvs_ann_bench_param_parser.h"
+
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+namespace cuvs::bench {
+
+template <typename T>
+auto create_algo(const std::string& algo_name,
+                 const std::string& distance,
+                 int dim,
+                 const nlohmann::json& conf) -> std::unique_ptr<cuvs::bench::algo<T>>
+{
+  [[maybe_unused]] cuvs::bench::Metric metric = parse_metric(distance);
+  std::unique_ptr<cuvs::bench::algo<T>> a;
+
+  if constexpr (std::is_same_v<T, float>) {
+#ifdef CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE
+    if (algo_name == "raft_brute_force" || algo_name == "cuvs_brute_force") {
+      a = std::make_unique<cuvs::bench::cuvs_gpu<T>>(metric, dim);
+    }
+#endif
+  }
+
+  if constexpr (std::is_same_v<T, uint8_t>) {}
+
+#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT
+  if constexpr (std::is_same_v<T, float> || std::is_same_v<T, uint8_t> ||
+                std::is_same_v<T, int8_t>) {
+    if (algo_name == "raft_ivf_flat" || algo_name == "cuvs_ivf_flat") {
+      typename cuvs::bench::cuvs_ivf_flat<T, int64_t>::build_param param;
+      parse_build_param<T, int64_t>(conf, param);
+      a = std::make_unique<cuvs::bench::cuvs_ivf_flat<T, int64_t>>(metric, dim, param);
+    }
+  }
+#endif
+#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_PQ
+  if (algo_name == "raft_ivf_pq" || algo_name == "cuvs_ivf_pq") {
+    typename cuvs::bench::cuvs_ivf_pq<T, int64_t>::build_param param;
+    parse_build_param<T, int64_t>(conf, param);
+    a = std::make_unique<cuvs::bench::cuvs_ivf_pq<T, int64_t>>(metric, dim, param);
+  }
+#endif
+#ifdef CUVS_ANN_BENCH_USE_CUVS_CAGRA
+  if (algo_name == "raft_cagra" || algo_name == "cuvs_cagra") {
+    typename cuvs::bench::cuvs_cagra<T, uint32_t>::build_param param;
+    parse_build_param<T, uint32_t>(conf, param);
+    a = std::make_unique<cuvs::bench::cuvs_cagra<T, uint32_t>>(metric, dim, param);
+  }
+#endif
+
+  if (!a) { throw std::runtime_error("invalid algo: '" + algo_name + "'"); }
+
+  return a;
+}
+
+template <typename T>
+auto create_search_param(const std::string& algo_name, const nlohmann::json& conf)
+  -> std::unique_ptr<typename cuvs::bench::algo<T>::search_param>
+{
+#ifdef CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE
+  if (algo_name == "raft_brute_force" || algo_name == "cuvs_brute_force") {
+    auto param = std::make_unique<typename cuvs::bench::cuvs_gpu<T>::search_param>();
+    return param;
+  }
+#endif
+#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT
+  if constexpr (std::is_same_v<T, float> || std::is_same_v<T, uint8_t> ||
+                std::is_same_v<T, int8_t>) {
+    if (algo_name == "raft_ivf_flat" || algo_name == "cuvs_ivf_flat") {
+      auto param =
+        std::make_unique<typename cuvs::bench::cuvs_ivf_flat<T, int64_t>::search_param>();
+      parse_search_param<T, int64_t>(conf, *param);
+      return param;
+    }
+  }
+#endif
+#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_PQ
+  if (algo_name == "raft_ivf_pq" || algo_name == "cuvs_ivf_pq") {
+    auto param = std::make_unique<typename cuvs::bench::cuvs_ivf_pq<T, int64_t>::search_param>();
+    parse_search_param<T, int64_t>(conf, *param);
+    return param;
+  }
+#endif
+#ifdef CUVS_ANN_BENCH_USE_CUVS_CAGRA
+  if (algo_name == "raft_cagra" || algo_name == "cuvs_cagra") {
+    auto param = std::make_unique<typename cuvs::bench::cuvs_cagra<T, uint32_t>::search_param>();
+    parse_search_param<T, uint32_t>(conf, *param);
+    return param;
+  }
+#endif
+
+  // else
+  throw std::runtime_error("invalid algo: '" + algo_name + "'");
+}
+
+};  // namespace cuvs::bench
+
+REGISTER_ALGO_INSTANCE(float);
+// REGISTER_ALGO_INSTANCE(half);
+REGISTER_ALGO_INSTANCE(std::int8_t);
+REGISTER_ALGO_INSTANCE(std::uint8_t);
+
+#ifdef ANN_BENCH_BUILD_MAIN
+#include "../common/benchmark.hpp"
+int main(int argc, char** argv) { return cuvs::bench::run_main(argc, argv); }
+#endif
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_float.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_float.cu
new file mode 100644
index 000000000..576f1d5bd
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_float.cu
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "cuvs_cagra_wrapper.h"
+
+namespace cuvs::bench {
+template class cuvs_cagra<float, uint32_t>;
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_half.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_half.cu
new file mode 100644
index 000000000..6768034a2
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_half.cu
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "cuvs_cagra_wrapper.h"
+
+namespace cuvs::bench {
+// template class cuvs_cagra<half, uint32_t>;
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
new file mode 100644
index 000000000..558ba01e0
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../common/ann_types.hpp"
+#include "cuvs_ann_bench_param_parser.h"
+#include "cuvs_cagra_hnswlib_wrapper.h"
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+namespace cuvs::bench {
+
+template <typename T, typename IdxT>
+void parse_search_param(const nlohmann::json& conf,
+                        typename cuvs::bench::cuvs_cagra_hnswlib<T, IdxT>::search_param& param)
+{
+  param.ef = conf.at("ef");
+  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+}
+
+template <typename T>
+auto create_algo(const std::string& algo_name,
+                 const std::string& distance,
+                 int dim,
+                 const nlohmann::json& conf) -> std::unique_ptr<cuvs::bench::algo<T>>
+{
+  [[maybe_unused]] cuvs::bench::Metric metric = parse_metric(distance);
+  std::unique_ptr<cuvs::bench::algo<T>> a;
+
+  if constexpr (std::is_same_v<T, float> or std::is_same_v<T, std::uint8_t>) {
+    if (algo_name == "raft_cagra_hnswlib" || algo_name == "cuvs_cagra_hnswlib") {
+      typename cuvs::bench::cuvs_cagra_hnswlib<T, uint32_t>::build_param param;
+      parse_build_param<T, uint32_t>(conf, param);
+      a = std::make_unique<cuvs::bench::cuvs_cagra_hnswlib<T, uint32_t>>(metric, dim, param);
+    }
+  }
+
+  if (!a) { throw std::runtime_error("invalid algo: '" + algo_name + "'"); }
+
+  return a;
+}
+
+template <typename T>
+auto create_search_param(const std::string& algo_name, const nlohmann::json& conf)
+  -> std::unique_ptr<typename cuvs::bench::algo<T>::search_param>
+{
+  if (algo_name == "raft_cagra_hnswlib" || algo_name == "cuvs_cagra_hnswlib") {
+    auto param =
+      std::make_unique<typename cuvs::bench::cuvs_cagra_hnswlib<T, uint32_t>::search_param>();
+    parse_search_param<T, uint32_t>(conf, *param);
+    return param;
+  }
+
+  throw std::runtime_error("invalid algo: '" + algo_name + "'");
+}
+
+}  // namespace cuvs::bench
+
+REGISTER_ALGO_INSTANCE(float);
+REGISTER_ALGO_INSTANCE(std::int8_t);
+REGISTER_ALGO_INSTANCE(std::uint8_t);
+
+#ifdef ANN_BENCH_BUILD_MAIN
+#include "../common/benchmark.hpp"
+/*
+[NOTE] Dear developer,
+
+Please don't modify the content of the `main` function; this will make the behavior of the benchmark
+executable differ depending on the cmake flags and will complicate the debugging. In particular,
+don't try to setup an RMM memory resource here; it will anyway be modified by the memory resource
+set on per-algorithm basis. For example, see `cuvs/cuvs_ann_bench_utils.h`.
+*/
+int main(int argc, char** argv) { return cuvs::bench::run_main(argc, argv); }
+#endif
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
new file mode 100644
index 000000000..875fe0bba
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../hnswlib/hnswlib_wrapper.h"
+#include "cuvs_cagra_wrapper.h"
+
+#include <memory>
+
+namespace cuvs::bench {
+
+template <typename T, typename IdxT>
+class cuvs_cagra_hnswlib : public algo<T>, public algo_gpu {
+ public:
+  using search_param_base = typename algo<T>::search_param;
+  using build_param       = typename cuvs_cagra<T, IdxT>::build_param;
+  using search_param      = typename hnsw_lib<T>::search_param;
+
+  cuvs_cagra_hnswlib(Metric metric, int dim, const build_param& param, int concurrent_searches = 1)
+    : algo<T>(metric, dim),
+      cagra_build_{metric, dim, param, concurrent_searches},
+      // hnsw_lib param values don't matter since we don't build with hnsw_lib
+      hnswlib_search_{metric, dim, typename hnsw_lib<T>::build_param{50, 100}}
+  {
+  }
+
+  void build(const T* dataset, size_t nrow) final;
+
+  void set_search_param(const search_param_base& param) override;
+
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              algo_base::index_type* neighbors,
+              float* distances) const override;
+
+  [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
+  {
+    return cagra_build_.get_sync_stream();
+  }
+
+  // to enable dataset access from GPU memory
+  [[nodiscard]] auto get_preference() const -> algo_property override
+  {
+    algo_property property;
+    property.dataset_memory_type = MemoryType::kHostMmap;
+    property.query_memory_type   = MemoryType::kHost;
+    return property;
+  }
+
+  void save(const std::string& file) const override;
+  void load(const std::string&) override;
+  std::unique_ptr<algo<T>> copy() override
+  {
+    return std::make_unique<cuvs_cagra_hnswlib<T, IdxT>>(*this);
+  }
+
+ private:
+  cuvs_cagra<T, IdxT> cagra_build_;
+  hnsw_lib<T> hnswlib_search_;
+};
+
+template <typename T, typename IdxT>
+void cuvs_cagra_hnswlib<T, IdxT>::build(const T* dataset, size_t nrow)
+{
+  cagra_build_.build(dataset, nrow);
+}
+
+template <typename T, typename IdxT>
+void cuvs_cagra_hnswlib<T, IdxT>::set_search_param(const search_param_base& param_)
+{
+  hnswlib_search_.set_search_param(param_);
+}
+
+template <typename T, typename IdxT>
+void cuvs_cagra_hnswlib<T, IdxT>::save(const std::string& file) const
+{
+  cagra_build_.save_to_hnswlib(file);
+}
+
+template <typename T, typename IdxT>
+void cuvs_cagra_hnswlib<T, IdxT>::load(const std::string& file)
+{
+  hnswlib_search_.load(file);
+  hnswlib_search_.set_base_layer_only();
+}
+
+template <typename T, typename IdxT>
+void cuvs_cagra_hnswlib<T, IdxT>::search(
+  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+{
+  hnswlib_search_.search(queries, batch_size, k, neighbors, distances);
+}
+
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_int8_t.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_int8_t.cu
new file mode 100644
index 000000000..e2c0735f5
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_int8_t.cu
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "cuvs_cagra_wrapper.h"
+
+namespace cuvs::bench {
+template class cuvs_cagra<int8_t, uint32_t>;
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_uint8_t.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_uint8_t.cu
new file mode 100644
index 000000000..0f3849885
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_uint8_t.cu
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "cuvs_cagra_wrapper.h"
+
+namespace cuvs::bench {
+template class cuvs_cagra<uint8_t, uint32_t>;
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
new file mode 100644
index 000000000..9ac25f53f
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../../../../src/neighbors/detail/cagra/utils.hpp"
+#include "../common/ann_types.hpp"
+#include "../common/cuda_huge_page_resource.hpp"
+#include "../common/cuda_pinned_resource.hpp"
+#include "cuvs_ann_bench_utils.h"
+
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/cagra.hpp>
+#include <cuvs/neighbors/common.hpp>
+#include <cuvs/neighbors/ivf_pq.hpp>
+#include <cuvs/neighbors/nn_descent.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/linalg/unary_op.cuh>
+// #include <raft/neighbors/dataset.hpp>
+// #include <raft/neighbors/detail/cagra/cagra_build.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <optional>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+namespace cuvs::bench {
+
+enum class AllocatorType { kHostPinned, kHostHugePage, kDevice };
+enum class CagraBuildAlgo { kAuto, kIvfPq, kNnDescent };
+
+template <typename T, typename IdxT>
+class cuvs_cagra : public algo<T>, public algo_gpu {
+ public:
+  using search_param_base = typename algo<T>::search_param;
+
+  struct search_param : public search_param_base {
+    cuvs::neighbors::cagra::search_params p;
+    float refine_ratio;
+    AllocatorType graph_mem   = AllocatorType::kDevice;
+    AllocatorType dataset_mem = AllocatorType::kDevice;
+    [[nodiscard]] auto needs_dataset() const -> bool override { return true; }
+  };
+
+  struct build_param {
+    cuvs::neighbors::cagra::index_params cagra_params;
+    CagraBuildAlgo algo;
+    std::optional<cuvs::neighbors::nn_descent::index_params> nn_descent_params = std::nullopt;
+    std::optional<float> ivf_pq_refine_rate                                    = std::nullopt;
+    std::optional<cuvs::neighbors::ivf_pq::index_params> ivf_pq_build_params   = std::nullopt;
+    std::optional<cuvs::neighbors::ivf_pq::search_params> ivf_pq_search_params = std::nullopt;
+  };
+
+  cuvs_cagra(Metric metric, int dim, const build_param& param, int concurrent_searches = 1)
+    : algo<T>(metric, dim),
+      index_params_(param),
+      dimension_(dim),
+
+      dataset_(std::make_shared<raft::device_matrix<T, int64_t, raft::row_major>>(
+        std::move(raft::make_device_matrix<T, int64_t>(handle_, 0, 0)))),
+      graph_(std::make_shared<raft::device_matrix<IdxT, int64_t, raft::row_major>>(
+        std::move(raft::make_device_matrix<IdxT, int64_t>(handle_, 0, 0)))),
+      input_dataset_v_(
+        std::make_shared<raft::device_matrix_view<const T, int64_t, raft::row_major>>(
+          nullptr, 0, 0))
+
+  {
+    index_params_.cagra_params.metric         = parse_metric_type(metric);
+    index_params_.ivf_pq_build_params->metric = parse_metric_type(metric);
+  }
+
+  void build(const T* dataset, size_t nrow) final;
+
+  void set_search_param(const search_param_base& param) override;
+
+  void set_search_dataset(const T* dataset, size_t nrow) override;
+
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              algo_base::index_type* neighbors,
+              float* distances) const override;
+  void search_base(const T* queries,
+                   int batch_size,
+                   int k,
+                   algo_base::index_type* neighbors,
+                   float* distances) const;
+
+  [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
+  {
+    return handle_.get_sync_stream();
+  }
+
+  // to enable dataset access from GPU memory
+  [[nodiscard]] auto get_preference() const -> algo_property override
+  {
+    algo_property property;
+    property.dataset_memory_type = MemoryType::kHostMmap;
+    property.query_memory_type   = MemoryType::kDevice;
+    return property;
+  }
+  void save(const std::string& file) const override;
+  void load(const std::string&) override;
+  void save_to_hnswlib(const std::string& file) const;
+  std::unique_ptr<algo<T>> copy() override;
+
+ private:
+  // handle_ must go first to make sure it dies last and all memory allocated in pool
+  configured_raft_resources handle_{};
+  raft::mr::cuda_pinned_resource mr_pinned_;
+  raft::mr::cuda_huge_page_resource mr_huge_page_;
+  AllocatorType graph_mem_{AllocatorType::kDevice};
+  AllocatorType dataset_mem_{AllocatorType::kDevice};
+  float refine_ratio_;
+  build_param index_params_;
+  bool need_dataset_update_{true};
+  cuvs::neighbors::cagra::search_params search_params_;
+  std::shared_ptr<cuvs::neighbors::cagra::index<T, IdxT>> index_;
+  int dimension_;
+  std::shared_ptr<raft::device_matrix<IdxT, int64_t, raft::row_major>> graph_;
+  std::shared_ptr<raft::device_matrix<T, int64_t, raft::row_major>> dataset_;
+  std::shared_ptr<raft::device_matrix_view<const T, int64_t, raft::row_major>> input_dataset_v_;
+
+  inline rmm::device_async_resource_ref get_mr(AllocatorType mem_type)
+  {
+    switch (mem_type) {
+      case (AllocatorType::kHostPinned): return &mr_pinned_;
+      case (AllocatorType::kHostHugePage): return &mr_huge_page_;
+      default: return rmm::mr::get_current_device_resource();
+    }
+  }
+};
+
+template <typename T, typename IdxT>
+void cuvs_cagra<T, IdxT>::build(const T* dataset, size_t nrow)
+{
+  auto dataset_view =
+    raft::make_host_matrix_view<const T, int64_t>(dataset, IdxT(nrow), dimension_);
+
+  auto& params = index_params_.cagra_params;
+
+  if (index_params_.algo == CagraBuildAlgo::kIvfPq) {
+    auto pq_params = cuvs::neighbors::cagra::graph_build_params::ivf_pq_params(
+      dataset_view.extents(), params.metric);
+    if (index_params_.ivf_pq_build_params) {
+      pq_params.build_params = *index_params_.ivf_pq_build_params;
+    }
+    if (index_params_.ivf_pq_search_params) {
+      pq_params.search_params = *index_params_.ivf_pq_search_params;
+    }
+    if (index_params_.ivf_pq_refine_rate) {
+      pq_params.refinement_rate = *index_params_.ivf_pq_refine_rate;
+    }
+    params.graph_build_params = pq_params;
+  } else if (index_params_.algo == CagraBuildAlgo::kNnDescent) {
+    auto nn_params = cuvs::neighbors::cagra::graph_build_params::nn_descent_params(
+      params.intermediate_graph_degree);
+    if (index_params_.nn_descent_params) { nn_params = *index_params_.nn_descent_params; }
+    params.graph_build_params = nn_params;
+  }
+  index_ = std::make_shared<cuvs::neighbors::cagra::index<T, IdxT>>(
+    std::move(cuvs::neighbors::cagra::build(handle_, params, dataset_view)));
+}
+
+inline auto allocator_to_string(AllocatorType mem_type) -> std::string
+{
+  if (mem_type == AllocatorType::kDevice) {
+    return "device";
+  } else if (mem_type == AllocatorType::kHostPinned) {
+    return "host_pinned";
+  } else if (mem_type == AllocatorType::kHostHugePage) {
+    return "host_huge_page";
+  }
+  return "<invalid allocator type>";
+}
+
+template <typename T, typename IdxT>
+void cuvs_cagra<T, IdxT>::set_search_param(const search_param_base& param)
+{
+  auto sp        = dynamic_cast<const search_param&>(param);
+  search_params_ = sp.p;
+  refine_ratio_  = sp.refine_ratio;
+  if (sp.graph_mem != graph_mem_) {
+    // Move graph to correct memory space
+    graph_mem_ = sp.graph_mem;
+    RAFT_LOG_DEBUG("moving graph to new memory space: %s", allocator_to_string(graph_mem_).c_str());
+    // We create a new graph and copy to it from existing graph
+    auto mr        = get_mr(graph_mem_);
+    auto new_graph = raft::make_device_mdarray<IdxT, int64_t>(
+      handle_, mr, raft::make_extents<int64_t>(index_->graph().extent(0), index_->graph_degree()));
+
+    raft::copy(new_graph.data_handle(),
+               index_->graph().data_handle(),
+               index_->graph().size(),
+               raft::resource::get_cuda_stream(handle_));
+
+    index_->update_graph(handle_, make_const_mdspan(new_graph.view()));
+    // update_graph() only stores a view in the index. We need to keep the graph object alive.
+    *graph_ = std::move(new_graph);
+  }
+
+  if (sp.dataset_mem != dataset_mem_ || need_dataset_update_) {
+    dataset_mem_ = sp.dataset_mem;
+
+    // First free up existing memory
+    *dataset_ = raft::make_device_matrix<T, int64_t>(handle_, 0, 0);
+    index_->update_dataset(handle_, make_const_mdspan(dataset_->view()));
+
+    // Allocate space using the correct memory resource.
+    RAFT_LOG_DEBUG("moving dataset to new memory space: %s",
+                   allocator_to_string(dataset_mem_).c_str());
+
+    auto mr = get_mr(dataset_mem_);
+    cuvs::neighbors::cagra::detail::copy_with_padding(handle_, *dataset_, *input_dataset_v_, mr);
+
+    auto dataset_view = raft::make_device_strided_matrix_view<const T, int64_t>(
+      dataset_->data_handle(), dataset_->extent(0), this->dim_, dataset_->extent(1));
+    index_->update_dataset(handle_, dataset_view);
+
+    need_dataset_update_ = false;
+  }
+}
+
+template <typename T, typename IdxT>
+void cuvs_cagra<T, IdxT>::set_search_dataset(const T* dataset, size_t nrow)
+{
+  using ds_idx_type = decltype(index_->data().n_rows());
+  bool is_vpq =
+    dynamic_cast<const cuvs::neighbors::vpq_dataset<half, ds_idx_type>*>(&index_->data()) ||
+    dynamic_cast<const cuvs::neighbors::vpq_dataset<float, ds_idx_type>*>(&index_->data());
+  // It can happen that we are re-using a previous algo object which already has
+  // the dataset set. Check if we need update.
+  if (static_cast<size_t>(input_dataset_v_->extent(0)) != nrow ||
+      input_dataset_v_->data_handle() != dataset) {
+    *input_dataset_v_ = raft::make_device_matrix_view<const T, int64_t>(dataset, nrow, this->dim_);
+    need_dataset_update_ = !is_vpq;  // ignore update if this is a VPQ dataset.
+  }
+}
+
+template <typename T, typename IdxT>
+void cuvs_cagra<T, IdxT>::save(const std::string& file) const
+{
+  cuvs::neighbors::cagra::serialize(handle_, file, *index_);
+}
+
+template <typename T, typename IdxT>
+void cuvs_cagra<T, IdxT>::save_to_hnswlib(const std::string& file) const
+{
+  cuvs::neighbors::cagra::serialize_to_hnswlib(handle_, file, *index_);
+}
+
+template <typename T, typename IdxT>
+void cuvs_cagra<T, IdxT>::load(const std::string& file)
+{
+  index_ = std::make_shared<cuvs::neighbors::cagra::index<T, IdxT>>(handle_);
+  cuvs::neighbors::cagra::deserialize(handle_, file, index_.get());
+}
+
+template <typename T, typename IdxT>
+std::unique_ptr<algo<T>> cuvs_cagra<T, IdxT>::copy()
+{
+  return std::make_unique<cuvs_cagra<T, IdxT>>(*this);  // use copy constructor
+}
+
+template <typename T, typename IdxT>
+void cuvs_cagra<T, IdxT>::search_base(
+  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+{
+  static_assert(std::is_integral_v<algo_base::index_type>);
+  static_assert(std::is_integral_v<IdxT>);
+
+  IdxT* neighbors_idx_t;
+  std::optional<rmm::device_uvector<IdxT>> neighbors_storage{std::nullopt};
+  if constexpr (sizeof(IdxT) == sizeof(algo_base::index_type)) {
+    neighbors_idx_t = reinterpret_cast<IdxT*>(neighbors);
+  } else {
+    neighbors_storage.emplace(batch_size * k, raft::resource::get_cuda_stream(handle_));
+    neighbors_idx_t = neighbors_storage->data();
+  }
+
+  auto queries_view =
+    raft::make_device_matrix_view<const T, int64_t>(queries, batch_size, dimension_);
+  auto neighbors_view =
+    raft::make_device_matrix_view<IdxT, int64_t>(neighbors_idx_t, batch_size, k);
+  auto distances_view = raft::make_device_matrix_view<float, int64_t>(distances, batch_size, k);
+
+  cuvs::neighbors::cagra::search(
+    handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+
+  if constexpr (sizeof(IdxT) != sizeof(algo_base::index_type)) {
+    raft::linalg::unaryOp(neighbors,
+                          neighbors_idx_t,
+                          batch_size * k,
+                          raft::cast_op<algo_base::index_type>(),
+                          raft::resource::get_cuda_stream(handle_));
+  }
+}
+
+template <typename T, typename IdxT>
+void cuvs_cagra<T, IdxT>::search(
+  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+{
+  auto k0                       = static_cast<size_t>(refine_ratio_ * k);
+  const bool disable_refinement = k0 <= static_cast<size_t>(k);
+  const raft::resources& res    = handle_;
+
+  if (disable_refinement) {
+    search_base(queries, batch_size, k, neighbors, distances);
+  } else {
+    auto queries_v = raft::make_device_matrix_view<const T, algo_base::index_type>(
+      queries, batch_size, dimension_);
+    auto candidate_ixs =
+      raft::make_device_matrix<algo_base::index_type, algo_base::index_type>(res, batch_size, k0);
+    auto candidate_dists =
+      raft::make_device_matrix<float, algo_base::index_type>(res, batch_size, k0);
+    search_base(
+      queries, batch_size, k0, candidate_ixs.data_handle(), candidate_dists.data_handle());
+    refine_helper(
+      res, *input_dataset_v_, queries_v, candidate_ixs, k, neighbors, distances, index_->metric());
+  }
+}
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ivf_flat.cu b/cpp/bench/ann/src/cuvs/cuvs_ivf_flat.cu
new file mode 100644
index 000000000..f38e16c9f
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_ivf_flat.cu
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "cuvs_ivf_flat_wrapper.h"
+
+namespace cuvs::bench {
+template class cuvs_ivf_flat<float, int64_t>;
+template class cuvs_ivf_flat<uint8_t, int64_t>;
+template class cuvs_ivf_flat<int8_t, int64_t>;
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ivf_flat_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_ivf_flat_wrapper.h
new file mode 100644
index 000000000..2aaa5a294
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_ivf_flat_wrapper.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../common/ann_types.hpp"
+#include "cuvs_ann_bench_utils.h"
+
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/ivf_flat.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+namespace cuvs::bench {
+
+template <typename T, typename IdxT>
+class cuvs_ivf_flat : public algo<T>, public algo_gpu {
+ public:
+  using search_param_base = typename algo<T>::search_param;
+
+  struct search_param : public search_param_base {
+    cuvs::neighbors::ivf_flat::search_params ivf_flat_params;
+  };
+
+  using build_param = cuvs::neighbors::ivf_flat::index_params;
+
+  cuvs_ivf_flat(Metric metric, int dim, const build_param& param)
+    : algo<T>(metric, dim), index_params_(param), dimension_(dim)
+  {
+    index_params_.metric                         = parse_metric_type(metric);
+    index_params_.conservative_memory_allocation = true;
+    RAFT_CUDA_TRY(cudaGetDevice(&device_));
+  }
+
+  void build(const T* dataset, size_t nrow) final;
+
+  void set_search_param(const search_param_base& param) override;
+
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              algo_base::index_type* neighbors,
+              float* distances) const override;
+
+  [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
+  {
+    return handle_.get_sync_stream();
+  }
+
+  // to enable dataset access from GPU memory
+  [[nodiscard]] auto get_preference() const -> algo_property override
+  {
+    algo_property property;
+    property.dataset_memory_type = MemoryType::kHostMmap;
+    property.query_memory_type   = MemoryType::kDevice;
+    return property;
+  }
+  void save(const std::string& file) const override;
+  void load(const std::string&) override;
+  std::unique_ptr<algo<T>> copy() override;
+
+ private:
+  // handle_ must go first to make sure it dies last and all memory allocated in pool
+  configured_raft_resources handle_{};
+  build_param index_params_;
+  cuvs::neighbors::ivf_flat::search_params search_params_;
+  std::shared_ptr<cuvs::neighbors::ivf_flat::index<T, IdxT>> index_;
+  int device_;
+  int dimension_;
+};
+
+template <typename T, typename IdxT>
+void cuvs_ivf_flat<T, IdxT>::build(const T* dataset, size_t nrow)
+{
+  index_ = std::make_shared<cuvs::neighbors::ivf_flat::index<T, IdxT>>(
+    std::move(cuvs::neighbors::ivf_flat::build(
+      handle_,
+      index_params_,
+      raft::make_host_matrix_view<const T, int64_t>(dataset, nrow, dimension_))));
+  // Note: internally the IVF-Flat build works with simple pointers, and accepts both host and
+  // device pointer. Therefore, although we provide here a host_mdspan, this works with device
+  // pointer too.
+}
+
+template <typename T, typename IdxT>
+void cuvs_ivf_flat<T, IdxT>::set_search_param(const search_param_base& param)
+{
+  auto sp        = dynamic_cast<const search_param&>(param);
+  search_params_ = sp.ivf_flat_params;
+  assert(search_params_.n_probes <= index_params_.n_lists);
+}
+
+template <typename T, typename IdxT>
+void cuvs_ivf_flat<T, IdxT>::save(const std::string& file) const
+{
+  cuvs::neighbors::ivf_flat::serialize(handle_, file, *index_);
+  return;
+}
+
+template <typename T, typename IdxT>
+void cuvs_ivf_flat<T, IdxT>::load(const std::string& file)
+{
+  index_ =
+    std::make_shared<cuvs::neighbors::ivf_flat::index<T, IdxT>>(handle_, index_params_, this->dim_);
+
+  cuvs::neighbors::ivf_flat::deserialize(handle_, file, index_.get());
+  return;
+}
+
+template <typename T, typename IdxT>
+std::unique_ptr<algo<T>> cuvs_ivf_flat<T, IdxT>::copy()
+{
+  return std::make_unique<cuvs_ivf_flat<T, IdxT>>(*this);  // use copy constructor
+}
+
+template <typename T, typename IdxT>
+void cuvs_ivf_flat<T, IdxT>::search(
+  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+{
+  static_assert(std::is_integral_v<algo_base::index_type>);
+  static_assert(std::is_integral_v<IdxT>);
+
+  IdxT* neighbors_idx_t;
+  std::optional<rmm::device_uvector<IdxT>> neighbors_storage{std::nullopt};
+  if constexpr (sizeof(IdxT) == sizeof(algo_base::index_type)) {
+    neighbors_idx_t = reinterpret_cast<IdxT*>(neighbors);
+  } else {
+    neighbors_storage.emplace(batch_size * k, raft::resource::get_cuda_stream(handle_));
+    neighbors_idx_t = neighbors_storage->data();
+  }
+  cuvs::neighbors::ivf_flat::search(
+    handle_,
+    search_params_,
+    *index_,
+    raft::make_device_matrix_view<const T, int64_t>(queries, batch_size, index_->dim()),
+    raft::make_device_matrix_view<IdxT, int64_t>(neighbors_idx_t, batch_size, k),
+    raft::make_device_matrix_view<float, int64_t>(distances, batch_size, k));
+  if constexpr (sizeof(IdxT) != sizeof(algo_base::index_type)) {
+    raft::linalg::unaryOp(neighbors,
+                          neighbors_idx_t,
+                          batch_size * k,
+                          raft::cast_op<algo_base::index_type>(),
+                          raft::resource::get_cuda_stream(handle_));
+  }
+}
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ivf_pq.cu b/cpp/bench/ann/src/cuvs/cuvs_ivf_pq.cu
new file mode 100644
index 000000000..3ffdd4a25
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_ivf_pq.cu
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "cuvs_ivf_pq_wrapper.h"
+
+namespace cuvs::bench {
+template class cuvs_ivf_pq<float, int64_t>;
+// template class cuvs_ivf_pq<half, int64_t>;
+template class cuvs_ivf_pq<uint8_t, int64_t>;
+template class cuvs_ivf_pq<int8_t, int64_t>;
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h
new file mode 100644
index 000000000..a305f14ea
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../common/ann_types.hpp"
+#include "cuvs_ann_bench_utils.h"
+
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/ivf_pq.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/neighbors/refine.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <type_traits>
+
+namespace cuvs::bench {
+
+template <typename T, typename IdxT>
+class cuvs_ivf_pq : public algo<T>, public algo_gpu {
+ public:
+  using search_param_base = typename algo<T>::search_param;
+  using algo<T>::dim_;
+
+  struct search_param : public search_param_base {
+    cuvs::neighbors::ivf_pq::search_params pq_param;
+    float refine_ratio = 1.0f;
+    [[nodiscard]] auto needs_dataset() const -> bool override { return refine_ratio > 1.0f; }
+  };
+
+  using build_param = cuvs::neighbors::ivf_pq::index_params;
+
+  cuvs_ivf_pq(Metric metric, int dim, const build_param& param)
+    : algo<T>(metric, dim), index_params_(param), dimension_(dim)
+  {
+    index_params_.metric = parse_metric_type(metric);
+  }
+
+  void build(const T* dataset, size_t nrow) final;
+
+  void set_search_param(const search_param_base& param) override;
+  void set_search_dataset(const T* dataset, size_t nrow) override;
+
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              algo_base::index_type* neighbors,
+              float* distances) const override;
+  void search_base(const T* queries,
+                   int batch_size,
+                   int k,
+                   algo_base::index_type* neighbors,
+                   float* distances) const;
+
+  [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
+  {
+    return handle_.get_sync_stream();
+  }
+
+  // to enable dataset access from GPU memory
+  [[nodiscard]] auto get_preference() const -> algo_property override
+  {
+    algo_property property;
+    property.dataset_memory_type = MemoryType::kHost;
+    property.query_memory_type   = MemoryType::kDevice;
+    return property;
+  }
+  void save(const std::string& file) const override;
+  void load(const std::string&) override;
+  std::unique_ptr<algo<T>> copy() override;
+
+ private:
+  // handle_ must go first to make sure it dies last and all memory allocated in pool
+  configured_raft_resources handle_{};
+  build_param index_params_;
+  cuvs::neighbors::ivf_pq::search_params search_params_;
+  std::shared_ptr<cuvs::neighbors::ivf_pq::index<IdxT>> index_;
+  int dimension_;
+  float refine_ratio_ = 1.0;
+  raft::device_matrix_view<const T, IdxT> dataset_;
+};
+
+template <typename T, typename IdxT>
+void cuvs_ivf_pq<T, IdxT>::save(const std::string& file) const
+{
+  cuvs::neighbors::ivf_pq::serialize(handle_, file, *index_);
+}
+
+template <typename T, typename IdxT>
+void cuvs_ivf_pq<T, IdxT>::load(const std::string& file)
+{
+  index_ = std::make_shared<cuvs::neighbors::ivf_pq::index<IdxT>>(handle_, index_params_, dim_);
+  cuvs::neighbors::ivf_pq::deserialize(handle_, file, index_.get());
+}
+
+template <typename T, typename IdxT>
+void cuvs_ivf_pq<T, IdxT>::build(const T* dataset, size_t nrow)
+{
+  auto dataset_v = raft::make_device_matrix_view<const T, IdxT>(dataset, IdxT(nrow), dim_);
+  std::make_shared<cuvs::neighbors::ivf_pq::index<IdxT>>(
+    std::move(cuvs::neighbors::ivf_pq::build(handle_, index_params_, dataset_v)))
+    .swap(index_);
+}
+
+template <typename T, typename IdxT>
+std::unique_ptr<algo<T>> cuvs_ivf_pq<T, IdxT>::copy()
+{
+  return std::make_unique<cuvs_ivf_pq<T, IdxT>>(*this);  // use copy constructor
+}
+
+template <typename T, typename IdxT>
+void cuvs_ivf_pq<T, IdxT>::set_search_param(const search_param_base& param)
+{
+  auto sp        = dynamic_cast<const search_param&>(param);
+  search_params_ = sp.pq_param;
+  refine_ratio_  = sp.refine_ratio;
+  assert(search_params_.n_probes <= index_params_.n_lists);
+}
+
+template <typename T, typename IdxT>
+void cuvs_ivf_pq<T, IdxT>::set_search_dataset(const T* dataset, size_t nrow)
+{
+  dataset_ = raft::make_device_matrix_view<const T, IdxT>(dataset, nrow, index_->dim());
+}
+
+template <typename T, typename IdxT>
+void cuvs_ivf_pq<T, IdxT>::search_base(
+  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+{
+  static_assert(std::is_integral_v<algo_base::index_type>);
+  static_assert(std::is_integral_v<IdxT>);
+
+  IdxT* neighbors_idx_t;
+  std::optional<rmm::device_uvector<IdxT>> neighbors_storage{std::nullopt};
+  if constexpr (sizeof(IdxT) == sizeof(algo_base::index_type)) {
+    neighbors_idx_t = reinterpret_cast<IdxT*>(neighbors);
+  } else {
+    neighbors_storage.emplace(batch_size * k, raft::resource::get_cuda_stream(handle_));
+    neighbors_idx_t = neighbors_storage->data();
+  }
+
+  auto queries_view =
+    raft::make_device_matrix_view<const T, uint32_t>(queries, batch_size, dimension_);
+  auto neighbors_view =
+    raft::make_device_matrix_view<IdxT, uint32_t>(neighbors_idx_t, batch_size, k);
+  auto distances_view = raft::make_device_matrix_view<float, uint32_t>(distances, batch_size, k);
+
+  cuvs::neighbors::ivf_pq::search(
+    handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+
+  if constexpr (sizeof(IdxT) != sizeof(algo_base::index_type)) {
+    raft::linalg::unaryOp(neighbors,
+                          neighbors_idx_t,
+                          batch_size * k,
+                          raft::cast_op<algo_base::index_type>(),
+                          raft::resource::get_cuda_stream(handle_));
+  }
+}
+
+template <typename T, typename IdxT>
+void cuvs_ivf_pq<T, IdxT>::search(
+  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+{
+  auto k0                       = static_cast<size_t>(refine_ratio_ * k);
+  const bool disable_refinement = k0 <= static_cast<size_t>(k);
+  const raft::resources& res    = handle_;
+
+  if (disable_refinement) {
+    search_base(queries, batch_size, k, neighbors, distances);
+  } else {
+    auto queries_v = raft::make_device_matrix_view<const T, algo_base::index_type>(
+      queries, batch_size, dimension_);
+    auto candidate_ixs =
+      raft::make_device_matrix<algo_base::index_type, algo_base::index_type>(res, batch_size, k0);
+    auto candidate_dists =
+      raft::make_device_matrix<float, algo_base::index_type>(res, batch_size, k0);
+    search_base(
+      queries, batch_size, k0, candidate_ixs.data_handle(), candidate_dists.data_handle());
+    refine_helper(
+      res, dataset_, queries_v, candidate_ixs, k, neighbors, distances, index_->metric());
+  }
+}
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_wrapper.h
new file mode 100644
index 000000000..0954e6051
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_wrapper.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../common/ann_types.hpp"
+#include "cuvs_ann_bench_utils.h"
+
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/brute_force.hpp>
+#include <raft/core/device_resources.hpp>
+
+#include <cassert>
+#include <fstream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+namespace raft_temp {
+
+inline auto parse_metric_type(cuvs::bench::Metric metric) -> cuvs::distance::DistanceType
+{
+  switch (metric) {
+    case cuvs::bench::Metric::kInnerProduct: return cuvs::distance::DistanceType::InnerProduct;
+    case cuvs::bench::Metric::kEuclidean: return cuvs::distance::DistanceType::L2Expanded;
+    default: throw std::runtime_error("raft supports only metric type of inner product and L2");
+  }
+}
+}  // namespace raft_temp
+
+namespace cuvs::bench {
+
+// brute force KNN - RAFT
+template <typename T>
+class cuvs_gpu : public algo<T>, public algo_gpu {
+ public:
+  using search_param_base = typename algo<T>::search_param;
+
+  struct search_param : public search_param_base {
+    [[nodiscard]] auto needs_dataset() const -> bool override { return true; }
+  };
+
+  cuvs_gpu(Metric metric, int dim);
+
+  void build(const T*, size_t) final;
+
+  void set_search_param(const search_param_base& param) override;
+
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              algo_base::index_type* neighbors,
+              float* distances) const final;
+
+  // to enable dataset access from GPU memory
+  [[nodiscard]] auto get_preference() const -> algo_property override
+  {
+    algo_property property;
+    property.dataset_memory_type = MemoryType::kDevice;
+    property.query_memory_type   = MemoryType::kDevice;
+    return property;
+  }
+  [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
+  {
+    return handle_.get_sync_stream();
+  }
+  void set_search_dataset(const T* dataset, size_t nrow) override;
+  void save(const std::string& file) const override;
+  void load(const std::string&) override;
+  std::unique_ptr<algo<T>> copy() override;
+
+ protected:
+  // handle_ must go first to make sure it dies last and all memory allocated in pool
+  configured_raft_resources handle_{};
+  std::shared_ptr<cuvs::neighbors::brute_force::index<T>> index_;
+  cuvs::distance::DistanceType metric_type_;
+  int device_;
+  const T* dataset_;
+  size_t nrow_;
+};
+
+template <typename T>
+cuvs_gpu<T>::cuvs_gpu(Metric metric, int dim)
+  : algo<T>(metric, dim), metric_type_(raft_temp::parse_metric_type(metric))
+{
+  static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
+                "raft bfknn only supports float/double");
+  RAFT_CUDA_TRY(cudaGetDevice(&device_));
+}
+
+template <typename T>
+void cuvs_gpu<T>::build(const T* dataset, size_t nrow)
+{
+  auto dataset_view = raft::make_device_matrix_view<const T, int64_t>(dataset, nrow, this->dim_);
+  index_            = std::make_shared<cuvs::neighbors::brute_force::index<T>>(
+    std::move(cuvs::neighbors::brute_force::build(handle_, dataset_view, metric_type_)));
+}
+
+template <typename T>
+void cuvs_gpu<T>::set_search_param(const search_param_base&)
+{
+  // Nothing to set here as it is brute force implementation
+}
+
+template <typename T>
+void cuvs_gpu<T>::set_search_dataset(const T* dataset, size_t nrow)
+{
+  dataset_ = dataset;
+  nrow_    = nrow;
+  // Wrap the dataset with an index.
+  auto dataset_view = raft::make_device_matrix_view<const T, int64_t>(dataset, nrow, this->dim_);
+  index_            = std::make_shared<cuvs::neighbors::brute_force::index<T>>(
+    std::move(cuvs::neighbors::brute_force::build(handle_, dataset_view, metric_type_)));
+}
+
+template <typename T>
+void cuvs_gpu<T>::save(const std::string& file) const
+{
+  // The index is just the dataset with metadata (shape). The dataset already exist on disk,
+  // therefore we do not need to save it here.
+  // We create an empty file because the benchmark logic requires an index file to be created.
+  std::ofstream of(file);
+  of.close();
+}
+
+template <typename T>
+void cuvs_gpu<T>::load(const std::string& file)
+{
+  // We do not have serialization of brute force index. We can simply wrap the
+  // dataset into a brute force index, like it is done in set_search_dataset.
+}
+
+template <typename T>
+void cuvs_gpu<T>::search(
+  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+{
+  auto queries_view =
+    raft::make_device_matrix_view<const T, int64_t>(queries, batch_size, this->dim_);
+
+  auto neighbors_view =
+    raft::make_device_matrix_view<algo_base::index_type, int64_t>(neighbors, batch_size, k);
+  auto distances_view = raft::make_device_matrix_view<float, int64_t>(distances, batch_size, k);
+
+  cuvs::neighbors::brute_force::search(
+    handle_, *index_, queries_view, neighbors_view, distances_view, std::nullopt);
+}
+
+template <typename T>
+std::unique_ptr<algo<T>> cuvs_gpu<T>::copy()
+{
+  return std::make_unique<cuvs_gpu<T>>(*this);  // use copy constructor
+}
+
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp b/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp
new file mode 100644
index 000000000..354a9b291
--- /dev/null
+++ b/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../common/ann_types.hpp"
+#include "faiss_cpu_wrapper.h"
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+namespace cuvs::bench {
+
+template <typename T>
+void parse_base_build_param(const nlohmann::json& conf,
+                            typename cuvs::bench::faiss_cpu<T>::build_param& param)
+{
+  param.nlist = conf.at("nlist");
+  if (conf.contains("ratio")) { param.ratio = conf.at("ratio"); }
+}
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuvs::bench::faiss_cpu_ivf_flat<T>::build_param& param)
+{
+  parse_base_build_param<T>(conf, param);
+}
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuvs::bench::faiss_cpu_ivfpq<T>::build_param& param)
+{
+  parse_base_build_param<T>(conf, param);
+  param.m = conf.at("M");
+  if (conf.contains("usePrecomputed")) {
+    param.use_precomputed = conf.at("usePrecomputed");
+  } else {
+    param.use_precomputed = false;
+  }
+  if (conf.contains("bitsPerCode")) {
+    param.bits_per_code = conf.at("bitsPerCode");
+  } else {
+    param.bits_per_code = 8;
+  }
+}
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuvs::bench::faiss_cpu_ivfsq<T>::build_param& param)
+{
+  parse_base_build_param<T>(conf, param);
+  param.quantizer_type = conf.at("quantizer_type");
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf,
+                        typename cuvs::bench::faiss_cpu<T>::search_param& param)
+{
+  param.nprobe = conf.at("nprobe");
+  if (conf.contains("refine_ratio")) { param.refine_ratio = conf.at("refine_ratio"); }
+  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+}
+
+template <typename T, template <typename> class Algo>
+auto make_algo(cuvs::bench::Metric metric, int dim, const nlohmann::json& conf)
+  -> std::unique_ptr<cuvs::bench::algo<T>>
+{
+  typename Algo<T>::build_param param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+auto create_algo(const std::string& algo_name,
+                 const std::string& distance,
+                 int dim,
+                 const nlohmann::json& conf) -> std::unique_ptr<cuvs::bench::algo<T>>
+{
+  std::unique_ptr<cuvs::bench::algo<T>> a;
+
+  if constexpr (std::is_same_v<T, float>) {
+    cuvs::bench::Metric metric = parse_metric(distance);
+    if (algo_name == "faiss_cpu_ivf_flat") {
+      a = make_algo<T, cuvs::bench::faiss_cpu_ivf_flat>(metric, dim, conf);
+    } else if (algo_name == "faiss_cpu_ivf_pq") {
+      a = make_algo<T, cuvs::bench::faiss_cpu_ivfpq>(metric, dim, conf);
+    } else if (algo_name == "faiss_cpu_ivf_sq") {
+      a = make_algo<T, cuvs::bench::faiss_cpu_ivfsq>(metric, dim, conf);
+    } else if (algo_name == "faiss_cpu_flat") {
+      a = std::make_unique<cuvs::bench::faiss_cpu_flat<T>>(metric, dim);
+    }
+  }
+
+  if constexpr (std::is_same_v<T, uint8_t>) {}
+
+  if (!a) { throw std::runtime_error("invalid algo: '" + algo_name + "'"); }
+
+  return a;
+}
+
+template <typename T>
+auto create_search_param(const std::string& algo_name, const nlohmann::json& conf)
+  -> std::unique_ptr<typename cuvs::bench::algo<T>::search_param>
+{
+  if (algo_name == "faiss_cpu_ivf_flat" || algo_name == "faiss_cpu_ivf_pq" ||
+      algo_name == "faiss_cpu_ivf_sq") {
+    auto param = std::make_unique<typename cuvs::bench::faiss_cpu<T>::search_param>();
+    parse_search_param<T>(conf, *param);
+    return param;
+  } else if (algo_name == "faiss_cpu_flat") {
+    auto param = std::make_unique<typename cuvs::bench::faiss_cpu<T>::search_param>();
+    return param;
+  }
+  // else
+  throw std::runtime_error("invalid algo: '" + algo_name + "'");
+}
+
+}  // namespace cuvs::bench
+
+REGISTER_ALGO_INSTANCE(float);
+REGISTER_ALGO_INSTANCE(std::int8_t);
+REGISTER_ALGO_INSTANCE(std::uint8_t);
+
+#ifdef ANN_BENCH_BUILD_MAIN
+#include "../common/benchmark.hpp"
+int main(int argc, char** argv) { return cuvs::bench::run_main(argc, argv); }
+#endif
diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
new file mode 100644
index 000000000..0cc40de37
--- /dev/null
+++ b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../common/ann_types.hpp"
+#include "../common/thread_pool.hpp"
+#include "../common/util.hpp"
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexRefine.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/index_io.h>
+
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+namespace {
+
+auto parse_metric_type(cuvs::bench::Metric metric) -> faiss::MetricType
+{
+  if (metric == cuvs::bench::Metric::kInnerProduct) {
+    return faiss::METRIC_INNER_PRODUCT;
+  } else if (metric == cuvs::bench::Metric::kEuclidean) {
+    return faiss::METRIC_L2;
+  } else {
+    throw std::runtime_error("faiss supports only metric type of inner product and L2");
+  }
+}
+}  // namespace
+
+namespace cuvs::bench {
+
+template <typename T>
+class faiss_cpu : public algo<T> {
+ public:
+  using search_param_base = typename algo<T>::search_param;
+  struct search_param : public search_param_base {
+    int nprobe;
+    float refine_ratio = 1.0;
+    int num_threads    = omp_get_num_procs();
+  };
+
+  struct build_param {
+    int nlist = 1;
+    int ratio = 2;
+  };
+
+  faiss_cpu(Metric metric, int dim, const build_param& param)
+    : algo<T>(metric, dim),
+      metric_type_(parse_metric_type(metric)),
+      nlist_{param.nlist},
+      training_sample_fraction_{1.0 / double(param.ratio)}
+  {
+    static_assert(std::is_same_v<T, float>, "faiss support only float type");
+  }
+
+  void build(const T* dataset, size_t nrow) final;
+
+  void set_search_param(const search_param_base& param) override;
+
+  void init_quantizer(int dim)
+  {
+    if (this->metric_type_ == faiss::MetricType::METRIC_L2) {
+      this->quantizer_ = std::make_shared<faiss::IndexFlatL2>(dim);
+    } else if (this->metric_type_ == faiss::MetricType::METRIC_INNER_PRODUCT) {
+      this->quantizer_ = std::make_shared<faiss::IndexFlatIP>(dim);
+    }
+  }
+
+  // TODO(snanditale): if the number of results is less than k, the remaining elements of
+  // 'neighbors' will be filled with (size_t)-1
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              algo_base::index_type* neighbors,
+              float* distances) const final;
+
+  [[nodiscard]] auto get_preference() const -> algo_property override
+  {
+    algo_property property;
+    // to enable building big dataset which is larger than  memory
+    property.dataset_memory_type = MemoryType::kHost;
+    property.query_memory_type   = MemoryType::kHost;
+    return property;
+  }
+
+ protected:
+  template <typename Index>
+  void save_(const std::string& file) const;  // NOLINT
+
+  template <typename Index>
+  void load_(const std::string& file);  // NOLINT
+
+  std::shared_ptr<faiss::Index> index_;
+  std::shared_ptr<faiss::Index> quantizer_;
+  std::shared_ptr<faiss::IndexRefineFlat> index_refine_;
+  faiss::MetricType metric_type_;
+  int nlist_;
+  double training_sample_fraction_;
+
+  int num_threads_;
+  std::shared_ptr<fixed_thread_pool> thread_pool_;
+};
+
+template <typename T>
+void faiss_cpu<T>::build(const T* dataset, size_t nrow)
+{
+  auto index_ivf = dynamic_cast<faiss::IndexIVF*>(index_.get());
+  if (index_ivf != nullptr) {
+    // set the min/max training size for clustering to use the whole provided training set.
+    double trainset_size       = training_sample_fraction_ * static_cast<double>(nrow);
+    double points_per_centroid = trainset_size / static_cast<double>(nlist_);
+    int max_ppc                = std::ceil(points_per_centroid);
+    int min_ppc                = std::floor(points_per_centroid);
+    if (min_ppc < index_ivf->cp.min_points_per_centroid) {
+      log_warn(
+        "The suggested training set size %zu (data size %zu, training sample ratio %f) yields %d "
+        "points per cluster (n_lists = %d). This is smaller than the FAISS default "
+        "min_points_per_centroid = %d.",
+        static_cast<size_t>(trainset_size),
+        nrow,
+        training_sample_fraction_,
+        min_ppc,
+        nlist_,
+        index_ivf->cp.min_points_per_centroid);
+    }
+    index_ivf->cp.max_points_per_centroid = max_ppc;
+    index_ivf->cp.min_points_per_centroid = min_ppc;
+  }
+  index_->train(nrow, dataset);  // faiss::IndexFlat::train() will do nothing
+  assert(index_->is_trained);
+  index_->add(nrow, dataset);
+  index_refine_ = std::make_shared<faiss::IndexRefineFlat>(this->index_.get(), dataset);
+}
+
+template <typename T>
+void faiss_cpu<T>::set_search_param(const search_param_base& param)
+{
+  auto sp    = dynamic_cast<const search_param&>(param);
+  int nprobe = sp.nprobe;
+  assert(nprobe <= nlist_);
+  dynamic_cast<faiss::IndexIVF*>(index_.get())->nprobe = nprobe;
+
+  if (sp.refine_ratio > 1.0) { this->index_refine_.get()->k_factor = sp.refine_ratio; }
+
+  if (!thread_pool_ || num_threads_ != sp.num_threads) {
+    num_threads_ = sp.num_threads;
+    thread_pool_ = std::make_shared<fixed_thread_pool>(num_threads_);
+  }
+}
+
+template <typename T>
+void faiss_cpu<T>::search(
+  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+{
+  static_assert(sizeof(size_t) == sizeof(faiss::idx_t),
+                "sizes of size_t and faiss::idx_t are different");
+
+  thread_pool_->submit(
+    [&](int i) {
+      // Use thread pool for batch size = 1. FAISS multi-threads internally for batch size > 1.
+      index_->search(batch_size, queries, k, distances, reinterpret_cast<faiss::idx_t*>(neighbors));
+    },
+    1);
+}
+
+template <typename T>
+template <typename Index>
+void faiss_cpu<T>::save_(const std::string& file) const
+{
+  faiss::write_index(index_.get(), file.c_str());
+}
+
+template <typename T>
+template <typename Index>
+void faiss_cpu<T>::load_(const std::string& file)
+{
+  index_ = std::shared_ptr<Index>(dynamic_cast<Index*>(faiss::read_index(file.c_str())));
+}
+
+template <typename T>
+class faiss_cpu_ivf_flat : public faiss_cpu<T> {
+ public:
+  using typename faiss_cpu<T>::build_param;
+
+  faiss_cpu_ivf_flat(Metric metric, int dim, const build_param& param)
+    : faiss_cpu<T>(metric, dim, param)
+  {
+    this->init_quantizer(dim);
+    this->index_ = std::make_shared<faiss::IndexIVFFlat>(
+      this->quantizer_.get(), dim, param.nlist, this->metric_type_);
+  }
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::IndexIVFFlat>(file);
+  }
+  void load(const std::string& file) override { this->template load_<faiss::IndexIVFFlat>(file); }
+
+  std::unique_ptr<algo<T>> copy()
+  {
+    return std::make_unique<faiss_cpu_ivf_flat<T>>(*this);  // use copy constructor
+  }
+};
+
+template <typename T>
+class faiss_cpu_ivfpq : public faiss_cpu<T> {
+ public:
+  struct build_param : public faiss_cpu<T>::build_param {
+    int m;
+    int bits_per_code;
+    bool use_precomputed;
+  };
+
+  faiss_cpu_ivfpq(Metric metric, int dim, const build_param& param)
+    : faiss_cpu<T>(metric, dim, param)
+  {
+    this->init_quantizer(dim);
+    this->index_ = std::make_shared<faiss::IndexIVFPQ>(
+      this->quantizer_.get(), dim, param.nlist, param.m, param.bits_per_code, this->metric_type_);
+  }
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::IndexIVFPQ>(file);
+  }
+  void load(const std::string& file) override { this->template load_<faiss::IndexIVFPQ>(file); }
+
+  std::unique_ptr<algo<T>> copy()
+  {
+    return std::make_unique<faiss_cpu_ivfpq<T>>(*this);  // use copy constructor
+  }
+};
+
+// TODO(snanditale): Enable this in cmake
+//  ref: https://github.com/rapidsai/raft/issues/1876
+template <typename T>
+class faiss_cpu_ivfsq : public faiss_cpu<T> {
+ public:
+  struct build_param : public faiss_cpu<T>::build_param {
+    std::string quantizer_type;
+  };
+
+  faiss_cpu_ivfsq(Metric metric, int dim, const build_param& param)
+    : faiss_cpu<T>(metric, dim, param)
+  {
+    faiss::ScalarQuantizer::QuantizerType qtype;
+    if (param.quantizer_type == "fp16") {
+      qtype = faiss::ScalarQuantizer::QT_fp16;
+    } else if (param.quantizer_type == "int8") {
+      qtype = faiss::ScalarQuantizer::QT_8bit;
+    } else {
+      throw std::runtime_error("faiss_cpu_ivfsq supports only fp16 and int8 but got " +
+                               param.quantizer_type);
+    }
+
+    this->init_quantizer(dim);
+    this->index_ = std::make_shared<faiss::IndexIVFScalarQuantizer>(
+      this->quantizer_.get(), dim, param.nlist, qtype, this->metric_type_, true);
+  }
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::IndexIVFScalarQuantizer>(file);
+  }
+  void load(const std::string& file) override
+  {
+    this->template load_<faiss::IndexIVFScalarQuantizer>(file);
+  }
+
+  std::unique_ptr<algo<T>> copy()
+  {
+    return std::make_unique<faiss_cpu_ivfsq<T>>(*this);  // use copy constructor
+  }
+};
+
+template <typename T>
+class faiss_cpu_flat : public faiss_cpu<T> {
+ public:
+  faiss_cpu_flat(Metric metric, int dim)
+    : faiss_cpu<T>(metric, dim, typename faiss_cpu<T>::build_param{})
+  {
+    this->index_ = std::make_shared<faiss::IndexFlat>(dim, this->metric_type_);
+  }
+
+  // class faiss_cpu is more like a IVF class, so need special treating here
+  void set_search_param(const typename algo<T>::search_param& param) override
+  {
+    auto search_param = dynamic_cast<const typename faiss_cpu<T>::search_param&>(param);
+    if (!this->thread_pool_ || this->num_threads_ != search_param.num_threads) {
+      this->num_threads_ = search_param.num_threads;
+      this->thread_pool_ = std::make_shared<fixed_thread_pool>(this->num_threads_);
+    }
+  };
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::IndexFlat>(file);
+  }
+  void load(const std::string& file) override { this->template load_<faiss::IndexFlat>(file); }
+
+  std::unique_ptr<algo<T>> copy()
+  {
+    return std::make_unique<faiss_cpu_flat<T>>(*this);  // use copy constructor
+  }
+};
+
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu b/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu
new file mode 100644
index 000000000..2d9271639
--- /dev/null
+++ b/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../common/ann_types.hpp"
+
+#undef WARP_SIZE
+#include "faiss_gpu_wrapper.h"
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+namespace cuvs::bench {
+
+template <typename T>
+void parse_base_build_param(const nlohmann::json& conf,
+                            typename cuvs::bench::faiss_gpu<T>::build_param& param)
+{
+  param.nlist = conf.at("nlist");
+  if (conf.contains("ratio")) { param.ratio = conf.at("ratio"); }
+}
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuvs::bench::faiss_gpu_ivf_flat<T>::build_param& param)
+{
+  parse_base_build_param<T>(conf, param);
+}
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuvs::bench::faiss_gpu_ivfpq<T>::build_param& param)
+{
+  parse_base_build_param<T>(conf, param);
+  param.m = conf.at("M");
+  if (conf.contains("usePrecomputed")) {
+    param.use_precomputed = conf.at("usePrecomputed");
+  } else {
+    param.use_precomputed = false;
+  }
+  if (conf.contains("useFloat16")) {
+    param.use_float16 = conf.at("useFloat16");
+  } else {
+    param.use_float16 = false;
+  }
+}
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuvs::bench::faiss_gpu_ivfsq<T>::build_param& param)
+{
+  parse_base_build_param<T>(conf, param);
+  param.quantizer_type = conf.at("quantizer_type");
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf,
+                        typename cuvs::bench::faiss_gpu<T>::search_param& param)
+{
+  param.nprobe = conf.at("nprobe");
+  if (conf.contains("refine_ratio")) { param.refine_ratio = conf.at("refine_ratio"); }
+}
+
+template <typename T, template <typename> class Algo>
+auto make_algo(cuvs::bench::Metric metric, int dim, const nlohmann::json& conf)
+  -> std::unique_ptr<cuvs::bench::algo<T>>
+{
+  typename Algo<T>::build_param param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+auto create_algo(const std::string& algo_name,
+                 const std::string& distance,
+                 int dim,
+                 const nlohmann::json& conf) -> std::unique_ptr<cuvs::bench::algo<T>>
+{
+  std::unique_ptr<cuvs::bench::algo<T>> a;
+
+  if constexpr (std::is_same_v<T, float>) {
+    cuvs::bench::Metric metric = parse_metric(distance);
+    if (algo_name == "faiss_gpu_ivf_flat") {
+      a = make_algo<T, cuvs::bench::faiss_gpu_ivf_flat>(metric, dim, conf);
+    } else if (algo_name == "faiss_gpu_ivf_pq") {
+      a = make_algo<T, cuvs::bench::faiss_gpu_ivfpq>(metric, dim, conf);
+    } else if (algo_name == "faiss_gpu_ivf_sq") {
+      a = make_algo<T, cuvs::bench::faiss_gpu_ivfsq>(metric, dim, conf);
+    } else if (algo_name == "faiss_gpu_flat") {
+      a = std::make_unique<cuvs::bench::faiss_gpu_flat<T>>(metric, dim);
+    }
+  }
+
+  if (!a) { throw std::runtime_error("invalid algo: '" + algo_name + "'"); }
+
+  return a;
+}
+
+template <typename T>
+auto create_search_param(const std::string& algo_name, const nlohmann::json& conf)
+  -> std::unique_ptr<typename cuvs::bench::algo<T>::search_param>
+{
+  if (algo_name == "faiss_gpu_ivf_flat" || algo_name == "faiss_gpu_ivf_pq" ||
+      algo_name == "faiss_gpu_ivf_sq") {
+    auto param = std::make_unique<typename cuvs::bench::faiss_gpu<T>::search_param>();
+    parse_search_param<T>(conf, *param);
+    return param;
+  } else if (algo_name == "faiss_gpu_flat") {
+    auto param = std::make_unique<typename cuvs::bench::faiss_gpu<T>::search_param>();
+    return param;
+  }
+  // else
+  throw std::runtime_error("invalid algo: '" + algo_name + "'");
+}
+
+}  // namespace cuvs::bench
+
+REGISTER_ALGO_INSTANCE(float);
+REGISTER_ALGO_INSTANCE(std::int8_t);
+REGISTER_ALGO_INSTANCE(std::uint8_t);
+
+#ifdef ANN_BENCH_BUILD_MAIN
+#include "../common/benchmark.hpp"
+int main(int argc, char** argv) { return cuvs::bench::run_main(argc, argv); }
+#endif
diff --git a/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
new file mode 100644
index 000000000..f935e365f
--- /dev/null
+++ b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../common/ann_types.hpp"
+#include "../common/util.hpp"
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexRefine.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/impl/ScalarQuantizer.h>
+#include <faiss/index_io.h>
+#include <omp.h>
+
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+namespace {
+
+auto parse_metric_type(cuvs::bench::Metric metric) -> faiss::MetricType
+{
+  if (metric == cuvs::bench::Metric::kInnerProduct) {
+    return faiss::METRIC_INNER_PRODUCT;
+  } else if (metric == cuvs::bench::Metric::kEuclidean) {
+    return faiss::METRIC_L2;
+  } else {
+    throw std::runtime_error("faiss supports only metric type of inner product and L2");
+  }
+}
+
+// note BLAS library can still use multi-threading, and
+// setting environment variable like OPENBLAS_NUM_THREADS can control it
+class omp_single_thread_scope {
+ public:
+  omp_single_thread_scope()
+  {
+    max_threads_ = omp_get_max_threads();
+    omp_set_num_threads(1);
+  }
+  ~omp_single_thread_scope()
+  {
+    // the best we can do
+    omp_set_num_threads(max_threads_);
+  }
+
+ private:
+  int max_threads_;
+};
+
+}  // namespace
+
+namespace cuvs::bench {
+
+template <typename T>
+class faiss_gpu : public algo<T>, public algo_gpu {
+ public:
+  using search_param_base = typename algo<T>::search_param;
+  struct search_param : public search_param_base {
+    int nprobe;
+    float refine_ratio = 1.0;
+    [[nodiscard]] auto needs_dataset() const -> bool override { return refine_ratio > 1.0f; }
+  };
+
+  struct build_param {
+    int nlist = 1;
+    int ratio = 2;
+  };
+
+  faiss_gpu(Metric metric, int dim, const build_param& param)
+    : algo<T>(metric, dim),
+      gpu_resource_{std::make_shared<faiss::gpu::StandardGpuResources>()},
+      metric_type_(parse_metric_type(metric)),
+      nlist_{param.nlist},
+      training_sample_fraction_{1.0 / double(param.ratio)}
+  {
+    static_assert(std::is_same_v<T, float>, "faiss support only float type");
+    cudaGetDevice(&device_);
+  }
+
+  void build(const T* dataset, size_t nrow) final;
+
+  virtual void set_search_param(const search_param_base& param) {}
+
+  void set_search_dataset(const T* dataset, size_t nrow) override { dataset_ = dataset; }
+
+  // TODO(snanditale): if the number of results is less than k, the remaining elements of
+  // 'neighbors' will be filled with (size_t)-1
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              algo_base::index_type* neighbors,
+              float* distances) const final;
+
+  [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
+  {
+    return gpu_resource_->getDefaultStream(device_);
+  }
+
+  [[nodiscard]] auto get_preference() const -> algo_property override
+  {
+    algo_property property;
+    // to enable building big dataset which is larger than GPU memory
+    property.dataset_memory_type = MemoryType::kHost;
+    property.query_memory_type   = MemoryType::kHost;
+    return property;
+  }
+
+ protected:
+  template <typename GpuIndex, typename CpuIndex>
+  void save_(const std::string& file) const;  // NOLINT
+
+  template <typename GpuIndex, typename CpuIndex>
+  void load_(const std::string& file);  // NOLINT
+
+  /** [NOTE Multithreading]
+   *
+   * `gpu_resource_` is a shared resource:
+   *   1. It uses a shared_ptr under the hood, so the copies of it refer to the same
+   *      resource implementation instance
+   *   2. GpuIndex is probably keeping a reference to it, as it's passed to the constructor
+   *
+   * To avoid copying the index (database) in each thread, we make both the index and
+   * the gpu_resource shared.
+   * This means faiss GPU streams are possibly shared among the CPU threads;
+   * the throughput search mode may be inaccurate.
+   *
+   * WARNING: we haven't investigated whether faiss::gpu::GpuIndex or
+   * faiss::gpu::StandardGpuResources are thread-safe.
+   *
+   */
+  mutable std::shared_ptr<faiss::gpu::StandardGpuResources> gpu_resource_;
+  std::shared_ptr<faiss::gpu::GpuIndex> index_;
+  std::shared_ptr<faiss::IndexRefineFlat> index_refine_{nullptr};
+  faiss::MetricType metric_type_;
+  int nlist_;
+  int device_;
+  double training_sample_fraction_;
+  std::shared_ptr<faiss::SearchParameters> search_params_;
+  const T* dataset_;
+  float refine_ratio_ = 1.0;
+};
+
+template <typename T>
+void faiss_gpu<T>::build(const T* dataset, size_t nrow)
+{
+  omp_single_thread_scope omp_single_thread;
+  auto index_ivf = dynamic_cast<faiss::gpu::GpuIndexIVF*>(index_.get());
+  if (index_ivf != nullptr) {
+    // set the min/max training size for clustering to use the whole provided training set.
+    double trainset_size       = training_sample_fraction_ * static_cast<double>(nrow);
+    double points_per_centroid = trainset_size / static_cast<double>(nlist_);
+    int max_ppc                = std::ceil(points_per_centroid);
+    int min_ppc                = std::floor(points_per_centroid);
+    if (min_ppc < index_ivf->cp.min_points_per_centroid) {
+      log_warn(
+        "The suggested training set size %zu (data size %zu, training sample ratio %f) yields %d "
+        "points per cluster (n_lists = %d). This is smaller than the FAISS default "
+        "min_points_per_centroid = %d.",
+        static_cast<size_t>(trainset_size),
+        nrow,
+        training_sample_fraction_,
+        min_ppc,
+        nlist_,
+        index_ivf->cp.min_points_per_centroid);
+    }
+    index_ivf->cp.max_points_per_centroid = max_ppc;
+    index_ivf->cp.min_points_per_centroid = min_ppc;
+  }
+  index_->train(nrow, dataset);  // faiss::gpu::GpuIndexFlat::train() will do nothing
+  assert(index_->is_trained);
+  index_->add(nrow, dataset);
+}
+
+template <typename T>
+void faiss_gpu<T>::search(
+  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+{
+  static_assert(sizeof(size_t) == sizeof(faiss::idx_t),
+                "sizes of size_t and faiss::idx_t are different");
+
+  if (this->refine_ratio_ > 1.0) {
+    // TODO(snanditale): FAISS changed their search APIs to accept the search parameters as a struct
+    // object but their refine API doesn't allow the struct to be passed in. Once this is fixed, we
+    // need to re-enable refinement below
+    // index_refine_->search(batch_size, queries, k, distances,
+    // reinterpret_cast<faiss::idx_t*>(neighbors), this->search_params_.get()); Related FAISS issue:
+    // https://github.com/facebookresearch/faiss/issues/3118
+    throw std::runtime_error(
+      "FAISS doesn't support refinement in their new APIs so this feature is disabled in the "
+      "benchmarks for the time being.");
+  } else {
+    index_->search(batch_size,
+                   queries,
+                   k,
+                   distances,
+                   reinterpret_cast<faiss::idx_t*>(neighbors),
+                   this->search_params_.get());
+  }
+}
+
+template <typename T>
+template <typename GpuIndex, typename CpuIndex>
+void faiss_gpu<T>::save_(const std::string& file) const
+{
+  omp_single_thread_scope omp_single_thread;
+
+  auto cpu_index = std::make_unique<CpuIndex>();
+  dynamic_cast<GpuIndex*>(index_.get())->copyTo(cpu_index.get());
+  faiss::write_index(cpu_index.get(), file.c_str());
+}
+
+template <typename T>
+template <typename GpuIndex, typename CpuIndex>
+void faiss_gpu<T>::load_(const std::string& file)
+{
+  omp_single_thread_scope omp_single_thread;
+
+  std::unique_ptr<CpuIndex> cpu_index(dynamic_cast<CpuIndex*>(faiss::read_index(file.c_str())));
+  assert(cpu_index);
+
+  try {
+    dynamic_cast<GpuIndex*>(index_.get())->copyFrom(cpu_index.get());
+
+  } catch (const std::exception& e) {
+    std::cout << "Error loading index file: " << std::string(e.what()) << std::endl;
+  }
+}
+
+template <typename T>
+class faiss_gpu_ivf_flat : public faiss_gpu<T> {
+ public:
+  using typename faiss_gpu<T>::build_param;
+  using typename faiss_gpu<T>::search_param_base;
+
+  faiss_gpu_ivf_flat(Metric metric, int dim, const build_param& param)
+    : faiss_gpu<T>(metric, dim, param)
+  {
+    faiss::gpu::GpuIndexIVFFlatConfig config;
+    config.device = this->device_;
+    this->index_  = std::make_shared<faiss::gpu::GpuIndexIVFFlat>(
+      this->gpu_resource_.get(), dim, param.nlist, this->metric_type_, config);
+  }
+
+  void set_search_param(const search_param_base& param) override
+  {
+    auto sp    = dynamic_cast<const typename faiss_gpu<T>::search_param&>(param);
+    int nprobe = sp.nprobe;
+    assert(nprobe <= this->nlist_);
+
+    faiss::IVFSearchParameters faiss_search_params;
+    faiss_search_params.nprobe = nprobe;
+    this->search_params_       = std::make_shared<faiss::IVFSearchParameters>(faiss_search_params);
+    this->refine_ratio_        = sp.refine_ratio;
+  }
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::gpu::GpuIndexIVFFlat, faiss::IndexIVFFlat>(file);
+  }
+  void load(const std::string& file) override
+  {
+    this->template load_<faiss::gpu::GpuIndexIVFFlat, faiss::IndexIVFFlat>(file);
+  }
+  std::unique_ptr<algo<T>> copy() override
+  {
+    return std::make_unique<faiss_gpu_ivf_flat<T>>(*this);
+  };
+};
+
+template <typename T>
+class faiss_gpu_ivfpq : public faiss_gpu<T> {
+ public:
+  struct build_param : public faiss_gpu<T>::build_param {
+    int m;
+    bool use_float16;
+    bool use_precomputed;
+  };
+  using typename faiss_gpu<T>::search_param_base;
+
+  faiss_gpu_ivfpq(Metric metric, int dim, const build_param& param)
+    : faiss_gpu<T>(metric, dim, param)
+  {
+    faiss::gpu::GpuIndexIVFPQConfig config;
+    config.useFloat16LookupTables = param.use_float16;
+    config.usePrecomputedTables   = param.use_precomputed;
+    config.device                 = this->device_;
+
+    this->index_ =
+      std::make_shared<faiss::gpu::GpuIndexIVFPQ>(this->gpu_resource_.get(),
+                                                  dim,
+                                                  param.nlist,
+                                                  param.m,
+                                                  8,  // FAISS only supports bitsPerCode=8
+                                                  this->metric_type_,
+                                                  config);
+  }
+
+  void set_search_param(const search_param_base& param) override
+  {
+    auto sp    = dynamic_cast<const typename faiss_gpu<T>::search_param&>(param);
+    int nprobe = sp.nprobe;
+    assert(nprobe <= this->nlist_);
+    this->refine_ratio_ = sp.refine_ratio;
+    faiss::IVFPQSearchParameters faiss_search_params;
+    faiss_search_params.nprobe = nprobe;
+
+    this->search_params_ = std::make_shared<faiss::IVFPQSearchParameters>(faiss_search_params);
+
+    if (sp.refine_ratio > 1.0) {
+      this->index_refine_ =
+        std::make_shared<faiss::IndexRefineFlat>(this->index_.get(), this->dataset_);
+      this->index_refine_.get()->k_factor = sp.refine_ratio;
+    }
+  }
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::gpu::GpuIndexIVFPQ, faiss::IndexIVFPQ>(file);
+  }
+  void load(const std::string& file) override
+  {
+    this->template load_<faiss::gpu::GpuIndexIVFPQ, faiss::IndexIVFPQ>(file);
+  }
+  std::unique_ptr<algo<T>> copy() override { return std::make_unique<faiss_gpu_ivfpq<T>>(*this); };
+};
+
+// TODO(snanditale): Enable this in cmake
+//  ref: https://github.com/rapidsai/raft/issues/1876
+template <typename T>
+class faiss_gpu_ivfsq : public faiss_gpu<T> {
+ public:
+  struct build_param : public faiss_gpu<T>::build_param {
+    std::string quantizer_type;
+  };
+  using typename faiss_gpu<T>::search_param_base;
+
+  faiss_gpu_ivfsq(Metric metric, int dim, const build_param& param)
+    : faiss_gpu<T>(metric, dim, param)
+  {
+    faiss::ScalarQuantizer::QuantizerType qtype;
+    if (param.quantizer_type == "fp16") {
+      qtype = faiss::ScalarQuantizer::QT_fp16;
+    } else if (param.quantizer_type == "int8") {
+      qtype = faiss::ScalarQuantizer::QT_8bit;
+    } else {
+      throw std::runtime_error("faiss_gpu_ivfsq supports only fp16 and int8 but got " +
+                               param.quantizer_type);
+    }
+
+    faiss::gpu::GpuIndexIVFScalarQuantizerConfig config;
+    config.device = this->device_;
+    this->index_  = std::make_shared<faiss::gpu::GpuIndexIVFScalarQuantizer>(
+      this->gpu_resource_.get(), dim, param.nlist, qtype, this->metric_type_, true, config);
+  }
+
+  void set_search_param(const search_param_base& param) override
+  {
+    auto sp    = dynamic_cast<const typename faiss_gpu<T>::search_param&>(param);
+    int nprobe = sp.nprobe;
+    assert(nprobe <= this->nlist_);
+
+    faiss::IVFSearchParameters faiss_search_params;
+    faiss_search_params.nprobe = nprobe;
+
+    this->search_params_ = std::make_shared<faiss::IVFSearchParameters>(faiss_search_params);
+    this->refine_ratio_  = sp.refine_ratio;
+    if (sp.refine_ratio > 1.0) {
+      this->index_refine_ =
+        std::make_shared<faiss::IndexRefineFlat>(this->index_.get(), this->dataset_);
+      this->index_refine_.get()->k_factor = sp.refine_ratio;
+    }
+  }
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::gpu::GpuIndexIVFScalarQuantizer, faiss::IndexIVFScalarQuantizer>(
+      file);
+  }
+  void load(const std::string& file) override
+  {
+    this->template load_<faiss::gpu::GpuIndexIVFScalarQuantizer, faiss::IndexIVFScalarQuantizer>(
+      file);
+  }
+  std::unique_ptr<algo<T>> copy() override { return std::make_unique<faiss_gpu_ivfsq<T>>(*this); };
+};
+
+template <typename T>
+class faiss_gpu_flat : public faiss_gpu<T> {
+ public:
+  using typename faiss_gpu<T>::search_param_base;
+
+  faiss_gpu_flat(Metric metric, int dim)
+    : faiss_gpu<T>(metric, dim, typename faiss_gpu<T>::build_param{})
+  {
+    faiss::gpu::GpuIndexFlatConfig config;
+    config.device = this->device_;
+    this->index_  = std::make_shared<faiss::gpu::GpuIndexFlat>(
+      this->gpu_resource_.get(), dim, this->metric_type_, config);
+  }
+  void set_search_param(const search_param_base& param) override
+  {
+    auto sp    = dynamic_cast<const typename faiss_gpu<T>::search_param&>(param);
+    int nprobe = sp.nprobe;
+    assert(nprobe <= this->nlist_);
+
+    this->search_params_ = std::make_shared<faiss::SearchParameters>();
+  }
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::gpu::GpuIndexFlat, faiss::IndexFlat>(file);
+  }
+  void load(const std::string& file) override
+  {
+    this->template load_<faiss::gpu::GpuIndexFlat, faiss::IndexFlat>(file);
+  }
+  std::unique_ptr<algo<T>> copy() override { return std::make_unique<faiss_gpu_flat<T>>(*this); };
+};
+
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
new file mode 100644
index 000000000..6b1a1eaf7
--- /dev/null
+++ b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../common/ann_types.hpp"
+#include "ggnn_wrapper.cuh"
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+namespace cuvs::bench {
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuvs::bench::ggnn<T>::build_param& param)
+{
+  param.k = conf.at("k");
+
+  if (conf.contains("k_build")) { param.k_build = conf.at("k_build"); }
+  if (conf.contains("segment_size")) { param.segment_size = conf.at("segment_size"); }
+  if (conf.contains("num_layers")) { param.num_layers = conf.at("num_layers"); }
+  if (conf.contains("tau")) { param.tau = conf.at("tau"); }
+  if (conf.contains("refine_iterations")) {
+    param.refine_iterations = conf.at("refine_iterations");
+  }
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf,
+                        typename cuvs::bench::ggnn<T>::search_param& param)
+{
+  param.tau = conf.at("tau");
+
+  if (conf.contains("block_dim")) { param.block_dim = conf.at("block_dim"); }
+  if (conf.contains("max_iterations")) { param.max_iterations = conf.at("max_iterations"); }
+  if (conf.contains("cache_size")) { param.cache_size = conf.at("cache_size"); }
+  if (conf.contains("sorted_size")) { param.sorted_size = conf.at("sorted_size"); }
+}
+
+template <typename T, template <typename> class Algo>
+auto make_algo(cuvs::bench::Metric metric, int dim, const nlohmann::json& conf)
+  -> std::unique_ptr<cuvs::bench::algo<T>>
+{
+  typename Algo<T>::build_param param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+auto create_algo(const std::string& algo_name,
+                 const std::string& distance,
+                 int dim,
+                 const nlohmann::json& conf) -> std::unique_ptr<cuvs::bench::algo<T>>
+{
+  cuvs::bench::Metric metric = parse_metric(distance);
+  std::unique_ptr<cuvs::bench::algo<T>> a;
+
+  if constexpr (std::is_same_v<T, float> || std::is_same_v<T, uint8_t> ||
+                std::is_same_v<T, int8_t>) {
+    if (algo_name == "ggnn") { a = make_algo<T, cuvs::bench::ggnn>(metric, dim, conf); }
+  }
+  if (!a) { throw std::runtime_error("invalid algo: '" + algo_name + "'"); }
+
+  return a;
+}
+
+template <typename T>
+auto create_search_param(const std::string& algo_name, const nlohmann::json& conf)
+  -> std::unique_ptr<typename cuvs::bench::algo<T>::search_param>
+{
+  if constexpr (std::is_same_v<T, float> || std::is_same_v<T, uint8_t> ||
+                std::is_same_v<T, int8_t>) {
+    if (algo_name == "ggnn") {
+      auto param = std::make_unique<typename cuvs::bench::ggnn<T>::search_param>();
+      parse_search_param<T>(conf, *param);
+      return param;
+    }
+  }
+  // else
+  throw std::runtime_error("invalid algo: '" + algo_name + "'");
+}
+
+}  // namespace cuvs::bench
+
+REGISTER_ALGO_INSTANCE(float);
+REGISTER_ALGO_INSTANCE(std::int8_t);
+REGISTER_ALGO_INSTANCE(std::uint8_t);
+
+#ifdef ANN_BENCH_BUILD_MAIN
+#include "../common/benchmark.hpp"
+int main(int argc, char** argv) { return cuvs::bench::run_main(argc, argv); }
+#endif
diff --git a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
new file mode 100644
index 000000000..e2ca18e22
--- /dev/null
+++ b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../common/ann_types.hpp"
+#include "../common/util.hpp"
+
+// #include <raft/util/cudart_utils.hpp>
+
+#include <ggnn/cuda_knn_ggnn_gpu_instance.cuh>
+
+#include <memory>
+#include <stdexcept>
+
+namespace cuvs::bench {
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+class ggnn_impl;
+
+template <typename T>
+class ggnn : public algo<T>, public algo_gpu {
+ public:
+  struct build_param {
+    int k_build{24};       // KBuild
+    int segment_size{32};  // S
+    int num_layers{4};     // L
+    float tau{0.5};
+    int refine_iterations{2};
+    int k;  // GGNN requires to know k during building
+  };
+
+  using search_param_base = typename algo<T>::search_param;
+  struct search_param : public search_param_base {
+    float tau;
+    int block_dim{32};
+    int max_iterations{400};
+    int cache_size{512};
+    int sorted_size{256};
+    [[nodiscard]] auto needs_dataset() const -> bool override { return true; }
+  };
+
+  ggnn(Metric metric, int dim, const build_param& param);
+
+  void build(const T* dataset, size_t nrow) override { impl_->build(dataset, nrow); }
+
+  void set_search_param(const search_param_base& param) override { impl_->set_search_param(param); }
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              algo_base::index_type* neighbors,
+              float* distances) const override
+  {
+    impl_->search(queries, batch_size, k, neighbors, distances);
+  }
+  [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
+  {
+    return dynamic_cast<algo_gpu*>(impl_.get())->get_sync_stream();
+  }
+
+  void save(const std::string& file) const override { impl_->save(file); }
+  void load(const std::string& file) override { impl_->load(file); }
+  std::unique_ptr<algo<T>> copy() override { return std::make_unique<ggnn<T>>(*this); };
+
+  [[nodiscard]] auto get_preference() const -> algo_property override
+  {
+    return impl_->get_preference();
+  }
+
+  void set_search_dataset(const T* dataset, size_t nrow) override
+  {
+    impl_->set_search_dataset(dataset, nrow);
+  };
+
+ private:
+  std::shared_ptr<algo<T>> impl_;
+};
+
+template <typename T>
+ggnn<T>::ggnn(Metric metric, int dim, const build_param& param) : algo<T>(metric, dim)
+{
+  // ggnn/src/sift1m.cu
+  if (metric == Metric::kEuclidean && dim == 128 && param.k_build == 24 && param.k == 10 &&
+      param.segment_size == 32) {
+    impl_ = std::make_shared<ggnn_impl<T, Euclidean, 128, 24, 10, 32>>(metric, dim, param);
+  }
+  // ggnn/src/deep1b_multi_gpu.cu, and adapt it deep1B
+  else if (metric == Metric::kEuclidean && dim == 96 && param.k_build == 24 && param.k == 10 &&
+           param.segment_size == 32) {
+    impl_ = std::make_shared<ggnn_impl<T, Euclidean, 96, 24, 10, 32>>(metric, dim, param);
+  } else if (metric == Metric::kInnerProduct && dim == 96 && param.k_build == 24 && param.k == 10 &&
+             param.segment_size == 32) {
+    impl_ = std::make_shared<ggnn_impl<T, Cosine, 96, 24, 10, 32>>(metric, dim, param);
+  } else if (metric == Metric::kInnerProduct && dim == 96 && param.k_build == 96 && param.k == 10 &&
+             param.segment_size == 64) {
+    impl_ = std::make_shared<ggnn_impl<T, Cosine, 96, 96, 10, 64>>(metric, dim, param);
+  }
+  // ggnn/src/glove200.cu, adapt it to glove100
+  else if (metric == Metric::kInnerProduct && dim == 100 && param.k_build == 96 && param.k == 10 &&
+           param.segment_size == 64) {
+    impl_ = std::make_shared<ggnn_impl<T, Cosine, 100, 96, 10, 64>>(metric, dim, param);
+  } else {
+    throw std::runtime_error(
+      "ggnn: not supported combination of metric, dim and build param; "
+      "see Ggnn's constructor in ggnn_wrapper.cuh for available combinations");
+  }
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+class ggnn_impl : public algo<T>, public algo_gpu {
+ public:
+  using search_param_base = typename algo<T>::search_param;
+
+  ggnn_impl(Metric metric, int dim, const typename ggnn<T>::build_param& param);
+
+  void build(const T* dataset, size_t nrow) override;
+
+  void set_search_param(const search_param_base& param) override;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              algo_base::index_type* neighbors,
+              float* distances) const override;
+  [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override { return stream_; }
+
+  void save(const std::string& file) const override;
+  void load(const std::string& file) override;
+  std::unique_ptr<algo<T>> copy() override
+  {
+    auto r = std::make_unique<ggnn_impl<T, measure, D, KBuild, KQuery, S>>(*this);
+    // set the thread-local stream to the copied handle.
+    r->stream_ = cuvs::bench::get_stream_from_global_pool();
+    return r;
+  };
+
+  [[nodiscard]] auto get_preference() const -> algo_property override
+  {
+    algo_property property;
+    property.dataset_memory_type = MemoryType::kDevice;
+    property.query_memory_type   = MemoryType::kDevice;
+    return property;
+  }
+
+  void set_search_dataset(const T* dataset, size_t nrow) override;
+
+ private:
+  using algo<T>::metric_;
+  using algo<T>::dim_;
+
+  using ggnngpu_instance = GGNNGPUInstance<measure,
+                                           int64_t /* KeyT */,
+                                           float /* ValueT */,
+                                           size_t /* GAddrT */,
+                                           T /* BaseT */,
+                                           size_t /* BAddrT */,
+                                           D,
+                                           KBuild,
+                                           KBuild / 2 /* KF */,
+                                           KQuery,
+                                           S>;
+  std::shared_ptr<ggnngpu_instance> ggnn_;
+  typename ggnn<T>::build_param build_param_;
+  typename ggnn<T>::search_param search_param_;
+  cudaStream_t stream_;
+  const T* base_dataset_                 = nullptr;
+  size_t base_n_rows_                    = 0;
+  std::optional<std::string> graph_file_ = std::nullopt;
+
+  void load_impl()
+  {
+    if (base_dataset_ == nullptr) { return; }
+    if (base_n_rows_ == 0) { return; }
+    int device;
+    cudaGetDevice(&device);
+    ggnn_ = std::make_shared<ggnngpu_instance>(
+      device, base_n_rows_, build_param_.num_layers, true, build_param_.tau);
+    ggnn_->set_base_data(base_dataset_);
+    ggnn_->set_stream(get_sync_stream());
+    if (graph_file_.has_value()) {
+      auto& ggnn_host   = ggnn_->ggnn_cpu_buffers.at(0);
+      auto& ggnn_device = ggnn_->ggnn_shards.at(0);
+      ggnn_->set_stream(get_sync_stream());
+
+      ggnn_host.load(graph_file_.value());
+      ggnn_host.uploadAsync(ggnn_device);
+      cudaStreamSynchronize(ggnn_device.stream);
+    }
+  }
+};
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+ggnn_impl<T, measure, D, KBuild, KQuery, S>::ggnn_impl(Metric metric,
+                                                       int dim,
+                                                       const typename ggnn<T>::build_param& param)
+  : algo<T>(metric, dim), build_param_(param), stream_(cuvs::bench::get_stream_from_global_pool())
+{
+  if (metric_ == Metric::kInnerProduct) {
+    if (measure != Cosine) { throw std::runtime_error("mis-matched metric"); }
+  } else if (metric_ == Metric::kEuclidean) {
+    if (measure != Euclidean) { throw std::runtime_error("mis-matched metric"); }
+  } else {
+    throw std::runtime_error(
+      "ggnn supports only metric type of InnerProduct, Cosine and Euclidean");
+  }
+
+  if (dim != D) { throw std::runtime_error("mis-matched dim"); }
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void ggnn_impl<T, measure, D, KBuild, KQuery, S>::build(const T* dataset, size_t nrow)
+{
+  base_dataset_ = dataset;
+  base_n_rows_  = nrow;
+  graph_file_   = std::nullopt;
+  load_impl();
+  ggnn_->build(0);
+  for (int i = 0; i < build_param_.refine_iterations; ++i) {
+    ggnn_->refine();
+  }
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void ggnn_impl<T, measure, D, KBuild, KQuery, S>::set_search_dataset(const T* dataset, size_t nrow)
+{
+  if (base_dataset_ != dataset || base_n_rows_ != nrow) {
+    base_dataset_ = dataset;
+    base_n_rows_  = nrow;
+    load_impl();
+  }
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void ggnn_impl<T, measure, D, KBuild, KQuery, S>::set_search_param(const search_param_base& param)
+{
+  search_param_ = dynamic_cast<const typename ggnn<T>::search_param&>(param);
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void ggnn_impl<T, measure, D, KBuild, KQuery, S>::search(
+  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+{
+  static_assert(sizeof(size_t) == sizeof(int64_t), "sizes of size_t and GGNN's KeyT are different");
+  if (k != KQuery) {
+    throw std::runtime_error(
+      "k = " + std::to_string(k) +
+      ", but this GGNN instance only supports k = " + std::to_string(KQuery));
+  }
+
+  ggnn_->set_stream(get_sync_stream());
+  cudaMemcpyToSymbol(c_tau_query, &search_param_.tau, sizeof(float));
+
+  const int block_dim      = search_param_.block_dim;
+  const int max_iterations = search_param_.max_iterations;
+  const int cache_size     = search_param_.cache_size;
+  const int sorted_size    = search_param_.sorted_size;
+  // default value
+  if (block_dim == 32 && max_iterations == 400 && cache_size == 512 && sorted_size == 256) {
+    ggnn_->template queryLayer<32, 400, 512, 256, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  }
+  // ggnn/src/sift1m.cu
+  else if (block_dim == 32 && max_iterations == 200 && cache_size == 256 && sorted_size == 64) {
+    ggnn_->template queryLayer<32, 200, 256, 64, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  }
+  // ggnn/src/sift1m.cu
+  else if (block_dim == 32 && max_iterations == 400 && cache_size == 448 && sorted_size == 64) {
+    ggnn_->template queryLayer<32, 400, 448, 64, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  }
+  // ggnn/src/glove200.cu
+  else if (block_dim == 128 && max_iterations == 2000 && cache_size == 2048 && sorted_size == 32) {
+    ggnn_->template queryLayer<128, 2000, 2048, 32, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  }
+  // for glove100
+  else if (block_dim == 64 && max_iterations == 400 && cache_size == 512 && sorted_size == 32) {
+    ggnn_->template queryLayer<64, 400, 512, 32, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  } else if (block_dim == 128 && max_iterations == 2000 && cache_size == 1024 &&
+             sorted_size == 32) {
+    ggnn_->template queryLayer<128, 2000, 1024, 32, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  } else {
+    throw std::runtime_error("ggnn: not supported search param");
+  }
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void ggnn_impl<T, measure, D, KBuild, KQuery, S>::save(const std::string& file) const
+{
+  auto& ggnn_host   = ggnn_->ggnn_cpu_buffers.at(0);
+  auto& ggnn_device = ggnn_->ggnn_shards.at(0);
+  ggnn_->set_stream(get_sync_stream());
+
+  ggnn_host.downloadAsync(ggnn_device);
+  cudaStreamSynchronize(ggnn_device.stream);
+  ggnn_host.store(file);
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void ggnn_impl<T, measure, D, KBuild, KQuery, S>::load(const std::string& file)
+{
+  if (!graph_file_.has_value() || graph_file_.value() != file) {
+    graph_file_ = file;
+    load_impl();
+  }
+}
+
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
new file mode 100644
index 000000000..755c7c8d6
--- /dev/null
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../common/ann_types.hpp"
+#include "hnswlib_wrapper.h"
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+namespace cuvs::bench {
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuvs::bench::hnsw_lib<T>::build_param& param)
+{
+  param.ef_construction = conf.at("efConstruction");
+  param.m               = conf.at("M");
+  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf,
+                        typename cuvs::bench::hnsw_lib<T>::search_param& param)
+{
+  param.ef = conf.at("ef");
+  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+}
+
+template <typename T, template <typename> class Algo>
+auto make_algo(cuvs::bench::Metric metric, int dim, const nlohmann::json& conf)
+  -> std::unique_ptr<cuvs::bench::algo<T>>
+{
+  typename Algo<T>::build_param param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+auto create_algo(const std::string& algo_name,
+                 const std::string& distance,
+                 int dim,
+                 const nlohmann::json& conf) -> std::unique_ptr<cuvs::bench::algo<T>>
+{
+  cuvs::bench::Metric metric = parse_metric(distance);
+  std::unique_ptr<cuvs::bench::algo<T>> a;
+
+  if constexpr (std::is_same_v<T, float>) {
+    if (algo_name == "hnswlib") { a = make_algo<T, cuvs::bench::hnsw_lib>(metric, dim, conf); }
+  }
+
+  if constexpr (std::is_same_v<T, uint8_t>) {
+    if (algo_name == "hnswlib") { a = make_algo<T, cuvs::bench::hnsw_lib>(metric, dim, conf); }
+  }
+
+  if (!a) { throw std::runtime_error("invalid algo: '" + algo_name + "'"); }
+  return a;
+}
+
+template <typename T>
+auto create_search_param(const std::string& algo_name, const nlohmann::json& conf)
+  -> std::unique_ptr<typename cuvs::bench::algo<T>::search_param>
+{
+  if (algo_name == "hnswlib") {
+    auto param = std::make_unique<typename cuvs::bench::hnsw_lib<T>::search_param>();
+    parse_search_param<T>(conf, *param);
+    return param;
+  }
+  // else
+  throw std::runtime_error("invalid algo: '" + algo_name + "'");
+}
+
+};  // namespace cuvs::bench
+
+REGISTER_ALGO_INSTANCE(float);
+REGISTER_ALGO_INSTANCE(std::int8_t);
+REGISTER_ALGO_INSTANCE(std::uint8_t);
+
+#ifdef ANN_BENCH_BUILD_MAIN
+#include "../common/benchmark.hpp"
+int main(int argc, char** argv) { return cuvs::bench::run_main(argc, argv); }
+#endif
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
new file mode 100644
index 000000000..9d643f12a
--- /dev/null
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../common/ann_types.hpp"
+#include "../common/thread_pool.hpp"
+#include "../common/util.hpp"
+
+#include <hnswlib/hnswlib.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <cmath>
+#include <condition_variable>
+#include <cstdio>
+#include <ctime>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <numeric>
+#include <stdexcept>
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace cuvs::bench {
+
+template <typename T>
+struct hnsw_dist_t {
+  using type = void;
+};
+
+template <>
+struct hnsw_dist_t<float> {
+  using type = float;
+};
+
+template <>
+struct hnsw_dist_t<uint8_t> {
+  using type = int;
+};
+
+template <>
+struct hnsw_dist_t<int8_t> {
+  using type = int;
+};
+
+template <typename T>
+class hnsw_lib : public algo<T> {
+ public:
+  // https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
+  struct build_param {
+    int m;
+    int ef_construction;
+    int num_threads = omp_get_num_procs();
+  };
+
+  using search_param_base = typename algo<T>::search_param;
+  struct search_param : public search_param_base {
+    int ef;
+    int num_threads = 1;
+  };
+
+  hnsw_lib(Metric metric, int dim, const build_param& param);
+
+  void build(const T* dataset, size_t nrow) override;
+
+  void set_search_param(const search_param_base& param) override;
+  void search(const T* query,
+              int batch_size,
+              int k,
+              algo_base::index_type* indices,
+              float* distances) const override;
+
+  void save(const std::string& path_to_index) const override;
+  void load(const std::string& path_to_index) override;
+  auto copy() -> std::unique_ptr<algo<T>> override { return std::make_unique<hnsw_lib<T>>(*this); };
+
+  [[nodiscard]] auto get_preference() const -> algo_property override
+  {
+    algo_property property;
+    property.dataset_memory_type = MemoryType::kHost;
+    property.query_memory_type   = MemoryType::kHost;
+    return property;
+  }
+
+  void set_base_layer_only() { appr_alg_->base_layer_only = true; }
+
+ private:
+  void get_search_knn_results(const T* query,
+                              int k,
+                              algo_base::index_type* indices,
+                              float* distances) const;
+
+  std::shared_ptr<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>> appr_alg_;
+  std::shared_ptr<hnswlib::SpaceInterface<typename hnsw_dist_t<T>::type>> space_;
+
+  using algo<T>::metric_;
+  using algo<T>::dim_;
+  int ef_construction_;
+  int m_;
+  int num_threads_;
+  std::shared_ptr<fixed_thread_pool> thread_pool_;
+  Mode bench_mode_;
+};
+
+template <typename T>
+hnsw_lib<T>::hnsw_lib(Metric metric, int dim, const build_param& param) : algo<T>(metric, dim)
+{
+  assert(dim_ > 0);
+  static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t>);
+  if constexpr (std::is_same_v<T, uint8_t>) {
+    if (metric_ != Metric::kEuclidean) {
+      throw std::runtime_error("hnswlib<uint8_t> only supports Euclidean distance");
+    }
+  }
+
+  ef_construction_ = param.ef_construction;
+  m_               = param.m;
+  num_threads_     = param.num_threads;
+}
+
+template <typename T>
+void hnsw_lib<T>::build(const T* dataset, size_t nrow)
+{
+  if constexpr (std::is_same_v<T, float>) {
+    if (metric_ == Metric::kInnerProduct) {
+      space_ = std::make_shared<hnswlib::InnerProductSpace>(dim_);
+    } else {
+      space_ = std::make_shared<hnswlib::L2Space>(dim_);
+    }
+  } else if constexpr (std::is_same_v<T, uint8_t>) {
+    space_ = std::make_shared<hnswlib::L2SpaceI<T>>(dim_);
+  }
+
+  appr_alg_ = std::make_shared<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    space_.get(), nrow, m_, ef_construction_);
+
+  thread_pool_                  = std::make_shared<fixed_thread_pool>(num_threads_);
+  const size_t items_per_thread = nrow / (num_threads_ + 1);
+
+  thread_pool_->submit(
+    [&](size_t i) {
+      if (i < items_per_thread && i % 10000 == 0) {
+        char buf[20];
+        std::time_t now = std::time(nullptr);
+        std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", std::localtime(&now));
+        printf("%s building %zu / %zu\n", buf, i, items_per_thread);
+        fflush(stdout);
+      }
+
+      appr_alg_->addPoint(dataset + i * dim_, i);
+    },
+    nrow);
+}
+
+template <typename T>
+void hnsw_lib<T>::set_search_param(const search_param_base& param_)
+{
+  auto param     = dynamic_cast<const search_param&>(param_);
+  appr_alg_->ef_ = param.ef;
+  num_threads_   = param.num_threads;
+  // bench_mode_ = param.metric_objective;
+  bench_mode_ = Mode::kLatency;  // TODO(achirkin): pass the benchmark mode in the algo parameters
+
+  // Create a pool if multiple query threads have been set and the pool hasn't been created already
+  bool create_pool = (bench_mode_ == Mode::kLatency && num_threads_ > 1 && !thread_pool_);
+  if (create_pool) { thread_pool_ = std::make_shared<fixed_thread_pool>(num_threads_); }
+}
+
+template <typename T>
+void hnsw_lib<T>::search(
+  const T* query, int batch_size, int k, algo_base::index_type* indices, float* distances) const
+{
+  auto f = [&](int i) {
+    // hnsw can only handle a single vector at a time.
+    get_search_knn_results(query + i * dim_, k, indices + i * k, distances + i * k);
+  };
+  if (bench_mode_ == Mode::kLatency && num_threads_ > 1) {
+    thread_pool_->submit(f, batch_size);
+  } else {
+    for (int i = 0; i < batch_size; i++) {
+      f(i);
+    }
+  }
+}
+
+template <typename T>
+void hnsw_lib<T>::save(const std::string& path_to_index) const
+{
+  appr_alg_->saveIndex(std::string(path_to_index));
+}
+
+template <typename T>
+void hnsw_lib<T>::load(const std::string& path_to_index)
+{
+  if constexpr (std::is_same_v<T, float>) {
+    if (metric_ == Metric::kInnerProduct) {
+      space_ = std::make_shared<hnswlib::InnerProductSpace>(dim_);
+    } else {
+      space_ = std::make_shared<hnswlib::L2Space>(dim_);
+    }
+  } else if constexpr (std::is_same_v<T, uint8_t>) {
+    space_ = std::make_shared<hnswlib::L2SpaceI<T>>(dim_);
+  }
+
+  appr_alg_ = std::make_shared<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    space_.get(), path_to_index);
+}
+
+template <typename T>
+void hnsw_lib<T>::get_search_knn_results(const T* query,
+                                         int k,
+                                         algo_base::index_type* indices,
+                                         float* distances) const
+{
+  auto result = appr_alg_->searchKnn(query, k);
+  assert(result.size() >= static_cast<size_t>(k));
+
+  for (int i = k - 1; i >= 0; --i) {
+    indices[i]   = result.top().second;
+    distances[i] = result.top().first;
+    result.pop();
+  }
+}
+
+};  // namespace cuvs::bench
diff --git a/cpp/cmake/patches/ggnn.diff b/cpp/cmake/patches/ggnn.diff
new file mode 100644
index 000000000..fc4529880
--- /dev/null
+++ b/cpp/cmake/patches/ggnn.diff
@@ -0,0 +1,230 @@
+--- a/include/ggnn/cache/cuda_simple_knn_sym_cache.cuh
++++ b/include/ggnn/cache/cuda_simple_knn_sym_cache.cuh
+@@ -62,7 +62,7 @@ struct SimpleKNNSymCache {
+                                                 const ValueT dist_half)
+         : dist_query(dist_query), dist_half(dist_half) {}
+ 
+-    __device__ __forceinline__ DistQueryAndHalf() {}
++    DistQueryAndHalf() = default;
+   };
+ 
+   struct DistanceAndNorm {
+@@ -98,8 +98,7 @@ struct SimpleKNNSymCache {
+     KeyT cache;
+     DistQueryAndHalf dist;
+     bool flag;
+-
+-    __device__ __forceinline__ SyncTempStorage() {}
++    SyncTempStorage() = default;
+   };
+ 
+  public:
+diff --git a/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh b/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh
+index 8cbaf0d..6eb72ac 100644
+--- a/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh
++++ b/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh
+@@ -41,7 +41,6 @@ limitations under the License.
+ #include "ggnn/sym/cuda_knn_sym_query_layer.cuh"
+ #include "ggnn/utils/cuda_knn_utils.cuh"
+ #include "ggnn/utils/cuda_knn_constants.cuh"
+-#include "ggnn/utils/cuda_knn_dataset.cuh"
+ 
+ template <typename ValueT>
+ __global__ void divide(ValueT* res, ValueT* input, ValueT N) {
+@@ -98,9 +97,7 @@ struct GGNNGPUInstance {
+   typedef GGNNGraphDevice<KeyT, BaseT, ValueT> GGNNGraphDevice;
+   typedef GGNNGraphHost<KeyT, BaseT, ValueT> GGNNGraphHost;
+ 
+-  const Dataset<KeyT, BaseT, BAddrT>* dataset;
+   GGNNGraphBuffer<KeyT, ValueT>* ggnn_buffer {nullptr};
+-  GGNNQuery<KeyT, ValueT, BaseT> ggnn_query;
+ 
+   // Graph Shards resident on the GPU
+   std::vector<GGNNGraphDevice> ggnn_shards;
+@@ -117,13 +114,12 @@ struct GGNNGPUInstance {
+   // number of shards that need to be processed by this instance
+   const int num_parts;
+ 
+-  GGNNGPUInstance(const int gpu_id, const Dataset<KeyT, BaseT, BAddrT>* dataset,
++  GGNNGPUInstance(const int gpu_id,
+             const int N_shard, const int L,
+             const bool enable_construction, const float tau_build,
+             const int num_parts=1, const int num_cpu_buffers=1) :
+     N_shard{N_shard}, L{L}, tau_build{tau_build},
+-    dataset{dataset}, gpu_id{gpu_id},
+-    ggnn_query{dataset->N_query, D, KQuery, num_parts},
++    gpu_id{gpu_id},
+     num_parts{num_parts}
+   {
+     CHECK_LE(L, MAX_LAYER);
+@@ -135,7 +131,6 @@ struct GGNNGPUInstance {
+       CHECK_EQ(current_gpu_id, gpu_id) << "cudaSetDevice() needs to be called in advance!";
+     }
+ 
+-    ggnn_query.loadQueriesAsync(dataset->h_query, 0);
+ 
+     computeGraphParameters();
+ 
+@@ -186,7 +181,7 @@ struct GGNNGPUInstance {
+   }
+ 
+   GGNNGPUInstance(const GGNNGPUInstance& other)
+-   : dataset{nullptr}, ggnn_query{0, D, KQuery},
++   :
+      gpu_id{0}, N_shard{0}, num_parts{0} {
+     // this exists to allow using vector::emplace_back
+     // when it triggers a reallocation, this code will be called.
+@@ -305,6 +300,7 @@ struct GGNNGPUInstance {
+ 
+   // io
+ 
++  /*
+   void waitForDiskIO(const int shard_id) {
+     auto& cpu_buffer = ggnn_cpu_buffers[shard_id%ggnn_cpu_buffers.size()];
+     if (cpu_buffer.disk_io_thread.joinable())
+@@ -468,11 +464,12 @@ struct GGNNGPUInstance {
+     CHECK_CUDA(cudaDeviceSynchronize());
+     CHECK_CUDA(cudaPeekAtLastError());
+   }
++  */
+ 
+   // graph operations
+ 
+   template <int BLOCK_DIM_X = 32, int MAX_ITERATIONS = 400, int CACHE_SIZE = 512, int SORTED_SIZE = 256, bool DIST_STATS = false>
+-  void queryLayer(const int shard_id = 0) const {
++  void queryLayer(const BaseT* d_query, int batch_size, KeyT* d_query_result_ids, ValueT* d_query_result_dists, const int shard_id = 0) const {
+     CHECK_CUDA(cudaSetDevice(gpu_id));
+     const auto& shard = ggnn_shards.at(shard_id%ggnn_shards.size());
+ 
+@@ -482,21 +479,21 @@ struct GGNNGPUInstance {
+ 
+     int* m_dist_statistics = nullptr;
+     if (DIST_STATS)
+-      cudaMallocManaged(&m_dist_statistics, dataset->N_query * sizeof(int));
++      cudaMallocManaged(&m_dist_statistics, batch_size * sizeof(int));
+ 
+     QueryKernel query_kernel;
+     query_kernel.d_base = shard.d_base;
+-    query_kernel.d_query = ggnn_query.d_query;
++    query_kernel.d_query = d_query;
+ 
+     query_kernel.d_graph = shard.d_graph;
+-    query_kernel.d_query_results = ggnn_query.d_query_result_ids;
+-    query_kernel.d_query_results_dists = ggnn_query.d_query_result_dists;
++    query_kernel.d_query_results = d_query_result_ids;
++    query_kernel.d_query_results_dists = d_query_result_dists;
+ 
+     query_kernel.d_translation = shard.d_translation;
+ 
+     query_kernel.d_nn1_stats = shard.d_nn1_stats;
+ 
+-    query_kernel.N = dataset->N_query;
++    query_kernel.N = batch_size;
+     query_kernel.N_offset = 0;
+ 
+     query_kernel.d_dist_stats = m_dist_statistics;
+@@ -771,6 +768,16 @@ struct GGNNGPUInstance {
+       sym(layer, shard_id);
+     }
+   }
++
++  void set_stream(cudaStream_t stream) {
++    assert(ggnn_shards.size() == 1);
++    ggnn_shards.at(0).stream = stream;
++  }
++
++  void set_base_data(const BaseT* dataset) {
++    assert(ggnn_shards.size() == 1);
++    ggnn_shards.at(0).d_base = dataset;
++  }
+ };
+ 
+ #endif  // INCLUDE_GGNN_CUDA_KNN_GGNN_GPU_INSTANCE_CUH_
+diff --git a/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh b/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh
+index c94a8f1..781226d 100644
+--- a/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh
++++ b/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh
+@@ -50,7 +50,7 @@ struct GGNNGraphDevice {
+   ValueT* d_nn1_stats;
+ 
+   /// base data pointer for the shard.
+-  BaseT* d_base;
++  const BaseT* d_base;
+ 
+   /// combined memory pool
+   char* d_memory;
+@@ -69,7 +69,9 @@ struct GGNNGraphDevice {
+     const size_t selection_translation_size = align8(ST_all * sizeof(KeyT));
+     const size_t nn1_stats_size = align8(2 * sizeof(ValueT));
+     total_graph_size = graph_size + 2 * selection_translation_size + nn1_stats_size;
+-    base_size = align8(static_cast<size_t>(N) * D * sizeof(BaseT));
++    // base_size = align8(static_cast<size_t>(N) * D * sizeof(BaseT));
++    (void) N;
++    (void) D;
+ 
+     const size_t total_size = base_size+total_graph_size;
+ 
+@@ -86,8 +88,7 @@ struct GGNNGraphDevice {
+     CHECK_CUDA(cudaMalloc(&d_memory, total_size));
+ 
+     size_t pos = 0;
+-    d_base = reinterpret_cast<BaseT*>(d_memory+pos);
+-    pos += base_size;
++    d_base = nullptr;
+     d_graph = reinterpret_cast<KeyT*>(d_memory+pos);
+     pos += graph_size;
+     d_translation = reinterpret_cast<KeyT*>(d_memory+pos);
+@@ -99,14 +100,14 @@ struct GGNNGraphDevice {
+ 
+     CHECK_EQ(pos, total_size);
+ 
+-    CHECK_CUDA(cudaStreamCreate(&stream));
++    // CHECK_CUDA(cudaStreamCreate(&stream));
+ 
+     CHECK_CUDA(cudaPeekAtLastError());
+     CHECK_CUDA(cudaDeviceSynchronize());
+     CHECK_CUDA(cudaPeekAtLastError());
+   }
+ 
+-  GGNNGraphDevice(const GGNNGraphDevice& other) {
++  GGNNGraphDevice(const GGNNGraphDevice&) {
+     // this exists to allow using vector::emplace_back
+     // when it triggers a reallocation, this code will be called.
+     // always make sure that enough memory is reserved ahead of time.
+@@ -116,7 +117,7 @@ struct GGNNGraphDevice {
+   ~GGNNGraphDevice() {
+     cudaFree(d_memory);
+ 
+-    CHECK_CUDA(cudaStreamDestroy(stream));
++    // CHECK_CUDA(cudaStreamDestroy(stream));
+   }
+ };
+ 
+diff --git a/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh b/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh
+index 2055f9e..ef5843a 100644
+--- a/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh
++++ b/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh
+@@ -92,7 +92,7 @@ struct GGNNGraphHost {
+     CHECK_CUDA(cudaPeekAtLastError());
+   }
+ 
+-  GGNNGraphHost(const GGNNGraphHost& other) {
++  GGNNGraphHost(const GGNNGraphHost&) {
+     // this exists to allow using vector::emplace_back
+     // when it triggers a reallocation, this code will be called.
+     // always make sure that enough memory is reserved ahead of time.
+diff --git a/include/ggnn/select/cuda_knn_wrs_select_layer.cuh b/include/ggnn/select/cuda_knn_wrs_select_layer.cuh
+index 49d76a1..eef69e6 100644
+--- a/include/ggnn/select/cuda_knn_wrs_select_layer.cuh
++++ b/include/ggnn/select/cuda_knn_wrs_select_layer.cuh
+@@ -22,7 +22,6 @@ limitations under the License.
+ #include <cuda.h>
+ #include <cuda_runtime.h>
+ 
+-#include <gflags/gflags.h>
+ #include <cub/cub.cuh>
+ 
+ #include "ggnn/utils/cuda_knn_constants.cuh"
+-- 
+2.43.0
+
diff --git a/cpp/cmake/patches/hnswlib.diff b/cpp/cmake/patches/hnswlib.diff
new file mode 100644
index 000000000..e7f89a8cc
--- /dev/null
+++ b/cpp/cmake/patches/hnswlib.diff
@@ -0,0 +1,188 @@
+--- a/hnswlib/hnswalg.h
++++ b/hnswlib/hnswalg.h
+@@ -3,6 +3,7 @@
+ #include "visited_list_pool.h"
+ #include "hnswlib.h"
+ #include <atomic>
++#include <limits>
+ #include <random>
+ #include <stdlib.h>
+ #include <assert.h>
+@@ -16,6 +17,8 @@ namespace hnswlib {
+     template<typename dist_t>
+     class HierarchicalNSW : public AlgorithmInterface<dist_t> {
+     public:
++        bool base_layer_only{false};
++        int num_seeds=32;
+         static const tableint max_update_element_locks = 65536;
+         HierarchicalNSW(SpaceInterface<dist_t> *s) {
+         }
+@@ -56,7 +59,7 @@ namespace hnswlib {
+             visited_list_pool_ = new VisitedListPool(1, max_elements);
+ 
+             //initializations for special treatment of the first node
+-            enterpoint_node_ = -1;
++            enterpoint_node_ = std::numeric_limits<tableint>::max();
+             maxlevel_ = -1;
+ 
+             linkLists_ = (char **) malloc(sizeof(void *) * max_elements_);
+@@ -527,7 +530,7 @@ namespace hnswlib {
+                     tableint *datal = (tableint *) (data + 1);
+                     for (int i = 0; i < size; i++) {
+                         tableint cand = datal[i];
+-                        if (cand < 0 || cand > max_elements_)
++                        if (cand > max_elements_)
+                             throw std::runtime_error("cand error");
+                         dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
+ 
+@@ -1067,7 +1070,7 @@ namespace hnswlib {
+                             tableint *datal = (tableint *) (data + 1);
+                             for (int i = 0; i < size; i++) {
+                                 tableint cand = datal[i];
+-                                if (cand < 0 || cand > max_elements_)
++                                if (cand > max_elements_)
+                                     throw std::runtime_error("cand error");
+                                 dist_t d = fstdistfunc_(data_point, getDataByInternalId(cand), dist_func_param_);
+                                 if (d < curdist) {
+@@ -1119,28 +1122,41 @@ namespace hnswlib {
+             tableint currObj = enterpoint_node_;
+             dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_);
+ 
+-            for (int level = maxlevel_; level > 0; level--) {
+-                bool changed = true;
+-                while (changed) {
+-                    changed = false;
+-                    unsigned int *data;
++            if (base_layer_only) {
++                // You can increase the number of seeds when testing large-scale dataset, num_seeds = 48 for 100M-scale
++                for (int i = 0; i < num_seeds; i++) {
++                    tableint obj = i * (max_elements_ / num_seeds);
++                    dist_t dist = fstdistfunc_(query_data, getDataByInternalId(obj), dist_func_param_);
++                    if (dist < curdist) {
++                        curdist = dist;
++                        currObj = obj;
++                    }
++                }
++            }
++            else{
++                for (int level = maxlevel_; level > 0; level--) {
++                    bool changed = true;
++                    while (changed) {
++                        changed = false;
++                        unsigned int *data;
+ 
+-                    data = (unsigned int *) get_linklist(currObj, level);
+-                    int size = getListCount(data);
+-                    metric_hops++;
+-                    metric_distance_computations+=size;
++                        data = (unsigned int *) get_linklist(currObj, level);
++                        int size = getListCount(data);
++                        metric_hops++;
++                        metric_distance_computations+=size;
+ 
+-                    tableint *datal = (tableint *) (data + 1);
+-                    for (int i = 0; i < size; i++) {
+-                        tableint cand = datal[i];
+-                        if (cand < 0 || cand > max_elements_)
+-                            throw std::runtime_error("cand error");
+-                        dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
++                        tableint *datal = (tableint *) (data + 1);
++                        for (int i = 0; i < size; i++) {
++                            tableint cand = datal[i];
++                            if (cand > max_elements_)
++                                throw std::runtime_error("cand error");
++                            dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
+ 
+-                        if (d < curdist) {
+-                            curdist = d;
+-                            currObj = cand;
+-                            changed = true;
++                            if (d < curdist) {
++                                curdist = d;
++                                currObj = cand;
++                                changed = true;
++                            }
+                         }
+                     }
+                 }
+diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h
+index 4413537..c3240f3 100644
+--- a/hnswlib/space_l2.h
++++ b/hnswlib/space_l2.h
+@@ -252,13 +252,14 @@ namespace hnswlib {
+         ~L2Space() {}
+     };
+ 
++    template <typename T>
+     static int
+     L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const void *__restrict qty_ptr) {
+ 
+         size_t qty = *((size_t *) qty_ptr);
+         int res = 0;
+-        unsigned char *a = (unsigned char *) pVect1;
+-        unsigned char *b = (unsigned char *) pVect2;
++        T *a = (T *) pVect1;
++        T *b = (T *) pVect2;
+ 
+         qty = qty >> 2;
+         for (size_t i = 0; i < qty; i++) {
+@@ -279,11 +280,12 @@ namespace hnswlib {
+         return (res);
+     }
+ 
++    template <typename T>
+     static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2, const void* __restrict qty_ptr) {
+         size_t qty = *((size_t*)qty_ptr);
+         int res = 0;
+-        unsigned char* a = (unsigned char*)pVect1;
+-        unsigned char* b = (unsigned char*)pVect2;
++        T* a = (T*)pVect1;
++        T* b = (T*)pVect2;
+ 
+         for(size_t i = 0; i < qty; i++)
+         {
+@@ -294,6 +296,7 @@ namespace hnswlib {
+         return (res);
+     }
+ 
++    template <typename T>
+     class L2SpaceI : public SpaceInterface<int> {
+ 
+         DISTFUNC<int> fstdistfunc_;
+@@ -302,10 +305,10 @@ namespace hnswlib {
+     public:
+         L2SpaceI(size_t dim) {
+             if(dim % 4 == 0) {
+-                fstdistfunc_ = L2SqrI4x;
++                fstdistfunc_ = L2SqrI4x<T>;
+             }
+             else {
+-                fstdistfunc_ = L2SqrI;
++                fstdistfunc_ = L2SqrI<T>;
+             }
+             dim_ = dim;
+             data_size_ = dim * sizeof(unsigned char);
+diff --git a/hnswlib/visited_list_pool.h b/hnswlib/visited_list_pool.h
+index 5e1a4a5..4195ebd 100644
+--- a/hnswlib/visited_list_pool.h
++++ b/hnswlib/visited_list_pool.h
+@@ -3,6 +3,7 @@
+ #include <mutex>
+ #include <string.h>
+ #include <deque>
++#include <limits>
+ 
+ namespace hnswlib {
+     typedef unsigned short int vl_type;
+@@ -14,7 +15,7 @@ namespace hnswlib {
+         unsigned int numelements;
+ 
+         VisitedList(int numelements1) {
+-            curV = -1;
++            curV = std::numeric_limits<vl_type>::max();
+             numelements = numelements1;
+             mass = new vl_type[numelements];
+         }
+-- 
+2.43.0
+
diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake
new file mode 100644
index 000000000..89446332d
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_faiss.cmake
@@ -0,0 +1,109 @@
+#=============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_faiss)
+    set(oneValueArgs VERSION REPOSITORY PINNED_TAG BUILD_STATIC_LIBS EXCLUDE_FROM_ALL ENABLE_GPU)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+        rapids_find_generate_module(faiss
+                HEADER_NAMES  faiss/IndexFlat.h
+                LIBRARY_NAMES faiss
+                )
+
+        set(BUILD_SHARED_LIBS ON)
+        if (PKG_BUILD_STATIC_LIBS)
+            set(BUILD_SHARED_LIBS OFF)
+            set(CPM_DOWNLOAD_faiss ON)
+        endif()
+
+        include(cmake/modules/FindAVX.cmake)
+
+        # Link against AVX CPU lib if it exists
+        set(CUVS_FAISS_GLOBAL_TARGETS faiss::faiss)
+        set(CUVS_FAISS_EXPORT_GLOBAL_TARGETS faiss)
+        set(CUVS_FAISS_OPT_LEVEL "generic")
+        if(CXX_AVX_FOUND)
+            set(CUVS_FAISS_OPT_LEVEL "avx2")
+            list(APPEND CUVS_FAISS_GLOBAL_TARGETS faiss::faiss_avx2)
+            list(APPEND CUVS_FAISS_EXPORT_GLOBAL_TARGETS faiss_avx2)
+        endif()
+
+        rapids_cpm_find(faiss ${PKG_VERSION}
+                GLOBAL_TARGETS ${CUVS_FAISS_GLOBAL_TARGETS}
+                CPM_ARGS
+                GIT_REPOSITORY   ${PKG_REPOSITORY}
+                GIT_TAG          ${PKG_PINNED_TAG}
+                EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL}
+                OPTIONS
+                "FAISS_ENABLE_GPU ${PKG_ENABLE_GPU}"
+                "FAISS_ENABLE_PYTHON OFF"
+                "FAISS_OPT_LEVEL ${CUVS_FAISS_OPT_LEVEL}"
+                "FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}"
+                "BUILD_TESTING OFF"
+                "CMAKE_MESSAGE_LOG_LEVEL VERBOSE"
+                )
+
+        if(TARGET faiss AND NOT TARGET faiss::faiss)
+            add_library(faiss::faiss ALIAS faiss)
+        endif()
+
+    if(CXX_AVX_FOUND)
+
+        if(TARGET faiss_avx2 AND NOT TARGET faiss::faiss_avx2)
+            add_library(faiss::faiss_avx2 ALIAS faiss_avx2)
+        endif()
+    endif()
+
+
+    if(faiss_ADDED)
+            rapids_export(BUILD faiss
+                    EXPORT_SET faiss-targets
+                    GLOBAL_TARGETS ${CUVS_FAISS_EXPORT_GLOBAL_TARGETS}
+                    NAMESPACE faiss::)
+        endif()
+
+    # We generate the faiss-config files when we built faiss locally, so always do `find_dependency`
+    rapids_export_package(BUILD OpenMP cuvs-ann-bench-exports) # faiss uses openMP but doesn't export a need for it
+    rapids_export_package(BUILD faiss cuvs-ann-bench-exports GLOBAL_TARGETS ${CUVS_FAISS_GLOBAL_TARGETS} ${CUVS_FAISS_EXPORT_GLOBAL_TARGETS})
+    rapids_export_package(INSTALL faiss cuvs-ann-bench-exports GLOBAL_TARGETS ${CUVS_FAISS_GLOBAL_TARGETS} ${CUVS_FAISS_EXPORT_GLOBAL_TARGETS})
+
+    # Tell cmake where it can find the generated faiss-config.cmake we wrote.
+    include("${rapids-cmake-dir}/export/find_package_root.cmake")
+    rapids_export_find_package_root(BUILD faiss [=[${CMAKE_CURRENT_LIST_DIR}]=]
+                                    EXPORT_SET cuvs-ann-bench-exports)
+endfunction()
+
+if(NOT CUVS_FAISS_GIT_TAG)
+    # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC
+    # (https://github.com/facebookresearch/faiss/pull/2446)
+    set(CUVS_FAISS_GIT_TAG fea/statically-link-ctk)
+    # set(CUVS_FAISS_GIT_TAG bde7c0027191f29c9dadafe4f6e68ca0ee31fb30)
+endif()
+
+if(NOT CUVS_FAISS_GIT_REPOSITORY)
+    # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC
+    # (https://github.com/facebookresearch/faiss/pull/2446)
+    set(CUVS_FAISS_GIT_REPOSITORY https://github.com/cjnolet/faiss.git)
+    # set(CUVS_FAISS_GIT_REPOSITORY https://github.com/facebookresearch/faiss.git)
+endif()
+
+find_and_configure_faiss(VERSION    1.7.4
+        REPOSITORY  ${CUVS_FAISS_GIT_REPOSITORY}
+        PINNED_TAG  ${CUVS_FAISS_GIT_TAG}
+        BUILD_STATIC_LIBS ${CUVS_USE_FAISS_STATIC}
+        EXCLUDE_FROM_ALL ${CUVS_EXCLUDE_FAISS_FROM_ALL}
+        ENABLE_GPU ${CUVS_FAISS_ENABLE_GPU})
diff --git a/cpp/cmake/thirdparty/get_ggnn.cmake b/cpp/cmake/thirdparty/get_ggnn.cmake
new file mode 100644
index 000000000..8137ef84e
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_ggnn.cmake
@@ -0,0 +1,58 @@
+#=============================================================================
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_ggnn)
+  set(oneValueArgs VERSION REPOSITORY PINNED_TAG)
+  cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+          "${multiValueArgs}" ${ARGN} )
+
+
+  set(patch_files_to_run "${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/ggnn.diff")
+  set(patch_issues_to_ref "fix compile issues")
+  set(patch_script "${CMAKE_BINARY_DIR}/rapids-cmake/patches/ggnn/patch.cmake")
+  set(log_file "${CMAKE_BINARY_DIR}/rapids-cmake/patches/ggnn/log")
+  string(TIMESTAMP current_year "%Y" UTC)
+  configure_file(${rapids-cmake-dir}/cpm/patches/command_template.cmake.in "${patch_script}"
+                @ONLY)
+
+  rapids_cpm_find(
+    ggnn ${PKG_VERSION}
+    GLOBAL_TARGETS ggnn::ggnn
+    CPM_ARGS
+    GIT_REPOSITORY ${PKG_REPOSITORY}
+    GIT_TAG ${PKG_PINNED_TAG}
+    GIT_SHALLOW TRUE
+    DOWNLOAD_ONLY ON
+    PATCH_COMMAND ${CMAKE_COMMAND} -P ${patch_script}
+  )
+  if(NOT TARGET ggnn::ggnn)
+    add_library(ggnn INTERFACE)
+    target_include_directories(ggnn INTERFACE "$<BUILD_INTERFACE:${ggnn_SOURCE_DIR}/include>")
+    add_library(ggnn::ggnn ALIAS ggnn)
+  endif()
+
+endfunction()
+if(NOT RAFT_GGNN_GIT_TAG)
+  set(RAFT_GGNN_GIT_TAG release_0.5)
+endif()
+
+if(NOT RAFT_GGNN_GIT_REPOSITORY)
+  set(RAFT_GGNN_GIT_REPOSITORY https://github.com/cgtuebingen/ggnn.git)
+endif()
+find_and_configure_ggnn(VERSION 0.5
+        REPOSITORY       ${RAFT_GGNN_GIT_REPOSITORY}
+        PINNED_TAG       ${RAFT_GGNN_GIT_TAG}
+        )
diff --git a/cpp/cmake/thirdparty/get_glog.cmake b/cpp/cmake/thirdparty/get_glog.cmake
new file mode 100644
index 000000000..5839ee3d5
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_glog.cmake
@@ -0,0 +1,42 @@
+#=============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_glog)
+    set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+    rapids_cpm_find(glog ${PKG_VERSION}
+            GLOBAL_TARGETS      glog::glog
+            CPM_ARGS
+            GIT_REPOSITORY         https://github.com/${PKG_FORK}/glog.git
+            GIT_TAG                ${PKG_PINNED_TAG}
+            EXCLUDE_FROM_ALL       ${PKG_EXCLUDE_FROM_ALL}
+            )
+
+    if(glog_ADDED)
+        message(VERBOSE "cuVS: Using glog located in ${glog_SOURCE_DIR}")
+    else()
+        message(VERBOSE "cuVS: Using glog located in ${glog_DIR}")
+    endif()
+
+endfunction()
+
+find_and_configure_glog(VERSION 0.6.0
+        FORK             google
+        PINNED_TAG       v0.6.0
+        EXCLUDE_FROM_ALL ON
+        )
diff --git a/cpp/cmake/thirdparty/get_hnswlib.cmake b/cpp/cmake/thirdparty/get_hnswlib.cmake
new file mode 100644
index 000000000..054a12f1e
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_hnswlib.cmake
@@ -0,0 +1,92 @@
+#=============================================================================
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_hnswlib)
+  set(oneValueArgs VERSION REPOSITORY PINNED_TAG EXCLUDE_FROM_ALL)
+  cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+          "${multiValueArgs}" ${ARGN} )
+
+  set(patch_files_to_run "${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/hnswlib.diff")
+  set(patch_issues_to_ref "fix compile issues")
+  set(patch_script "${CMAKE_BINARY_DIR}/rapids-cmake/patches/hnswlib/patch.cmake")
+  set(log_file "${CMAKE_BINARY_DIR}/rapids-cmake/patches/hnswlib/log")
+  string(TIMESTAMP current_year "%Y" UTC)
+  configure_file(${rapids-cmake-dir}/cpm/patches/command_template.cmake.in "${patch_script}"
+                @ONLY)
+
+  rapids_cpm_find(
+    hnswlib ${PKG_VERSION}
+    GLOBAL_TARGETS hnswlib::hnswlib
+    BUILD_EXPORT_SET raft-exports
+    INSTALL_EXPORT_SET raft-exports
+    CPM_ARGS
+    GIT_REPOSITORY ${PKG_REPOSITORY}
+    GIT_TAG ${PKG_PINNED_TAG}
+    GIT_SHALLOW TRUE
+    DOWNLOAD_ONLY ON
+    PATCH_COMMAND ${CMAKE_COMMAND} -P ${patch_script}
+  )
+  if(NOT TARGET hnswlib::hnswlib)
+    add_library(hnswlib INTERFACE )
+    add_library(hnswlib::hnswlib ALIAS hnswlib)
+    target_include_directories(hnswlib INTERFACE
+     "$<BUILD_INTERFACE:${hnswlib_SOURCE_DIR}>"
+     "$<INSTALL_INTERFACE:include>")
+
+    if(NOT PKG_EXCLUDE_FROM_ALL)
+      install(TARGETS hnswlib EXPORT hnswlib-exports)
+      install(DIRECTORY "${hnswlib_SOURCE_DIR}/hnswlib/" DESTINATION include/hnswlib)
+
+      # write install export rules
+      rapids_export(
+        INSTALL hnswlib
+        VERSION ${PKG_VERSION}
+        EXPORT_SET hnswlib-exports
+        GLOBAL_TARGETS hnswlib
+        NAMESPACE hnswlib::)
+    endif()
+
+    # write build export rules
+    rapids_export(
+      BUILD hnswlib
+      VERSION ${PKG_VERSION}
+      EXPORT_SET hnswlib-exports
+      GLOBAL_TARGETS hnswlib
+      NAMESPACE hnswlib::)
+
+    include("${rapids-cmake-dir}/export/find_package_root.cmake")
+
+    # When using RAFT from the build dir, ensure hnswlib is also found in RAFT's build dir. This
+    # line adds `set(hnswlib_ROOT "${CMAKE_CURRENT_LIST_DIR}")` to build/raft-dependencies.cmake
+    rapids_export_find_package_root(
+      BUILD hnswlib [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET raft-exports
+    )
+  endif()
+endfunction()
+
+
+if(NOT CUVS_HNSWLIB_GIT_TAG)
+  set(CUVS_HNSWLIB_GIT_TAG v0.6.2)
+endif()
+
+if(NOT CUVS_HNSWLIB_GIT_REPOSITORY)
+  set(CUVS_HNSWLIB_GIT_REPOSITORY https://github.com/nmslib/hnswlib.git)
+endif()
+find_and_configure_hnswlib(VERSION 0.6.2
+        REPOSITORY       ${CUVS_HNSWLIB_GIT_REPOSITORY}
+        PINNED_TAG       ${CUVS_HNSWLIB_GIT_TAG}
+        EXCLUDE_FROM_ALL OFF
+        )
diff --git a/cpp/cmake/thirdparty/get_nlohmann_json.cmake b/cpp/cmake/thirdparty/get_nlohmann_json.cmake
new file mode 100644
index 000000000..8d0b671e4
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_nlohmann_json.cmake
@@ -0,0 +1,42 @@
+#=============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_nlohmann_json)
+    set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+    rapids_cpm_find(nlohmann_json ${PKG_VERSION}
+            GLOBAL_TARGETS      nlohmann_json::nlohmann_json
+            CPM_ARGS
+            GIT_REPOSITORY         https://github.com/${PKG_FORK}/json.git
+            GIT_TAG                ${PKG_PINNED_TAG}
+            EXCLUDE_FROM_ALL       ${PKG_EXCLUDE_FROM_ALL}
+            )
+
+    if(glog_ADDED)
+        message(VERBOSE "cuVS: Using glog located in ${glog_SOURCE_DIR}")
+    else()
+        message(VERBOSE "cuVS: Using glog located in ${glog_DIR}")
+    endif()
+
+endfunction()
+
+find_and_configure_nlohmann_json(VERSION  3.11.2
+        FORK             nlohmann
+        PINNED_TAG       v3.11.2
+        EXCLUDE_FROM_ALL ON
+        )
diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index b0668eeb0..141fba4ee 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -1043,6 +1043,166 @@ void serialize(raft::resources const& handle,
 void deserialize(raft::resources const& handle,
                  std::istream& is,
                  cuvs::neighbors::cagra::index<uint8_t, uint32_t>* index);
+
+/**
+ * Write the CAGRA built index as a base layer HNSW index to an output stream
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/cagra_serialize.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create an output stream
+ * std::ostream os(std::cout.rdbuf());
+ * // create an index with `auto index = raft::cagra::build(...);`
+ * cuvs::neighbors::cagra::serialize_to_hnswlib(handle, os, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] os output stream
+ * @param[in] index CAGRA index
+ *
+ */
+void serialize_to_hnswlib(raft::resources const& handle,
+                          std::ostream& os,
+                          const cuvs::neighbors::cagra::index<float, uint32_t>& index);
+
+/**
+ * Save a CAGRA build index in hnswlib base-layer-only serialized format
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/cagra_serialize.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * // create an index with `auto index = raft::cagra::build(...);`
+ * cuvs::neighbors::cagra::serialize_to_hnswlib(handle, filename, index);
+ * @endcode
+ *
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index CAGRA index
+ *
+ */
+void serialize_to_hnswlib(raft::resources const& handle,
+                          const std::string& filename,
+                          const cuvs::neighbors::cagra::index<float, uint32_t>& index);
+
+/**
+ * Write the CAGRA built index as a base layer HNSW index to an output stream
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/cagra_serialize.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create an output stream
+ * std::ostream os(std::cout.rdbuf());
+ * // create an index with `auto index = raft::cagra::build(...);`
+ * cuvs::neighbors::cagra::serialize_to_hnswlib(handle, os, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] os output stream
+ * @param[in] index CAGRA index
+ *
+ */
+void serialize_to_hnswlib(raft::resources const& handle,
+                          std::ostream& os,
+                          const cuvs::neighbors::cagra::index<int8_t, uint32_t>& index);
+
+/**
+ * Save a CAGRA build index in hnswlib base-layer-only serialized format
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/cagra_serialize.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * // create an index with `auto index = raft::cagra::build(...);`
+ * cuvs::neighbors::cagra::serialize_to_hnswlib(handle, filename, index);
+ * @endcode
+ *
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index CAGRA index
+ *
+ */
+void serialize_to_hnswlib(raft::resources const& handle,
+                          const std::string& filename,
+                          const cuvs::neighbors::cagra::index<int8_t, uint32_t>& index);
+
+/**
+ * Write the CAGRA built index as a base layer HNSW index to an output stream
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/cagra_serialize.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create an output stream
+ * std::ostream os(std::cout.rdbuf());
+ * // create an index with `auto index = raft::cagra::build(...);`
+ * cuvs::neighbors::cagra::serialize_to_hnswlib(handle, os, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] os output stream
+ * @param[in] index CAGRA index
+ *
+ */
+void serialize_to_hnswlib(raft::resources const& handle,
+                          std::ostream& os,
+                          const cuvs::neighbors::cagra::index<uint8_t, uint32_t>& index);
+
+/**
+ * Save a CAGRA build index in hnswlib base-layer-only serialized format
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/cagra_serialize.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * // create an index with `auto index = raft::cagra::build(...);`
+ * cuvs::neighbors::cagra::serialize_to_hnswlib(handle, filename, index);
+ * @endcode
+ *
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index CAGRA index
+ *
+ */
+void serialize_to_hnswlib(raft::resources const& handle,
+                          const std::string& filename,
+                          const cuvs::neighbors::cagra::index<uint8_t, uint32_t>& index);
+
 /**
  * @}
  */
diff --git a/cpp/src/neighbors/cagra_serialize.cuh b/cpp/src/neighbors/cagra_serialize.cuh
index 03f128cb9..e193c0630 100644
--- a/cpp/src/neighbors/cagra_serialize.cuh
+++ b/cpp/src/neighbors/cagra_serialize.cuh
@@ -19,102 +19,52 @@
 #include "detail/cagra/cagra_serialize.cuh"
 
 namespace cuvs::neighbors::cagra {
-/**
- * Write the CAGRA built index as a base layer HNSW index to an output stream
- *
- * Experimental, both the API and the serialization format are subject to change.
- *
- * @code{.cpp}
- * #include <raft/core/resources.hpp>
- * #include <cuvs/neighbors/cagra_serialize.hpp>
- *
- * raft::resources handle;
- *
- * // create an output stream
- * std::ostream os(std::cout.rdbuf());
- * // create an index with `auto index = raft::cagra::build(...);`
- * raft::cagra::serialize_to_hnswlib(handle, os, index);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices
- *
- * @param[in] handle the raft handle
- * @param[in] os output stream
- * @param[in] index CAGRA index
- *
- */
-template <typename T, typename IdxT>
-void serialize_to_hnswlib(raft::resources const& handle,
-                          std::ostream& os,
-                          const cuvs::neighbors::cagra::index<T, IdxT>& index)
-{
-  detail::serialize_to_hnswlib<T, IdxT>(handle, os, index);
-}
-
-/**
- * Save a CAGRA build index in hnswlib base-layer-only serialized format
- *
- * Experimental, both the API and the serialization format are subject to change.
- *
- * @code{.cpp}
- * #include <raft/core/resources.hpp>
- * #include <cuvs/neighbors/cagra_serialize.hpp>
- *
- * raft::resources handle;
- *
- * // create a string with a filepath
- * std::string filename("/path/to/index");
- * // create an index with `auto index = raft::cagra::build(...);`
- * raft::cagra::serialize_to_hnswlib(handle, filename, index);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices
- *
- * @param[in] handle the raft handle
- * @param[in] filename the file name for saving the index
- * @param[in] index CAGRA index
- *
- */
-template <typename T, typename IdxT>
-void serialize_to_hnswlib(raft::resources const& handle,
-                          const std::string& filename,
-                          const cuvs::neighbors::cagra::index<T, IdxT>& index)
-{
-  detail::serialize_to_hnswlib<T, IdxT>(handle, filename, index);
-}
 
-#define CUVS_INST_CAGRA_SERIALIZE(DTYPE)                                                   \
-  void serialize(raft::resources const& handle,                                            \
-                 const std::string& filename,                                              \
-                 const cuvs::neighbors::cagra::index<DTYPE, uint32_t>& index,              \
-                 bool include_dataset)                                                     \
-  {                                                                                        \
-    cuvs::neighbors::cagra::detail::serialize<DTYPE, uint32_t>(                            \
-      handle, filename, index, include_dataset);                                           \
-  };                                                                                       \
-                                                                                           \
-  void deserialize(raft::resources const& handle,                                          \
-                   const std::string& filename,                                            \
-                   cuvs::neighbors::cagra::index<DTYPE, uint32_t>* index)                  \
-  {                                                                                        \
-    cuvs::neighbors::cagra::detail::deserialize<DTYPE, uint32_t>(handle, filename, index); \
-  };                                                                                       \
-  void serialize(raft::resources const& handle,                                            \
-                 std::ostream& os,                                                         \
-                 const cuvs::neighbors::cagra::index<DTYPE, uint32_t>& index,              \
-                 bool include_dataset)                                                     \
-  {                                                                                        \
-    cuvs::neighbors::cagra::detail::serialize<DTYPE, uint32_t>(                            \
-      handle, os, index, include_dataset);                                                 \
-  }                                                                                        \
-                                                                                           \
-  void deserialize(raft::resources const& handle,                                          \
-                   std::istream& is,                                                       \
-                   cuvs::neighbors::cagra::index<DTYPE, uint32_t>* index)                  \
-  {                                                                                        \
-    cuvs::neighbors::cagra::detail::deserialize<DTYPE, uint32_t>(handle, is, index);       \
+#define CUVS_INST_CAGRA_SERIALIZE(DTYPE)                                                      \
+  void serialize(raft::resources const& handle,                                               \
+                 const std::string& filename,                                                 \
+                 const cuvs::neighbors::cagra::index<DTYPE, uint32_t>& index,                 \
+                 bool include_dataset)                                                        \
+  {                                                                                           \
+    cuvs::neighbors::cagra::detail::serialize<DTYPE, uint32_t>(                               \
+      handle, filename, index, include_dataset);                                              \
+  };                                                                                          \
+                                                                                              \
+  void deserialize(raft::resources const& handle,                                             \
+                   const std::string& filename,                                               \
+                   cuvs::neighbors::cagra::index<DTYPE, uint32_t>* index)                     \
+  {                                                                                           \
+    cuvs::neighbors::cagra::detail::deserialize<DTYPE, uint32_t>(handle, filename, index);    \
+  };                                                                                          \
+  void serialize(raft::resources const& handle,                                               \
+                 std::ostream& os,                                                            \
+                 const cuvs::neighbors::cagra::index<DTYPE, uint32_t>& index,                 \
+                 bool include_dataset)                                                        \
+  {                                                                                           \
+    cuvs::neighbors::cagra::detail::serialize<DTYPE, uint32_t>(                               \
+      handle, os, index, include_dataset);                                                    \
+  }                                                                                           \
+                                                                                              \
+  void deserialize(raft::resources const& handle,                                             \
+                   std::istream& is,                                                          \
+                   cuvs::neighbors::cagra::index<DTYPE, uint32_t>* index)                     \
+  {                                                                                           \
+    cuvs::neighbors::cagra::detail::deserialize<DTYPE, uint32_t>(handle, is, index);          \
+  }                                                                                           \
+                                                                                              \
+  void serialize_to_hnswlib(raft::resources const& handle,                                    \
+                            std::ostream& os,                                                 \
+                            const cuvs::neighbors::cagra::index<DTYPE, uint32_t>& index)      \
+  {                                                                                           \
+    cuvs::neighbors::cagra::detail::serialize_to_hnswlib<DTYPE, uint32_t>(handle, os, index); \
+  }                                                                                           \
+                                                                                              \
+  void serialize_to_hnswlib(raft::resources const& handle,                                    \
+                            const std::string& filename,                                      \
+                            const cuvs::neighbors::cagra::index<DTYPE, uint32_t>& index)      \
+  {                                                                                           \
+    cuvs::neighbors::cagra::detail::serialize_to_hnswlib<DTYPE, uint32_t>(                    \
+      handle, filename, index);                                                               \
   }
 
 }  // namespace cuvs::neighbors::cagra
diff --git a/dependencies.yaml b/dependencies.yaml
index 22325d5df..689c98417 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -324,6 +324,7 @@ dependencies:
       - output_types: [conda]
         packages:
           - *cmake_ver
+          - openblas
   docs:
     common:
       - output_types: [conda]