diff --git a/build.sh b/build.sh index 678df7b35..a283bcd07 100755 --- a/build.sh +++ b/build.sh @@ -18,8 +18,8 @@ ARGS=$* # scripts, and that this script resides in the repo dir! REPODIR=$(cd $(dirname $0); pwd) -VALIDARGS="clean libcuvs python rust docs tests examples --uninstall -v -g -n --compile-static-lib --allgpuarch --no-nvtx --show_depr_warn --incl-cache-stats --time -h" -HELP="$0 [ ...] [ ...] [--cmake-args=\"\"] [--cache-tool=] [--limit-tests=] [--build-metrics=] +VALIDARGS="clean libcuvs python rust docs tests bench-ann examples --uninstall -v -g -n --compile-static-lib --allgpuarch --no-nvtx --show_depr_warn --incl-cache-stats --time -h" +HELP="$0 [ ...] [ ...] [--cmake-args=\"\"] [--cache-tool=] [--limit-tests=] [--limit-bench-ann=] [--build-metrics=] where is: clean - remove all existing build artifacts and configuration (start over) libcuvs - build the cuvs C++ code only. Also builds the C-wrapper library @@ -28,6 +28,7 @@ HELP="$0 [ ...] [ ...] [--cmake-args=\"\"] [--cache-tool= is: @@ -37,6 +38,7 @@ HELP="$0 [ ...] [ ...] [--cmake-args=\"\"] [--cache-tool==1.23,<2.0a0 - numpydoc - nvcc_linux-aarch64=11.8 +- openblas - pre-commit - pydata-sphinx-theme - pylibraft==24.8.*,>=0.0.0a0 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index a0581ff45..e3181d69c 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -41,6 +41,7 @@ dependencies: - numpy>=1.23,<2.0a0 - numpydoc - nvcc_linux-64=11.8 +- openblas - pre-commit - pydata-sphinx-theme - pylibraft==24.8.*,>=0.0.0a0 diff --git a/conda/environments/all_cuda-122_arch-aarch64.yaml b/conda/environments/all_cuda-122_arch-aarch64.yaml index 9b151e4b7..e22d4388e 100644 --- a/conda/environments/all_cuda-122_arch-aarch64.yaml +++ b/conda/environments/all_cuda-122_arch-aarch64.yaml @@ -37,6 +37,7 @@ dependencies: - ninja - numpy>=1.23,<2.0a0 - numpydoc +- openblas - pre-commit - pydata-sphinx-theme - pylibraft==24.8.*,>=0.0.0a0 diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml index d4b04daf0..08d2f23bb 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-122_arch-x86_64.yaml @@ -37,6 +37,7 @@ dependencies: - ninja - numpy>=1.23,<2.0a0 - numpydoc +- openblas - pre-commit - pydata-sphinx-theme - pylibraft==24.8.*,>=0.0.0a0 diff --git a/conda/recipes/libcuvs/build_libcuvs_tests.sh b/conda/recipes/libcuvs/build_libcuvs_tests.sh index fb116a692..5d77ae2d1 100644 --- a/conda/recipes/libcuvs/build_libcuvs_tests.sh +++ b/conda/recipes/libcuvs/build_libcuvs_tests.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash # Copyright (c) 2022-2024, NVIDIA CORPORATION. -./build.sh tests --allgpuarch --no-nvtx --build-metrics=tests_bench --incl-cache-stats +./build.sh tests bench-ann --allgpuarch --no-nvtx --build-metrics=tests_bench --incl-cache-stats cmake --install cpp/build --component testing diff --git a/conda/recipes/libcuvs/meta.yaml b/conda/recipes/libcuvs/meta.yaml index d1e8506c8..4ffdc91e4 100644 --- a/conda/recipes/libcuvs/meta.yaml +++ b/conda/recipes/libcuvs/meta.yaml @@ -198,6 +198,7 @@ outputs: - libraft ={{ minor_version }} - {{ pin_subpackage('libcuvs', exact=True) }} - cuda-version ={{ cuda_version }} + - openblas # required by some CPU algos in benchmarks {% if cuda_major == "11" %} - cuda-profiler-api {{ cuda11_cuda_profiler_api_run_version }} - libcublas {{ cuda11_libcublas_host_version }} diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index abe6fa7b3..c7b61b92f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -55,6 +55,7 @@ option(BUILD_SHARED_LIBS "Build cuvs shared libraries" ON) option(BUILD_TESTS "Build cuvs unit-tests" ON) option(BUILD_C_LIBRARY "Build raft C API library" OFF) option(BUILD_C_TESTS "Build raft C API tests" OFF) +option(BUILD_ANN_BENCH "Build cuVS ann benchmarks" OFF) option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF) option(CUDA_ENABLE_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF @@ -92,6 +93,7 @@ include(CMakeDependentOption) message(VERBOSE "cuVS: Build cuVS unit-tests: ${BUILD_TESTS}") message(VERBOSE "cuVS: Build CPU only components: ${BUILD_CPU_ONLY}") +message(VERBOSE "cuVS: Build ANN benchmarks: ${BUILD_ANN_BENCH}") message(VERBOSE "cuVS: Enable detection of conda environment for dependencies: ${DETECT_CONDA_ENV}") message(VERBOSE "cuVS: Disable depreaction warnings " ${DISABLE_DEPRECATION_WARNINGS}) message(VERBOSE "cuVS: Disable OpenMP: ${DISABLE_OPENMP}") @@ -184,6 +186,11 @@ endif() include(cmake/thirdparty/get_cutlass.cmake) +if(BUILD_ANN_BENCH) + include(${rapids-cmake-dir}/cpm/gbench.cmake) + rapids_cpm_gbench(BUILD_STATIC) +endif() + # ################################################################################################## # * cuvs --------------------------------------------------------------------- @@ -663,3 +670,10 @@ if(BUILD_TESTS OR BUILD_C_TESTS) include(internal/CMakeLists.txt) include(test/CMakeLists.txt) endif() + +# ################################################################################################## +# * build ann benchmark executable ----------------------------------------------- + +if(BUILD_ANN_BENCH) + include(bench/ann/CMakeLists.txt) +endif() diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt new file mode 100644 index 000000000..6e9e66fad --- /dev/null +++ b/cpp/bench/ann/CMakeLists.txt @@ -0,0 +1,367 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# ################################################################################################## +# * benchmark options ------------------------------------------------------------------------------ + +option(CUVS_ANN_BENCH_USE_FAISS_GPU_FLAT "Include faiss' brute-force knn algorithm in benchmark" ON) +option(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON) +option(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON) +option(CUVS_ANN_BENCH_USE_FAISS_CPU_FLAT "Include faiss' cpu brute-force algorithm in benchmark" ON) +option(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT "Include faiss' cpu ivf flat algorithm in benchmark" + ON +) +option(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_PQ "Include faiss' cpu ivf pq algorithm in benchmark" ON) +option(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT "Include cuVS ivf flat algorithm in benchmark" ON) +option(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ "Include cuVS ivf pq algorithm in benchmark" ON) +option(CUVS_ANN_BENCH_USE_CUVS_CAGRA "Include cuVS CAGRA in benchmark" ON) +option(CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE "Include cuVS brute force knn in benchmark" ON) +option(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB "Include cuVS CAGRA with HNSW search in benchmark" ON) +option(CUVS_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON) +option(CUVS_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" OFF) +option(CUVS_ANN_BENCH_SINGLE_EXE + "Make a single executable with benchmark as shared library modules" OFF +) + +# ################################################################################################## +# * Process options ---------------------------------------------------------- + +find_package(Threads REQUIRED) + +if(BUILD_CPU_ONLY) + set(CUVS_FAISS_ENABLE_GPU OFF) + set(CUVS_ANN_BENCH_USE_FAISS_GPU_FLAT OFF) + set(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT OFF) + set(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_PQ OFF) + set(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT OFF) + set(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ OFF) + set(CUVS_ANN_BENCH_USE_CUVS_CAGRA OFF) + set(CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE OFF) + set(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB OFF) + set(CUVS_ANN_BENCH_USE_GGNN OFF) +else() + set(CUVS_FAISS_ENABLE_GPU ON) +endif() + +set(CUVS_ANN_BENCH_USE_FAISS OFF) +if(CUVS_ANN_BENCH_USE_FAISS_GPU_FLAT + OR CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_PQ + OR CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT + OR CUVS_ANN_BENCH_USE_FAISS_CPU_FLAT + OR CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_PQ + OR CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT +) + set(CUVS_ANN_BENCH_USE_FAISS ON) + set(CUVS_USE_FAISS_STATIC ON) +endif() + +set(CUVS_ANN_BENCH_USE_CUVS OFF) +if(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ + OR CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE + OR CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT + OR CUVS_ANN_BENCH_USE_CUVS_CAGRA + OR CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB +) + set(CUVS_ANN_BENCH_USE_CUVS ON) +endif() + +# ################################################################################################## +# * Fetch requirements ------------------------------------------------------------- + +if(CUVS_ANN_BENCH_USE_HNSWLIB OR CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB) + include(cmake/thirdparty/get_hnswlib.cmake) +endif() + +include(cmake/thirdparty/get_nlohmann_json.cmake) + +if(CUVS_ANN_BENCH_USE_GGNN) + include(cmake/thirdparty/get_ggnn.cmake) +endif() + +if(CUVS_ANN_BENCH_USE_FAISS) + # We need to ensure that faiss has all the conda information. So we currently use the very ugly + # hammer of `link_libraries` to ensure that all targets in this directory and the faiss directory + # will have the conda includes/link dirs + link_libraries($) + include(cmake/thirdparty/get_faiss.cmake) +endif() + +# ################################################################################################## +# * Enable NVTX if available + +# Note: ANN_BENCH wrappers have extra NVTX code not related to raft::nvtx.They track gbench +# benchmark cases and iterations. This is to make limited NVTX available to all algos, not just +# raft/cuVS. +if(TARGET CUDA::nvtx3) + set(_CMAKE_REQUIRED_INCLUDES_ORIG ${CMAKE_REQUIRED_INCLUDES}) + get_target_property(CMAKE_REQUIRED_INCLUDES CUDA::nvtx3 INTERFACE_INCLUDE_DIRECTORIES) + unset(NVTX3_HEADERS_FOUND CACHE) + # Check the headers explicitly to make sure the cpu-only build succeeds + CHECK_INCLUDE_FILE_CXX(nvtx3/nvToolsExt.h NVTX3_HEADERS_FOUND) + set(CMAKE_REQUIRED_INCLUDES ${_CMAKE_REQUIRED_INCLUDES_ORIG}) +endif() + +# ################################################################################################## +# * Target function ------------------------------------------------------------- + +function(ConfigureAnnBench) + + set(oneValueArgs NAME) + set(multiValueArgs PATH LINKS CXXFLAGS) + + if(NOT BUILD_CPU_ONLY) + set(GPU_BUILD ON) + endif() + + cmake_parse_arguments( + ConfigureAnnBench "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN} + ) + + set(BENCH_NAME ${ConfigureAnnBench_NAME}_ANN_BENCH) + + if(CUVS_ANN_BENCH_SINGLE_EXE) + add_library(${BENCH_NAME} SHARED ${ConfigureAnnBench_PATH}) + string(TOLOWER ${BENCH_NAME} BENCH_LIB_NAME) + set_target_properties(${BENCH_NAME} PROPERTIES OUTPUT_NAME ${BENCH_LIB_NAME}) + add_dependencies(${BENCH_NAME} ANN_BENCH) + else() + add_executable(${BENCH_NAME} ${ConfigureAnnBench_PATH}) + target_compile_definitions( + ${BENCH_NAME} PRIVATE ANN_BENCH_BUILD_MAIN + $<$:ANN_BENCH_NVTX3_HEADERS_FOUND> + ) + target_link_libraries( + ${BENCH_NAME} PRIVATE benchmark::benchmark $<$:CUDA::nvtx3> + ) + endif() + + target_link_libraries( + ${BENCH_NAME} + PRIVATE ${ConfigureAnnBench_LINKS} + nlohmann_json::nlohmann_json + Threads::Threads + $<$:CUDA::cudart_static> + $ + $ + -static-libgcc + -static-libstdc++ + ) + + set_target_properties( + ${BENCH_NAME} + PROPERTIES # set target compile options + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + POSITION_INDEPENDENT_CODE ON + INTERFACE_POSITION_INDEPENDENT_CODE ON + BUILD_RPATH "\$ORIGIN" + INSTALL_RPATH "\$ORIGIN" + ) + + set(${ConfigureAnnBench_CXXFLAGS} ${CUVS_CXX_FLAGS} ${ConfigureAnnBench_CXXFLAGS}) + + target_compile_options( + ${BENCH_NAME} PRIVATE "$<$:${ConfigureAnnBench_CXXFLAGS}>" + "$<$:${CUVS_CUDA_FLAGS}>" + ) + + if(CUVS_ANN_BENCH_USE_${ConfigureAnnBench_NAME}) + target_compile_definitions( + ${BENCH_NAME} + PUBLIC + CUVS_ANN_BENCH_USE_${ConfigureAnnBench_NAME}=CUVS_ANN_BENCH_USE_${ConfigureAnnBench_NAME} + ) + endif() + + target_include_directories( + ${BENCH_NAME} + PUBLIC "$" + PRIVATE ${ConfigureAnnBench_INCLUDES} + ) + + install( + TARGETS ${BENCH_NAME} + COMPONENT ann_bench + DESTINATION bin/ann + ) + + add_dependencies(CUVS_ANN_BENCH_ALL ${BENCH_NAME}) +endfunction() + +# ################################################################################################## +# * Configure benchmark targets ------------------------------------------------------------- + +if(NOT TARGET CUVS_ANN_BENCH_ALL) + add_custom_target(CUVS_ANN_BENCH_ALL) +endif() + +if(CUVS_ANN_BENCH_USE_HNSWLIB) + ConfigureAnnBench( + NAME HNSWLIB PATH bench/ann/src/hnswlib/hnswlib_benchmark.cpp LINKS hnswlib::hnswlib + ) + +endif() + +if(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ) + ConfigureAnnBench( + NAME CUVS_IVF_PQ PATH bench/ann/src/cuvs/cuvs_benchmark.cu + $<$:bench/ann/src/cuvs/cuvs_ivf_pq.cu> LINKS cuvs + ) +endif() + +if(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT) + ConfigureAnnBench( + NAME CUVS_IVF_FLAT PATH bench/ann/src/cuvs/cuvs_benchmark.cu + $<$:bench/ann/src/cuvs/cuvs_ivf_flat.cu> LINKS cuvs + ) +endif() + +if(CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE) + ConfigureAnnBench(NAME CUVS_BRUTE_FORCE PATH bench/ann/src/cuvs/cuvs_benchmark.cu LINKS cuvs) +endif() + +if(CUVS_ANN_BENCH_USE_CUVS_CAGRA) + ConfigureAnnBench( + NAME + CUVS_CAGRA + PATH + bench/ann/src/cuvs/cuvs_benchmark.cu + $<$:bench/ann/src/cuvs/cuvs_cagra_float.cu> + $<$:bench/ann/src/cuvs/cuvs_cagra_half.cu> + $<$:bench/ann/src/cuvs/cuvs_cagra_int8_t.cu> + $<$:bench/ann/src/cuvs/cuvs_cagra_uint8_t.cu> + LINKS + cuvs + ) +endif() + +if(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB) + ConfigureAnnBench( + NAME CUVS_CAGRA_HNSWLIB PATH bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu LINKS cuvs + hnswlib::hnswlib + ) +endif() + +set(CUVS_FAISS_TARGETS faiss::faiss) +if(TARGET faiss::faiss_avx2) + set(CUVS_FAISS_TARGETS faiss::faiss_avx2) +endif() + +message("CUVS_FAISS_TARGETS: ${CUVS_FAISS_TARGETS}") +message("CUDAToolkit_LIBRARY_DIR: ${CUDAToolkit_LIBRARY_DIR}") +if(CUVS_ANN_BENCH_USE_FAISS_CPU_FLAT) + ConfigureAnnBench( + NAME FAISS_CPU_FLAT PATH bench/ann/src/faiss/faiss_cpu_benchmark.cpp LINKS + ${CUVS_FAISS_TARGETS} + ) +endif() + +if(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT) + ConfigureAnnBench( + NAME FAISS_CPU_IVF_FLAT PATH bench/ann/src/faiss/faiss_cpu_benchmark.cpp LINKS + ${CUVS_FAISS_TARGETS} + ) +endif() + +if(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_PQ) + ConfigureAnnBench( + NAME FAISS_CPU_IVF_PQ PATH bench/ann/src/faiss/faiss_cpu_benchmark.cpp LINKS + ${CUVS_FAISS_TARGETS} + ) +endif() + +if(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT) + ConfigureAnnBench( + NAME FAISS_GPU_IVF_FLAT PATH bench/ann/src/faiss/faiss_gpu_benchmark.cu LINKS + ${CUVS_FAISS_TARGETS} + ) +endif() + +if(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_PQ) + ConfigureAnnBench( + NAME FAISS_GPU_IVF_PQ PATH bench/ann/src/faiss/faiss_gpu_benchmark.cu LINKS + ${CUVS_FAISS_TARGETS} + ) +endif() + +if(CUVS_ANN_BENCH_USE_FAISS_GPU_FLAT) + ConfigureAnnBench( + NAME FAISS_GPU_FLAT PATH bench/ann/src/faiss/faiss_gpu_benchmark.cu LINKS ${CUVS_FAISS_TARGETS} + ) +endif() + +if(CUVS_ANN_BENCH_USE_GGNN) + include(cmake/thirdparty/get_glog.cmake) + ConfigureAnnBench( + NAME GGNN PATH bench/ann/src/ggnn/ggnn_benchmark.cu LINKS glog::glog ggnn::ggnn CUDA::curand + ) +endif() + +# ################################################################################################## +# * Dynamically-loading ANN_BENCH executable ------------------------------------------------------- +if(CUVS_ANN_BENCH_SINGLE_EXE) + add_executable(ANN_BENCH bench/ann/src/common/benchmark.cpp) + + # Build and link static version of the GBench to keep ANN_BENCH self-contained. + get_target_property(TMP_PROP benchmark::benchmark SOURCES) + add_library(benchmark_static STATIC ${TMP_PROP}) + get_target_property(TMP_PROP benchmark::benchmark INCLUDE_DIRECTORIES) + target_include_directories(benchmark_static PUBLIC ${TMP_PROP}) + get_target_property(TMP_PROP benchmark::benchmark LINK_LIBRARIES) + target_link_libraries(benchmark_static PUBLIC ${TMP_PROP}) + + target_include_directories(ANN_BENCH PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + + target_link_libraries( + ANN_BENCH + PRIVATE raft::raft + nlohmann_json::nlohmann_json + benchmark_static + dl + -static-libgcc + fmt::fmt-header-only + spdlog::spdlog_header_only + -static-libstdc++ + $<$:CUDA::nvtx3> + ) + set_target_properties( + ANN_BENCH + PROPERTIES # set target compile options + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + POSITION_INDEPENDENT_CODE ON + INTERFACE_POSITION_INDEPENDENT_CODE ON + BUILD_RPATH "\$ORIGIN" + INSTALL_RPATH "\$ORIGIN" + ) + target_compile_definitions( + ANN_BENCH + PRIVATE + $<$:ANN_BENCH_LINK_CUDART="libcudart.so.${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}.${CUDAToolkit_VERSION_PATCH}"> + $<$:ANN_BENCH_NVTX3_HEADERS_FOUND> + ) + + target_link_options(ANN_BENCH PRIVATE -export-dynamic) + + install( + TARGETS ANN_BENCH + COMPONENT ann_bench + DESTINATION bin/ann + EXCLUDE_FROM_ALL + ) +endif() diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp new file mode 100644 index 000000000..4b17885c0 --- /dev/null +++ b/cpp/bench/ann/src/common/ann_types.hpp @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cuda_stub.hpp" // cudaStream_t + +#include +#include +#include +#include +#include + +namespace cuvs::bench { + +/** Benchmark mode: measuring latency vs throughput. */ +enum class Mode { + kThroughput, // See how many vectors we can push through + kLatency // See how fast we can push a vector through +}; + +enum class MemoryType { + kHost, + kHostMmap, + kDevice, +}; + +enum class Metric { + kInnerProduct, + kEuclidean, +}; + +inline auto parse_metric(const std::string& metric_str) -> Metric +{ + if (metric_str == "inner_product") { + return cuvs::bench::Metric::kInnerProduct; + } else if (metric_str == "euclidean") { + return cuvs::bench::Metric::kEuclidean; + } else { + throw std::runtime_error("invalid metric: '" + metric_str + "'"); + } +} + +inline auto parse_memory_type(const std::string& memory_type) -> MemoryType +{ + if (memory_type == "host") { + return MemoryType::kHost; + } else if (memory_type == "mmap") { + return MemoryType::kHostMmap; + } else if (memory_type == "device") { + return MemoryType::kDevice; + } else { + throw std::runtime_error("invalid memory type: '" + memory_type + "'"); + } +} + +struct algo_property { + MemoryType dataset_memory_type; + // neighbors/distances should have same memory type as queries + MemoryType query_memory_type; +}; + +class algo_base { + public: + using index_type = int64_t; + + inline algo_base(Metric metric, int dim) : metric_(metric), dim_(dim) {} + virtual ~algo_base() noexcept = default; + + protected: + Metric metric_; + int dim_; +}; + +/** + * The GPU-based algorithms, which do not perform CPU synchronization at the end of their build or + * search methods, must implement this interface. + * + * The `cuda_timer` / `cuda_lap` from `util.hpp` uses this stream to record GPU times with events + * and, if necessary, also synchronize (via events) between iterations. + * + * If the algo does not implement this interface, GPU timings are disabled. + */ +class algo_gpu { + public: + /** + * Return the main cuda stream for this algorithm. + * If any work is done in multiple streams, they should synchornize with the main stream at the + * end. + */ + [[nodiscard]] virtual auto get_sync_stream() const noexcept -> cudaStream_t = 0; + /** + * By default a GPU algorithm uses a fixed stream to order GPU operations. + * However, an algorithm may need to synchronize with the host at the end of its execution. + * In that case, also synchronizing with a benchmark event would put it at disadvantage. + * + * We can disable event sync by passing `false` here + * - ONLY IF THE ALGORITHM HAS PRODUCED ITS OUTPUT BY THE TIME IT SYNCHRONIZES WITH CPU. + */ + [[nodiscard]] virtual auto uses_stream() const noexcept -> bool { return true; } + virtual ~algo_gpu() noexcept = default; +}; + +template +class algo : public algo_base { + public: + struct search_param { + virtual ~search_param() = default; + [[nodiscard]] virtual auto needs_dataset() const -> bool { return false; }; + }; + + inline algo(Metric metric, int dim) : algo_base(metric, dim) {} + ~algo() noexcept override = default; + + virtual void build(const T* dataset, size_t nrow) = 0; + + virtual void set_search_param(const search_param& param) = 0; + // TODO(snanditale): this assumes that an algorithm can always return k results. + // This is not always possible. + virtual void search(const T* queries, + int batch_size, + int k, + algo_base::index_type* neighbors, + float* distances) const = 0; + + virtual void save(const std::string& file) const = 0; + virtual void load(const std::string& file) = 0; + + [[nodiscard]] virtual auto get_preference() const -> algo_property = 0; + + // Some algorithms don't save the building dataset in their indices. + // So they should be given the access to that dataset during searching. + // The advantage of this way is that index has smaller size + // and many indices can share one dataset. + // + // search_param::needs_dataset() of such algorithm should be true, + // and set_search_dataset() should save the passed-in pointer somewhere. + // The client code should call set_search_dataset() before searching, + // and should not release dataset before searching is finished. + virtual void set_search_dataset(const T* /*dataset*/, size_t /*nrow*/){}; + + /** + * Make a shallow copy of the algo wrapper that shares the resources and ensures thread-safe + * access to them. */ + virtual auto copy() -> std::unique_ptr> = 0; +}; + +} // namespace cuvs::bench + +#define REGISTER_ALGO_INSTANCE(DataT) \ + template auto cuvs::bench::create_algo( \ + const std::string&, const std::string&, int, const nlohmann::json&) \ + ->std::unique_ptr>; \ + template auto cuvs::bench::create_search_param(const std::string&, const nlohmann::json&) \ + ->std::unique_ptr::search_param>; diff --git a/cpp/bench/ann/src/common/benchmark.cpp b/cpp/bench/ann/src/common/benchmark.cpp new file mode 100644 index 000000000..7ec8558a4 --- /dev/null +++ b/cpp/bench/ann/src/common/benchmark.cpp @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// clang-format off +#include "cuda_stub.hpp" // must go first +// clang-format on + +#include "ann_types.hpp" + +#include + +#include +#include +#include + +namespace cuvs::bench { + +struct lib_handle { + void* handle{nullptr}; + explicit lib_handle(const std::string& name) + { + handle = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL); + if (handle == nullptr) { + auto error_msg = "Failed to load " + name; + auto err = dlerror(); + if (err != nullptr && err[0] != '\0') { error_msg += ": " + std::string(err); } + throw std::runtime_error(error_msg); + } + } + ~lib_handle() noexcept + { + if (handle != nullptr) { dlclose(handle); } + } +}; + +auto load_lib(const std::string& algo) -> void* +{ + static std::unordered_map libs{}; + auto found = libs.find(algo); + + if (found != libs.end()) { return found->second.handle; } + auto lib_name = "lib" + algo + "_ann_bench.so"; + return libs.emplace(algo, lib_name).first->second.handle; +} + +/* + TODO(achirkin): remove this compatibility layer. + When reading old raft algo configs, we may encounter raft_xxx algorithms; + they all are renamed to cuvs_xxx algorithm. + This compatibility layer helps using old configs with the new benchmark executable. + */ +auto load_lib_raft_compat(const std::string& algo) -> void* +{ + try { + return load_lib(algo); + } catch (std::runtime_error& e) { + if (algo.rfind("raft", 0) == 0) { return load_lib("cuvs" + algo.substr(4)); } + throw e; + } +} + +auto get_fun_name(void* addr) -> std::string +{ + Dl_info dl_info; + if (dladdr(addr, &dl_info) != 0) { + if (dl_info.dli_sname != nullptr && dl_info.dli_sname[0] != '\0') { + return std::string{dl_info.dli_sname}; + } + } + throw std::logic_error("Failed to find out name of the looked up function"); +} + +template +auto create_algo(const std::string& algo, + const std::string& distance, + int dim, + const nlohmann::json& conf) -> std::unique_ptr> +{ + static auto fname = get_fun_name(reinterpret_cast(&create_algo)); + auto handle = load_lib_raft_compat(algo); + auto fun_addr = dlsym(handle, fname.c_str()); + if (fun_addr == nullptr) { + throw std::runtime_error("Couldn't load the create_algo function (" + algo + ")"); + } + auto fun = reinterpret_cast)>(fun_addr); + return fun(algo, distance, dim, conf); +} + +template +std::unique_ptr::search_param> create_search_param( + const std::string& algo, const nlohmann::json& conf) +{ + static auto fname = get_fun_name(reinterpret_cast(&create_search_param)); + auto handle = load_lib_raft_compat(algo); + auto fun_addr = dlsym(handle, fname.c_str()); + if (fun_addr == nullptr) { + throw std::runtime_error("Couldn't load the create_search_param function (" + algo + ")"); + } + auto fun = reinterpret_cast)>(fun_addr); + return fun(algo, conf); +} + +}; // namespace cuvs::bench + +REGISTER_ALGO_INSTANCE(float); +REGISTER_ALGO_INSTANCE(std::int8_t); +REGISTER_ALGO_INSTANCE(std::uint8_t); + +#include "benchmark.hpp" + +auto main(int argc, char** argv) -> int { return cuvs::bench::run_main(argc, argv); } diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp new file mode 100644 index 000000000..e6462c157 --- /dev/null +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -0,0 +1,725 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "ann_types.hpp" +#include "conf.hpp" +#include "dataset.hpp" +#include "util.hpp" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cuvs::bench { + +static inline std::unique_ptr current_algo{nullptr}; +static inline std::unique_ptr current_algo_props{nullptr}; + +using kv_series = std::vector>>; + +inline auto apply_overrides(const std::vector& configs, + const kv_series& overrides, + std::size_t override_idx = 0) -> std::vector +{ + std::vector results{}; + if (override_idx >= overrides.size()) { + auto n = configs.size(); + for (size_t i = 0; i < n; i++) { + auto c = configs[i]; + c["override_suffix"] = n > 1 ? "/" + std::to_string(i) : ""; + results.push_back(c); + } + return results; + } + auto rec_configs = apply_overrides(configs, overrides, override_idx + 1); + auto [key, vals] = overrides[override_idx]; + auto n = vals.size(); + for (size_t i = 0; i < n; i++) { + const auto& val = vals[i]; + for (auto rc : rec_configs) { + if (n > 1) { + rc["override_suffix"] = + static_cast(rc["override_suffix"]) + "/" + std::to_string(i); + } + rc[key] = val; + results.push_back(rc); + } + } + return results; +} + +inline auto apply_overrides(const nlohmann::json& config, + const kv_series& overrides, + std::size_t override_idx = 0) +{ + return apply_overrides(std::vector{config}, overrides, 0); +} + +inline void dump_parameters(::benchmark::State& state, nlohmann::json params) +{ + std::string label = ""; + bool label_empty = true; + for (auto& [key, val] : params.items()) { + if (val.is_number()) { + state.counters.insert({{key, val}}); + } else if (val.is_boolean()) { + state.counters.insert({{key, val ? 1.0 : 0.0}}); + } else { + auto kv = key + "=" + val.dump(); + if (label_empty) { + label = kv; + } else { + label += "#" + kv; + } + label_empty = false; + } + } + if (!label_empty) { state.SetLabel(label); } +} + +inline auto parse_algo_property(algo_property prop, const nlohmann::json& conf) -> algo_property +{ + if (conf.contains("dataset_memory_type")) { + prop.dataset_memory_type = parse_memory_type(conf.at("dataset_memory_type")); + } + if (conf.contains("query_memory_type")) { + prop.query_memory_type = parse_memory_type(conf.at("query_memory_type")); + } + return prop; +}; + +template +void bench_build(::benchmark::State& state, + std::shared_ptr> dataset, + configuration::index index, + bool force_overwrite) +{ + // NB: these two thread-local vars can be used within algo wrappers + cuvs::bench::benchmark_thread_id = state.thread_index(); + cuvs::bench::benchmark_n_threads = state.threads(); + dump_parameters(state, index.build_param); + if (file_exists(index.file)) { + if (force_overwrite) { + log_info("Overwriting file: %s", index.file.c_str()); + } else { + return state.SkipWithMessage( + "Index file already exists (use --force to overwrite the index)."); + } + } + + std::unique_ptr> algo; + try { + algo = create_algo(index.algo, dataset->distance(), dataset->dim(), index.build_param); + } catch (const std::exception& e) { + return state.SkipWithError("Failed to create an algo: " + std::string(e.what())); + } + + const auto algo_property = parse_algo_property(algo->get_preference(), index.build_param); + + const T* base_set = dataset->base_set(algo_property.dataset_memory_type); + std::size_t index_size = dataset->base_set_size(); + + cuda_timer gpu_timer{algo}; + { + nvtx_case nvtx{state.name()}; + for (auto _ : state) { + [[maybe_unused]] auto ntx_lap = nvtx.lap(); + [[maybe_unused]] auto gpu_lap = gpu_timer.lap(); + try { + algo->build(base_set, index_size); + } catch (const std::exception& e) { + state.SkipWithError(std::string(e.what())); + } + } + } + if (gpu_timer.active()) { + state.counters.insert({"GPU", {gpu_timer.total_time(), benchmark::Counter::kAvgIterations}}); + } + state.counters.insert({{"index_size", index_size}}); + + if (state.skipped()) { return; } + make_sure_parent_dir_exists(index.file); + algo->save(index.file); +} + +template +void bench_search(::benchmark::State& state, + configuration::index index, + std::size_t search_param_ix, + std::shared_ptr> dataset) +{ + // NB: these two thread-local vars can be used within algo wrappers + cuvs::bench::benchmark_thread_id = state.thread_index(); + cuvs::bench::benchmark_n_threads = state.threads(); + std::size_t queries_processed = 0; + + const auto& sp_json = index.search_params[search_param_ix]; + + if (state.thread_index() == 0) { dump_parameters(state, sp_json); } + + // NB: `k` and `n_queries` are guaranteed to be populated in conf.cpp + const std::uint32_t k = sp_json["k"]; + // Amount of data processes in one go + const std::size_t n_queries = sp_json["n_queries"]; + // Round down the query data to a multiple of the batch size to loop over full batches of data + const std::size_t query_set_size = (dataset->query_set_size() / n_queries) * n_queries; + + if (dataset->query_set_size() < n_queries) { + std::stringstream msg; + msg << "Not enough queries in benchmark set. Expected " << n_queries << ", actual " + << dataset->query_set_size(); + state.SkipWithError(msg.str()); + return; + } + + // Each thread start from a different offset, so that the queries that they process do not + // overlap. + std::ptrdiff_t batch_offset = (state.thread_index() * n_queries) % query_set_size; + std::ptrdiff_t queries_stride = state.threads() * n_queries; + // Output is saved into a contiguous buffer (separate buffers for each thread). + std::ptrdiff_t out_offset = 0; + + const T* query_set = nullptr; + + if (!file_exists(index.file)) { + state.SkipWithError("Index file is missing. Run the benchmark in the build mode first."); + return; + } + + /** + * Make sure the first thread loads the algo and dataset + */ + progress_barrier load_barrier{}; + if (load_barrier.arrive(1) == 0) { + // algo is static to cache it between close search runs to save time on index loading + static std::string index_file = ""; + if (index.file != index_file) { + current_algo.reset(); + index_file = index.file; + } + + std::unique_ptr::search_param> search_param; + algo* a; + try { + if (!current_algo || (a = dynamic_cast*>(current_algo.get())) == nullptr) { + auto ualgo = + create_algo(index.algo, dataset->distance(), dataset->dim(), index.build_param); + a = ualgo.get(); + a->load(index_file); + current_algo = std::move(ualgo); + } + search_param = create_search_param(index.algo, sp_json); + } catch (const std::exception& e) { + state.SkipWithError("Failed to create an algo: " + std::string(e.what())); + return; + } + + current_algo_props = + std::make_unique(std::move(parse_algo_property(a->get_preference(), sp_json))); + + if (search_param->needs_dataset()) { + try { + a->set_search_dataset(dataset->base_set(current_algo_props->dataset_memory_type), + dataset->base_set_size()); + } catch (const std::exception& ex) { + state.SkipWithError("The algorithm '" + index.name + + "' requires the base set, but it's not available. " + + "Exception: " + std::string(ex.what())); + return; + } + } + try { + a->set_search_param(*search_param); + } catch (const std::exception& ex) { + state.SkipWithError("An error occurred setting search parameters: " + std::string(ex.what())); + return; + } + + query_set = dataset->query_set(current_algo_props->query_memory_type); + load_barrier.arrive(state.threads()); + } else { + // All other threads will wait for the first thread to initialize the algo. + load_barrier.wait(state.threads() * 2); + // gbench ensures that all threads are synchronized at the start of the benchmark loop. + // We are accessing shared variables (like current_algo, current_algo_probs) before the + // benchmark loop, therefore the synchronization here is necessary. + } + query_set = dataset->query_set(current_algo_props->query_memory_type); + + /** + * Each thread will manage its own outputs + */ + using index_type = algo_base::index_type; + constexpr size_t kAlignResultBuf = 64; + size_t result_elem_count = k * query_set_size; + result_elem_count = + ((result_elem_count + kAlignResultBuf - 1) / kAlignResultBuf) * kAlignResultBuf; + auto& result_buf = + get_result_buffer_from_global_pool(result_elem_count * (sizeof(float) + sizeof(index_type))); + auto* neighbors_ptr = + reinterpret_cast(result_buf.data(current_algo_props->query_memory_type)); + auto* distances_ptr = reinterpret_cast(neighbors_ptr + result_elem_count); + + { + nvtx_case nvtx{state.name()}; + + std::unique_ptr> a{nullptr}; + try { + dynamic_cast*>(current_algo.get())->copy().swap(a); + } catch (const std::exception& e) { + state.SkipWithError("Algo::copy: " + std::string(e.what())); + return; + } + // Initialize with algo, so that the timer.lap() object can sync with algo::get_sync_stream() + cuda_timer gpu_timer{a}; + auto start = std::chrono::high_resolution_clock::now(); + for (auto _ : state) { + [[maybe_unused]] auto ntx_lap = nvtx.lap(); + [[maybe_unused]] auto gpu_lap = gpu_timer.lap(); + try { + a->search(query_set + batch_offset * dataset->dim(), + n_queries, + k, + neighbors_ptr + out_offset * k, + distances_ptr + out_offset * k); + } catch (const std::exception& e) { + state.SkipWithError("Benchmark loop: " + std::string(e.what())); + break; + } + + // advance to the next batch + batch_offset = (batch_offset + queries_stride) % query_set_size; + out_offset = (out_offset + n_queries) % query_set_size; + + queries_processed += n_queries; + } + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast>(end - start).count(); + if (state.thread_index() == 0) { state.counters.insert({{"end_to_end", duration}}); } + state.counters.insert({"Latency", {duration, benchmark::Counter::kAvgIterations}}); + + if (gpu_timer.active()) { + state.counters.insert({"GPU", {gpu_timer.total_time(), benchmark::Counter::kAvgIterations}}); + } + } + + state.SetItemsProcessed(queries_processed); + + // This will be the total number of queries across all threads + state.counters.insert({{"total_queries", queries_processed}}); + + if (state.skipped()) { return; } + + // Each thread calculates recall on their partition of queries. + // evaluate recall + if (dataset->max_k() >= k) { + const std::int32_t* gt = dataset->gt_set(); + const std::uint32_t max_k = dataset->max_k(); + result_buf.transfer_data(MemoryType::kHost, current_algo_props->query_memory_type); + auto* neighbors_host = reinterpret_cast(result_buf.data(MemoryType::kHost)); + std::size_t rows = std::min(queries_processed, query_set_size); + std::size_t match_count = 0; + std::size_t total_count = rows * static_cast(k); + + // We go through the groundtruth with same stride as the benchmark loop. + size_t out_offset = 0; + size_t batch_offset = (state.thread_index() * n_queries) % query_set_size; + while (out_offset < rows) { + for (std::size_t i = 0; i < n_queries; i++) { + size_t i_orig_idx = batch_offset + i; + size_t i_out_idx = out_offset + i; + if (i_out_idx < rows) { + for (std::uint32_t j = 0; j < k; j++) { + auto act_idx = static_cast(neighbors_host[i_out_idx * k + j]); + for (std::uint32_t l = 0; l < k; l++) { + auto exp_idx = gt[i_orig_idx * max_k + l]; + if (act_idx == exp_idx) { + match_count++; + break; + } + } + } + } + } + out_offset += n_queries; + batch_offset = (batch_offset + queries_stride) % query_set_size; + } + double actual_recall = static_cast(match_count) / static_cast(total_count); + state.counters.insert({"Recall", {actual_recall, benchmark::Counter::kAvgThreads}}); + } +} + +inline void printf_usage() +{ + ::benchmark::PrintDefaultHelp(); + fprintf(stdout, + " [--build|--search] \n" + " [--force]\n" + " [--data_prefix=]\n" + " [--index_prefix=]\n" + " [--override_kv=]\n" + " [--mode=\n" + " [--threads=min[:max]]\n" + " .json\n" + "\n" + "Note the non-standard benchmark parameters:\n" + " --build: build mode, will build index\n" + " --search: search mode, will search using the built index\n" + " one and only one of --build and --search should be specified\n" + " --force: force overwriting existing index files\n" + " --data_prefix=:" + " prepend to dataset file paths specified in the .json (default = " + "'data/').\n" + " --index_prefix=:" + " prepend to index file paths specified in the .json (default = " + "'index/').\n" + " --override_kv=:" + " override a build/search key one or more times multiplying the number of configurations;" + " you can use this parameter multiple times to get the Cartesian product of benchmark" + " configs.\n" + " --mode=" + " run the benchmarks in latency (accumulate times spent in each batch) or " + " throughput (pipeline batches and measure end-to-end) mode\n" + " --threads=min[:max] specify the number threads to use for throughput benchmark." + " Power of 2 values between 'min' and 'max' will be used. If only 'min' is specified," + " then a single test is run with 'min' threads. By default min=1, max=.\n"); +} + +template +void register_build(std::shared_ptr> dataset, + std::vector indices, + bool force_overwrite) +{ + for (auto index : indices) { + auto suf = static_cast(index.build_param["override_suffix"]); + auto file_suf = suf; + index.build_param.erase("override_suffix"); + std::replace(file_suf.begin(), file_suf.end(), '/', '-'); + index.file += file_suf; + auto* b = ::benchmark::RegisterBenchmark( + index.name + suf, bench_build, dataset, index, force_overwrite); + b->Unit(benchmark::kSecond); + b->MeasureProcessCPUTime(); + b->UseRealTime(); + } +} + +template +void register_search(std::shared_ptr> dataset, + std::vector indices, + Mode metric_objective, + const std::vector& threads) +{ + for (auto index : indices) { + for (std::size_t i = 0; i < index.search_params.size(); i++) { + auto suf = static_cast(index.search_params[i]["override_suffix"]); + index.search_params[i].erase("override_suffix"); + + auto* b = ::benchmark::RegisterBenchmark(index.name + suf, bench_search, index, i, dataset) + ->Unit(benchmark::kMillisecond) + /** + * The following are important for getting accuracy QPS measurements on both CPU + * and GPU These make sure that + * - `end_to_end` ~ (`Time` * `Iterations`) + * - `items_per_second` ~ (`total_queries` / `end_to_end`) + * - Throughput = `items_per_second` + */ + ->MeasureProcessCPUTime() + ->UseRealTime(); + + if (metric_objective == Mode::kThroughput) { b->ThreadRange(threads[0], threads[1]); } + } + } +} + +template +void dispatch_benchmark(const configuration& conf, + bool force_overwrite, + bool build_mode, + bool search_mode, + std::string data_prefix, + std::string index_prefix, + kv_series override_kv, + Mode metric_objective, + const std::vector& threads) +{ + if (cudart.found()) { + for (auto [key, value] : cuda_info()) { + ::benchmark::AddCustomContext(key, value); + } + } + const auto dataset_conf = conf.get_dataset_conf(); + auto base_file = combine_path(data_prefix, dataset_conf.base_file); + auto query_file = combine_path(data_prefix, dataset_conf.query_file); + auto gt_file = dataset_conf.groundtruth_neighbors_file; + if (gt_file.has_value()) { gt_file.emplace(combine_path(data_prefix, gt_file.value())); } + auto dataset = std::make_shared>(dataset_conf.name, + base_file, + dataset_conf.subset_first_row, + dataset_conf.subset_size, + query_file, + dataset_conf.distance, + gt_file); + ::benchmark::AddCustomContext("dataset", dataset_conf.name); + ::benchmark::AddCustomContext("distance", dataset_conf.distance); + std::vector indices = conf.get_indices(); + if (build_mode) { + if (file_exists(base_file)) { + log_info("Using the dataset file '%s'", base_file.c_str()); + ::benchmark::AddCustomContext("n_records", std::to_string(dataset->base_set_size())); + ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim())); + } else { + log_warn("dataset file '%s' does not exist; benchmarking index building is impossible.", + base_file.c_str()); + } + std::vector more_indices{}; + for (auto& index : indices) { + for (auto param : apply_overrides(index.build_param, override_kv)) { + auto modified_index = index; + modified_index.build_param = param; + modified_index.file = combine_path(index_prefix, modified_index.file); + more_indices.push_back(modified_index); + } + } + register_build(dataset, more_indices, force_overwrite); + } else if (search_mode) { + if (file_exists(query_file)) { + log_info("Using the query file '%s'", query_file.c_str()); + ::benchmark::AddCustomContext("max_n_queries", std::to_string(dataset->query_set_size())); + ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim())); + if (gt_file.has_value()) { + if (file_exists(*gt_file)) { + log_info("Using the ground truth file '%s'", gt_file->c_str()); + ::benchmark::AddCustomContext("max_k", std::to_string(dataset->max_k())); + } else { + log_warn("Ground truth file '%s' does not exist; the recall won't be reported.", + gt_file->c_str()); + } + } else { + log_warn( + "Ground truth file is not provided; the recall won't be reported. NB: use " + "the 'groundtruth_neighbors_file' alongside the 'query_file' key to specify the " + "path to " + "the ground truth in your conf.json."); + } + } else { + log_warn("Query file '%s' does not exist; benchmarking search is impossible.", + query_file.c_str()); + } + for (auto& index : indices) { + index.search_params = apply_overrides(index.search_params, override_kv); + index.file = combine_path(index_prefix, index.file); + } + register_search(dataset, indices, metric_objective, threads); + } +} + +inline auto parse_bool_flag(const char* arg, const char* pat, bool& result) -> bool +{ + if (strcmp(arg, pat) == 0) { + result = true; + return true; + } + return false; +} + +inline auto parse_string_flag(const char* arg, const char* pat, std::string& result) -> bool +{ + auto n = strlen(pat); + if (strncmp(pat, arg, strlen(pat)) == 0) { + result = arg + n + 1; + return true; + } + return false; +} + +inline auto run_main(int argc, char** argv) -> int +{ + bool force_overwrite = false; + bool build_mode = false; + bool search_mode = false; + std::string data_prefix = "data"; + std::string index_prefix = "index"; + std::string new_override_kv = ""; + std::string mode = "latency"; + std::string threads_arg_txt = ""; + std::vector threads = {1, -1}; // min_thread, max_thread + std::string log_level_str = ""; + [[maybe_unused]] int raft_log_level = 0; // raft::logger::get(RAFT_NAME).get_level(); + kv_series override_kv{}; + + char arg0_default[] = "benchmark"; // NOLINT + char* args_default = arg0_default; + if (!argv) { + argc = 1; + argv = &args_default; + } + if (argc == 1) { + printf_usage(); + return -1; + } + + char* conf_path = argv[--argc]; + std::ifstream conf_stream(conf_path); + + for (int i = 1; i < argc; i++) { + if (parse_bool_flag(argv[i], "--force", force_overwrite) || + parse_bool_flag(argv[i], "--build", build_mode) || + parse_bool_flag(argv[i], "--search", search_mode) || + parse_string_flag(argv[i], "--data_prefix", data_prefix) || + parse_string_flag(argv[i], "--index_prefix", index_prefix) || + parse_string_flag(argv[i], "--mode", mode) || + parse_string_flag(argv[i], "--override_kv", new_override_kv) || + parse_string_flag(argv[i], "--threads", threads_arg_txt) || + parse_string_flag(argv[i], "--raft_log_level", log_level_str)) { + if (!log_level_str.empty()) { + raft_log_level = std::stoi(log_level_str); + log_level_str = ""; + } + if (!threads_arg_txt.empty()) { + auto threads_arg = split(threads_arg_txt, ':'); + threads[0] = std::stoi(threads_arg[0]); + if (threads_arg.size() > 1) { + threads[1] = std::stoi(threads_arg[1]); + } else { + threads[1] = threads[0]; + } + threads_arg_txt = ""; + } + if (!new_override_kv.empty()) { + auto kvv = split(new_override_kv, ':'); + auto key = kvv[0]; + std::vector vals{}; + for (std::size_t j = 1; j < kvv.size(); j++) { + vals.push_back(nlohmann::json::parse(kvv[j])); + } + override_kv.emplace_back(key, vals); + new_override_kv = ""; + } + for (int j = i; j < argc - 1; j++) { + argv[j] = argv[j + 1]; + } + argc--; + i--; + } + } + + // raft::logger::get(RAFT_NAME).set_level(raft_log_level); + + Mode metric_objective = Mode::kLatency; + if (mode == "throughput") { metric_objective = Mode::kThroughput; } + + int max_threads = + (metric_objective == Mode::kThroughput) ? std::thread::hardware_concurrency() : 1; + if (threads[1] == -1) threads[1] = max_threads; + + if (metric_objective == Mode::kLatency) { + if (threads[0] != 1 || threads[1] != 1) { + log_warn("Latency mode enabled. Overriding threads arg, running with single thread."); + threads = {1, 1}; + } + } + + if (build_mode == search_mode) { + log_error("One and only one of --build and --search should be specified"); + printf_usage(); + return -1; + } + + if (!conf_stream) { + log_error("Can't open configuration file: %s", conf_path); + return -1; + } + + if (cudart.needed() && !cudart.found()) { + log_warn("cudart library is not found, GPU-based indices won't work."); + } + + configuration conf(conf_stream); + std::string dtype = conf.get_dataset_conf().dtype; + + if (dtype == "float") { + dispatch_benchmark(conf, + force_overwrite, + build_mode, + search_mode, + data_prefix, + index_prefix, + override_kv, + metric_objective, + threads); + // } else if (dtype == "half") { + // dispatch_benchmark(conf, + // force_overwrite, + // build_mode, + // search_mode, + // data_prefix, + // index_prefix, + // override_kv, + // metric_objective, + // threads); + } else if (dtype == "uint8") { + dispatch_benchmark(conf, + force_overwrite, + build_mode, + search_mode, + data_prefix, + index_prefix, + override_kv, + metric_objective, + threads); + } else if (dtype == "int8") { + dispatch_benchmark(conf, + force_overwrite, + build_mode, + search_mode, + data_prefix, + index_prefix, + override_kv, + metric_objective, + threads); + } else { + log_error("datatype '%s' is not supported", dtype.c_str()); + return -1; + } + + ::benchmark::Initialize(&argc, argv, printf_usage); + if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return -1; + ::benchmark::RunSpecifiedBenchmarks(); + ::benchmark::Shutdown(); + // Release a possibly cached algo object, so that it cannot be alive longer than the handle + // to a shared library it depends on (dynamic benchmark executable). + current_algo.reset(); + current_algo_props.reset(); + reset_global_device_resources(); + return 0; +} +}; // namespace cuvs::bench diff --git a/cpp/bench/ann/src/common/conf.hpp b/cpp/bench/ann/src/common/conf.hpp new file mode 100644 index 000000000..1fc7327cb --- /dev/null +++ b/cpp/bench/ann/src/common/conf.hpp @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "util.hpp" + +#include +#include +#include +#include +#include + +namespace cuvs::bench { + +class configuration { + public: + struct index { + std::string name; + std::string algo; + nlohmann::json build_param; + std::string file; + + int batch_size; + int k; + std::vector search_params; + }; + + struct dataset_conf { + std::string name; + std::string base_file; + // use only a subset of base_file, + // the range of rows is [subset_first_row, subset_first_row + subset_size) + // however, subset_size = 0 means using all rows after subset_first_row + // that is, the subset is [subset_first_row, #rows in base_file) + size_t subset_first_row{0}; + size_t subset_size{0}; + std::string query_file; + std::string distance; + std::optional groundtruth_neighbors_file{std::nullopt}; + + // data type of input dataset, possible values ["float", "int8", "uint8"] + std::string dtype; + }; + + explicit inline configuration(std::istream& conf_stream) + { + // to enable comments in json + auto conf = nlohmann::json::parse(conf_stream, nullptr, true, true); + + parse_dataset(conf.at("dataset")); + parse_index(conf.at("index"), conf.at("search_basic_param")); + } + + [[nodiscard]] inline auto get_dataset_conf() const -> dataset_conf { return dataset_conf_; } + [[nodiscard]] inline auto get_indices() const -> std::vector { return indices_; }; + + private: + inline void parse_dataset(const nlohmann::json& conf) + { + dataset_conf_.name = conf.at("name"); + dataset_conf_.base_file = conf.at("base_file"); + dataset_conf_.query_file = conf.at("query_file"); + dataset_conf_.distance = conf.at("distance"); + + if (conf.contains("groundtruth_neighbors_file")) { + dataset_conf_.groundtruth_neighbors_file = conf.at("groundtruth_neighbors_file"); + } + if (conf.contains("subset_first_row")) { + dataset_conf_.subset_first_row = conf.at("subset_first_row"); + } + if (conf.contains("subset_size")) { dataset_conf_.subset_size = conf.at("subset_size"); } + + if (conf.contains("dtype")) { + dataset_conf_.dtype = conf.at("dtype"); + } else { + auto filename = dataset_conf_.base_file; + if (filename.size() > 6 && filename.compare(filename.size() - 6, 6, "f16bin") == 0) { + dataset_conf_.dtype = "half"; + } else if (filename.size() > 9 && + filename.compare(filename.size() - 9, 9, "fp16.fbin") == 0) { + dataset_conf_.dtype = "half"; + } else if (filename.size() > 4 && filename.compare(filename.size() - 4, 4, "fbin") == 0) { + dataset_conf_.dtype = "float"; + } else if (filename.size() > 5 && filename.compare(filename.size() - 5, 5, "u8bin") == 0) { + dataset_conf_.dtype = "uint8"; + } else if (filename.size() > 5 && filename.compare(filename.size() - 5, 5, "i8bin") == 0) { + dataset_conf_.dtype = "int8"; + } else { + log_error("Could not determine data type of the dataset %s", filename.c_str()); + } + } + } + inline void parse_index(const nlohmann::json& index_conf, const nlohmann::json& search_basic_conf) + { + const int batch_size = search_basic_conf.at("batch_size"); + const int k = search_basic_conf.at("k"); + + for (const auto& conf : index_conf) { + index index; + index.name = conf.at("name"); + index.algo = conf.at("algo"); + index.build_param = conf.at("build_param"); + index.file = conf.at("file"); + index.batch_size = batch_size; + index.k = k; + + for (auto param : conf.at("search_params")) { + /* ### Special parameters for backward compatibility ### + + - Local values of `k` and `n_queries` take priority. + - The legacy "batch_size" renamed to `n_queries`. + - Basic search params are used otherwise. + */ + if (!param.contains("k")) { param["k"] = k; } + if (!param.contains("n_queries")) { + if (param.contains("batch_size")) { + param["n_queries"] = param["batch_size"]; + param.erase("batch_size"); + } else { + param["n_queries"] = batch_size; + } + } + index.search_params.push_back(param); + } + + indices_.push_back(index); + } + } + + dataset_conf dataset_conf_; + std::vector indices_; +}; + +} // namespace cuvs::bench diff --git a/cpp/bench/ann/src/common/cuda_huge_page_resource.hpp b/cpp/bench/ann/src/common/cuda_huge_page_resource.hpp new file mode 100644 index 000000000..a4d967dd1 --- /dev/null +++ b/cpp/bench/ann/src/common/cuda_huge_page_resource.hpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include +#include + +#include + +#include +#include + +namespace raft::mr { +/** + * @brief `device_memory_resource` derived class that uses mmap to allocate memory. + * This class enables memory allocation using huge pages. + * It is assumed that the allocated memory is directly accessible on device. This currently only + * works on GH systems. + * + * TODO(tfeher): consider improving or removing this helper once we made progress with + * https://github.com/rapidsai/raft/issues/1819 + */ +class cuda_huge_page_resource final : public rmm::mr::device_memory_resource { + public: + cuda_huge_page_resource() = default; + ~cuda_huge_page_resource() override = default; + cuda_huge_page_resource(cuda_huge_page_resource const&) = default; + cuda_huge_page_resource(cuda_huge_page_resource&&) = default; + auto operator=(cuda_huge_page_resource const&) -> cuda_huge_page_resource& = default; + auto operator=(cuda_huge_page_resource&&) -> cuda_huge_page_resource& = default; + + private: + /** + * @brief Allocates memory of size at least `bytes` using cudaMalloc. + * + * The returned pointer has at least 256B alignment. + * + * @note Stream argument is ignored + * + * @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled + * + * @param bytes The size, in bytes, of the allocation + * @return void* Pointer to the newly allocated memory + */ + auto do_allocate(std::size_t bytes, rmm::cuda_stream_view) -> void* override + { + void* addr{nullptr}; + addr = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (addr == MAP_FAILED) { RAFT_FAIL("huge_page_resource::MAP FAILED"); } + if (madvise(addr, bytes, MADV_HUGEPAGE) == -1) { + munmap(addr, bytes); + RAFT_FAIL("huge_page_resource::madvise MADV_HUGEPAGE"); + } + memset(addr, 0, bytes); + return addr; + } + + /** + * @brief Deallocate memory pointed to by \p p. + * + * @note Stream argument is ignored. + * + * @throws Nothing. + * + * @param p Pointer to be deallocated + */ + void do_deallocate(void* ptr, std::size_t size, rmm::cuda_stream_view) override + { + if (munmap(ptr, size) == -1) { RAFT_FAIL("huge_page_resource::munmap"); } + } + + /** + * @brief Compare this resource to another. + * + * Two cuda_huge_page_resources always compare equal, because they can each + * deallocate memory allocated by the other. + * + * @throws Nothing. + * + * @param other The other resource to compare to + * @return true If the two resources are equivalent + * @return false If the two resources are not equal + */ + [[nodiscard]] auto do_is_equal(device_memory_resource const& other) const noexcept + -> bool override + { + return dynamic_cast(&other) != nullptr; + } +}; +} // namespace raft::mr diff --git a/cpp/bench/ann/src/common/cuda_pinned_resource.hpp b/cpp/bench/ann/src/common/cuda_pinned_resource.hpp new file mode 100644 index 000000000..fb977cd34 --- /dev/null +++ b/cpp/bench/ann/src/common/cuda_pinned_resource.hpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include + +namespace raft::mr { +/** + * @brief `device_memory_resource` derived class that uses cudaMallocHost/Free for + * allocation/deallocation. + * + * This is almost the same as rmm::mr::host::pinned_memory_resource, but it has + * device_memory_resource as base class. Pinned memory can be accessed from device, + * and using this allocator we can create device_mdarray backed by pinned allocator. + * + * TODO(tfeher): it would be preferred to just rely on the existing allocator from rmm + * (pinned_memory_resource), but that is incompatible with the container_policy class + * for device matrix, because the latter expects a device_memory_resource. We shall + * revise this once we progress with Issue https://github.com/rapidsai/raft/issues/1819 + */ +class cuda_pinned_resource final : public rmm::mr::device_memory_resource { + public: + cuda_pinned_resource() = default; + ~cuda_pinned_resource() override = default; + cuda_pinned_resource(cuda_pinned_resource const&) = default; + cuda_pinned_resource(cuda_pinned_resource&&) = default; + auto operator=(cuda_pinned_resource const&) -> cuda_pinned_resource& = default; + auto operator=(cuda_pinned_resource&&) -> cuda_pinned_resource& = default; + + private: + /** + * @brief Allocates memory of size at least `bytes` using cudaMalloc. + * + * The returned pointer has at least 256B alignment. + * + * @note Stream argument is ignored + * + * @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled + * + * @param bytes The size, in bytes, of the allocation + * @return void* Pointer to the newly allocated memory + */ + auto do_allocate(std::size_t bytes, rmm::cuda_stream_view) -> void* override + { + void* ptr{nullptr}; + RMM_CUDA_TRY_ALLOC(cudaMallocHost(&ptr, bytes)); + return ptr; + } + + /** + * @brief Deallocate memory pointed to by \p p. + * + * @note Stream argument is ignored. + * + * @throws Nothing. + * + * @param p Pointer to be deallocated + */ + void do_deallocate(void* ptr, std::size_t, rmm::cuda_stream_view) override + { + RMM_ASSERT_CUDA_SUCCESS(cudaFreeHost(ptr)); + } + + /** + * @brief Compare this resource to another. + * + * Two cuda_pinned_resources always compare equal, because they can each + * deallocate memory allocated by the other. + * + * @throws Nothing. + * + * @param other The other resource to compare to + * @return true If the two resources are equivalent + * @return false If the two resources are not equal + */ + [[nodiscard]] auto do_is_equal(device_memory_resource const& other) const noexcept + -> bool override + { + return dynamic_cast(&other) != nullptr; + } +}; +} // namespace raft::mr diff --git a/cpp/bench/ann/src/common/cuda_stub.hpp b/cpp/bench/ann/src/common/cuda_stub.hpp new file mode 100644 index 000000000..0ee087a11 --- /dev/null +++ b/cpp/bench/ann/src/common/cuda_stub.hpp @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* +The content of this header is governed by two preprocessor definitions: + + - BUILD_CPU_ONLY - whether none of the CUDA functions are used. + - ANN_BENCH_LINK_CUDART - dynamically link against this string if defined. + +___________________________________________________________________________________ +|BUILD_CPU_ONLY | ANN_BENCH_LINK_CUDART | cudart | cuda_runtime_api.h | +| | | found | needed | included | +|---------------|-----------------------|-----------|---------|--------------------| +| ON | | false | false | NO | +| ON | "cudart.so.xx.xx" | false | false | NO | +| OFF | | true | true | YES | +| OFF | "cudart.so.xx.xx" | | true | YES | +------------------------------------------------------------------------------------ +*/ + +#pragma once + +#ifndef BUILD_CPU_ONLY +#include +#include +#ifdef ANN_BENCH_LINK_CUDART +#include + +#include +#endif +#else +#include + +typedef void* cudaStream_t; +typedef void* cudaEvent_t; +typedef uint16_t half; +#endif + +namespace cuvs::bench { + +struct cuda_lib_handle { + void* handle{nullptr}; + explicit cuda_lib_handle() + { +#ifdef ANN_BENCH_LINK_CUDART + constexpr int kFlags = RTLD_NOW | RTLD_GLOBAL | RTLD_DEEPBIND | RTLD_NODELETE; + // The full name of the linked cudart library 'cudart.so.MAJOR.MINOR.PATCH' + char libname[] = ANN_BENCH_LINK_CUDART; // NOLINT + handle = dlopen(ANN_BENCH_LINK_CUDART, kFlags); + if (handle != nullptr) { return; } + // try strip the PATCH + auto p = strrchr(libname, '.'); + p[0] = 0; + handle = dlopen(libname, kFlags); + if (handle != nullptr) { return; } + // try set the MINOR version to 0 + p = strrchr(libname, '.'); + p[1] = '0'; + p[2] = 0; + handle = dlopen(libname, kFlags); + if (handle != nullptr) { return; } + // try strip the MINOR + p[0] = 0; + handle = dlopen(libname, kFlags); + if (handle != nullptr) { return; } + // try strip the MAJOR + p = strrchr(libname, '.'); + p[0] = 0; + handle = dlopen(libname, kFlags); +#endif + } + ~cuda_lib_handle() noexcept + { +#ifdef ANN_BENCH_LINK_CUDART + if (handle != nullptr) { dlclose(handle); } +#endif + } + + template + auto sym(const char* name) -> Symbol + { +#ifdef ANN_BENCH_LINK_CUDART + return reinterpret_cast(dlsym(handle, name)); +#else + return nullptr; +#endif + } + + /** Whether this is NOT a cpu-only package. */ + [[nodiscard]] constexpr inline auto needed() const -> bool + { +#if defined(BUILD_CPU_ONLY) + return false; +#else + return true; +#endif + } + + /** CUDA found, either at compile time or at runtime. */ + [[nodiscard]] inline auto found() const -> bool + { +#if defined(BUILD_CPU_ONLY) + return false; +#elif defined(ANN_BENCH_LINK_CUDART) + return handle != nullptr; +#else + return true; +#endif + } +}; + +static inline cuda_lib_handle cudart{}; + +#ifdef ANN_BENCH_LINK_CUDART +namespace stub { + +[[gnu::weak, gnu::noinline]] auto cuda_memcpy(void* dst, + const void* src, + size_t count, + enum cudaMemcpyKind kind) -> cudaError_t +{ + return cudaSuccess; +} + +[[gnu::weak, gnu::noinline]] auto cuda_malloc(void** ptr, size_t size) -> cudaError_t +{ + *ptr = nullptr; + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] auto cuda_memset(void* devPtr, int value, size_t count) -> cudaError_t +{ + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] auto cuda_free(void* devPtr) -> cudaError_t { return cudaSuccess; } +[[gnu::weak, gnu::noinline]] auto cuda_stream_create(cudaStream_t* pStream) -> cudaError_t +{ + *pStream = nullptr; + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] auto cuda_stream_create_with_flags(cudaStream_t* pStream, + unsigned int flags) -> cudaError_t +{ + *pStream = nullptr; + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] auto cuda_stream_destroy(cudaStream_t pStream) -> cudaError_t +{ + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] auto cuda_device_synchronize() -> cudaError_t { return cudaSuccess; } + +[[gnu::weak, gnu::noinline]] auto cuda_stream_synchronize(cudaStream_t pStream) -> cudaError_t +{ + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] auto cuda_event_create(cudaEvent_t* event) -> cudaError_t +{ + *event = nullptr; + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] auto cuda_event_record(cudaEvent_t event, cudaStream_t stream) + -> cudaError_t +{ + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] auto cuda_event_synchronize(cudaEvent_t event) -> cudaError_t +{ + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] auto cuda_event_elapsed_time(float* ms, + cudaEvent_t start, + cudaEvent_t end) -> cudaError_t +{ + *ms = 0; + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] auto cuda_event_destroy(cudaEvent_t event) -> cudaError_t +{ + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] auto cuda_get_device(int* device) -> cudaError_t +{ + *device = 0; + return cudaSuccess; +}; +[[gnu::weak, gnu::noinline]] auto cuda_driver_get_version(int* driver) -> cudaError_t +{ + *driver = 0; + return cudaSuccess; +}; +[[gnu::weak, gnu::noinline]] auto cuda_runtime_get_version(int* runtime) -> cudaError_t +{ + *runtime = 0; + return cudaSuccess; +}; +[[gnu::weak, gnu::noinline]] cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp* prop, + int device) +{ + *prop = cudaDeviceProp{}; + return cudaSuccess; +} + +} // namespace stub + +#define RAFT_DECLARE_CUDART(fun) \ + static inline decltype(&stub::fun) fun = \ + cudart.found() ? cudart.sym(#fun) : &stub::fun + +RAFT_DECLARE_CUDART(cuda_memcpy); +RAFT_DECLARE_CUDART(cuda_malloc); +RAFT_DECLARE_CUDART(cuda_memset); +RAFT_DECLARE_CUDART(cuda_free); +RAFT_DECLARE_CUDART(cuda_stream_create); +RAFT_DECLARE_CUDART(cuda_stream_create_with_flags); +RAFT_DECLARE_CUDART(cuda_stream_destroy); +RAFT_DECLARE_CUDART(cuda_device_synchronize); +RAFT_DECLARE_CUDART(cuda_stream_synchronize); +RAFT_DECLARE_CUDART(cuda_event_create); +RAFT_DECLARE_CUDART(cuda_event_record); +RAFT_DECLARE_CUDART(cuda_event_synchronize); +RAFT_DECLARE_CUDART(cuda_event_elapsed_time); +RAFT_DECLARE_CUDART(cuda_event_destroy); +RAFT_DECLARE_CUDART(cuda_get_device); +RAFT_DECLARE_CUDART(cuda_driver_get_version); +RAFT_DECLARE_CUDART(cuda_runtime_get_version); +RAFT_DECLARE_CUDART(cudaGetDeviceProperties); + +#undef RAFT_DECLARE_CUDART +#endif + +}; // namespace cuvs::bench diff --git a/cpp/bench/ann/src/common/dataset.hpp b/cpp/bench/ann/src/common/dataset.hpp new file mode 100644 index 000000000..95f1a82a2 --- /dev/null +++ b/cpp/bench/ann/src/common/dataset.hpp @@ -0,0 +1,498 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "ann_types.hpp" +#include "util.hpp" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cuvs::bench { + +// http://big-algo-benchmarks.com/index.html: +// binary format that starts with 8 bytes of data consisting of num_points(uint32_t) +// num_dimensions(uint32) followed by num_pts x num_dimensions x sizeof(type) bytes of +// data stored one vector after another. +// Data files will have suffixes .fbin, .u8bin, and .i8bin to represent float32, uint8 +// and int8 type data. +// As extensions for this benchmark, half and int data files will have suffixes .f16bin +// and .ibin, respectively. +template +class bin_file { + public: + bin_file(std::string file, + const std::string& mode, + uint32_t subset_first_row = 0, + uint32_t subset_size = 0); + ~bin_file() + { + if (mapped_ptr_ != nullptr) { unmap(); } + if (fp_ != nullptr) { fclose(fp_); } + } + bin_file(const bin_file&) = delete; + auto operator=(const bin_file&) -> bin_file& = delete; + + void get_shape(size_t* nrows, int* ndims) const + { + assert(read_mode_); + if (!fp_) { open_file(); } + *nrows = nrows_; + *ndims = ndims_; + } + + void read(T* data) const + { + assert(read_mode_); + if (!fp_) { open_file(); } + size_t total = static_cast(nrows_) * ndims_; + if (fread(data, sizeof(T), total, fp_) != total) { + throw std::runtime_error{"fread() bin_file " + file_ + " failed"}; + } + } + + void write(const T* data, uint32_t nrows, uint32_t ndims) + { + assert(!read_mode_); + if (!fp_) { open_file(); } + if (fwrite(&nrows, sizeof(uint32_t), 1, fp_) != 1) { + throw std::runtime_error{"fwrite() bin_file " + file_ + " failed"}; + } + if (fwrite(&ndims, sizeof(uint32_t), 1, fp_) != 1) { + throw std::runtime_error{"fwrite() bin_file " + file_ + " failed"}; + } + + size_t total = static_cast(nrows) * ndims; + if (fwrite(data, sizeof(T), total, fp_) != total) { + throw std::runtime_error{"fwrite() bin_file " + file_ + " failed"}; + } + } + + auto map() const -> T* + { + assert(read_mode_); + if (!fp_) { open_file(); } + int fid = fileno(fp_); + mapped_ptr_ = mmap(nullptr, file_size_, PROT_READ, MAP_PRIVATE, fid, 0); + if (mapped_ptr_ == MAP_FAILED) { + mapped_ptr_ = nullptr; + throw std::runtime_error{"mmap error: Value of errno " + std::to_string(errno) + ", " + + std::string(strerror(errno))}; + } + return reinterpret_cast(reinterpret_cast(mapped_ptr_) + 2 * sizeof(uint32_t) + + subset_first_row_ * ndims_ * sizeof(T)); + } + + void unmap() const + { + if (munmap(mapped_ptr_, file_size_) == -1) { + throw std::runtime_error{"munmap error: " + std::string(strerror(errno))}; + } + } + + private: + void check_suffix(); + void open_file() const; + + std::string file_; + bool read_mode_; + uint32_t subset_first_row_; + uint32_t subset_size_; + + mutable FILE* fp_{nullptr}; + mutable uint32_t nrows_; + mutable uint32_t ndims_; + mutable size_t file_size_; + mutable void* mapped_ptr_{nullptr}; +}; + +template +bin_file::bin_file(std::string file, + const std::string& mode, + uint32_t subset_first_row, + uint32_t subset_size) + : file_(std::move(file)), + read_mode_(mode == "r"), + subset_first_row_(subset_first_row), + subset_size_(subset_size) + +{ + check_suffix(); + + if (!read_mode_) { + if (mode == "w") { + if (subset_first_row != 0) { + throw std::runtime_error{"subset_first_row should be zero for write mode"}; + } + if (subset_size != 0) { + throw std::runtime_error{"subset_size should be zero for write mode"}; + } + } else { + throw std::runtime_error{"bin_file's mode must be either 'r' or 'w': " + file_}; + } + } +} + +template +void bin_file::open_file() const +{ + fp_ = fopen(file_.c_str(), read_mode_ ? "r" : "w"); + if (!fp_) { throw std::runtime_error{"open bin_file failed: " + file_}; } + + if (read_mode_) { + struct stat statbuf; + if (stat(file_.c_str(), &statbuf) != 0) { throw std::runtime_error{"stat() failed: " + file_}; } + file_size_ = statbuf.st_size; + + uint32_t header[2]; + if (fread(header, sizeof(uint32_t), 2, fp_) != 2) { + throw std::runtime_error{"read header of bin_file failed: " + file_}; + } + nrows_ = header[0]; + ndims_ = header[1]; + + size_t expected_file_size = + 2 * sizeof(uint32_t) + static_cast(nrows_) * ndims_ * sizeof(T); + if (file_size_ != expected_file_size) { + throw std::runtime_error{"expected file size of " + file_ + " is " + + std::to_string(expected_file_size) + ", however, actual size is " + + std::to_string(file_size_)}; + } + + if (subset_first_row_ >= nrows_) { + throw std::runtime_error{file_ + ": subset_first_row (" + std::to_string(subset_first_row_) + + ") >= nrows (" + std::to_string(nrows_) + ")"}; + } + if (subset_first_row_ + subset_size_ > nrows_) { + throw std::runtime_error{file_ + ": subset_first_row (" + std::to_string(subset_first_row_) + + ") + subset_size (" + std::to_string(subset_size_) + ") > nrows (" + + std::to_string(nrows_) + ")"}; + } + + if (subset_first_row_) { + static_assert(sizeof(long) == 8, "fseek() don't support 64-bit offset"); + if (fseek(fp_, sizeof(T) * subset_first_row_ * ndims_, SEEK_CUR) == -1) { + throw std::runtime_error{file_ + ": fseek failed"}; + } + nrows_ -= subset_first_row_; + } + if (subset_size_) { nrows_ = subset_size_; } + } +} + +template +void bin_file::check_suffix() +{ + auto pos = file_.rfind('.'); + if (pos == std::string::npos) { + throw std::runtime_error{"name of bin_file doesn't have a suffix: " + file_}; + } + std::string suffix = file_.substr(pos + 1); + + if constexpr (std::is_same_v) { + if (suffix != "fbin") { + throw std::runtime_error{"bin_file should has .fbin suffix: " + file_}; + } + } else if constexpr (std::is_same_v) { + if (suffix != "f16bin" && suffix != "fbin") { + throw std::runtime_error{"bin_file should has .f16bin suffix: " + file_}; + } + } else if constexpr (std::is_same_v) { + if (suffix != "ibin") { + throw std::runtime_error{"bin_file should has .ibin suffix: " + file_}; + } + } else if constexpr (std::is_same_v) { + if (suffix != "u8bin") { + throw std::runtime_error{"bin_file should has .u8bin suffix: " + file_}; + } + } else if constexpr (std::is_same_v) { + if (suffix != "i8bin") { + throw std::runtime_error{"bin_file should has .i8bin suffix: " + file_}; + } + } else { + throw std::runtime_error( + "T of bin_file should be one of float, half, int, uint8_t, or int8_t"); + } +} + +template +class dataset { + public: + explicit dataset(std::string name) : name_(std::move(name)) {} + dataset(std::string name, std::string distance) + : name_(std::move(name)), distance_(std::move(distance)) + { + } + dataset(const dataset&) = delete; + auto operator=(const dataset&) -> dataset& = delete; + virtual ~dataset(); + + auto name() const -> std::string { return name_; } + auto distance() const -> std::string { return distance_; } + virtual auto dim() const -> int = 0; + virtual auto max_k() const -> uint32_t = 0; + virtual auto base_set_size() const -> size_t = 0; + virtual auto query_set_size() const -> size_t = 0; + + // load data lazily, so don't pay the overhead of reading unneeded set + // e.g. don't load base set when searching + auto base_set() const -> const T* + { + if (!base_set_) { load_base_set(); } + return base_set_; + } + + auto query_set() const -> const T* + { + if (!query_set_) { load_query_set(); } + return query_set_; + } + + auto gt_set() const -> const int32_t* + { + if (!gt_set_) { load_gt_set(); } + return gt_set_; + } + + auto base_set_on_gpu() const -> const T*; + auto query_set_on_gpu() const -> const T*; + auto mapped_base_set() const -> const T*; + + auto query_set(MemoryType memory_type) const -> const T* + { + switch (memory_type) { + case MemoryType::kDevice: return query_set_on_gpu(); + default: return query_set(); + } + } + + auto base_set(MemoryType memory_type) const -> const T* + { + switch (memory_type) { + case MemoryType::kDevice: return base_set_on_gpu(); + case MemoryType::kHost: return base_set(); + case MemoryType::kHostMmap: return mapped_base_set(); + default: return nullptr; + } + } + + protected: + virtual void load_base_set() const = 0; + virtual void load_gt_set() const = 0; + virtual void load_query_set() const = 0; + virtual void map_base_set() const = 0; + + std::string name_; + std::string distance_; + + mutable T* base_set_ = nullptr; + mutable T* query_set_ = nullptr; + mutable T* d_base_set_ = nullptr; + mutable T* d_query_set_ = nullptr; + mutable T* mapped_base_set_ = nullptr; + mutable int32_t* gt_set_ = nullptr; +}; + +template +dataset::~dataset() +{ + delete[] base_set_; + delete[] query_set_; + delete[] gt_set_; +#ifndef BUILD_CPU_ONLY + if (d_base_set_) { cudaFree(d_base_set_); } + if (d_query_set_) { cudaFree(d_query_set_); } +#endif +} + +template +auto dataset::base_set_on_gpu() const -> const T* +{ +#ifndef BUILD_CPU_ONLY + if (!d_base_set_) { + base_set(); + cudaMalloc(reinterpret_cast(&d_base_set_), base_set_size() * dim() * sizeof(T)); + cudaMemcpy(d_base_set_, base_set_, base_set_size() * dim() * sizeof(T), cudaMemcpyHostToDevice); + } +#endif + return d_base_set_; +} + +template +auto dataset::query_set_on_gpu() const -> const T* +{ +#ifndef BUILD_CPU_ONLY + if (!d_query_set_) { + query_set(); + cudaMalloc(reinterpret_cast(&d_query_set_), query_set_size() * dim() * sizeof(T)); + cudaMemcpy( + d_query_set_, query_set_, query_set_size() * dim() * sizeof(T), cudaMemcpyHostToDevice); + } +#endif + return d_query_set_; +} + +template +auto dataset::mapped_base_set() const -> const T* +{ + if (!mapped_base_set_) { map_base_set(); } + return mapped_base_set_; +} + +template +class bin_dataset : public dataset { + public: + bin_dataset(const std::string& name, + const std::string& base_file, + size_t subset_first_row, + size_t subset_size, + const std::string& query_file, + const std::string& distance, + const std::optional& groundtruth_neighbors_file); + + auto dim() const -> int override; + auto max_k() const -> uint32_t override; + auto base_set_size() const -> size_t override; + auto query_set_size() const -> size_t override; + + private: + void load_base_set() const; + void load_query_set() const; + void load_gt_set() const; + void map_base_set() const; + + mutable int dim_ = 0; + mutable uint32_t max_k_ = 0; + mutable size_t base_set_size_ = 0; + mutable size_t query_set_size_ = 0; + + bin_file base_file_; + bin_file query_file_; + std::optional> gt_file_{std::nullopt}; +}; + +template +bin_dataset::bin_dataset(const std::string& name, + const std::string& base_file, + size_t subset_first_row, + size_t subset_size, + const std::string& query_file, + const std::string& distance, + const std::optional& groundtruth_neighbors_file) + : dataset(name, distance), + base_file_(base_file, "r", subset_first_row, subset_size), + query_file_(query_file, "r") +{ + if (groundtruth_neighbors_file.has_value()) { + gt_file_.emplace(groundtruth_neighbors_file.value(), "r"); + } +} + +template +auto bin_dataset::dim() const -> int +{ + if (dim_ > 0) { return dim_; } + if (base_set_size() > 0) { return dim_; } + if (query_set_size() > 0) { return dim_; } + return dim_; +} + +template +auto bin_dataset::max_k() const -> uint32_t +{ + if (!this->gt_set_) { load_gt_set(); } + return max_k_; +} + +template +auto bin_dataset::query_set_size() const -> size_t +{ + if (query_set_size_ > 0) { return query_set_size_; } + int dim; + query_file_.get_shape(&query_set_size_, &dim); + if (query_set_size_ == 0) { throw std::runtime_error{"Zero query set size"}; } + if (dim == 0) { throw std::runtime_error{"Zero query set dim"}; } + if (dim_ == 0) { + dim_ = dim; + } else if (dim_ != dim) { + throw std::runtime_error{"base set dim (" + std::to_string(dim_) + ") != query set dim (" + + std::to_string(dim)}; + } + return query_set_size_; +} + +template +auto bin_dataset::base_set_size() const -> size_t +{ + if (base_set_size_ > 0) { return base_set_size_; } + int dim; + base_file_.get_shape(&base_set_size_, &dim); + if (base_set_size_ == 0) { throw std::runtime_error{"Zero base set size"}; } + if (dim == 0) { throw std::runtime_error{"Zero base set dim"}; } + if (dim_ == 0) { + dim_ = dim; + } else if (dim_ != dim) { + throw std::runtime_error{"base set dim (" + std::to_string(dim) + ") != query set dim (" + + std::to_string(dim_)}; + } + return base_set_size_; +} + +template +void bin_dataset::load_base_set() const +{ + this->base_set_ = new T[base_set_size() * dim()]; + base_file_.read(this->base_set_); +} + +template +void bin_dataset::load_query_set() const +{ + this->query_set_ = new T[query_set_size() * dim()]; + query_file_.read(this->query_set_); +} + +template +void bin_dataset::load_gt_set() const +{ + if (gt_file_.has_value()) { + size_t queries; + int k; + gt_file_->get_shape(&queries, &k); + this->gt_set_ = new std::int32_t[queries * k]; + gt_file_->read(this->gt_set_); + max_k_ = k; + } +} + +template +void bin_dataset::map_base_set() const +{ + this->mapped_base_set_ = base_file_.map(); +} + +} // namespace cuvs::bench diff --git a/cpp/bench/ann/src/common/thread_pool.hpp b/cpp/bench/ann/src/common/thread_pool.hpp new file mode 100644 index 000000000..9d0218606 --- /dev/null +++ b/cpp/bench/ann/src/common/thread_pool.hpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +class fixed_thread_pool { + public: + explicit fixed_thread_pool(int num_threads) + { + if (num_threads < 1) { + throw std::runtime_error("num_threads must >= 1"); + } else if (num_threads == 1) { + return; + } + + tasks_ = new task[num_threads]; + + threads_.reserve(num_threads); + for (int i = 0; i < num_threads; ++i) { + threads_.emplace_back([&, i] { + auto& task = tasks_[i]; + while (true) { + std::unique_lock lock(task.mtx); + task.cv.wait(lock, + [&] { return task.has_task || finished_.load(std::memory_order_relaxed); }); + if (finished_.load(std::memory_order_relaxed)) { break; } + + task.task(); + task.has_task = false; + } + }); + } + } + + ~fixed_thread_pool() + { + if (threads_.empty()) { return; } + + finished_.store(true, std::memory_order_relaxed); + for (unsigned i = 0; i < threads_.size(); ++i) { + auto& task = tasks_[i]; + std::lock_guard(task.mtx); + + task.cv.notify_one(); + threads_[i].join(); + } + + delete[] tasks_; + } + + template + void submit(Func f, IdxT len) + { + // Run functions in main thread if thread pool has no threads + if (threads_.empty()) { + for (IdxT i = 0; i < len; ++i) { + f(i); + } + return; + } + + const int num_threads = threads_.size(); + // one extra part for competition among threads + const IdxT items_per_thread = len / (num_threads + 1); + std::atomic cnt(items_per_thread * num_threads); + + // Wrap function + auto wrapped_f = [&](IdxT start, IdxT end) { + for (IdxT i = start; i < end; ++i) { + f(i); + } + + while (true) { + IdxT i = cnt.fetch_add(1, std::memory_order_relaxed); + if (i >= len) { break; } + f(i); + } + }; + + std::vector> futures; + futures.reserve(num_threads); + for (int i = 0; i < num_threads; ++i) { + IdxT start = i * items_per_thread; + auto& task = tasks_[i]; + { + std::lock_guard lock(task.mtx); + (void)lock; // stop nvcc warning + task.task = std::packaged_task([=] { wrapped_f(start, start + items_per_thread); }); + futures.push_back(task.task.get_future()); + task.has_task = true; + } + task.cv.notify_one(); + } + + for (auto& fut : futures) { + fut.wait(); + } + return; + } + + private: + struct alignas(64) task { + std::mutex mtx; + std::condition_variable cv; + bool has_task = false; + std::packaged_task task; + }; + + task* tasks_; + std::vector threads_; + std::atomic finished_{false}; +}; diff --git a/cpp/bench/ann/src/common/util.hpp b/cpp/bench/ann/src/common/util.hpp new file mode 100644 index 000000000..490b0326e --- /dev/null +++ b/cpp/bench/ann/src/common/util.hpp @@ -0,0 +1,561 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "ann_types.hpp" +#include "cuda_stub.hpp" // cuda-related utils + +#ifdef ANN_BENCH_NVTX3_HEADERS_FOUND +#include +#endif + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cuvs::bench { + +/** + * Current thread id as given by the benchmark State. + * It's populated on every call of a benchmark case. + * It's relevant in the 'throughput' mode of the search benchmarks, + * where some algorithms might want to coordinate allocation of the resources. + */ +inline thread_local int benchmark_thread_id = 0; +/** + * Total concurrent thread count as given by the benchmark State. + * It's populated on every call of a benchmark case. + * It's relevant in the 'throughput' mode of the search benchmarks, + * where some algorithms might want to coordinate allocation of the resources. + */ +inline thread_local int benchmark_n_threads = 1; + +struct cuda_timer { + private: + std::optional stream_; + cudaEvent_t start_{nullptr}; + cudaEvent_t stop_{nullptr}; + double total_time_{0}; + + template + static inline auto extract_stream(AnnT* algo) -> std::optional + { + auto gpu_ann = dynamic_cast(algo); + if (gpu_ann != nullptr && gpu_ann->uses_stream()) { + return std::make_optional(gpu_ann->get_sync_stream()); + } + return std::nullopt; + } + + public: + struct cuda_lap { + private: + cudaStream_t stream_; + cudaEvent_t start_; + cudaEvent_t stop_; + double& total_time_; + + public: + cuda_lap(cudaStream_t stream, cudaEvent_t start, cudaEvent_t stop, double& total_time) + : start_(start), stop_(stop), stream_(stream), total_time_(total_time) + { +#ifndef BUILD_CPU_ONLY + cudaEventRecord(start_, stream_); +#endif + } + cuda_lap() = delete; + + ~cuda_lap() noexcept + { +#ifndef BUILD_CPU_ONLY + cudaEventRecord(stop_, stream_); + cudaEventSynchronize(stop_); + float milliseconds = 0.0f; + cudaEventElapsedTime(&milliseconds, start_, stop_); + total_time_ += milliseconds / 1000.0; +#endif + } + }; + + explicit cuda_timer(std::optional stream) : stream_{stream} + { +#ifndef BUILD_CPU_ONLY + if (stream_.has_value()) { + cudaEventCreate(&stop_); + cudaEventCreate(&start_); + } +#endif + } + + template + explicit cuda_timer(const std::unique_ptr& algo) : cuda_timer{extract_stream(algo.get())} + { + } + + ~cuda_timer() noexcept + { +#ifndef BUILD_CPU_ONLY + if (stream_.has_value()) { + cudaStreamSynchronize(stream_.value()); + cudaEventDestroy(start_); + cudaEventDestroy(stop_); + } +#endif + } + + cuda_timer() = delete; + cuda_timer(cuda_timer const&) = delete; + cuda_timer(cuda_timer&&) = delete; + auto operator=(cuda_timer const&) -> cuda_timer& = delete; + auto operator=(cuda_timer&&) -> cuda_timer& = delete; + + [[nodiscard]] auto stream() const -> std::optional { return stream_; } + + [[nodiscard]] auto active() const -> bool { return stream_.has_value(); } + + [[nodiscard]] auto total_time() const -> double { return total_time_; } + + [[nodiscard]] auto lap(bool enabled = true) -> std::optional + { + return enabled && stream_.has_value() + ? std::make_optional(stream_.value(), start_, stop_, total_time_) + : std::nullopt; + } +}; + +#ifndef BUILD_CPU_ONLY +// ATM, rmm::stream does not support passing in flags; hence this helper type. +struct non_blocking_stream { + non_blocking_stream() { cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking); } + ~non_blocking_stream() noexcept + { + if (stream_ != nullptr) { cudaStreamDestroy(stream_); } + } + non_blocking_stream(non_blocking_stream const&) = delete; + non_blocking_stream(non_blocking_stream&& other) noexcept { std::swap(stream_, other.stream_); } + auto operator=(non_blocking_stream const&) -> non_blocking_stream& = delete; + auto operator=(non_blocking_stream&&) -> non_blocking_stream& = delete; + [[nodiscard]] auto view() const noexcept -> cudaStream_t { return stream_; } + + private: + cudaStream_t stream_{nullptr}; +}; + +namespace detail { +inline std::vector global_stream_pool(0); +inline std::mutex gsp_mutex; +} // namespace detail +#endif + +/** + * Get a stream associated with the current benchmark thread. + * + * Note, the streams are reused between the benchmark cases. + * This makes it easier to profile and analyse multiple benchmark cases in one timeline using tools + * like nsys. + */ +inline auto get_stream_from_global_pool() -> cudaStream_t +{ +#ifndef BUILD_CPU_ONLY + std::lock_guard guard(detail::gsp_mutex); + if (static_cast(detail::global_stream_pool.size()) < benchmark_n_threads) { + detail::global_stream_pool.resize(benchmark_n_threads); + } + return detail::global_stream_pool[benchmark_thread_id].view(); +#else + return nullptr; +#endif +} + +struct result_buffer { + explicit result_buffer(size_t size, cudaStream_t stream) : size_{size}, stream_{stream} + { + if (size_ == 0) { return; } + data_host_ = malloc(size_); +#ifndef BUILD_CPU_ONLY + cudaMallocAsync(&data_device_, size_, stream_); + cudaStreamSynchronize(stream_); +#endif + } + result_buffer() = delete; + result_buffer(result_buffer&&) = delete; + auto operator=(result_buffer&&) -> result_buffer& = delete; + result_buffer(const result_buffer&) = delete; + auto operator=(const result_buffer&) -> result_buffer& = delete; + ~result_buffer() noexcept + { + if (size_ == 0) { return; } +#ifndef BUILD_CPU_ONLY + cudaFreeAsync(data_device_, stream_); + cudaStreamSynchronize(stream_); +#endif + free(data_host_); + } + + [[nodiscard]] auto size() const noexcept { return size_; } + [[nodiscard]] auto data(MemoryType loc) const noexcept + { + switch (loc) { + case MemoryType::kDevice: return data_device_; + default: return data_host_; + } + } + + void transfer_data(MemoryType dst, MemoryType src) + { + auto dst_ptr = data(dst); + auto src_ptr = data(src); + if (dst_ptr == src_ptr) { return; } +#ifndef BUILD_CPU_ONLY + cudaMemcpyAsync(dst_ptr, src_ptr, size_, cudaMemcpyDefault, stream_); + cudaStreamSynchronize(stream_); +#endif + } + + private: + size_t size_{0}; + cudaStream_t stream_ = nullptr; + void* data_host_ = nullptr; + void* data_device_ = nullptr; +}; + +namespace detail { +inline std::vector> global_result_buffer_pool(0); +inline std::mutex grp_mutex; +} // namespace detail + +/** + * Get a result buffer associated with the current benchmark thread. + * + * Note, the allocations are reused between the benchmark cases. + * This reduces the setup overhead and number of times the context is being blocked + * (this is relevant if there is a persistent kernel running across multiples benchmark cases). + */ +inline auto get_result_buffer_from_global_pool(size_t size) -> result_buffer& +{ + auto stream = get_stream_from_global_pool(); + auto& rb = [stream, size]() -> result_buffer& { + std::lock_guard guard(detail::grp_mutex); + if (static_cast(detail::global_result_buffer_pool.size()) < benchmark_n_threads) { + detail::global_result_buffer_pool.resize(benchmark_n_threads); + } + auto& rb = detail::global_result_buffer_pool[benchmark_thread_id]; + if (!rb || rb->size() < size) { rb = std::make_unique(size, stream); } + return *rb; + }(); + + memset(rb.data(MemoryType::kHost), 0, size); +#ifndef BUILD_CPU_ONLY + cudaMemsetAsync(rb.data(MemoryType::kDevice), 0, size, stream); + cudaStreamSynchronize(stream); +#endif + return rb; +} + +/** + * Delete all streams and memory allocations in the global pool. + * It's called at the end of the `main` function - before global/static variables and cuda context + * is destroyed - to make sure they are destroyed gracefully and correctly seen by analysis tools + * such as nsys. + */ +inline void reset_global_device_resources() +{ +#ifndef BUILD_CPU_ONLY + std::lock_guard guard(detail::gsp_mutex); + detail::global_result_buffer_pool.resize(0); + detail::global_stream_pool.resize(0); +#endif +} + +inline auto cuda_info() +{ + std::vector> props; +#ifndef BUILD_CPU_ONLY + int dev, driver = 0, runtime = 0; + cudaDriverGetVersion(&driver); + cudaRuntimeGetVersion(&runtime); + + cudaDeviceProp device_prop; + cudaGetDevice(&dev); + cudaGetDeviceProperties(&device_prop, dev); + props.emplace_back("gpu_name", std::string(device_prop.name)); + props.emplace_back("gpu_sm_count", std::to_string(device_prop.multiProcessorCount)); + props.emplace_back("gpu_sm_freq", std::to_string(device_prop.clockRate * 1e3)); + props.emplace_back("gpu_mem_freq", std::to_string(device_prop.memoryClockRate * 1e3)); + props.emplace_back("gpu_mem_bus_width", std::to_string(device_prop.memoryBusWidth)); + props.emplace_back("gpu_mem_global_size", std::to_string(device_prop.totalGlobalMem)); + props.emplace_back("gpu_mem_shared_size", std::to_string(device_prop.sharedMemPerMultiprocessor)); + props.emplace_back("gpu_driver_version", + std::to_string(driver / 1000) + "." + std::to_string((driver % 100) / 10)); + props.emplace_back("gpu_runtime_version", + std::to_string(runtime / 1000) + "." + std::to_string((runtime % 100) / 10)); +#endif + return props; +} + +struct nvtx_case { +#ifdef ANN_BENCH_NVTX3_HEADERS_FOUND + private: + std::string case_name_; + std::array iter_name_{0}; + nvtxDomainHandle_t domain_; + int64_t iteration_ = 0; + nvtxEventAttributes_t case_attrib_{0}; + nvtxEventAttributes_t iter_attrib_{0}; +#endif + + public: + struct nvtx_lap { +#ifdef ANN_BENCH_NVTX3_HEADERS_FOUND + private: + nvtxDomainHandle_t domain_; + + public: + nvtx_lap(nvtxDomainHandle_t domain, nvtxEventAttributes_t* attr) : domain_(domain) + { + nvtxDomainRangePushEx(domain_, attr); + } + nvtx_lap() = delete; + ~nvtx_lap() noexcept { nvtxDomainRangePop(domain_); } +#endif + }; + +#ifdef ANN_BENCH_NVTX3_HEADERS_FOUND + explicit nvtx_case(std::string case_name) + : case_name_(std::move(case_name)), domain_(nvtxDomainCreateA("algo benchmark")) + { + case_attrib_.version = NVTX_VERSION; + iter_attrib_.version = NVTX_VERSION; + case_attrib_.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + iter_attrib_.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + case_attrib_.colorType = NVTX_COLOR_ARGB; + iter_attrib_.colorType = NVTX_COLOR_ARGB; + case_attrib_.messageType = NVTX_MESSAGE_TYPE_ASCII; + iter_attrib_.messageType = NVTX_MESSAGE_TYPE_ASCII; + case_attrib_.message.ascii = case_name_.c_str(); + auto c = std::hash{}(case_name_); + case_attrib_.color = c | 0xA0A0A0; + nvtxDomainRangePushEx(domain_, &case_attrib_); + } + + ~nvtx_case() + { + nvtxDomainRangePop(domain_); + nvtxDomainDestroy(domain_); + } +#else + explicit nvtx_case(std::string) {} +#endif + + [[nodiscard]] auto lap() -> nvtx_case::nvtx_lap + { +#ifdef ANN_BENCH_NVTX3_HEADERS_FOUND + auto i = iteration_++; + uint32_t c = (i % 5); + uint32_t r = 150 + c * 20; + uint32_t g = 200 + c * 10; + uint32_t b = 220 + c * 5; + std::snprintf(iter_name_.data(), iter_name_.size(), "Lap %zd", i); + iter_attrib_.message.ascii = iter_name_.data(); + iter_attrib_.color = (r << 16) + (g << 8) + b; + return nvtx_lap{domain_, &iter_attrib_}; +#else + return nvtx_lap{}; +#endif + } +}; + +/** + * A progress tracker that allows syncing threads multiple times and resets the global + * progress once the threads are done. + */ +struct progress_barrier { + progress_barrier() = default; + ~progress_barrier() noexcept + { + { + // Lock makes sure the notified threads see the updates to `done_`. + std::unique_lock lk(mutex_); + done_.store(true, std::memory_order_relaxed); + cv_.notify_all(); + } + // This is the only place where the order of the updates to thread_progress_ and done_ is + // important. They are not guarded by the mutex, and `done_` must not be reset to `true` by + // other threads after the `total_progress_` is zero. + // Hence the default memory order (std::memory_order_seq_cst). + auto rem = total_progress_.fetch_sub(thread_progress_); + if (rem == thread_progress_) { + // the last thread to exit clears the progress state. + done_.store(false); + } + } + + /** + * Advance the progress counter by `n` and return the previous `progress` value. + * + * This can be used to track which thread arrives on the call site first. + * + * @return the previous progress counter value (before incrementing it by `n`). + */ + auto arrive(int n) + { + thread_progress_ += n; + // Lock makes sure the notified threads see the updates to `total_progress_`. + std::unique_lock lk(mutex_); + auto prev = total_progress_.fetch_add(n, std::memory_order_relaxed); + cv_.notify_all(); + return prev; + } + + /** + * Wait till the progress counter reaches `n` or finishes abnormally. + * + * @return the latest observed value of the progress counter. + */ + auto wait(int limit) + { + int cur = total_progress_.load(std::memory_order_relaxed); + if (cur >= limit) { return cur; } + auto done = done_.load(std::memory_order_relaxed); + if (done) { return cur; } + std::unique_lock lk(mutex_); + while (cur < limit && !done) { + using namespace std::chrono_literals; + cv_.wait_for(lk, 10ms); + cur = total_progress_.load(std::memory_order_relaxed); + done = done_.load(std::memory_order_relaxed); + } + return cur; + } + + private: + static inline std::atomic total_progress_; + static inline std::atomic done_; + static inline std::mutex mutex_; + static inline std::condition_variable cv_; + int thread_progress_{0}; +}; + +inline auto split(const std::string& s, char delimiter) -> std::vector +{ + std::vector tokens; + std::string token; + std::istringstream iss(s); + while (getline(iss, token, delimiter)) { + if (!token.empty()) { tokens.push_back(token); } + } + return tokens; +} + +inline auto file_exists(const std::string& filename) -> bool +{ + struct stat statbuf; + if (stat(filename.c_str(), &statbuf) != 0) { return false; } + return S_ISREG(statbuf.st_mode); +} + +inline auto dir_exists(const std::string& dir) -> bool +{ + struct stat statbuf; + if (stat(dir.c_str(), &statbuf) != 0) { return false; } + return S_ISDIR(statbuf.st_mode); +} + +inline auto create_dir(const std::string& dir) -> bool +{ + const auto path = split(dir, '/'); + + std::string cwd; + if (!dir.empty() && dir[0] == '/') { cwd += '/'; } + + for (const auto& p : path) { + cwd += p + "/"; + if (!dir_exists(cwd)) { + int ret = mkdir(cwd.c_str(), S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); + if (ret != 0) { return false; } + } + } + return true; +} + +inline void make_sure_parent_dir_exists(const std::string& file_path) +{ + const auto pos = file_path.rfind('/'); + if (pos != std::string::npos) { + auto dir = file_path.substr(0, pos); + if (!dir_exists(dir)) { create_dir(dir); } + } +} + +inline auto combine_path(const std::string& dir, const std::string& path) +{ + std::filesystem::path p_dir(dir); + std::filesystem::path p_suf(path); + return (p_dir / p_suf).string(); +} + +template +void log_with_level(const char* level, const Ts&... vs) +{ + char buf[20]; + auto now = std::chrono::system_clock::now(); + auto now_tt = std::chrono::system_clock::to_time_t(now); + size_t millis = + std::chrono::duration_cast(now.time_since_epoch()).count() % + 1000000ULL; + std::strftime(buf, sizeof(buf), "%H:%M:%S", std::localtime(&now_tt)); + printf("[%s] [%s.%06zu] ", level, buf, millis); + if constexpr (sizeof...(Ts) == 1) { + printf("%s", vs...); + } else { + printf(vs...); + } + printf("\n"); + fflush(stdout); +} + +template +void log_info(Ts&&... vs) +{ + log_with_level("I", std::forward(vs)...); +} + +template +void log_warn(Ts&&... vs) +{ + log_with_level("W", std::forward(vs)...); +} + +template +void log_error(Ts&&... vs) +{ + log_with_level("E", std::forward(vs)...); +} + +} // namespace cuvs::bench diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h new file mode 100644 index 000000000..67f8ed39d --- /dev/null +++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#undef WARP_SIZE +#ifdef CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE +#include "cuvs_wrapper.h" +#endif +#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT +#include "cuvs_ivf_flat_wrapper.h" +extern template class cuvs::bench::cuvs_ivf_flat; +extern template class cuvs::bench::cuvs_ivf_flat; +extern template class cuvs::bench::cuvs_ivf_flat; +#endif +#if defined(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) || \ + defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB) +#include "cuvs_ivf_pq_wrapper.h" +#endif +#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_PQ +extern template class cuvs::bench::cuvs_ivf_pq; +extern template class cuvs::bench::cuvs_ivf_pq; +extern template class cuvs::bench::cuvs_ivf_pq; +#endif +#if defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB) +#include "cuvs_cagra_wrapper.h" +#endif +#ifdef CUVS_ANN_BENCH_USE_CUVS_CAGRA +extern template class cuvs::bench::cuvs_cagra; +extern template class cuvs::bench::cuvs_cagra; +extern template class cuvs::bench::cuvs_cagra; +extern template class cuvs::bench::cuvs_cagra; +#endif + +#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT +template +void parse_build_param(const nlohmann::json& conf, + typename cuvs::bench::cuvs_ivf_flat::build_param& param) +{ + param.n_lists = conf.at("nlist"); + if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); } + if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); } +} + +template +void parse_search_param(const nlohmann::json& conf, + typename cuvs::bench::cuvs_ivf_flat::search_param& param) +{ + param.ivf_flat_params.n_probes = conf.at("nprobe"); +} +#endif + +#if defined(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) || \ + defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB) +template +void parse_build_param(const nlohmann::json& conf, + typename cuvs::bench::cuvs_ivf_pq::build_param& param) +{ + if (conf.contains("nlist")) { param.n_lists = conf.at("nlist"); } + if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); } + if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); } + if (conf.contains("pq_bits")) { param.pq_bits = conf.at("pq_bits"); } + if (conf.contains("pq_dim")) { param.pq_dim = conf.at("pq_dim"); } + if (conf.contains("codebook_kind")) { + std::string kind = conf.at("codebook_kind"); + if (kind == "cluster") { + param.codebook_kind = cuvs::neighbors::ivf_pq::codebook_gen::PER_CLUSTER; + } else if (kind == "subspace") { + param.codebook_kind = cuvs::neighbors::ivf_pq::codebook_gen::PER_SUBSPACE; + } else { + throw std::runtime_error("codebook_kind: '" + kind + + "', should be either 'cluster' or 'subspace'"); + } + } +} + +template +void parse_search_param(const nlohmann::json& conf, + typename cuvs::bench::cuvs_ivf_pq::search_param& param) +{ + if (conf.contains("nprobe")) { param.pq_param.n_probes = conf.at("nprobe"); } + if (conf.contains("internalDistanceDtype")) { + std::string type = conf.at("internalDistanceDtype"); + if (type == "float") { + param.pq_param.internal_distance_dtype = CUDA_R_32F; + } else if (type == "half") { + param.pq_param.internal_distance_dtype = CUDA_R_16F; + } else { + throw std::runtime_error("internalDistanceDtype: '" + type + + "', should be either 'float' or 'half'"); + } + } else { + // set half as default type + param.pq_param.internal_distance_dtype = CUDA_R_16F; + } + + if (conf.contains("smemLutDtype")) { + std::string type = conf.at("smemLutDtype"); + if (type == "float") { + param.pq_param.lut_dtype = CUDA_R_32F; + } else if (type == "half") { + param.pq_param.lut_dtype = CUDA_R_16F; + } else if (type == "fp8") { + param.pq_param.lut_dtype = CUDA_R_8U; + } else { + throw std::runtime_error("smemLutDtype: '" + type + + "', should be either 'float', 'half' or 'fp8'"); + } + } else { + // set half as default + param.pq_param.lut_dtype = CUDA_R_16F; + } + if (conf.contains("refine_ratio")) { + param.refine_ratio = conf.at("refine_ratio"); + if (param.refine_ratio < 1.0f) { throw std::runtime_error("refine_ratio should be >= 1.0"); } + } +} +#endif + +#if defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB) +template +void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::nn_descent::index_params& param) +{ + if (conf.contains("graph_degree")) { param.graph_degree = conf.at("graph_degree"); } + if (conf.contains("intermediate_graph_degree")) { + param.intermediate_graph_degree = conf.at("intermediate_graph_degree"); + } + // we allow niter shorthand for max_iterations + if (conf.contains("niter")) { param.max_iterations = conf.at("niter"); } + if (conf.contains("max_iterations")) { param.max_iterations = conf.at("max_iterations"); } + if (conf.contains("termination_threshold")) { + param.termination_threshold = conf.at("termination_threshold"); + } +} + +inline void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::vpq_params& param) +{ + if (conf.contains("pq_bits")) { param.pq_bits = conf.at("pq_bits"); } + if (conf.contains("pq_dim")) { param.pq_dim = conf.at("pq_dim"); } + if (conf.contains("vq_n_centers")) { param.vq_n_centers = conf.at("vq_n_centers"); } + if (conf.contains("kmeans_n_iters")) { param.kmeans_n_iters = conf.at("kmeans_n_iters"); } + if (conf.contains("vq_kmeans_trainset_fraction")) { + param.vq_kmeans_trainset_fraction = conf.at("vq_kmeans_trainset_fraction"); + } + if (conf.contains("pq_kmeans_trainset_fraction")) { + param.pq_kmeans_trainset_fraction = conf.at("pq_kmeans_trainset_fraction"); + } +} + +nlohmann::json collect_conf_with_prefix(const nlohmann::json& conf, + const std::string& prefix, + bool remove_prefix = true) +{ + nlohmann::json out; + for (auto& i : conf.items()) { + if (i.key().compare(0, prefix.size(), prefix) == 0) { + auto new_key = remove_prefix ? i.key().substr(prefix.size()) : i.key(); + out[new_key] = i.value(); + } + } + return out; +} + +template +void parse_build_param(const nlohmann::json& conf, + typename cuvs::bench::cuvs_cagra::build_param& param) +{ + if (conf.contains("graph_degree")) { + param.cagra_params.graph_degree = conf.at("graph_degree"); + param.cagra_params.intermediate_graph_degree = param.cagra_params.graph_degree * 2; + } + if (conf.contains("intermediate_graph_degree")) { + param.cagra_params.intermediate_graph_degree = conf.at("intermediate_graph_degree"); + } + if (conf.contains("graph_build_algo")) { + if (conf.at("graph_build_algo") == "IVF_PQ") { + param.algo = cuvs::bench::CagraBuildAlgo::kIvfPq; + } else if (conf.at("graph_build_algo") == "NN_DESCENT") { + param.algo = cuvs::bench::CagraBuildAlgo::kNnDescent; + } else { + param.algo = cuvs::bench::CagraBuildAlgo::kAuto; + } + } + nlohmann::json ivf_pq_build_conf = collect_conf_with_prefix(conf, "ivf_pq_build_"); + if (!ivf_pq_build_conf.empty()) { + cuvs::neighbors::ivf_pq::index_params bparam; + parse_build_param(ivf_pq_build_conf, bparam); + param.ivf_pq_build_params = bparam; + } + nlohmann::json ivf_pq_search_conf = collect_conf_with_prefix(conf, "ivf_pq_search_"); + if (!ivf_pq_search_conf.empty()) { + typename cuvs::bench::cuvs_ivf_pq::search_param sparam; + parse_search_param(ivf_pq_search_conf, sparam); + param.ivf_pq_search_params = sparam.pq_param; + param.ivf_pq_refine_rate = sparam.refine_ratio; + } + nlohmann::json nn_descent_conf = collect_conf_with_prefix(conf, "nn_descent_"); + if (!nn_descent_conf.empty()) { + cuvs::neighbors::nn_descent::index_params nn_param; + nn_param.intermediate_graph_degree = 1.5 * param.cagra_params.intermediate_graph_degree; + parse_build_param(nn_descent_conf, nn_param); + if (nn_param.graph_degree != param.cagra_params.intermediate_graph_degree) { + nn_param.graph_degree = param.cagra_params.intermediate_graph_degree; + } + param.nn_descent_params = nn_param; + } + nlohmann::json comp_search_conf = collect_conf_with_prefix(conf, "compression_"); + if (!comp_search_conf.empty()) { + cuvs::neighbors::vpq_params vpq_pams; + parse_build_param(comp_search_conf, vpq_pams); + param.cagra_params.compression.emplace(vpq_pams); + } +} + +cuvs::bench::AllocatorType parse_allocator(std::string mem_type) +{ + if (mem_type == "device") { + return cuvs::bench::AllocatorType::kDevice; + } else if (mem_type == "host_pinned") { + return cuvs::bench::AllocatorType::kHostPinned; + } else if (mem_type == "host_huge_page") { + return cuvs::bench::AllocatorType::kHostHugePage; + } + THROW( + "Invalid value for memory type %s, must be one of [\"device\", \"host_pinned\", " + "\"host_huge_page\"", + mem_type.c_str()); +} + +template +void parse_search_param(const nlohmann::json& conf, + typename cuvs::bench::cuvs_cagra::search_param& param) +{ + if (conf.contains("itopk")) { param.p.itopk_size = conf.at("itopk"); } + if (conf.contains("search_width")) { param.p.search_width = conf.at("search_width"); } + if (conf.contains("max_iterations")) { param.p.max_iterations = conf.at("max_iterations"); } + if (conf.contains("algo")) { + if (conf.at("algo") == "single_cta") { + param.p.algo = cuvs::neighbors::cagra::search_algo::SINGLE_CTA; + } else if (conf.at("algo") == "multi_cta") { + param.p.algo = cuvs::neighbors::cagra::search_algo::MULTI_CTA; + } else if (conf.at("algo") == "multi_kernel") { + param.p.algo = cuvs::neighbors::cagra::search_algo::MULTI_KERNEL; + } else if (conf.at("algo") == "auto") { + param.p.algo = cuvs::neighbors::cagra::search_algo::AUTO; + } else { + std::string tmp = conf.at("algo"); + THROW("Invalid value for algo: %s", tmp.c_str()); + } + } + if (conf.contains("graph_memory_type")) { + param.graph_mem = parse_allocator(conf.at("graph_memory_type")); + } + if (conf.contains("internal_dataset_memory_type")) { + param.dataset_mem = parse_allocator(conf.at("internal_dataset_memory_type")); + } + // Same ratio as in IVF-PQ + param.refine_ratio = conf.value("refine_ratio", 1.0f); +} +#endif diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_utils.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_utils.h new file mode 100644 index 000000000..b92785943 --- /dev/null +++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_utils.h @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "../common/ann_types.hpp" +#include "../common/util.hpp" +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +namespace cuvs::bench { + +inline auto parse_metric_type(cuvs::bench::Metric metric) -> cuvs::distance::DistanceType +{ + if (metric == cuvs::bench::Metric::kInnerProduct) { + return cuvs::distance::DistanceType::InnerProduct; + } else if (metric == cuvs::bench::Metric::kEuclidean) { + // Even for L2 expanded RAFT IVF Flat uses unexpanded formula + return cuvs::distance::DistanceType::L2Expanded; + } else { + throw std::runtime_error("raft supports only metric type of inner product and L2"); + } +} + +/** Report a more verbose error with a backtrace when OOM occurs on RMM side. */ +inline auto rmm_oom_callback(std::size_t bytes, void*) -> bool +{ + auto cuda_status = cudaGetLastError(); + size_t free = 0; + size_t total = 0; + RAFT_CUDA_TRY_NO_THROW(cudaMemGetInfo(&free, &total)); + RAFT_FAIL( + "Failed to allocate %zu bytes using RMM memory resource. " + "NB: latest cuda status = %s, free memory = %zu, total memory = %zu.", + bytes, + cudaGetErrorName(cuda_status), + free, + total); +} + +/** + * This container keeps the part of raft state that should be shared among multiple copies of raft + * handles (in different CPU threads). + * An example of this is an RMM memory resource: if we had an RMM memory pool per thread, we'd + * quickly run out of memory. + */ +class shared_raft_resources { + public: + using pool_mr_type = rmm::mr::pool_memory_resource; + using mr_type = rmm::mr::failure_callback_resource_adaptor; + + shared_raft_resources() + try : orig_resource_{rmm::mr::get_current_device_resource()}, + pool_resource_(orig_resource_, 1024 * 1024 * 1024ull), + resource_(&pool_resource_, rmm_oom_callback, nullptr) { + rmm::mr::set_current_device_resource(&resource_); + } catch (const std::exception& e) { + auto cuda_status = cudaGetLastError(); + size_t free = 0; + size_t total = 0; + RAFT_CUDA_TRY_NO_THROW(cudaMemGetInfo(&free, &total)); + RAFT_FAIL( + "Failed to initialize shared raft resources (NB: latest cuda status = %s, free memory = %zu, " + "total memory = %zu): %s", + cudaGetErrorName(cuda_status), + free, + total, + e.what()); + } + + shared_raft_resources(shared_raft_resources&&) = delete; + auto operator=(shared_raft_resources&&) -> shared_raft_resources& = delete; + shared_raft_resources(const shared_raft_resources& res) = delete; + auto operator=(const shared_raft_resources& other) -> shared_raft_resources& = delete; + + ~shared_raft_resources() noexcept { rmm::mr::set_current_device_resource(orig_resource_); } + + private: + rmm::mr::device_memory_resource* orig_resource_; + pool_mr_type pool_resource_; + mr_type resource_; +}; + +/** + * This struct is used by multiple raft benchmark wrappers. It serves as a thread-safe keeper of + * shared and private GPU resources (see below). + * + * - Accessing the same `configured_raft_resources` from concurrent threads is not safe. + * - Accessing the copies of `configured_raft_resources` from concurrent threads is safe. + * - There must be at most one "original" `configured_raft_resources` at any time, but as many + * copies of it as needed (modifies the program static state). + */ +class configured_raft_resources { + public: + /** + * This constructor has the shared state passed unmodified but creates the local state anew. + * It's used by the copy constructor. + */ + explicit configured_raft_resources(const std::shared_ptr& shared_res) + : shared_res_{shared_res}, + res_{std::make_unique( + rmm::cuda_stream_view(get_stream_from_global_pool()))} + { + } + + /** Default constructor creates all resources anew. */ + configured_raft_resources() : configured_raft_resources{std::make_shared()} + { + } + + configured_raft_resources(configured_raft_resources&&); + auto operator=(configured_raft_resources&&) -> configured_raft_resources&; + ~configured_raft_resources() = default; + configured_raft_resources(const configured_raft_resources& res) + : configured_raft_resources{res.shared_res_} + { + } + auto operator=(const configured_raft_resources& other) -> configured_raft_resources& + { + this->shared_res_ = other.shared_res_; + return *this; + } + + operator raft::resources&() noexcept { return *res_; } // NOLINT + operator const raft::resources&() const noexcept { return *res_; } // NOLINT + + /** Get the main stream */ + [[nodiscard]] auto get_sync_stream() const noexcept + { + return raft::resource::get_cuda_stream(*res_); + } + + private: + /** The resources shared among multiple raft handles / threads. */ + std::shared_ptr shared_res_; + /** + * Until we make the use of copies of raft::resources thread-safe, each benchmark wrapper must + * have its own copy of it. + */ + std::unique_ptr res_ = std::make_unique(); +}; + +inline configured_raft_resources::configured_raft_resources(configured_raft_resources&&) = default; +inline auto configured_raft_resources::operator=(configured_raft_resources&&) + -> configured_raft_resources& = default; + +/** A helper to refine the neighbors when the data is on device or on host. */ +template +void refine_helper(const raft::resources& res, + DatasetT dataset, + QueriesT queries, + CandidatesT candidates, + int k, + algo_base::index_type* neighbors, + float* distances, + cuvs::distance::DistanceType metric) +{ + using data_type = typename DatasetT::value_type; + using index_type = algo_base::index_type; + using extents_type = int64_t; // device-side refine requires this + + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + extents_type batch_size = queries.extent(0); + extents_type dim = queries.extent(1); + extents_type k0 = candidates.extent(1); + + if (raft::get_device_for_address(dataset.data_handle()) >= 0) { + auto dataset_device = raft::make_device_matrix_view( + dataset.data_handle(), dataset.extent(0), dataset.extent(1)); + auto queries_device = raft::make_device_matrix_view( + queries.data_handle(), batch_size, dim); + auto candidates_device = raft::make_device_matrix_view( + candidates.data_handle(), batch_size, k0); + auto neighbors_device = + raft::make_device_matrix_view(neighbors, batch_size, k); + auto distances_device = + raft::make_device_matrix_view(distances, batch_size, k); + + cuvs::neighbors::refine(res, + dataset_device, + queries_device, + candidates_device, + neighbors_device, + distances_device, + metric); + } else { + auto dataset_host = raft::make_host_matrix_view( + dataset.data_handle(), dataset.extent(0), dataset.extent(1)); + auto queries_host = raft::make_host_matrix(batch_size, dim); + auto candidates_host = raft::make_host_matrix(batch_size, k0); + auto neighbors_host = raft::make_host_matrix(batch_size, k); + auto distances_host = raft::make_host_matrix(batch_size, k); + + auto stream = raft::resource::get_cuda_stream(res); + raft::copy(queries_host.data_handle(), queries.data_handle(), queries_host.size(), stream); + raft::copy( + candidates_host.data_handle(), candidates.data_handle(), candidates_host.size(), stream); + + raft::resource::sync_stream(res); // wait for the queries and candidates + cuvs::neighbors::refine(res, + dataset_host, + queries_host.view(), + candidates_host.view(), + neighbors_host.view(), + distances_host.view(), + metric); + + raft::copy(neighbors, neighbors_host.data_handle(), neighbors_host.size(), stream); + raft::copy(distances, distances_host.data_handle(), distances_host.size(), stream); + } +} + +} // namespace cuvs::bench diff --git a/cpp/bench/ann/src/cuvs/cuvs_benchmark.cu b/cpp/bench/ann/src/cuvs/cuvs_benchmark.cu new file mode 100644 index 000000000..a7495c23a --- /dev/null +++ b/cpp/bench/ann/src/cuvs/cuvs_benchmark.cu @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../common/ann_types.hpp" +#include "cuvs_ann_bench_param_parser.h" + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace cuvs::bench { + +template +auto create_algo(const std::string& algo_name, + const std::string& distance, + int dim, + const nlohmann::json& conf) -> std::unique_ptr> +{ + [[maybe_unused]] cuvs::bench::Metric metric = parse_metric(distance); + std::unique_ptr> a; + + if constexpr (std::is_same_v) { +#ifdef CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE + if (algo_name == "raft_brute_force" || algo_name == "cuvs_brute_force") { + a = std::make_unique>(metric, dim); + } +#endif + } + + if constexpr (std::is_same_v) {} + +#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT + if constexpr (std::is_same_v || std::is_same_v || + std::is_same_v) { + if (algo_name == "raft_ivf_flat" || algo_name == "cuvs_ivf_flat") { + typename cuvs::bench::cuvs_ivf_flat::build_param param; + parse_build_param(conf, param); + a = std::make_unique>(metric, dim, param); + } + } +#endif +#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_PQ + if (algo_name == "raft_ivf_pq" || algo_name == "cuvs_ivf_pq") { + typename cuvs::bench::cuvs_ivf_pq::build_param param; + parse_build_param(conf, param); + a = std::make_unique>(metric, dim, param); + } +#endif +#ifdef CUVS_ANN_BENCH_USE_CUVS_CAGRA + if (algo_name == "raft_cagra" || algo_name == "cuvs_cagra") { + typename cuvs::bench::cuvs_cagra::build_param param; + parse_build_param(conf, param); + a = std::make_unique>(metric, dim, param); + } +#endif + + if (!a) { throw std::runtime_error("invalid algo: '" + algo_name + "'"); } + + return a; +} + +template +auto create_search_param(const std::string& algo_name, const nlohmann::json& conf) + -> std::unique_ptr::search_param> +{ +#ifdef CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE + if (algo_name == "raft_brute_force" || algo_name == "cuvs_brute_force") { + auto param = std::make_unique::search_param>(); + return param; + } +#endif +#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT + if constexpr (std::is_same_v || std::is_same_v || + std::is_same_v) { + if (algo_name == "raft_ivf_flat" || algo_name == "cuvs_ivf_flat") { + auto param = + std::make_unique::search_param>(); + parse_search_param(conf, *param); + return param; + } + } +#endif +#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_PQ + if (algo_name == "raft_ivf_pq" || algo_name == "cuvs_ivf_pq") { + auto param = std::make_unique::search_param>(); + parse_search_param(conf, *param); + return param; + } +#endif +#ifdef CUVS_ANN_BENCH_USE_CUVS_CAGRA + if (algo_name == "raft_cagra" || algo_name == "cuvs_cagra") { + auto param = std::make_unique::search_param>(); + parse_search_param(conf, *param); + return param; + } +#endif + + // else + throw std::runtime_error("invalid algo: '" + algo_name + "'"); +} + +}; // namespace cuvs::bench + +REGISTER_ALGO_INSTANCE(float); +// REGISTER_ALGO_INSTANCE(half); +REGISTER_ALGO_INSTANCE(std::int8_t); +REGISTER_ALGO_INSTANCE(std::uint8_t); + +#ifdef ANN_BENCH_BUILD_MAIN +#include "../common/benchmark.hpp" +int main(int argc, char** argv) { return cuvs::bench::run_main(argc, argv); } +#endif diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_float.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_float.cu new file mode 100644 index 000000000..576f1d5bd --- /dev/null +++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_float.cu @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "cuvs_cagra_wrapper.h" + +namespace cuvs::bench { +template class cuvs_cagra; +} // namespace cuvs::bench diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_half.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_half.cu new file mode 100644 index 000000000..6768034a2 --- /dev/null +++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_half.cu @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "cuvs_cagra_wrapper.h" + +namespace cuvs::bench { +// template class cuvs_cagra; +} // namespace cuvs::bench diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu new file mode 100644 index 000000000..558ba01e0 --- /dev/null +++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../common/ann_types.hpp" +#include "cuvs_ann_bench_param_parser.h" +#include "cuvs_cagra_hnswlib_wrapper.h" + +#include +#include +#include + +namespace cuvs::bench { + +template +void parse_search_param(const nlohmann::json& conf, + typename cuvs::bench::cuvs_cagra_hnswlib::search_param& param) +{ + param.ef = conf.at("ef"); + if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); } +} + +template +auto create_algo(const std::string& algo_name, + const std::string& distance, + int dim, + const nlohmann::json& conf) -> std::unique_ptr> +{ + [[maybe_unused]] cuvs::bench::Metric metric = parse_metric(distance); + std::unique_ptr> a; + + if constexpr (std::is_same_v or std::is_same_v) { + if (algo_name == "raft_cagra_hnswlib" || algo_name == "cuvs_cagra_hnswlib") { + typename cuvs::bench::cuvs_cagra_hnswlib::build_param param; + parse_build_param(conf, param); + a = std::make_unique>(metric, dim, param); + } + } + + if (!a) { throw std::runtime_error("invalid algo: '" + algo_name + "'"); } + + return a; +} + +template +auto create_search_param(const std::string& algo_name, const nlohmann::json& conf) + -> std::unique_ptr::search_param> +{ + if (algo_name == "raft_cagra_hnswlib" || algo_name == "cuvs_cagra_hnswlib") { + auto param = + std::make_unique::search_param>(); + parse_search_param(conf, *param); + return param; + } + + throw std::runtime_error("invalid algo: '" + algo_name + "'"); +} + +} // namespace cuvs::bench + +REGISTER_ALGO_INSTANCE(float); +REGISTER_ALGO_INSTANCE(std::int8_t); +REGISTER_ALGO_INSTANCE(std::uint8_t); + +#ifdef ANN_BENCH_BUILD_MAIN +#include "../common/benchmark.hpp" +/* +[NOTE] Dear developer, + +Please don't modify the content of the `main` function; this will make the behavior of the benchmark +executable differ depending on the cmake flags and will complicate the debugging. In particular, +don't try to setup an RMM memory resource here; it will anyway be modified by the memory resource +set on per-algorithm basis. For example, see `cuvs/cuvs_ann_bench_utils.h`. +*/ +int main(int argc, char** argv) { return cuvs::bench::run_main(argc, argv); } +#endif diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h new file mode 100644 index 000000000..875fe0bba --- /dev/null +++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "../hnswlib/hnswlib_wrapper.h" +#include "cuvs_cagra_wrapper.h" + +#include + +namespace cuvs::bench { + +template +class cuvs_cagra_hnswlib : public algo, public algo_gpu { + public: + using search_param_base = typename algo::search_param; + using build_param = typename cuvs_cagra::build_param; + using search_param = typename hnsw_lib::search_param; + + cuvs_cagra_hnswlib(Metric metric, int dim, const build_param& param, int concurrent_searches = 1) + : algo(metric, dim), + cagra_build_{metric, dim, param, concurrent_searches}, + // hnsw_lib param values don't matter since we don't build with hnsw_lib + hnswlib_search_{metric, dim, typename hnsw_lib::build_param{50, 100}} + { + } + + void build(const T* dataset, size_t nrow) final; + + void set_search_param(const search_param_base& param) override; + + void search(const T* queries, + int batch_size, + int k, + algo_base::index_type* neighbors, + float* distances) const override; + + [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override + { + return cagra_build_.get_sync_stream(); + } + + // to enable dataset access from GPU memory + [[nodiscard]] auto get_preference() const -> algo_property override + { + algo_property property; + property.dataset_memory_type = MemoryType::kHostMmap; + property.query_memory_type = MemoryType::kHost; + return property; + } + + void save(const std::string& file) const override; + void load(const std::string&) override; + std::unique_ptr> copy() override + { + return std::make_unique>(*this); + } + + private: + cuvs_cagra cagra_build_; + hnsw_lib hnswlib_search_; +}; + +template +void cuvs_cagra_hnswlib::build(const T* dataset, size_t nrow) +{ + cagra_build_.build(dataset, nrow); +} + +template +void cuvs_cagra_hnswlib::set_search_param(const search_param_base& param_) +{ + hnswlib_search_.set_search_param(param_); +} + +template +void cuvs_cagra_hnswlib::save(const std::string& file) const +{ + cagra_build_.save_to_hnswlib(file); +} + +template +void cuvs_cagra_hnswlib::load(const std::string& file) +{ + hnswlib_search_.load(file); + hnswlib_search_.set_base_layer_only(); +} + +template +void cuvs_cagra_hnswlib::search( + const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const +{ + hnswlib_search_.search(queries, batch_size, k, neighbors, distances); +} + +} // namespace cuvs::bench diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_int8_t.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_int8_t.cu new file mode 100644 index 000000000..e2c0735f5 --- /dev/null +++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_int8_t.cu @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "cuvs_cagra_wrapper.h" + +namespace cuvs::bench { +template class cuvs_cagra; +} // namespace cuvs::bench diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_uint8_t.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_uint8_t.cu new file mode 100644 index 000000000..0f3849885 --- /dev/null +++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_uint8_t.cu @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "cuvs_cagra_wrapper.h" + +namespace cuvs::bench { +template class cuvs_cagra; +} // namespace cuvs::bench diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h new file mode 100644 index 000000000..9ac25f53f --- /dev/null +++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h @@ -0,0 +1,345 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "../../../../src/neighbors/detail/cagra/utils.hpp" +#include "../common/ann_types.hpp" +#include "../common/cuda_huge_page_resource.hpp" +#include "../common/cuda_pinned_resource.hpp" +#include "cuvs_ann_bench_utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +// #include +// #include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cuvs::bench { + +enum class AllocatorType { kHostPinned, kHostHugePage, kDevice }; +enum class CagraBuildAlgo { kAuto, kIvfPq, kNnDescent }; + +template +class cuvs_cagra : public algo, public algo_gpu { + public: + using search_param_base = typename algo::search_param; + + struct search_param : public search_param_base { + cuvs::neighbors::cagra::search_params p; + float refine_ratio; + AllocatorType graph_mem = AllocatorType::kDevice; + AllocatorType dataset_mem = AllocatorType::kDevice; + [[nodiscard]] auto needs_dataset() const -> bool override { return true; } + }; + + struct build_param { + cuvs::neighbors::cagra::index_params cagra_params; + CagraBuildAlgo algo; + std::optional nn_descent_params = std::nullopt; + std::optional ivf_pq_refine_rate = std::nullopt; + std::optional ivf_pq_build_params = std::nullopt; + std::optional ivf_pq_search_params = std::nullopt; + }; + + cuvs_cagra(Metric metric, int dim, const build_param& param, int concurrent_searches = 1) + : algo(metric, dim), + index_params_(param), + dimension_(dim), + + dataset_(std::make_shared>( + std::move(raft::make_device_matrix(handle_, 0, 0)))), + graph_(std::make_shared>( + std::move(raft::make_device_matrix(handle_, 0, 0)))), + input_dataset_v_( + std::make_shared>( + nullptr, 0, 0)) + + { + index_params_.cagra_params.metric = parse_metric_type(metric); + index_params_.ivf_pq_build_params->metric = parse_metric_type(metric); + } + + void build(const T* dataset, size_t nrow) final; + + void set_search_param(const search_param_base& param) override; + + void set_search_dataset(const T* dataset, size_t nrow) override; + + void search(const T* queries, + int batch_size, + int k, + algo_base::index_type* neighbors, + float* distances) const override; + void search_base(const T* queries, + int batch_size, + int k, + algo_base::index_type* neighbors, + float* distances) const; + + [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override + { + return handle_.get_sync_stream(); + } + + // to enable dataset access from GPU memory + [[nodiscard]] auto get_preference() const -> algo_property override + { + algo_property property; + property.dataset_memory_type = MemoryType::kHostMmap; + property.query_memory_type = MemoryType::kDevice; + return property; + } + void save(const std::string& file) const override; + void load(const std::string&) override; + void save_to_hnswlib(const std::string& file) const; + std::unique_ptr> copy() override; + + private: + // handle_ must go first to make sure it dies last and all memory allocated in pool + configured_raft_resources handle_{}; + raft::mr::cuda_pinned_resource mr_pinned_; + raft::mr::cuda_huge_page_resource mr_huge_page_; + AllocatorType graph_mem_{AllocatorType::kDevice}; + AllocatorType dataset_mem_{AllocatorType::kDevice}; + float refine_ratio_; + build_param index_params_; + bool need_dataset_update_{true}; + cuvs::neighbors::cagra::search_params search_params_; + std::shared_ptr> index_; + int dimension_; + std::shared_ptr> graph_; + std::shared_ptr> dataset_; + std::shared_ptr> input_dataset_v_; + + inline rmm::device_async_resource_ref get_mr(AllocatorType mem_type) + { + switch (mem_type) { + case (AllocatorType::kHostPinned): return &mr_pinned_; + case (AllocatorType::kHostHugePage): return &mr_huge_page_; + default: return rmm::mr::get_current_device_resource(); + } + } +}; + +template +void cuvs_cagra::build(const T* dataset, size_t nrow) +{ + auto dataset_view = + raft::make_host_matrix_view(dataset, IdxT(nrow), dimension_); + + auto& params = index_params_.cagra_params; + + if (index_params_.algo == CagraBuildAlgo::kIvfPq) { + auto pq_params = cuvs::neighbors::cagra::graph_build_params::ivf_pq_params( + dataset_view.extents(), params.metric); + if (index_params_.ivf_pq_build_params) { + pq_params.build_params = *index_params_.ivf_pq_build_params; + } + if (index_params_.ivf_pq_search_params) { + pq_params.search_params = *index_params_.ivf_pq_search_params; + } + if (index_params_.ivf_pq_refine_rate) { + pq_params.refinement_rate = *index_params_.ivf_pq_refine_rate; + } + params.graph_build_params = pq_params; + } else if (index_params_.algo == CagraBuildAlgo::kNnDescent) { + auto nn_params = cuvs::neighbors::cagra::graph_build_params::nn_descent_params( + params.intermediate_graph_degree); + if (index_params_.nn_descent_params) { nn_params = *index_params_.nn_descent_params; } + params.graph_build_params = nn_params; + } + index_ = std::make_shared>( + std::move(cuvs::neighbors::cagra::build(handle_, params, dataset_view))); +} + +inline auto allocator_to_string(AllocatorType mem_type) -> std::string +{ + if (mem_type == AllocatorType::kDevice) { + return "device"; + } else if (mem_type == AllocatorType::kHostPinned) { + return "host_pinned"; + } else if (mem_type == AllocatorType::kHostHugePage) { + return "host_huge_page"; + } + return ""; +} + +template +void cuvs_cagra::set_search_param(const search_param_base& param) +{ + auto sp = dynamic_cast(param); + search_params_ = sp.p; + refine_ratio_ = sp.refine_ratio; + if (sp.graph_mem != graph_mem_) { + // Move graph to correct memory space + graph_mem_ = sp.graph_mem; + RAFT_LOG_DEBUG("moving graph to new memory space: %s", allocator_to_string(graph_mem_).c_str()); + // We create a new graph and copy to it from existing graph + auto mr = get_mr(graph_mem_); + auto new_graph = raft::make_device_mdarray( + handle_, mr, raft::make_extents(index_->graph().extent(0), index_->graph_degree())); + + raft::copy(new_graph.data_handle(), + index_->graph().data_handle(), + index_->graph().size(), + raft::resource::get_cuda_stream(handle_)); + + index_->update_graph(handle_, make_const_mdspan(new_graph.view())); + // update_graph() only stores a view in the index. We need to keep the graph object alive. + *graph_ = std::move(new_graph); + } + + if (sp.dataset_mem != dataset_mem_ || need_dataset_update_) { + dataset_mem_ = sp.dataset_mem; + + // First free up existing memory + *dataset_ = raft::make_device_matrix(handle_, 0, 0); + index_->update_dataset(handle_, make_const_mdspan(dataset_->view())); + + // Allocate space using the correct memory resource. + RAFT_LOG_DEBUG("moving dataset to new memory space: %s", + allocator_to_string(dataset_mem_).c_str()); + + auto mr = get_mr(dataset_mem_); + cuvs::neighbors::cagra::detail::copy_with_padding(handle_, *dataset_, *input_dataset_v_, mr); + + auto dataset_view = raft::make_device_strided_matrix_view( + dataset_->data_handle(), dataset_->extent(0), this->dim_, dataset_->extent(1)); + index_->update_dataset(handle_, dataset_view); + + need_dataset_update_ = false; + } +} + +template +void cuvs_cagra::set_search_dataset(const T* dataset, size_t nrow) +{ + using ds_idx_type = decltype(index_->data().n_rows()); + bool is_vpq = + dynamic_cast*>(&index_->data()) || + dynamic_cast*>(&index_->data()); + // It can happen that we are re-using a previous algo object which already has + // the dataset set. Check if we need update. + if (static_cast(input_dataset_v_->extent(0)) != nrow || + input_dataset_v_->data_handle() != dataset) { + *input_dataset_v_ = raft::make_device_matrix_view(dataset, nrow, this->dim_); + need_dataset_update_ = !is_vpq; // ignore update if this is a VPQ dataset. + } +} + +template +void cuvs_cagra::save(const std::string& file) const +{ + cuvs::neighbors::cagra::serialize(handle_, file, *index_); +} + +template +void cuvs_cagra::save_to_hnswlib(const std::string& file) const +{ + cuvs::neighbors::cagra::serialize_to_hnswlib(handle_, file, *index_); +} + +template +void cuvs_cagra::load(const std::string& file) +{ + index_ = std::make_shared>(handle_); + cuvs::neighbors::cagra::deserialize(handle_, file, index_.get()); +} + +template +std::unique_ptr> cuvs_cagra::copy() +{ + return std::make_unique>(*this); // use copy constructor +} + +template +void cuvs_cagra::search_base( + const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const +{ + static_assert(std::is_integral_v); + static_assert(std::is_integral_v); + + IdxT* neighbors_idx_t; + std::optional> neighbors_storage{std::nullopt}; + if constexpr (sizeof(IdxT) == sizeof(algo_base::index_type)) { + neighbors_idx_t = reinterpret_cast(neighbors); + } else { + neighbors_storage.emplace(batch_size * k, raft::resource::get_cuda_stream(handle_)); + neighbors_idx_t = neighbors_storage->data(); + } + + auto queries_view = + raft::make_device_matrix_view(queries, batch_size, dimension_); + auto neighbors_view = + raft::make_device_matrix_view(neighbors_idx_t, batch_size, k); + auto distances_view = raft::make_device_matrix_view(distances, batch_size, k); + + cuvs::neighbors::cagra::search( + handle_, search_params_, *index_, queries_view, neighbors_view, distances_view); + + if constexpr (sizeof(IdxT) != sizeof(algo_base::index_type)) { + raft::linalg::unaryOp(neighbors, + neighbors_idx_t, + batch_size * k, + raft::cast_op(), + raft::resource::get_cuda_stream(handle_)); + } +} + +template +void cuvs_cagra::search( + const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const +{ + auto k0 = static_cast(refine_ratio_ * k); + const bool disable_refinement = k0 <= static_cast(k); + const raft::resources& res = handle_; + + if (disable_refinement) { + search_base(queries, batch_size, k, neighbors, distances); + } else { + auto queries_v = raft::make_device_matrix_view( + queries, batch_size, dimension_); + auto candidate_ixs = + raft::make_device_matrix(res, batch_size, k0); + auto candidate_dists = + raft::make_device_matrix(res, batch_size, k0); + search_base( + queries, batch_size, k0, candidate_ixs.data_handle(), candidate_dists.data_handle()); + refine_helper( + res, *input_dataset_v_, queries_v, candidate_ixs, k, neighbors, distances, index_->metric()); + } +} +} // namespace cuvs::bench diff --git a/cpp/bench/ann/src/cuvs/cuvs_ivf_flat.cu b/cpp/bench/ann/src/cuvs/cuvs_ivf_flat.cu new file mode 100644 index 000000000..f38e16c9f --- /dev/null +++ b/cpp/bench/ann/src/cuvs/cuvs_ivf_flat.cu @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "cuvs_ivf_flat_wrapper.h" + +namespace cuvs::bench { +template class cuvs_ivf_flat; +template class cuvs_ivf_flat; +template class cuvs_ivf_flat; +} // namespace cuvs::bench diff --git a/cpp/bench/ann/src/cuvs/cuvs_ivf_flat_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_ivf_flat_wrapper.h new file mode 100644 index 000000000..2aaa5a294 --- /dev/null +++ b/cpp/bench/ann/src/cuvs/cuvs_ivf_flat_wrapper.h @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "../common/ann_types.hpp" +#include "cuvs_ann_bench_utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace cuvs::bench { + +template +class cuvs_ivf_flat : public algo, public algo_gpu { + public: + using search_param_base = typename algo::search_param; + + struct search_param : public search_param_base { + cuvs::neighbors::ivf_flat::search_params ivf_flat_params; + }; + + using build_param = cuvs::neighbors::ivf_flat::index_params; + + cuvs_ivf_flat(Metric metric, int dim, const build_param& param) + : algo(metric, dim), index_params_(param), dimension_(dim) + { + index_params_.metric = parse_metric_type(metric); + index_params_.conservative_memory_allocation = true; + RAFT_CUDA_TRY(cudaGetDevice(&device_)); + } + + void build(const T* dataset, size_t nrow) final; + + void set_search_param(const search_param_base& param) override; + + void search(const T* queries, + int batch_size, + int k, + algo_base::index_type* neighbors, + float* distances) const override; + + [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override + { + return handle_.get_sync_stream(); + } + + // to enable dataset access from GPU memory + [[nodiscard]] auto get_preference() const -> algo_property override + { + algo_property property; + property.dataset_memory_type = MemoryType::kHostMmap; + property.query_memory_type = MemoryType::kDevice; + return property; + } + void save(const std::string& file) const override; + void load(const std::string&) override; + std::unique_ptr> copy() override; + + private: + // handle_ must go first to make sure it dies last and all memory allocated in pool + configured_raft_resources handle_{}; + build_param index_params_; + cuvs::neighbors::ivf_flat::search_params search_params_; + std::shared_ptr> index_; + int device_; + int dimension_; +}; + +template +void cuvs_ivf_flat::build(const T* dataset, size_t nrow) +{ + index_ = std::make_shared>( + std::move(cuvs::neighbors::ivf_flat::build( + handle_, + index_params_, + raft::make_host_matrix_view(dataset, nrow, dimension_)))); + // Note: internally the IVF-Flat build works with simple pointers, and accepts both host and + // device pointer. Therefore, although we provide here a host_mdspan, this works with device + // pointer too. +} + +template +void cuvs_ivf_flat::set_search_param(const search_param_base& param) +{ + auto sp = dynamic_cast(param); + search_params_ = sp.ivf_flat_params; + assert(search_params_.n_probes <= index_params_.n_lists); +} + +template +void cuvs_ivf_flat::save(const std::string& file) const +{ + cuvs::neighbors::ivf_flat::serialize(handle_, file, *index_); + return; +} + +template +void cuvs_ivf_flat::load(const std::string& file) +{ + index_ = + std::make_shared>(handle_, index_params_, this->dim_); + + cuvs::neighbors::ivf_flat::deserialize(handle_, file, index_.get()); + return; +} + +template +std::unique_ptr> cuvs_ivf_flat::copy() +{ + return std::make_unique>(*this); // use copy constructor +} + +template +void cuvs_ivf_flat::search( + const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const +{ + static_assert(std::is_integral_v); + static_assert(std::is_integral_v); + + IdxT* neighbors_idx_t; + std::optional> neighbors_storage{std::nullopt}; + if constexpr (sizeof(IdxT) == sizeof(algo_base::index_type)) { + neighbors_idx_t = reinterpret_cast(neighbors); + } else { + neighbors_storage.emplace(batch_size * k, raft::resource::get_cuda_stream(handle_)); + neighbors_idx_t = neighbors_storage->data(); + } + cuvs::neighbors::ivf_flat::search( + handle_, + search_params_, + *index_, + raft::make_device_matrix_view(queries, batch_size, index_->dim()), + raft::make_device_matrix_view(neighbors_idx_t, batch_size, k), + raft::make_device_matrix_view(distances, batch_size, k)); + if constexpr (sizeof(IdxT) != sizeof(algo_base::index_type)) { + raft::linalg::unaryOp(neighbors, + neighbors_idx_t, + batch_size * k, + raft::cast_op(), + raft::resource::get_cuda_stream(handle_)); + } +} +} // namespace cuvs::bench diff --git a/cpp/bench/ann/src/cuvs/cuvs_ivf_pq.cu b/cpp/bench/ann/src/cuvs/cuvs_ivf_pq.cu new file mode 100644 index 000000000..3ffdd4a25 --- /dev/null +++ b/cpp/bench/ann/src/cuvs/cuvs_ivf_pq.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "cuvs_ivf_pq_wrapper.h" + +namespace cuvs::bench { +template class cuvs_ivf_pq; +// template class cuvs_ivf_pq; +template class cuvs_ivf_pq; +template class cuvs_ivf_pq; +} // namespace cuvs::bench diff --git a/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h new file mode 100644 index 000000000..a305f14ea --- /dev/null +++ b/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "../common/ann_types.hpp" +#include "cuvs_ann_bench_utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cuvs::bench { + +template +class cuvs_ivf_pq : public algo, public algo_gpu { + public: + using search_param_base = typename algo::search_param; + using algo::dim_; + + struct search_param : public search_param_base { + cuvs::neighbors::ivf_pq::search_params pq_param; + float refine_ratio = 1.0f; + [[nodiscard]] auto needs_dataset() const -> bool override { return refine_ratio > 1.0f; } + }; + + using build_param = cuvs::neighbors::ivf_pq::index_params; + + cuvs_ivf_pq(Metric metric, int dim, const build_param& param) + : algo(metric, dim), index_params_(param), dimension_(dim) + { + index_params_.metric = parse_metric_type(metric); + } + + void build(const T* dataset, size_t nrow) final; + + void set_search_param(const search_param_base& param) override; + void set_search_dataset(const T* dataset, size_t nrow) override; + + void search(const T* queries, + int batch_size, + int k, + algo_base::index_type* neighbors, + float* distances) const override; + void search_base(const T* queries, + int batch_size, + int k, + algo_base::index_type* neighbors, + float* distances) const; + + [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override + { + return handle_.get_sync_stream(); + } + + // to enable dataset access from GPU memory + [[nodiscard]] auto get_preference() const -> algo_property override + { + algo_property property; + property.dataset_memory_type = MemoryType::kHost; + property.query_memory_type = MemoryType::kDevice; + return property; + } + void save(const std::string& file) const override; + void load(const std::string&) override; + std::unique_ptr> copy() override; + + private: + // handle_ must go first to make sure it dies last and all memory allocated in pool + configured_raft_resources handle_{}; + build_param index_params_; + cuvs::neighbors::ivf_pq::search_params search_params_; + std::shared_ptr> index_; + int dimension_; + float refine_ratio_ = 1.0; + raft::device_matrix_view dataset_; +}; + +template +void cuvs_ivf_pq::save(const std::string& file) const +{ + cuvs::neighbors::ivf_pq::serialize(handle_, file, *index_); +} + +template +void cuvs_ivf_pq::load(const std::string& file) +{ + index_ = std::make_shared>(handle_, index_params_, dim_); + cuvs::neighbors::ivf_pq::deserialize(handle_, file, index_.get()); +} + +template +void cuvs_ivf_pq::build(const T* dataset, size_t nrow) +{ + auto dataset_v = raft::make_device_matrix_view(dataset, IdxT(nrow), dim_); + std::make_shared>( + std::move(cuvs::neighbors::ivf_pq::build(handle_, index_params_, dataset_v))) + .swap(index_); +} + +template +std::unique_ptr> cuvs_ivf_pq::copy() +{ + return std::make_unique>(*this); // use copy constructor +} + +template +void cuvs_ivf_pq::set_search_param(const search_param_base& param) +{ + auto sp = dynamic_cast(param); + search_params_ = sp.pq_param; + refine_ratio_ = sp.refine_ratio; + assert(search_params_.n_probes <= index_params_.n_lists); +} + +template +void cuvs_ivf_pq::set_search_dataset(const T* dataset, size_t nrow) +{ + dataset_ = raft::make_device_matrix_view(dataset, nrow, index_->dim()); +} + +template +void cuvs_ivf_pq::search_base( + const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const +{ + static_assert(std::is_integral_v); + static_assert(std::is_integral_v); + + IdxT* neighbors_idx_t; + std::optional> neighbors_storage{std::nullopt}; + if constexpr (sizeof(IdxT) == sizeof(algo_base::index_type)) { + neighbors_idx_t = reinterpret_cast(neighbors); + } else { + neighbors_storage.emplace(batch_size * k, raft::resource::get_cuda_stream(handle_)); + neighbors_idx_t = neighbors_storage->data(); + } + + auto queries_view = + raft::make_device_matrix_view(queries, batch_size, dimension_); + auto neighbors_view = + raft::make_device_matrix_view(neighbors_idx_t, batch_size, k); + auto distances_view = raft::make_device_matrix_view(distances, batch_size, k); + + cuvs::neighbors::ivf_pq::search( + handle_, search_params_, *index_, queries_view, neighbors_view, distances_view); + + if constexpr (sizeof(IdxT) != sizeof(algo_base::index_type)) { + raft::linalg::unaryOp(neighbors, + neighbors_idx_t, + batch_size * k, + raft::cast_op(), + raft::resource::get_cuda_stream(handle_)); + } +} + +template +void cuvs_ivf_pq::search( + const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const +{ + auto k0 = static_cast(refine_ratio_ * k); + const bool disable_refinement = k0 <= static_cast(k); + const raft::resources& res = handle_; + + if (disable_refinement) { + search_base(queries, batch_size, k, neighbors, distances); + } else { + auto queries_v = raft::make_device_matrix_view( + queries, batch_size, dimension_); + auto candidate_ixs = + raft::make_device_matrix(res, batch_size, k0); + auto candidate_dists = + raft::make_device_matrix(res, batch_size, k0); + search_base( + queries, batch_size, k0, candidate_ixs.data_handle(), candidate_dists.data_handle()); + refine_helper( + res, dataset_, queries_v, candidate_ixs, k, neighbors, distances, index_->metric()); + } +} +} // namespace cuvs::bench diff --git a/cpp/bench/ann/src/cuvs/cuvs_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_wrapper.h new file mode 100644 index 000000000..0954e6051 --- /dev/null +++ b/cpp/bench/ann/src/cuvs/cuvs_wrapper.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "../common/ann_types.hpp" +#include "cuvs_ann_bench_utils.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace raft_temp { + +inline auto parse_metric_type(cuvs::bench::Metric metric) -> cuvs::distance::DistanceType +{ + switch (metric) { + case cuvs::bench::Metric::kInnerProduct: return cuvs::distance::DistanceType::InnerProduct; + case cuvs::bench::Metric::kEuclidean: return cuvs::distance::DistanceType::L2Expanded; + default: throw std::runtime_error("raft supports only metric type of inner product and L2"); + } +} +} // namespace raft_temp + +namespace cuvs::bench { + +// brute force KNN - RAFT +template +class cuvs_gpu : public algo, public algo_gpu { + public: + using search_param_base = typename algo::search_param; + + struct search_param : public search_param_base { + [[nodiscard]] auto needs_dataset() const -> bool override { return true; } + }; + + cuvs_gpu(Metric metric, int dim); + + void build(const T*, size_t) final; + + void set_search_param(const search_param_base& param) override; + + void search(const T* queries, + int batch_size, + int k, + algo_base::index_type* neighbors, + float* distances) const final; + + // to enable dataset access from GPU memory + [[nodiscard]] auto get_preference() const -> algo_property override + { + algo_property property; + property.dataset_memory_type = MemoryType::kDevice; + property.query_memory_type = MemoryType::kDevice; + return property; + } + [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override + { + return handle_.get_sync_stream(); + } + void set_search_dataset(const T* dataset, size_t nrow) override; + void save(const std::string& file) const override; + void load(const std::string&) override; + std::unique_ptr> copy() override; + + protected: + // handle_ must go first to make sure it dies last and all memory allocated in pool + configured_raft_resources handle_{}; + std::shared_ptr> index_; + cuvs::distance::DistanceType metric_type_; + int device_; + const T* dataset_; + size_t nrow_; +}; + +template +cuvs_gpu::cuvs_gpu(Metric metric, int dim) + : algo(metric, dim), metric_type_(raft_temp::parse_metric_type(metric)) +{ + static_assert(std::is_same_v || std::is_same_v, + "raft bfknn only supports float/double"); + RAFT_CUDA_TRY(cudaGetDevice(&device_)); +} + +template +void cuvs_gpu::build(const T* dataset, size_t nrow) +{ + auto dataset_view = raft::make_device_matrix_view(dataset, nrow, this->dim_); + index_ = std::make_shared>( + std::move(cuvs::neighbors::brute_force::build(handle_, dataset_view, metric_type_))); +} + +template +void cuvs_gpu::set_search_param(const search_param_base&) +{ + // Nothing to set here as it is brute force implementation +} + +template +void cuvs_gpu::set_search_dataset(const T* dataset, size_t nrow) +{ + dataset_ = dataset; + nrow_ = nrow; + // Wrap the dataset with an index. + auto dataset_view = raft::make_device_matrix_view(dataset, nrow, this->dim_); + index_ = std::make_shared>( + std::move(cuvs::neighbors::brute_force::build(handle_, dataset_view, metric_type_))); +} + +template +void cuvs_gpu::save(const std::string& file) const +{ + // The index is just the dataset with metadata (shape). The dataset already exist on disk, + // therefore we do not need to save it here. + // We create an empty file because the benchmark logic requires an index file to be created. + std::ofstream of(file); + of.close(); +} + +template +void cuvs_gpu::load(const std::string& file) +{ + // We do not have serialization of brute force index. We can simply wrap the + // dataset into a brute force index, like it is done in set_search_dataset. +} + +template +void cuvs_gpu::search( + const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const +{ + auto queries_view = + raft::make_device_matrix_view(queries, batch_size, this->dim_); + + auto neighbors_view = + raft::make_device_matrix_view(neighbors, batch_size, k); + auto distances_view = raft::make_device_matrix_view(distances, batch_size, k); + + cuvs::neighbors::brute_force::search( + handle_, *index_, queries_view, neighbors_view, distances_view, std::nullopt); +} + +template +std::unique_ptr> cuvs_gpu::copy() +{ + return std::make_unique>(*this); // use copy constructor +} + +} // namespace cuvs::bench diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp b/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp new file mode 100644 index 000000000..354a9b291 --- /dev/null +++ b/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../common/ann_types.hpp" +#include "faiss_cpu_wrapper.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace cuvs::bench { + +template +void parse_base_build_param(const nlohmann::json& conf, + typename cuvs::bench::faiss_cpu::build_param& param) +{ + param.nlist = conf.at("nlist"); + if (conf.contains("ratio")) { param.ratio = conf.at("ratio"); } +} + +template +void parse_build_param(const nlohmann::json& conf, + typename cuvs::bench::faiss_cpu_ivf_flat::build_param& param) +{ + parse_base_build_param(conf, param); +} + +template +void parse_build_param(const nlohmann::json& conf, + typename cuvs::bench::faiss_cpu_ivfpq::build_param& param) +{ + parse_base_build_param(conf, param); + param.m = conf.at("M"); + if (conf.contains("usePrecomputed")) { + param.use_precomputed = conf.at("usePrecomputed"); + } else { + param.use_precomputed = false; + } + if (conf.contains("bitsPerCode")) { + param.bits_per_code = conf.at("bitsPerCode"); + } else { + param.bits_per_code = 8; + } +} + +template +void parse_build_param(const nlohmann::json& conf, + typename cuvs::bench::faiss_cpu_ivfsq::build_param& param) +{ + parse_base_build_param(conf, param); + param.quantizer_type = conf.at("quantizer_type"); +} + +template +void parse_search_param(const nlohmann::json& conf, + typename cuvs::bench::faiss_cpu::search_param& param) +{ + param.nprobe = conf.at("nprobe"); + if (conf.contains("refine_ratio")) { param.refine_ratio = conf.at("refine_ratio"); } + if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); } +} + +template class Algo> +auto make_algo(cuvs::bench::Metric metric, int dim, const nlohmann::json& conf) + -> std::unique_ptr> +{ + typename Algo::build_param param; + parse_build_param(conf, param); + return std::make_unique>(metric, dim, param); +} + +template +auto create_algo(const std::string& algo_name, + const std::string& distance, + int dim, + const nlohmann::json& conf) -> std::unique_ptr> +{ + std::unique_ptr> a; + + if constexpr (std::is_same_v) { + cuvs::bench::Metric metric = parse_metric(distance); + if (algo_name == "faiss_cpu_ivf_flat") { + a = make_algo(metric, dim, conf); + } else if (algo_name == "faiss_cpu_ivf_pq") { + a = make_algo(metric, dim, conf); + } else if (algo_name == "faiss_cpu_ivf_sq") { + a = make_algo(metric, dim, conf); + } else if (algo_name == "faiss_cpu_flat") { + a = std::make_unique>(metric, dim); + } + } + + if constexpr (std::is_same_v) {} + + if (!a) { throw std::runtime_error("invalid algo: '" + algo_name + "'"); } + + return a; +} + +template +auto create_search_param(const std::string& algo_name, const nlohmann::json& conf) + -> std::unique_ptr::search_param> +{ + if (algo_name == "faiss_cpu_ivf_flat" || algo_name == "faiss_cpu_ivf_pq" || + algo_name == "faiss_cpu_ivf_sq") { + auto param = std::make_unique::search_param>(); + parse_search_param(conf, *param); + return param; + } else if (algo_name == "faiss_cpu_flat") { + auto param = std::make_unique::search_param>(); + return param; + } + // else + throw std::runtime_error("invalid algo: '" + algo_name + "'"); +} + +} // namespace cuvs::bench + +REGISTER_ALGO_INSTANCE(float); +REGISTER_ALGO_INSTANCE(std::int8_t); +REGISTER_ALGO_INSTANCE(std::uint8_t); + +#ifdef ANN_BENCH_BUILD_MAIN +#include "../common/benchmark.hpp" +int main(int argc, char** argv) { return cuvs::bench::run_main(argc, argv); } +#endif diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h new file mode 100644 index 000000000..0cc40de37 --- /dev/null +++ b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "../common/ann_types.hpp" +#include "../common/thread_pool.hpp" +#include "../common/util.hpp" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace { + +auto parse_metric_type(cuvs::bench::Metric metric) -> faiss::MetricType +{ + if (metric == cuvs::bench::Metric::kInnerProduct) { + return faiss::METRIC_INNER_PRODUCT; + } else if (metric == cuvs::bench::Metric::kEuclidean) { + return faiss::METRIC_L2; + } else { + throw std::runtime_error("faiss supports only metric type of inner product and L2"); + } +} +} // namespace + +namespace cuvs::bench { + +template +class faiss_cpu : public algo { + public: + using search_param_base = typename algo::search_param; + struct search_param : public search_param_base { + int nprobe; + float refine_ratio = 1.0; + int num_threads = omp_get_num_procs(); + }; + + struct build_param { + int nlist = 1; + int ratio = 2; + }; + + faiss_cpu(Metric metric, int dim, const build_param& param) + : algo(metric, dim), + metric_type_(parse_metric_type(metric)), + nlist_{param.nlist}, + training_sample_fraction_{1.0 / double(param.ratio)} + { + static_assert(std::is_same_v, "faiss support only float type"); + } + + void build(const T* dataset, size_t nrow) final; + + void set_search_param(const search_param_base& param) override; + + void init_quantizer(int dim) + { + if (this->metric_type_ == faiss::MetricType::METRIC_L2) { + this->quantizer_ = std::make_shared(dim); + } else if (this->metric_type_ == faiss::MetricType::METRIC_INNER_PRODUCT) { + this->quantizer_ = std::make_shared(dim); + } + } + + // TODO(snanditale): if the number of results is less than k, the remaining elements of + // 'neighbors' will be filled with (size_t)-1 + void search(const T* queries, + int batch_size, + int k, + algo_base::index_type* neighbors, + float* distances) const final; + + [[nodiscard]] auto get_preference() const -> algo_property override + { + algo_property property; + // to enable building big dataset which is larger than memory + property.dataset_memory_type = MemoryType::kHost; + property.query_memory_type = MemoryType::kHost; + return property; + } + + protected: + template + void save_(const std::string& file) const; // NOLINT + + template + void load_(const std::string& file); // NOLINT + + std::shared_ptr index_; + std::shared_ptr quantizer_; + std::shared_ptr index_refine_; + faiss::MetricType metric_type_; + int nlist_; + double training_sample_fraction_; + + int num_threads_; + std::shared_ptr thread_pool_; +}; + +template +void faiss_cpu::build(const T* dataset, size_t nrow) +{ + auto index_ivf = dynamic_cast(index_.get()); + if (index_ivf != nullptr) { + // set the min/max training size for clustering to use the whole provided training set. + double trainset_size = training_sample_fraction_ * static_cast(nrow); + double points_per_centroid = trainset_size / static_cast(nlist_); + int max_ppc = std::ceil(points_per_centroid); + int min_ppc = std::floor(points_per_centroid); + if (min_ppc < index_ivf->cp.min_points_per_centroid) { + log_warn( + "The suggested training set size %zu (data size %zu, training sample ratio %f) yields %d " + "points per cluster (n_lists = %d). This is smaller than the FAISS default " + "min_points_per_centroid = %d.", + static_cast(trainset_size), + nrow, + training_sample_fraction_, + min_ppc, + nlist_, + index_ivf->cp.min_points_per_centroid); + } + index_ivf->cp.max_points_per_centroid = max_ppc; + index_ivf->cp.min_points_per_centroid = min_ppc; + } + index_->train(nrow, dataset); // faiss::IndexFlat::train() will do nothing + assert(index_->is_trained); + index_->add(nrow, dataset); + index_refine_ = std::make_shared(this->index_.get(), dataset); +} + +template +void faiss_cpu::set_search_param(const search_param_base& param) +{ + auto sp = dynamic_cast(param); + int nprobe = sp.nprobe; + assert(nprobe <= nlist_); + dynamic_cast(index_.get())->nprobe = nprobe; + + if (sp.refine_ratio > 1.0) { this->index_refine_.get()->k_factor = sp.refine_ratio; } + + if (!thread_pool_ || num_threads_ != sp.num_threads) { + num_threads_ = sp.num_threads; + thread_pool_ = std::make_shared(num_threads_); + } +} + +template +void faiss_cpu::search( + const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const +{ + static_assert(sizeof(size_t) == sizeof(faiss::idx_t), + "sizes of size_t and faiss::idx_t are different"); + + thread_pool_->submit( + [&](int i) { + // Use thread pool for batch size = 1. FAISS multi-threads internally for batch size > 1. + index_->search(batch_size, queries, k, distances, reinterpret_cast(neighbors)); + }, + 1); +} + +template +template +void faiss_cpu::save_(const std::string& file) const +{ + faiss::write_index(index_.get(), file.c_str()); +} + +template +template +void faiss_cpu::load_(const std::string& file) +{ + index_ = std::shared_ptr(dynamic_cast(faiss::read_index(file.c_str()))); +} + +template +class faiss_cpu_ivf_flat : public faiss_cpu { + public: + using typename faiss_cpu::build_param; + + faiss_cpu_ivf_flat(Metric metric, int dim, const build_param& param) + : faiss_cpu(metric, dim, param) + { + this->init_quantizer(dim); + this->index_ = std::make_shared( + this->quantizer_.get(), dim, param.nlist, this->metric_type_); + } + + void save(const std::string& file) const override + { + this->template save_(file); + } + void load(const std::string& file) override { this->template load_(file); } + + std::unique_ptr> copy() + { + return std::make_unique>(*this); // use copy constructor + } +}; + +template +class faiss_cpu_ivfpq : public faiss_cpu { + public: + struct build_param : public faiss_cpu::build_param { + int m; + int bits_per_code; + bool use_precomputed; + }; + + faiss_cpu_ivfpq(Metric metric, int dim, const build_param& param) + : faiss_cpu(metric, dim, param) + { + this->init_quantizer(dim); + this->index_ = std::make_shared( + this->quantizer_.get(), dim, param.nlist, param.m, param.bits_per_code, this->metric_type_); + } + + void save(const std::string& file) const override + { + this->template save_(file); + } + void load(const std::string& file) override { this->template load_(file); } + + std::unique_ptr> copy() + { + return std::make_unique>(*this); // use copy constructor + } +}; + +// TODO(snanditale): Enable this in cmake +// ref: https://github.com/rapidsai/raft/issues/1876 +template +class faiss_cpu_ivfsq : public faiss_cpu { + public: + struct build_param : public faiss_cpu::build_param { + std::string quantizer_type; + }; + + faiss_cpu_ivfsq(Metric metric, int dim, const build_param& param) + : faiss_cpu(metric, dim, param) + { + faiss::ScalarQuantizer::QuantizerType qtype; + if (param.quantizer_type == "fp16") { + qtype = faiss::ScalarQuantizer::QT_fp16; + } else if (param.quantizer_type == "int8") { + qtype = faiss::ScalarQuantizer::QT_8bit; + } else { + throw std::runtime_error("faiss_cpu_ivfsq supports only fp16 and int8 but got " + + param.quantizer_type); + } + + this->init_quantizer(dim); + this->index_ = std::make_shared( + this->quantizer_.get(), dim, param.nlist, qtype, this->metric_type_, true); + } + + void save(const std::string& file) const override + { + this->template save_(file); + } + void load(const std::string& file) override + { + this->template load_(file); + } + + std::unique_ptr> copy() + { + return std::make_unique>(*this); // use copy constructor + } +}; + +template +class faiss_cpu_flat : public faiss_cpu { + public: + faiss_cpu_flat(Metric metric, int dim) + : faiss_cpu(metric, dim, typename faiss_cpu::build_param{}) + { + this->index_ = std::make_shared(dim, this->metric_type_); + } + + // class faiss_cpu is more like a IVF class, so need special treating here + void set_search_param(const typename algo::search_param& param) override + { + auto search_param = dynamic_cast::search_param&>(param); + if (!this->thread_pool_ || this->num_threads_ != search_param.num_threads) { + this->num_threads_ = search_param.num_threads; + this->thread_pool_ = std::make_shared(this->num_threads_); + } + }; + + void save(const std::string& file) const override + { + this->template save_(file); + } + void load(const std::string& file) override { this->template load_(file); } + + std::unique_ptr> copy() + { + return std::make_unique>(*this); // use copy constructor + } +}; + +} // namespace cuvs::bench diff --git a/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu b/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu new file mode 100644 index 000000000..2d9271639 --- /dev/null +++ b/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../common/ann_types.hpp" + +#undef WARP_SIZE +#include "faiss_gpu_wrapper.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace cuvs::bench { + +template +void parse_base_build_param(const nlohmann::json& conf, + typename cuvs::bench::faiss_gpu::build_param& param) +{ + param.nlist = conf.at("nlist"); + if (conf.contains("ratio")) { param.ratio = conf.at("ratio"); } +} + +template +void parse_build_param(const nlohmann::json& conf, + typename cuvs::bench::faiss_gpu_ivf_flat::build_param& param) +{ + parse_base_build_param(conf, param); +} + +template +void parse_build_param(const nlohmann::json& conf, + typename cuvs::bench::faiss_gpu_ivfpq::build_param& param) +{ + parse_base_build_param(conf, param); + param.m = conf.at("M"); + if (conf.contains("usePrecomputed")) { + param.use_precomputed = conf.at("usePrecomputed"); + } else { + param.use_precomputed = false; + } + if (conf.contains("useFloat16")) { + param.use_float16 = conf.at("useFloat16"); + } else { + param.use_float16 = false; + } +} + +template +void parse_build_param(const nlohmann::json& conf, + typename cuvs::bench::faiss_gpu_ivfsq::build_param& param) +{ + parse_base_build_param(conf, param); + param.quantizer_type = conf.at("quantizer_type"); +} + +template +void parse_search_param(const nlohmann::json& conf, + typename cuvs::bench::faiss_gpu::search_param& param) +{ + param.nprobe = conf.at("nprobe"); + if (conf.contains("refine_ratio")) { param.refine_ratio = conf.at("refine_ratio"); } +} + +template class Algo> +auto make_algo(cuvs::bench::Metric metric, int dim, const nlohmann::json& conf) + -> std::unique_ptr> +{ + typename Algo::build_param param; + parse_build_param(conf, param); + return std::make_unique>(metric, dim, param); +} + +template +auto create_algo(const std::string& algo_name, + const std::string& distance, + int dim, + const nlohmann::json& conf) -> std::unique_ptr> +{ + std::unique_ptr> a; + + if constexpr (std::is_same_v) { + cuvs::bench::Metric metric = parse_metric(distance); + if (algo_name == "faiss_gpu_ivf_flat") { + a = make_algo(metric, dim, conf); + } else if (algo_name == "faiss_gpu_ivf_pq") { + a = make_algo(metric, dim, conf); + } else if (algo_name == "faiss_gpu_ivf_sq") { + a = make_algo(metric, dim, conf); + } else if (algo_name == "faiss_gpu_flat") { + a = std::make_unique>(metric, dim); + } + } + + if (!a) { throw std::runtime_error("invalid algo: '" + algo_name + "'"); } + + return a; +} + +template +auto create_search_param(const std::string& algo_name, const nlohmann::json& conf) + -> std::unique_ptr::search_param> +{ + if (algo_name == "faiss_gpu_ivf_flat" || algo_name == "faiss_gpu_ivf_pq" || + algo_name == "faiss_gpu_ivf_sq") { + auto param = std::make_unique::search_param>(); + parse_search_param(conf, *param); + return param; + } else if (algo_name == "faiss_gpu_flat") { + auto param = std::make_unique::search_param>(); + return param; + } + // else + throw std::runtime_error("invalid algo: '" + algo_name + "'"); +} + +} // namespace cuvs::bench + +REGISTER_ALGO_INSTANCE(float); +REGISTER_ALGO_INSTANCE(std::int8_t); +REGISTER_ALGO_INSTANCE(std::uint8_t); + +#ifdef ANN_BENCH_BUILD_MAIN +#include "../common/benchmark.hpp" +int main(int argc, char** argv) { return cuvs::bench::run_main(argc, argv); } +#endif diff --git a/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h new file mode 100644 index 000000000..f935e365f --- /dev/null +++ b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h @@ -0,0 +1,444 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "../common/ann_types.hpp" +#include "../common/util.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace { + +auto parse_metric_type(cuvs::bench::Metric metric) -> faiss::MetricType +{ + if (metric == cuvs::bench::Metric::kInnerProduct) { + return faiss::METRIC_INNER_PRODUCT; + } else if (metric == cuvs::bench::Metric::kEuclidean) { + return faiss::METRIC_L2; + } else { + throw std::runtime_error("faiss supports only metric type of inner product and L2"); + } +} + +// note BLAS library can still use multi-threading, and +// setting environment variable like OPENBLAS_NUM_THREADS can control it +class omp_single_thread_scope { + public: + omp_single_thread_scope() + { + max_threads_ = omp_get_max_threads(); + omp_set_num_threads(1); + } + ~omp_single_thread_scope() + { + // the best we can do + omp_set_num_threads(max_threads_); + } + + private: + int max_threads_; +}; + +} // namespace + +namespace cuvs::bench { + +template +class faiss_gpu : public algo, public algo_gpu { + public: + using search_param_base = typename algo::search_param; + struct search_param : public search_param_base { + int nprobe; + float refine_ratio = 1.0; + [[nodiscard]] auto needs_dataset() const -> bool override { return refine_ratio > 1.0f; } + }; + + struct build_param { + int nlist = 1; + int ratio = 2; + }; + + faiss_gpu(Metric metric, int dim, const build_param& param) + : algo(metric, dim), + gpu_resource_{std::make_shared()}, + metric_type_(parse_metric_type(metric)), + nlist_{param.nlist}, + training_sample_fraction_{1.0 / double(param.ratio)} + { + static_assert(std::is_same_v, "faiss support only float type"); + cudaGetDevice(&device_); + } + + void build(const T* dataset, size_t nrow) final; + + virtual void set_search_param(const search_param_base& param) {} + + void set_search_dataset(const T* dataset, size_t nrow) override { dataset_ = dataset; } + + // TODO(snanditale): if the number of results is less than k, the remaining elements of + // 'neighbors' will be filled with (size_t)-1 + void search(const T* queries, + int batch_size, + int k, + algo_base::index_type* neighbors, + float* distances) const final; + + [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override + { + return gpu_resource_->getDefaultStream(device_); + } + + [[nodiscard]] auto get_preference() const -> algo_property override + { + algo_property property; + // to enable building big dataset which is larger than GPU memory + property.dataset_memory_type = MemoryType::kHost; + property.query_memory_type = MemoryType::kHost; + return property; + } + + protected: + template + void save_(const std::string& file) const; // NOLINT + + template + void load_(const std::string& file); // NOLINT + + /** [NOTE Multithreading] + * + * `gpu_resource_` is a shared resource: + * 1. It uses a shared_ptr under the hood, so the copies of it refer to the same + * resource implementation instance + * 2. GpuIndex is probably keeping a reference to it, as it's passed to the constructor + * + * To avoid copying the index (database) in each thread, we make both the index and + * the gpu_resource shared. + * This means faiss GPU streams are possibly shared among the CPU threads; + * the throughput search mode may be inaccurate. + * + * WARNING: we haven't investigated whether faiss::gpu::GpuIndex or + * faiss::gpu::StandardGpuResources are thread-safe. + * + */ + mutable std::shared_ptr gpu_resource_; + std::shared_ptr index_; + std::shared_ptr index_refine_{nullptr}; + faiss::MetricType metric_type_; + int nlist_; + int device_; + double training_sample_fraction_; + std::shared_ptr search_params_; + const T* dataset_; + float refine_ratio_ = 1.0; +}; + +template +void faiss_gpu::build(const T* dataset, size_t nrow) +{ + omp_single_thread_scope omp_single_thread; + auto index_ivf = dynamic_cast(index_.get()); + if (index_ivf != nullptr) { + // set the min/max training size for clustering to use the whole provided training set. + double trainset_size = training_sample_fraction_ * static_cast(nrow); + double points_per_centroid = trainset_size / static_cast(nlist_); + int max_ppc = std::ceil(points_per_centroid); + int min_ppc = std::floor(points_per_centroid); + if (min_ppc < index_ivf->cp.min_points_per_centroid) { + log_warn( + "The suggested training set size %zu (data size %zu, training sample ratio %f) yields %d " + "points per cluster (n_lists = %d). This is smaller than the FAISS default " + "min_points_per_centroid = %d.", + static_cast(trainset_size), + nrow, + training_sample_fraction_, + min_ppc, + nlist_, + index_ivf->cp.min_points_per_centroid); + } + index_ivf->cp.max_points_per_centroid = max_ppc; + index_ivf->cp.min_points_per_centroid = min_ppc; + } + index_->train(nrow, dataset); // faiss::gpu::GpuIndexFlat::train() will do nothing + assert(index_->is_trained); + index_->add(nrow, dataset); +} + +template +void faiss_gpu::search( + const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const +{ + static_assert(sizeof(size_t) == sizeof(faiss::idx_t), + "sizes of size_t and faiss::idx_t are different"); + + if (this->refine_ratio_ > 1.0) { + // TODO(snanditale): FAISS changed their search APIs to accept the search parameters as a struct + // object but their refine API doesn't allow the struct to be passed in. Once this is fixed, we + // need to re-enable refinement below + // index_refine_->search(batch_size, queries, k, distances, + // reinterpret_cast(neighbors), this->search_params_.get()); Related FAISS issue: + // https://github.com/facebookresearch/faiss/issues/3118 + throw std::runtime_error( + "FAISS doesn't support refinement in their new APIs so this feature is disabled in the " + "benchmarks for the time being."); + } else { + index_->search(batch_size, + queries, + k, + distances, + reinterpret_cast(neighbors), + this->search_params_.get()); + } +} + +template +template +void faiss_gpu::save_(const std::string& file) const +{ + omp_single_thread_scope omp_single_thread; + + auto cpu_index = std::make_unique(); + dynamic_cast(index_.get())->copyTo(cpu_index.get()); + faiss::write_index(cpu_index.get(), file.c_str()); +} + +template +template +void faiss_gpu::load_(const std::string& file) +{ + omp_single_thread_scope omp_single_thread; + + std::unique_ptr cpu_index(dynamic_cast(faiss::read_index(file.c_str()))); + assert(cpu_index); + + try { + dynamic_cast(index_.get())->copyFrom(cpu_index.get()); + + } catch (const std::exception& e) { + std::cout << "Error loading index file: " << std::string(e.what()) << std::endl; + } +} + +template +class faiss_gpu_ivf_flat : public faiss_gpu { + public: + using typename faiss_gpu::build_param; + using typename faiss_gpu::search_param_base; + + faiss_gpu_ivf_flat(Metric metric, int dim, const build_param& param) + : faiss_gpu(metric, dim, param) + { + faiss::gpu::GpuIndexIVFFlatConfig config; + config.device = this->device_; + this->index_ = std::make_shared( + this->gpu_resource_.get(), dim, param.nlist, this->metric_type_, config); + } + + void set_search_param(const search_param_base& param) override + { + auto sp = dynamic_cast::search_param&>(param); + int nprobe = sp.nprobe; + assert(nprobe <= this->nlist_); + + faiss::IVFSearchParameters faiss_search_params; + faiss_search_params.nprobe = nprobe; + this->search_params_ = std::make_shared(faiss_search_params); + this->refine_ratio_ = sp.refine_ratio; + } + + void save(const std::string& file) const override + { + this->template save_(file); + } + void load(const std::string& file) override + { + this->template load_(file); + } + std::unique_ptr> copy() override + { + return std::make_unique>(*this); + }; +}; + +template +class faiss_gpu_ivfpq : public faiss_gpu { + public: + struct build_param : public faiss_gpu::build_param { + int m; + bool use_float16; + bool use_precomputed; + }; + using typename faiss_gpu::search_param_base; + + faiss_gpu_ivfpq(Metric metric, int dim, const build_param& param) + : faiss_gpu(metric, dim, param) + { + faiss::gpu::GpuIndexIVFPQConfig config; + config.useFloat16LookupTables = param.use_float16; + config.usePrecomputedTables = param.use_precomputed; + config.device = this->device_; + + this->index_ = + std::make_shared(this->gpu_resource_.get(), + dim, + param.nlist, + param.m, + 8, // FAISS only supports bitsPerCode=8 + this->metric_type_, + config); + } + + void set_search_param(const search_param_base& param) override + { + auto sp = dynamic_cast::search_param&>(param); + int nprobe = sp.nprobe; + assert(nprobe <= this->nlist_); + this->refine_ratio_ = sp.refine_ratio; + faiss::IVFPQSearchParameters faiss_search_params; + faiss_search_params.nprobe = nprobe; + + this->search_params_ = std::make_shared(faiss_search_params); + + if (sp.refine_ratio > 1.0) { + this->index_refine_ = + std::make_shared(this->index_.get(), this->dataset_); + this->index_refine_.get()->k_factor = sp.refine_ratio; + } + } + + void save(const std::string& file) const override + { + this->template save_(file); + } + void load(const std::string& file) override + { + this->template load_(file); + } + std::unique_ptr> copy() override { return std::make_unique>(*this); }; +}; + +// TODO(snanditale): Enable this in cmake +// ref: https://github.com/rapidsai/raft/issues/1876 +template +class faiss_gpu_ivfsq : public faiss_gpu { + public: + struct build_param : public faiss_gpu::build_param { + std::string quantizer_type; + }; + using typename faiss_gpu::search_param_base; + + faiss_gpu_ivfsq(Metric metric, int dim, const build_param& param) + : faiss_gpu(metric, dim, param) + { + faiss::ScalarQuantizer::QuantizerType qtype; + if (param.quantizer_type == "fp16") { + qtype = faiss::ScalarQuantizer::QT_fp16; + } else if (param.quantizer_type == "int8") { + qtype = faiss::ScalarQuantizer::QT_8bit; + } else { + throw std::runtime_error("faiss_gpu_ivfsq supports only fp16 and int8 but got " + + param.quantizer_type); + } + + faiss::gpu::GpuIndexIVFScalarQuantizerConfig config; + config.device = this->device_; + this->index_ = std::make_shared( + this->gpu_resource_.get(), dim, param.nlist, qtype, this->metric_type_, true, config); + } + + void set_search_param(const search_param_base& param) override + { + auto sp = dynamic_cast::search_param&>(param); + int nprobe = sp.nprobe; + assert(nprobe <= this->nlist_); + + faiss::IVFSearchParameters faiss_search_params; + faiss_search_params.nprobe = nprobe; + + this->search_params_ = std::make_shared(faiss_search_params); + this->refine_ratio_ = sp.refine_ratio; + if (sp.refine_ratio > 1.0) { + this->index_refine_ = + std::make_shared(this->index_.get(), this->dataset_); + this->index_refine_.get()->k_factor = sp.refine_ratio; + } + } + + void save(const std::string& file) const override + { + this->template save_( + file); + } + void load(const std::string& file) override + { + this->template load_( + file); + } + std::unique_ptr> copy() override { return std::make_unique>(*this); }; +}; + +template +class faiss_gpu_flat : public faiss_gpu { + public: + using typename faiss_gpu::search_param_base; + + faiss_gpu_flat(Metric metric, int dim) + : faiss_gpu(metric, dim, typename faiss_gpu::build_param{}) + { + faiss::gpu::GpuIndexFlatConfig config; + config.device = this->device_; + this->index_ = std::make_shared( + this->gpu_resource_.get(), dim, this->metric_type_, config); + } + void set_search_param(const search_param_base& param) override + { + auto sp = dynamic_cast::search_param&>(param); + int nprobe = sp.nprobe; + assert(nprobe <= this->nlist_); + + this->search_params_ = std::make_shared(); + } + + void save(const std::string& file) const override + { + this->template save_(file); + } + void load(const std::string& file) override + { + this->template load_(file); + } + std::unique_ptr> copy() override { return std::make_unique>(*this); }; +}; + +} // namespace cuvs::bench diff --git a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu new file mode 100644 index 000000000..6b1a1eaf7 --- /dev/null +++ b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../common/ann_types.hpp" +#include "ggnn_wrapper.cuh" + +#include +#include +#include +#include +#include +#include +#include + +namespace cuvs::bench { + +template +void parse_build_param(const nlohmann::json& conf, + typename cuvs::bench::ggnn::build_param& param) +{ + param.k = conf.at("k"); + + if (conf.contains("k_build")) { param.k_build = conf.at("k_build"); } + if (conf.contains("segment_size")) { param.segment_size = conf.at("segment_size"); } + if (conf.contains("num_layers")) { param.num_layers = conf.at("num_layers"); } + if (conf.contains("tau")) { param.tau = conf.at("tau"); } + if (conf.contains("refine_iterations")) { + param.refine_iterations = conf.at("refine_iterations"); + } +} + +template +void parse_search_param(const nlohmann::json& conf, + typename cuvs::bench::ggnn::search_param& param) +{ + param.tau = conf.at("tau"); + + if (conf.contains("block_dim")) { param.block_dim = conf.at("block_dim"); } + if (conf.contains("max_iterations")) { param.max_iterations = conf.at("max_iterations"); } + if (conf.contains("cache_size")) { param.cache_size = conf.at("cache_size"); } + if (conf.contains("sorted_size")) { param.sorted_size = conf.at("sorted_size"); } +} + +template class Algo> +auto make_algo(cuvs::bench::Metric metric, int dim, const nlohmann::json& conf) + -> std::unique_ptr> +{ + typename Algo::build_param param; + parse_build_param(conf, param); + return std::make_unique>(metric, dim, param); +} + +template +auto create_algo(const std::string& algo_name, + const std::string& distance, + int dim, + const nlohmann::json& conf) -> std::unique_ptr> +{ + cuvs::bench::Metric metric = parse_metric(distance); + std::unique_ptr> a; + + if constexpr (std::is_same_v || std::is_same_v || + std::is_same_v) { + if (algo_name == "ggnn") { a = make_algo(metric, dim, conf); } + } + if (!a) { throw std::runtime_error("invalid algo: '" + algo_name + "'"); } + + return a; +} + +template +auto create_search_param(const std::string& algo_name, const nlohmann::json& conf) + -> std::unique_ptr::search_param> +{ + if constexpr (std::is_same_v || std::is_same_v || + std::is_same_v) { + if (algo_name == "ggnn") { + auto param = std::make_unique::search_param>(); + parse_search_param(conf, *param); + return param; + } + } + // else + throw std::runtime_error("invalid algo: '" + algo_name + "'"); +} + +} // namespace cuvs::bench + +REGISTER_ALGO_INSTANCE(float); +REGISTER_ALGO_INSTANCE(std::int8_t); +REGISTER_ALGO_INSTANCE(std::uint8_t); + +#ifdef ANN_BENCH_BUILD_MAIN +#include "../common/benchmark.hpp" +int main(int argc, char** argv) { return cuvs::bench::run_main(argc, argv); } +#endif diff --git a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh new file mode 100644 index 000000000..e2ca18e22 --- /dev/null +++ b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../common/ann_types.hpp" +#include "../common/util.hpp" + +// #include + +#include + +#include +#include + +namespace cuvs::bench { + +template +class ggnn_impl; + +template +class ggnn : public algo, public algo_gpu { + public: + struct build_param { + int k_build{24}; // KBuild + int segment_size{32}; // S + int num_layers{4}; // L + float tau{0.5}; + int refine_iterations{2}; + int k; // GGNN requires to know k during building + }; + + using search_param_base = typename algo::search_param; + struct search_param : public search_param_base { + float tau; + int block_dim{32}; + int max_iterations{400}; + int cache_size{512}; + int sorted_size{256}; + [[nodiscard]] auto needs_dataset() const -> bool override { return true; } + }; + + ggnn(Metric metric, int dim, const build_param& param); + + void build(const T* dataset, size_t nrow) override { impl_->build(dataset, nrow); } + + void set_search_param(const search_param_base& param) override { impl_->set_search_param(param); } + void search(const T* queries, + int batch_size, + int k, + algo_base::index_type* neighbors, + float* distances) const override + { + impl_->search(queries, batch_size, k, neighbors, distances); + } + [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override + { + return dynamic_cast(impl_.get())->get_sync_stream(); + } + + void save(const std::string& file) const override { impl_->save(file); } + void load(const std::string& file) override { impl_->load(file); } + std::unique_ptr> copy() override { return std::make_unique>(*this); }; + + [[nodiscard]] auto get_preference() const -> algo_property override + { + return impl_->get_preference(); + } + + void set_search_dataset(const T* dataset, size_t nrow) override + { + impl_->set_search_dataset(dataset, nrow); + }; + + private: + std::shared_ptr> impl_; +}; + +template +ggnn::ggnn(Metric metric, int dim, const build_param& param) : algo(metric, dim) +{ + // ggnn/src/sift1m.cu + if (metric == Metric::kEuclidean && dim == 128 && param.k_build == 24 && param.k == 10 && + param.segment_size == 32) { + impl_ = std::make_shared>(metric, dim, param); + } + // ggnn/src/deep1b_multi_gpu.cu, and adapt it deep1B + else if (metric == Metric::kEuclidean && dim == 96 && param.k_build == 24 && param.k == 10 && + param.segment_size == 32) { + impl_ = std::make_shared>(metric, dim, param); + } else if (metric == Metric::kInnerProduct && dim == 96 && param.k_build == 24 && param.k == 10 && + param.segment_size == 32) { + impl_ = std::make_shared>(metric, dim, param); + } else if (metric == Metric::kInnerProduct && dim == 96 && param.k_build == 96 && param.k == 10 && + param.segment_size == 64) { + impl_ = std::make_shared>(metric, dim, param); + } + // ggnn/src/glove200.cu, adapt it to glove100 + else if (metric == Metric::kInnerProduct && dim == 100 && param.k_build == 96 && param.k == 10 && + param.segment_size == 64) { + impl_ = std::make_shared>(metric, dim, param); + } else { + throw std::runtime_error( + "ggnn: not supported combination of metric, dim and build param; " + "see Ggnn's constructor in ggnn_wrapper.cuh for available combinations"); + } +} + +template +class ggnn_impl : public algo, public algo_gpu { + public: + using search_param_base = typename algo::search_param; + + ggnn_impl(Metric metric, int dim, const typename ggnn::build_param& param); + + void build(const T* dataset, size_t nrow) override; + + void set_search_param(const search_param_base& param) override; + void search(const T* queries, + int batch_size, + int k, + algo_base::index_type* neighbors, + float* distances) const override; + [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override { return stream_; } + + void save(const std::string& file) const override; + void load(const std::string& file) override; + std::unique_ptr> copy() override + { + auto r = std::make_unique>(*this); + // set the thread-local stream to the copied handle. + r->stream_ = cuvs::bench::get_stream_from_global_pool(); + return r; + }; + + [[nodiscard]] auto get_preference() const -> algo_property override + { + algo_property property; + property.dataset_memory_type = MemoryType::kDevice; + property.query_memory_type = MemoryType::kDevice; + return property; + } + + void set_search_dataset(const T* dataset, size_t nrow) override; + + private: + using algo::metric_; + using algo::dim_; + + using ggnngpu_instance = GGNNGPUInstance; + std::shared_ptr ggnn_; + typename ggnn::build_param build_param_; + typename ggnn::search_param search_param_; + cudaStream_t stream_; + const T* base_dataset_ = nullptr; + size_t base_n_rows_ = 0; + std::optional graph_file_ = std::nullopt; + + void load_impl() + { + if (base_dataset_ == nullptr) { return; } + if (base_n_rows_ == 0) { return; } + int device; + cudaGetDevice(&device); + ggnn_ = std::make_shared( + device, base_n_rows_, build_param_.num_layers, true, build_param_.tau); + ggnn_->set_base_data(base_dataset_); + ggnn_->set_stream(get_sync_stream()); + if (graph_file_.has_value()) { + auto& ggnn_host = ggnn_->ggnn_cpu_buffers.at(0); + auto& ggnn_device = ggnn_->ggnn_shards.at(0); + ggnn_->set_stream(get_sync_stream()); + + ggnn_host.load(graph_file_.value()); + ggnn_host.uploadAsync(ggnn_device); + cudaStreamSynchronize(ggnn_device.stream); + } + } +}; + +template +ggnn_impl::ggnn_impl(Metric metric, + int dim, + const typename ggnn::build_param& param) + : algo(metric, dim), build_param_(param), stream_(cuvs::bench::get_stream_from_global_pool()) +{ + if (metric_ == Metric::kInnerProduct) { + if (measure != Cosine) { throw std::runtime_error("mis-matched metric"); } + } else if (metric_ == Metric::kEuclidean) { + if (measure != Euclidean) { throw std::runtime_error("mis-matched metric"); } + } else { + throw std::runtime_error( + "ggnn supports only metric type of InnerProduct, Cosine and Euclidean"); + } + + if (dim != D) { throw std::runtime_error("mis-matched dim"); } +} + +template +void ggnn_impl::build(const T* dataset, size_t nrow) +{ + base_dataset_ = dataset; + base_n_rows_ = nrow; + graph_file_ = std::nullopt; + load_impl(); + ggnn_->build(0); + for (int i = 0; i < build_param_.refine_iterations; ++i) { + ggnn_->refine(); + } +} + +template +void ggnn_impl::set_search_dataset(const T* dataset, size_t nrow) +{ + if (base_dataset_ != dataset || base_n_rows_ != nrow) { + base_dataset_ = dataset; + base_n_rows_ = nrow; + load_impl(); + } +} + +template +void ggnn_impl::set_search_param(const search_param_base& param) +{ + search_param_ = dynamic_cast::search_param&>(param); +} + +template +void ggnn_impl::search( + const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const +{ + static_assert(sizeof(size_t) == sizeof(int64_t), "sizes of size_t and GGNN's KeyT are different"); + if (k != KQuery) { + throw std::runtime_error( + "k = " + std::to_string(k) + + ", but this GGNN instance only supports k = " + std::to_string(KQuery)); + } + + ggnn_->set_stream(get_sync_stream()); + cudaMemcpyToSymbol(c_tau_query, &search_param_.tau, sizeof(float)); + + const int block_dim = search_param_.block_dim; + const int max_iterations = search_param_.max_iterations; + const int cache_size = search_param_.cache_size; + const int sorted_size = search_param_.sorted_size; + // default value + if (block_dim == 32 && max_iterations == 400 && cache_size == 512 && sorted_size == 256) { + ggnn_->template queryLayer<32, 400, 512, 256, false>( + queries, batch_size, reinterpret_cast(neighbors), distances); + } + // ggnn/src/sift1m.cu + else if (block_dim == 32 && max_iterations == 200 && cache_size == 256 && sorted_size == 64) { + ggnn_->template queryLayer<32, 200, 256, 64, false>( + queries, batch_size, reinterpret_cast(neighbors), distances); + } + // ggnn/src/sift1m.cu + else if (block_dim == 32 && max_iterations == 400 && cache_size == 448 && sorted_size == 64) { + ggnn_->template queryLayer<32, 400, 448, 64, false>( + queries, batch_size, reinterpret_cast(neighbors), distances); + } + // ggnn/src/glove200.cu + else if (block_dim == 128 && max_iterations == 2000 && cache_size == 2048 && sorted_size == 32) { + ggnn_->template queryLayer<128, 2000, 2048, 32, false>( + queries, batch_size, reinterpret_cast(neighbors), distances); + } + // for glove100 + else if (block_dim == 64 && max_iterations == 400 && cache_size == 512 && sorted_size == 32) { + ggnn_->template queryLayer<64, 400, 512, 32, false>( + queries, batch_size, reinterpret_cast(neighbors), distances); + } else if (block_dim == 128 && max_iterations == 2000 && cache_size == 1024 && + sorted_size == 32) { + ggnn_->template queryLayer<128, 2000, 1024, 32, false>( + queries, batch_size, reinterpret_cast(neighbors), distances); + } else { + throw std::runtime_error("ggnn: not supported search param"); + } +} + +template +void ggnn_impl::save(const std::string& file) const +{ + auto& ggnn_host = ggnn_->ggnn_cpu_buffers.at(0); + auto& ggnn_device = ggnn_->ggnn_shards.at(0); + ggnn_->set_stream(get_sync_stream()); + + ggnn_host.downloadAsync(ggnn_device); + cudaStreamSynchronize(ggnn_device.stream); + ggnn_host.store(file); +} + +template +void ggnn_impl::load(const std::string& file) +{ + if (!graph_file_.has_value() || graph_file_.value() != file) { + graph_file_ = file; + load_impl(); + } +} + +} // namespace cuvs::bench diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp new file mode 100644 index 000000000..755c7c8d6 --- /dev/null +++ b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../common/ann_types.hpp" +#include "hnswlib_wrapper.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace cuvs::bench { + +template +void parse_build_param(const nlohmann::json& conf, + typename cuvs::bench::hnsw_lib::build_param& param) +{ + param.ef_construction = conf.at("efConstruction"); + param.m = conf.at("M"); + if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); } +} + +template +void parse_search_param(const nlohmann::json& conf, + typename cuvs::bench::hnsw_lib::search_param& param) +{ + param.ef = conf.at("ef"); + if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); } +} + +template class Algo> +auto make_algo(cuvs::bench::Metric metric, int dim, const nlohmann::json& conf) + -> std::unique_ptr> +{ + typename Algo::build_param param; + parse_build_param(conf, param); + return std::make_unique>(metric, dim, param); +} + +template +auto create_algo(const std::string& algo_name, + const std::string& distance, + int dim, + const nlohmann::json& conf) -> std::unique_ptr> +{ + cuvs::bench::Metric metric = parse_metric(distance); + std::unique_ptr> a; + + if constexpr (std::is_same_v) { + if (algo_name == "hnswlib") { a = make_algo(metric, dim, conf); } + } + + if constexpr (std::is_same_v) { + if (algo_name == "hnswlib") { a = make_algo(metric, dim, conf); } + } + + if (!a) { throw std::runtime_error("invalid algo: '" + algo_name + "'"); } + return a; +} + +template +auto create_search_param(const std::string& algo_name, const nlohmann::json& conf) + -> std::unique_ptr::search_param> +{ + if (algo_name == "hnswlib") { + auto param = std::make_unique::search_param>(); + parse_search_param(conf, *param); + return param; + } + // else + throw std::runtime_error("invalid algo: '" + algo_name + "'"); +} + +}; // namespace cuvs::bench + +REGISTER_ALGO_INSTANCE(float); +REGISTER_ALGO_INSTANCE(std::int8_t); +REGISTER_ALGO_INSTANCE(std::uint8_t); + +#ifdef ANN_BENCH_BUILD_MAIN +#include "../common/benchmark.hpp" +int main(int argc, char** argv) { return cuvs::bench::run_main(argc, argv); } +#endif diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h new file mode 100644 index 000000000..9d643f12a --- /dev/null +++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "../common/ann_types.hpp" +#include "../common/thread_pool.hpp" +#include "../common/util.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cuvs::bench { + +template +struct hnsw_dist_t { + using type = void; +}; + +template <> +struct hnsw_dist_t { + using type = float; +}; + +template <> +struct hnsw_dist_t { + using type = int; +}; + +template <> +struct hnsw_dist_t { + using type = int; +}; + +template +class hnsw_lib : public algo { + public: + // https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md + struct build_param { + int m; + int ef_construction; + int num_threads = omp_get_num_procs(); + }; + + using search_param_base = typename algo::search_param; + struct search_param : public search_param_base { + int ef; + int num_threads = 1; + }; + + hnsw_lib(Metric metric, int dim, const build_param& param); + + void build(const T* dataset, size_t nrow) override; + + void set_search_param(const search_param_base& param) override; + void search(const T* query, + int batch_size, + int k, + algo_base::index_type* indices, + float* distances) const override; + + void save(const std::string& path_to_index) const override; + void load(const std::string& path_to_index) override; + auto copy() -> std::unique_ptr> override { return std::make_unique>(*this); }; + + [[nodiscard]] auto get_preference() const -> algo_property override + { + algo_property property; + property.dataset_memory_type = MemoryType::kHost; + property.query_memory_type = MemoryType::kHost; + return property; + } + + void set_base_layer_only() { appr_alg_->base_layer_only = true; } + + private: + void get_search_knn_results(const T* query, + int k, + algo_base::index_type* indices, + float* distances) const; + + std::shared_ptr::type>> appr_alg_; + std::shared_ptr::type>> space_; + + using algo::metric_; + using algo::dim_; + int ef_construction_; + int m_; + int num_threads_; + std::shared_ptr thread_pool_; + Mode bench_mode_; +}; + +template +hnsw_lib::hnsw_lib(Metric metric, int dim, const build_param& param) : algo(metric, dim) +{ + assert(dim_ > 0); + static_assert(std::is_same_v || std::is_same_v); + if constexpr (std::is_same_v) { + if (metric_ != Metric::kEuclidean) { + throw std::runtime_error("hnswlib only supports Euclidean distance"); + } + } + + ef_construction_ = param.ef_construction; + m_ = param.m; + num_threads_ = param.num_threads; +} + +template +void hnsw_lib::build(const T* dataset, size_t nrow) +{ + if constexpr (std::is_same_v) { + if (metric_ == Metric::kInnerProduct) { + space_ = std::make_shared(dim_); + } else { + space_ = std::make_shared(dim_); + } + } else if constexpr (std::is_same_v) { + space_ = std::make_shared>(dim_); + } + + appr_alg_ = std::make_shared::type>>( + space_.get(), nrow, m_, ef_construction_); + + thread_pool_ = std::make_shared(num_threads_); + const size_t items_per_thread = nrow / (num_threads_ + 1); + + thread_pool_->submit( + [&](size_t i) { + if (i < items_per_thread && i % 10000 == 0) { + char buf[20]; + std::time_t now = std::time(nullptr); + std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", std::localtime(&now)); + printf("%s building %zu / %zu\n", buf, i, items_per_thread); + fflush(stdout); + } + + appr_alg_->addPoint(dataset + i * dim_, i); + }, + nrow); +} + +template +void hnsw_lib::set_search_param(const search_param_base& param_) +{ + auto param = dynamic_cast(param_); + appr_alg_->ef_ = param.ef; + num_threads_ = param.num_threads; + // bench_mode_ = param.metric_objective; + bench_mode_ = Mode::kLatency; // TODO(achirkin): pass the benchmark mode in the algo parameters + + // Create a pool if multiple query threads have been set and the pool hasn't been created already + bool create_pool = (bench_mode_ == Mode::kLatency && num_threads_ > 1 && !thread_pool_); + if (create_pool) { thread_pool_ = std::make_shared(num_threads_); } +} + +template +void hnsw_lib::search( + const T* query, int batch_size, int k, algo_base::index_type* indices, float* distances) const +{ + auto f = [&](int i) { + // hnsw can only handle a single vector at a time. + get_search_knn_results(query + i * dim_, k, indices + i * k, distances + i * k); + }; + if (bench_mode_ == Mode::kLatency && num_threads_ > 1) { + thread_pool_->submit(f, batch_size); + } else { + for (int i = 0; i < batch_size; i++) { + f(i); + } + } +} + +template +void hnsw_lib::save(const std::string& path_to_index) const +{ + appr_alg_->saveIndex(std::string(path_to_index)); +} + +template +void hnsw_lib::load(const std::string& path_to_index) +{ + if constexpr (std::is_same_v) { + if (metric_ == Metric::kInnerProduct) { + space_ = std::make_shared(dim_); + } else { + space_ = std::make_shared(dim_); + } + } else if constexpr (std::is_same_v) { + space_ = std::make_shared>(dim_); + } + + appr_alg_ = std::make_shared::type>>( + space_.get(), path_to_index); +} + +template +void hnsw_lib::get_search_knn_results(const T* query, + int k, + algo_base::index_type* indices, + float* distances) const +{ + auto result = appr_alg_->searchKnn(query, k); + assert(result.size() >= static_cast(k)); + + for (int i = k - 1; i >= 0; --i) { + indices[i] = result.top().second; + distances[i] = result.top().first; + result.pop(); + } +} + +}; // namespace cuvs::bench diff --git a/cpp/cmake/patches/ggnn.diff b/cpp/cmake/patches/ggnn.diff new file mode 100644 index 000000000..fc4529880 --- /dev/null +++ b/cpp/cmake/patches/ggnn.diff @@ -0,0 +1,230 @@ +--- a/include/ggnn/cache/cuda_simple_knn_sym_cache.cuh ++++ b/include/ggnn/cache/cuda_simple_knn_sym_cache.cuh +@@ -62,7 +62,7 @@ struct SimpleKNNSymCache { + const ValueT dist_half) + : dist_query(dist_query), dist_half(dist_half) {} + +- __device__ __forceinline__ DistQueryAndHalf() {} ++ DistQueryAndHalf() = default; + }; + + struct DistanceAndNorm { +@@ -98,8 +98,7 @@ struct SimpleKNNSymCache { + KeyT cache; + DistQueryAndHalf dist; + bool flag; +- +- __device__ __forceinline__ SyncTempStorage() {} ++ SyncTempStorage() = default; + }; + + public: +diff --git a/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh b/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh +index 8cbaf0d..6eb72ac 100644 +--- a/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh ++++ b/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh +@@ -41,7 +41,6 @@ limitations under the License. + #include "ggnn/sym/cuda_knn_sym_query_layer.cuh" + #include "ggnn/utils/cuda_knn_utils.cuh" + #include "ggnn/utils/cuda_knn_constants.cuh" +-#include "ggnn/utils/cuda_knn_dataset.cuh" + + template + __global__ void divide(ValueT* res, ValueT* input, ValueT N) { +@@ -98,9 +97,7 @@ struct GGNNGPUInstance { + typedef GGNNGraphDevice GGNNGraphDevice; + typedef GGNNGraphHost GGNNGraphHost; + +- const Dataset* dataset; + GGNNGraphBuffer* ggnn_buffer {nullptr}; +- GGNNQuery ggnn_query; + + // Graph Shards resident on the GPU + std::vector ggnn_shards; +@@ -117,13 +114,12 @@ struct GGNNGPUInstance { + // number of shards that need to be processed by this instance + const int num_parts; + +- GGNNGPUInstance(const int gpu_id, const Dataset* dataset, ++ GGNNGPUInstance(const int gpu_id, + const int N_shard, const int L, + const bool enable_construction, const float tau_build, + const int num_parts=1, const int num_cpu_buffers=1) : + N_shard{N_shard}, L{L}, tau_build{tau_build}, +- dataset{dataset}, gpu_id{gpu_id}, +- ggnn_query{dataset->N_query, D, KQuery, num_parts}, ++ gpu_id{gpu_id}, + num_parts{num_parts} + { + CHECK_LE(L, MAX_LAYER); +@@ -135,7 +131,6 @@ struct GGNNGPUInstance { + CHECK_EQ(current_gpu_id, gpu_id) << "cudaSetDevice() needs to be called in advance!"; + } + +- ggnn_query.loadQueriesAsync(dataset->h_query, 0); + + computeGraphParameters(); + +@@ -186,7 +181,7 @@ struct GGNNGPUInstance { + } + + GGNNGPUInstance(const GGNNGPUInstance& other) +- : dataset{nullptr}, ggnn_query{0, D, KQuery}, ++ : + gpu_id{0}, N_shard{0}, num_parts{0} { + // this exists to allow using vector::emplace_back + // when it triggers a reallocation, this code will be called. +@@ -305,6 +300,7 @@ struct GGNNGPUInstance { + + // io + ++ /* + void waitForDiskIO(const int shard_id) { + auto& cpu_buffer = ggnn_cpu_buffers[shard_id%ggnn_cpu_buffers.size()]; + if (cpu_buffer.disk_io_thread.joinable()) +@@ -468,11 +464,12 @@ struct GGNNGPUInstance { + CHECK_CUDA(cudaDeviceSynchronize()); + CHECK_CUDA(cudaPeekAtLastError()); + } ++ */ + + // graph operations + + template +- void queryLayer(const int shard_id = 0) const { ++ void queryLayer(const BaseT* d_query, int batch_size, KeyT* d_query_result_ids, ValueT* d_query_result_dists, const int shard_id = 0) const { + CHECK_CUDA(cudaSetDevice(gpu_id)); + const auto& shard = ggnn_shards.at(shard_id%ggnn_shards.size()); + +@@ -482,21 +479,21 @@ struct GGNNGPUInstance { + + int* m_dist_statistics = nullptr; + if (DIST_STATS) +- cudaMallocManaged(&m_dist_statistics, dataset->N_query * sizeof(int)); ++ cudaMallocManaged(&m_dist_statistics, batch_size * sizeof(int)); + + QueryKernel query_kernel; + query_kernel.d_base = shard.d_base; +- query_kernel.d_query = ggnn_query.d_query; ++ query_kernel.d_query = d_query; + + query_kernel.d_graph = shard.d_graph; +- query_kernel.d_query_results = ggnn_query.d_query_result_ids; +- query_kernel.d_query_results_dists = ggnn_query.d_query_result_dists; ++ query_kernel.d_query_results = d_query_result_ids; ++ query_kernel.d_query_results_dists = d_query_result_dists; + + query_kernel.d_translation = shard.d_translation; + + query_kernel.d_nn1_stats = shard.d_nn1_stats; + +- query_kernel.N = dataset->N_query; ++ query_kernel.N = batch_size; + query_kernel.N_offset = 0; + + query_kernel.d_dist_stats = m_dist_statistics; +@@ -771,6 +768,16 @@ struct GGNNGPUInstance { + sym(layer, shard_id); + } + } ++ ++ void set_stream(cudaStream_t stream) { ++ assert(ggnn_shards.size() == 1); ++ ggnn_shards.at(0).stream = stream; ++ } ++ ++ void set_base_data(const BaseT* dataset) { ++ assert(ggnn_shards.size() == 1); ++ ggnn_shards.at(0).d_base = dataset; ++ } + }; + + #endif // INCLUDE_GGNN_CUDA_KNN_GGNN_GPU_INSTANCE_CUH_ +diff --git a/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh b/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh +index c94a8f1..781226d 100644 +--- a/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh ++++ b/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh +@@ -50,7 +50,7 @@ struct GGNNGraphDevice { + ValueT* d_nn1_stats; + + /// base data pointer for the shard. +- BaseT* d_base; ++ const BaseT* d_base; + + /// combined memory pool + char* d_memory; +@@ -69,7 +69,9 @@ struct GGNNGraphDevice { + const size_t selection_translation_size = align8(ST_all * sizeof(KeyT)); + const size_t nn1_stats_size = align8(2 * sizeof(ValueT)); + total_graph_size = graph_size + 2 * selection_translation_size + nn1_stats_size; +- base_size = align8(static_cast(N) * D * sizeof(BaseT)); ++ // base_size = align8(static_cast(N) * D * sizeof(BaseT)); ++ (void) N; ++ (void) D; + + const size_t total_size = base_size+total_graph_size; + +@@ -86,8 +88,7 @@ struct GGNNGraphDevice { + CHECK_CUDA(cudaMalloc(&d_memory, total_size)); + + size_t pos = 0; +- d_base = reinterpret_cast(d_memory+pos); +- pos += base_size; ++ d_base = nullptr; + d_graph = reinterpret_cast(d_memory+pos); + pos += graph_size; + d_translation = reinterpret_cast(d_memory+pos); +@@ -99,14 +100,14 @@ struct GGNNGraphDevice { + + CHECK_EQ(pos, total_size); + +- CHECK_CUDA(cudaStreamCreate(&stream)); ++ // CHECK_CUDA(cudaStreamCreate(&stream)); + + CHECK_CUDA(cudaPeekAtLastError()); + CHECK_CUDA(cudaDeviceSynchronize()); + CHECK_CUDA(cudaPeekAtLastError()); + } + +- GGNNGraphDevice(const GGNNGraphDevice& other) { ++ GGNNGraphDevice(const GGNNGraphDevice&) { + // this exists to allow using vector::emplace_back + // when it triggers a reallocation, this code will be called. + // always make sure that enough memory is reserved ahead of time. +@@ -116,7 +117,7 @@ struct GGNNGraphDevice { + ~GGNNGraphDevice() { + cudaFree(d_memory); + +- CHECK_CUDA(cudaStreamDestroy(stream)); ++ // CHECK_CUDA(cudaStreamDestroy(stream)); + } + }; + +diff --git a/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh b/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh +index 2055f9e..ef5843a 100644 +--- a/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh ++++ b/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh +@@ -92,7 +92,7 @@ struct GGNNGraphHost { + CHECK_CUDA(cudaPeekAtLastError()); + } + +- GGNNGraphHost(const GGNNGraphHost& other) { ++ GGNNGraphHost(const GGNNGraphHost&) { + // this exists to allow using vector::emplace_back + // when it triggers a reallocation, this code will be called. + // always make sure that enough memory is reserved ahead of time. +diff --git a/include/ggnn/select/cuda_knn_wrs_select_layer.cuh b/include/ggnn/select/cuda_knn_wrs_select_layer.cuh +index 49d76a1..eef69e6 100644 +--- a/include/ggnn/select/cuda_knn_wrs_select_layer.cuh ++++ b/include/ggnn/select/cuda_knn_wrs_select_layer.cuh +@@ -22,7 +22,6 @@ limitations under the License. + #include + #include + +-#include + #include + + #include "ggnn/utils/cuda_knn_constants.cuh" +-- +2.43.0 + diff --git a/cpp/cmake/patches/hnswlib.diff b/cpp/cmake/patches/hnswlib.diff new file mode 100644 index 000000000..e7f89a8cc --- /dev/null +++ b/cpp/cmake/patches/hnswlib.diff @@ -0,0 +1,188 @@ +--- a/hnswlib/hnswalg.h ++++ b/hnswlib/hnswalg.h +@@ -3,6 +3,7 @@ + #include "visited_list_pool.h" + #include "hnswlib.h" + #include ++#include + #include + #include + #include +@@ -16,6 +17,8 @@ namespace hnswlib { + template + class HierarchicalNSW : public AlgorithmInterface { + public: ++ bool base_layer_only{false}; ++ int num_seeds=32; + static const tableint max_update_element_locks = 65536; + HierarchicalNSW(SpaceInterface *s) { + } +@@ -56,7 +59,7 @@ namespace hnswlib { + visited_list_pool_ = new VisitedListPool(1, max_elements); + + //initializations for special treatment of the first node +- enterpoint_node_ = -1; ++ enterpoint_node_ = std::numeric_limits::max(); + maxlevel_ = -1; + + linkLists_ = (char **) malloc(sizeof(void *) * max_elements_); +@@ -527,7 +530,7 @@ namespace hnswlib { + tableint *datal = (tableint *) (data + 1); + for (int i = 0; i < size; i++) { + tableint cand = datal[i]; +- if (cand < 0 || cand > max_elements_) ++ if (cand > max_elements_) + throw std::runtime_error("cand error"); + dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_); + +@@ -1067,7 +1070,7 @@ namespace hnswlib { + tableint *datal = (tableint *) (data + 1); + for (int i = 0; i < size; i++) { + tableint cand = datal[i]; +- if (cand < 0 || cand > max_elements_) ++ if (cand > max_elements_) + throw std::runtime_error("cand error"); + dist_t d = fstdistfunc_(data_point, getDataByInternalId(cand), dist_func_param_); + if (d < curdist) { +@@ -1119,28 +1122,41 @@ namespace hnswlib { + tableint currObj = enterpoint_node_; + dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_); + +- for (int level = maxlevel_; level > 0; level--) { +- bool changed = true; +- while (changed) { +- changed = false; +- unsigned int *data; ++ if (base_layer_only) { ++ // You can increase the number of seeds when testing large-scale dataset, num_seeds = 48 for 100M-scale ++ for (int i = 0; i < num_seeds; i++) { ++ tableint obj = i * (max_elements_ / num_seeds); ++ dist_t dist = fstdistfunc_(query_data, getDataByInternalId(obj), dist_func_param_); ++ if (dist < curdist) { ++ curdist = dist; ++ currObj = obj; ++ } ++ } ++ } ++ else{ ++ for (int level = maxlevel_; level > 0; level--) { ++ bool changed = true; ++ while (changed) { ++ changed = false; ++ unsigned int *data; + +- data = (unsigned int *) get_linklist(currObj, level); +- int size = getListCount(data); +- metric_hops++; +- metric_distance_computations+=size; ++ data = (unsigned int *) get_linklist(currObj, level); ++ int size = getListCount(data); ++ metric_hops++; ++ metric_distance_computations+=size; + +- tableint *datal = (tableint *) (data + 1); +- for (int i = 0; i < size; i++) { +- tableint cand = datal[i]; +- if (cand < 0 || cand > max_elements_) +- throw std::runtime_error("cand error"); +- dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_); ++ tableint *datal = (tableint *) (data + 1); ++ for (int i = 0; i < size; i++) { ++ tableint cand = datal[i]; ++ if (cand > max_elements_) ++ throw std::runtime_error("cand error"); ++ dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_); + +- if (d < curdist) { +- curdist = d; +- currObj = cand; +- changed = true; ++ if (d < curdist) { ++ curdist = d; ++ currObj = cand; ++ changed = true; ++ } + } + } + } +diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h +index 4413537..c3240f3 100644 +--- a/hnswlib/space_l2.h ++++ b/hnswlib/space_l2.h +@@ -252,13 +252,14 @@ namespace hnswlib { + ~L2Space() {} + }; + ++ template + static int + L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const void *__restrict qty_ptr) { + + size_t qty = *((size_t *) qty_ptr); + int res = 0; +- unsigned char *a = (unsigned char *) pVect1; +- unsigned char *b = (unsigned char *) pVect2; ++ T *a = (T *) pVect1; ++ T *b = (T *) pVect2; + + qty = qty >> 2; + for (size_t i = 0; i < qty; i++) { +@@ -279,11 +280,12 @@ namespace hnswlib { + return (res); + } + ++ template + static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2, const void* __restrict qty_ptr) { + size_t qty = *((size_t*)qty_ptr); + int res = 0; +- unsigned char* a = (unsigned char*)pVect1; +- unsigned char* b = (unsigned char*)pVect2; ++ T* a = (T*)pVect1; ++ T* b = (T*)pVect2; + + for(size_t i = 0; i < qty; i++) + { +@@ -294,6 +296,7 @@ namespace hnswlib { + return (res); + } + ++ template + class L2SpaceI : public SpaceInterface { + + DISTFUNC fstdistfunc_; +@@ -302,10 +305,10 @@ namespace hnswlib { + public: + L2SpaceI(size_t dim) { + if(dim % 4 == 0) { +- fstdistfunc_ = L2SqrI4x; ++ fstdistfunc_ = L2SqrI4x; + } + else { +- fstdistfunc_ = L2SqrI; ++ fstdistfunc_ = L2SqrI; + } + dim_ = dim; + data_size_ = dim * sizeof(unsigned char); +diff --git a/hnswlib/visited_list_pool.h b/hnswlib/visited_list_pool.h +index 5e1a4a5..4195ebd 100644 +--- a/hnswlib/visited_list_pool.h ++++ b/hnswlib/visited_list_pool.h +@@ -3,6 +3,7 @@ + #include + #include + #include ++#include + + namespace hnswlib { + typedef unsigned short int vl_type; +@@ -14,7 +15,7 @@ namespace hnswlib { + unsigned int numelements; + + VisitedList(int numelements1) { +- curV = -1; ++ curV = std::numeric_limits::max(); + numelements = numelements1; + mass = new vl_type[numelements]; + } +-- +2.43.0 + diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake new file mode 100644 index 000000000..89446332d --- /dev/null +++ b/cpp/cmake/thirdparty/get_faiss.cmake @@ -0,0 +1,109 @@ +#============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +function(find_and_configure_faiss) + set(oneValueArgs VERSION REPOSITORY PINNED_TAG BUILD_STATIC_LIBS EXCLUDE_FROM_ALL ENABLE_GPU) + cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN} ) + + rapids_find_generate_module(faiss + HEADER_NAMES faiss/IndexFlat.h + LIBRARY_NAMES faiss + ) + + set(BUILD_SHARED_LIBS ON) + if (PKG_BUILD_STATIC_LIBS) + set(BUILD_SHARED_LIBS OFF) + set(CPM_DOWNLOAD_faiss ON) + endif() + + include(cmake/modules/FindAVX.cmake) + + # Link against AVX CPU lib if it exists + set(CUVS_FAISS_GLOBAL_TARGETS faiss::faiss) + set(CUVS_FAISS_EXPORT_GLOBAL_TARGETS faiss) + set(CUVS_FAISS_OPT_LEVEL "generic") + if(CXX_AVX_FOUND) + set(CUVS_FAISS_OPT_LEVEL "avx2") + list(APPEND CUVS_FAISS_GLOBAL_TARGETS faiss::faiss_avx2) + list(APPEND CUVS_FAISS_EXPORT_GLOBAL_TARGETS faiss_avx2) + endif() + + rapids_cpm_find(faiss ${PKG_VERSION} + GLOBAL_TARGETS ${CUVS_FAISS_GLOBAL_TARGETS} + CPM_ARGS + GIT_REPOSITORY ${PKG_REPOSITORY} + GIT_TAG ${PKG_PINNED_TAG} + EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL} + OPTIONS + "FAISS_ENABLE_GPU ${PKG_ENABLE_GPU}" + "FAISS_ENABLE_PYTHON OFF" + "FAISS_OPT_LEVEL ${CUVS_FAISS_OPT_LEVEL}" + "FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}" + "BUILD_TESTING OFF" + "CMAKE_MESSAGE_LOG_LEVEL VERBOSE" + ) + + if(TARGET faiss AND NOT TARGET faiss::faiss) + add_library(faiss::faiss ALIAS faiss) + endif() + + if(CXX_AVX_FOUND) + + if(TARGET faiss_avx2 AND NOT TARGET faiss::faiss_avx2) + add_library(faiss::faiss_avx2 ALIAS faiss_avx2) + endif() + endif() + + + if(faiss_ADDED) + rapids_export(BUILD faiss + EXPORT_SET faiss-targets + GLOBAL_TARGETS ${CUVS_FAISS_EXPORT_GLOBAL_TARGETS} + NAMESPACE faiss::) + endif() + + # We generate the faiss-config files when we built faiss locally, so always do `find_dependency` + rapids_export_package(BUILD OpenMP cuvs-ann-bench-exports) # faiss uses openMP but doesn't export a need for it + rapids_export_package(BUILD faiss cuvs-ann-bench-exports GLOBAL_TARGETS ${CUVS_FAISS_GLOBAL_TARGETS} ${CUVS_FAISS_EXPORT_GLOBAL_TARGETS}) + rapids_export_package(INSTALL faiss cuvs-ann-bench-exports GLOBAL_TARGETS ${CUVS_FAISS_GLOBAL_TARGETS} ${CUVS_FAISS_EXPORT_GLOBAL_TARGETS}) + + # Tell cmake where it can find the generated faiss-config.cmake we wrote. + include("${rapids-cmake-dir}/export/find_package_root.cmake") + rapids_export_find_package_root(BUILD faiss [=[${CMAKE_CURRENT_LIST_DIR}]=] + EXPORT_SET cuvs-ann-bench-exports) +endfunction() + +if(NOT CUVS_FAISS_GIT_TAG) + # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC + # (https://github.com/facebookresearch/faiss/pull/2446) + set(CUVS_FAISS_GIT_TAG fea/statically-link-ctk) + # set(CUVS_FAISS_GIT_TAG bde7c0027191f29c9dadafe4f6e68ca0ee31fb30) +endif() + +if(NOT CUVS_FAISS_GIT_REPOSITORY) + # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC + # (https://github.com/facebookresearch/faiss/pull/2446) + set(CUVS_FAISS_GIT_REPOSITORY https://github.com/cjnolet/faiss.git) + # set(CUVS_FAISS_GIT_REPOSITORY https://github.com/facebookresearch/faiss.git) +endif() + +find_and_configure_faiss(VERSION 1.7.4 + REPOSITORY ${CUVS_FAISS_GIT_REPOSITORY} + PINNED_TAG ${CUVS_FAISS_GIT_TAG} + BUILD_STATIC_LIBS ${CUVS_USE_FAISS_STATIC} + EXCLUDE_FROM_ALL ${CUVS_EXCLUDE_FAISS_FROM_ALL} + ENABLE_GPU ${CUVS_FAISS_ENABLE_GPU}) diff --git a/cpp/cmake/thirdparty/get_ggnn.cmake b/cpp/cmake/thirdparty/get_ggnn.cmake new file mode 100644 index 000000000..8137ef84e --- /dev/null +++ b/cpp/cmake/thirdparty/get_ggnn.cmake @@ -0,0 +1,58 @@ +#============================================================================= +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +function(find_and_configure_ggnn) + set(oneValueArgs VERSION REPOSITORY PINNED_TAG) + cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN} ) + + + set(patch_files_to_run "${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/ggnn.diff") + set(patch_issues_to_ref "fix compile issues") + set(patch_script "${CMAKE_BINARY_DIR}/rapids-cmake/patches/ggnn/patch.cmake") + set(log_file "${CMAKE_BINARY_DIR}/rapids-cmake/patches/ggnn/log") + string(TIMESTAMP current_year "%Y" UTC) + configure_file(${rapids-cmake-dir}/cpm/patches/command_template.cmake.in "${patch_script}" + @ONLY) + + rapids_cpm_find( + ggnn ${PKG_VERSION} + GLOBAL_TARGETS ggnn::ggnn + CPM_ARGS + GIT_REPOSITORY ${PKG_REPOSITORY} + GIT_TAG ${PKG_PINNED_TAG} + GIT_SHALLOW TRUE + DOWNLOAD_ONLY ON + PATCH_COMMAND ${CMAKE_COMMAND} -P ${patch_script} + ) + if(NOT TARGET ggnn::ggnn) + add_library(ggnn INTERFACE) + target_include_directories(ggnn INTERFACE "$") + add_library(ggnn::ggnn ALIAS ggnn) + endif() + +endfunction() +if(NOT RAFT_GGNN_GIT_TAG) + set(RAFT_GGNN_GIT_TAG release_0.5) +endif() + +if(NOT RAFT_GGNN_GIT_REPOSITORY) + set(RAFT_GGNN_GIT_REPOSITORY https://github.com/cgtuebingen/ggnn.git) +endif() +find_and_configure_ggnn(VERSION 0.5 + REPOSITORY ${RAFT_GGNN_GIT_REPOSITORY} + PINNED_TAG ${RAFT_GGNN_GIT_TAG} + ) diff --git a/cpp/cmake/thirdparty/get_glog.cmake b/cpp/cmake/thirdparty/get_glog.cmake new file mode 100644 index 000000000..5839ee3d5 --- /dev/null +++ b/cpp/cmake/thirdparty/get_glog.cmake @@ -0,0 +1,42 @@ +#============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +function(find_and_configure_glog) + set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL) + cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN} ) + + rapids_cpm_find(glog ${PKG_VERSION} + GLOBAL_TARGETS glog::glog + CPM_ARGS + GIT_REPOSITORY https://github.com/${PKG_FORK}/glog.git + GIT_TAG ${PKG_PINNED_TAG} + EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL} + ) + + if(glog_ADDED) + message(VERBOSE "cuVS: Using glog located in ${glog_SOURCE_DIR}") + else() + message(VERBOSE "cuVS: Using glog located in ${glog_DIR}") + endif() + +endfunction() + +find_and_configure_glog(VERSION 0.6.0 + FORK google + PINNED_TAG v0.6.0 + EXCLUDE_FROM_ALL ON + ) diff --git a/cpp/cmake/thirdparty/get_hnswlib.cmake b/cpp/cmake/thirdparty/get_hnswlib.cmake new file mode 100644 index 000000000..054a12f1e --- /dev/null +++ b/cpp/cmake/thirdparty/get_hnswlib.cmake @@ -0,0 +1,92 @@ +#============================================================================= +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +function(find_and_configure_hnswlib) + set(oneValueArgs VERSION REPOSITORY PINNED_TAG EXCLUDE_FROM_ALL) + cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN} ) + + set(patch_files_to_run "${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/hnswlib.diff") + set(patch_issues_to_ref "fix compile issues") + set(patch_script "${CMAKE_BINARY_DIR}/rapids-cmake/patches/hnswlib/patch.cmake") + set(log_file "${CMAKE_BINARY_DIR}/rapids-cmake/patches/hnswlib/log") + string(TIMESTAMP current_year "%Y" UTC) + configure_file(${rapids-cmake-dir}/cpm/patches/command_template.cmake.in "${patch_script}" + @ONLY) + + rapids_cpm_find( + hnswlib ${PKG_VERSION} + GLOBAL_TARGETS hnswlib::hnswlib + BUILD_EXPORT_SET raft-exports + INSTALL_EXPORT_SET raft-exports + CPM_ARGS + GIT_REPOSITORY ${PKG_REPOSITORY} + GIT_TAG ${PKG_PINNED_TAG} + GIT_SHALLOW TRUE + DOWNLOAD_ONLY ON + PATCH_COMMAND ${CMAKE_COMMAND} -P ${patch_script} + ) + if(NOT TARGET hnswlib::hnswlib) + add_library(hnswlib INTERFACE ) + add_library(hnswlib::hnswlib ALIAS hnswlib) + target_include_directories(hnswlib INTERFACE + "$" + "$") + + if(NOT PKG_EXCLUDE_FROM_ALL) + install(TARGETS hnswlib EXPORT hnswlib-exports) + install(DIRECTORY "${hnswlib_SOURCE_DIR}/hnswlib/" DESTINATION include/hnswlib) + + # write install export rules + rapids_export( + INSTALL hnswlib + VERSION ${PKG_VERSION} + EXPORT_SET hnswlib-exports + GLOBAL_TARGETS hnswlib + NAMESPACE hnswlib::) + endif() + + # write build export rules + rapids_export( + BUILD hnswlib + VERSION ${PKG_VERSION} + EXPORT_SET hnswlib-exports + GLOBAL_TARGETS hnswlib + NAMESPACE hnswlib::) + + include("${rapids-cmake-dir}/export/find_package_root.cmake") + + # When using RAFT from the build dir, ensure hnswlib is also found in RAFT's build dir. This + # line adds `set(hnswlib_ROOT "${CMAKE_CURRENT_LIST_DIR}")` to build/raft-dependencies.cmake + rapids_export_find_package_root( + BUILD hnswlib [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET raft-exports + ) + endif() +endfunction() + + +if(NOT CUVS_HNSWLIB_GIT_TAG) + set(CUVS_HNSWLIB_GIT_TAG v0.6.2) +endif() + +if(NOT CUVS_HNSWLIB_GIT_REPOSITORY) + set(CUVS_HNSWLIB_GIT_REPOSITORY https://github.com/nmslib/hnswlib.git) +endif() +find_and_configure_hnswlib(VERSION 0.6.2 + REPOSITORY ${CUVS_HNSWLIB_GIT_REPOSITORY} + PINNED_TAG ${CUVS_HNSWLIB_GIT_TAG} + EXCLUDE_FROM_ALL OFF + ) diff --git a/cpp/cmake/thirdparty/get_nlohmann_json.cmake b/cpp/cmake/thirdparty/get_nlohmann_json.cmake new file mode 100644 index 000000000..8d0b671e4 --- /dev/null +++ b/cpp/cmake/thirdparty/get_nlohmann_json.cmake @@ -0,0 +1,42 @@ +#============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +function(find_and_configure_nlohmann_json) + set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL) + cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN} ) + + rapids_cpm_find(nlohmann_json ${PKG_VERSION} + GLOBAL_TARGETS nlohmann_json::nlohmann_json + CPM_ARGS + GIT_REPOSITORY https://github.com/${PKG_FORK}/json.git + GIT_TAG ${PKG_PINNED_TAG} + EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL} + ) + + if(glog_ADDED) + message(VERBOSE "cuVS: Using glog located in ${glog_SOURCE_DIR}") + else() + message(VERBOSE "cuVS: Using glog located in ${glog_DIR}") + endif() + +endfunction() + +find_and_configure_nlohmann_json(VERSION 3.11.2 + FORK nlohmann + PINNED_TAG v3.11.2 + EXCLUDE_FROM_ALL ON + ) diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index b0668eeb0..141fba4ee 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -1043,6 +1043,166 @@ void serialize(raft::resources const& handle, void deserialize(raft::resources const& handle, std::istream& is, cuvs::neighbors::cagra::index* index); + +/** + * Write the CAGRA built index as a base layer HNSW index to an output stream + * + * Experimental, both the API and the serialization format are subject to change. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create an output stream + * std::ostream os(std::cout.rdbuf()); + * // create an index with `auto index = raft::cagra::build(...);` + * cuvs::neighbors::cagra::serialize_to_hnswlib(handle, os, index); + * @endcode + * + * @param[in] handle the raft handle + * @param[in] os output stream + * @param[in] index CAGRA index + * + */ +void serialize_to_hnswlib(raft::resources const& handle, + std::ostream& os, + const cuvs::neighbors::cagra::index& index); + +/** + * Save a CAGRA build index in hnswlib base-layer-only serialized format + * + * Experimental, both the API and the serialization format are subject to change. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create a string with a filepath + * std::string filename("/path/to/index"); + * // create an index with `auto index = raft::cagra::build(...);` + * cuvs::neighbors::cagra::serialize_to_hnswlib(handle, filename, index); + * @endcode + * + * + * @param[in] handle the raft handle + * @param[in] filename the file name for saving the index + * @param[in] index CAGRA index + * + */ +void serialize_to_hnswlib(raft::resources const& handle, + const std::string& filename, + const cuvs::neighbors::cagra::index& index); + +/** + * Write the CAGRA built index as a base layer HNSW index to an output stream + * + * Experimental, both the API and the serialization format are subject to change. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create an output stream + * std::ostream os(std::cout.rdbuf()); + * // create an index with `auto index = raft::cagra::build(...);` + * cuvs::neighbors::cagra::serialize_to_hnswlib(handle, os, index); + * @endcode + * + * @param[in] handle the raft handle + * @param[in] os output stream + * @param[in] index CAGRA index + * + */ +void serialize_to_hnswlib(raft::resources const& handle, + std::ostream& os, + const cuvs::neighbors::cagra::index& index); + +/** + * Save a CAGRA build index in hnswlib base-layer-only serialized format + * + * Experimental, both the API and the serialization format are subject to change. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create a string with a filepath + * std::string filename("/path/to/index"); + * // create an index with `auto index = raft::cagra::build(...);` + * cuvs::neighbors::cagra::serialize_to_hnswlib(handle, filename, index); + * @endcode + * + * + * @param[in] handle the raft handle + * @param[in] filename the file name for saving the index + * @param[in] index CAGRA index + * + */ +void serialize_to_hnswlib(raft::resources const& handle, + const std::string& filename, + const cuvs::neighbors::cagra::index& index); + +/** + * Write the CAGRA built index as a base layer HNSW index to an output stream + * + * Experimental, both the API and the serialization format are subject to change. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create an output stream + * std::ostream os(std::cout.rdbuf()); + * // create an index with `auto index = raft::cagra::build(...);` + * cuvs::neighbors::cagra::serialize_to_hnswlib(handle, os, index); + * @endcode + * + * @param[in] handle the raft handle + * @param[in] os output stream + * @param[in] index CAGRA index + * + */ +void serialize_to_hnswlib(raft::resources const& handle, + std::ostream& os, + const cuvs::neighbors::cagra::index& index); + +/** + * Save a CAGRA build index in hnswlib base-layer-only serialized format + * + * Experimental, both the API and the serialization format are subject to change. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create a string with a filepath + * std::string filename("/path/to/index"); + * // create an index with `auto index = raft::cagra::build(...);` + * cuvs::neighbors::cagra::serialize_to_hnswlib(handle, filename, index); + * @endcode + * + * + * @param[in] handle the raft handle + * @param[in] filename the file name for saving the index + * @param[in] index CAGRA index + * + */ +void serialize_to_hnswlib(raft::resources const& handle, + const std::string& filename, + const cuvs::neighbors::cagra::index& index); + /** * @} */ diff --git a/cpp/src/neighbors/cagra_serialize.cuh b/cpp/src/neighbors/cagra_serialize.cuh index 03f128cb9..e193c0630 100644 --- a/cpp/src/neighbors/cagra_serialize.cuh +++ b/cpp/src/neighbors/cagra_serialize.cuh @@ -19,102 +19,52 @@ #include "detail/cagra/cagra_serialize.cuh" namespace cuvs::neighbors::cagra { -/** - * Write the CAGRA built index as a base layer HNSW index to an output stream - * - * Experimental, both the API and the serialization format are subject to change. - * - * @code{.cpp} - * #include - * #include - * - * raft::resources handle; - * - * // create an output stream - * std::ostream os(std::cout.rdbuf()); - * // create an index with `auto index = raft::cagra::build(...);` - * raft::cagra::serialize_to_hnswlib(handle, os, index); - * @endcode - * - * @tparam T data element type - * @tparam IdxT type of the indices - * - * @param[in] handle the raft handle - * @param[in] os output stream - * @param[in] index CAGRA index - * - */ -template -void serialize_to_hnswlib(raft::resources const& handle, - std::ostream& os, - const cuvs::neighbors::cagra::index& index) -{ - detail::serialize_to_hnswlib(handle, os, index); -} - -/** - * Save a CAGRA build index in hnswlib base-layer-only serialized format - * - * Experimental, both the API and the serialization format are subject to change. - * - * @code{.cpp} - * #include - * #include - * - * raft::resources handle; - * - * // create a string with a filepath - * std::string filename("/path/to/index"); - * // create an index with `auto index = raft::cagra::build(...);` - * raft::cagra::serialize_to_hnswlib(handle, filename, index); - * @endcode - * - * @tparam T data element type - * @tparam IdxT type of the indices - * - * @param[in] handle the raft handle - * @param[in] filename the file name for saving the index - * @param[in] index CAGRA index - * - */ -template -void serialize_to_hnswlib(raft::resources const& handle, - const std::string& filename, - const cuvs::neighbors::cagra::index& index) -{ - detail::serialize_to_hnswlib(handle, filename, index); -} -#define CUVS_INST_CAGRA_SERIALIZE(DTYPE) \ - void serialize(raft::resources const& handle, \ - const std::string& filename, \ - const cuvs::neighbors::cagra::index& index, \ - bool include_dataset) \ - { \ - cuvs::neighbors::cagra::detail::serialize( \ - handle, filename, index, include_dataset); \ - }; \ - \ - void deserialize(raft::resources const& handle, \ - const std::string& filename, \ - cuvs::neighbors::cagra::index* index) \ - { \ - cuvs::neighbors::cagra::detail::deserialize(handle, filename, index); \ - }; \ - void serialize(raft::resources const& handle, \ - std::ostream& os, \ - const cuvs::neighbors::cagra::index& index, \ - bool include_dataset) \ - { \ - cuvs::neighbors::cagra::detail::serialize( \ - handle, os, index, include_dataset); \ - } \ - \ - void deserialize(raft::resources const& handle, \ - std::istream& is, \ - cuvs::neighbors::cagra::index* index) \ - { \ - cuvs::neighbors::cagra::detail::deserialize(handle, is, index); \ +#define CUVS_INST_CAGRA_SERIALIZE(DTYPE) \ + void serialize(raft::resources const& handle, \ + const std::string& filename, \ + const cuvs::neighbors::cagra::index& index, \ + bool include_dataset) \ + { \ + cuvs::neighbors::cagra::detail::serialize( \ + handle, filename, index, include_dataset); \ + }; \ + \ + void deserialize(raft::resources const& handle, \ + const std::string& filename, \ + cuvs::neighbors::cagra::index* index) \ + { \ + cuvs::neighbors::cagra::detail::deserialize(handle, filename, index); \ + }; \ + void serialize(raft::resources const& handle, \ + std::ostream& os, \ + const cuvs::neighbors::cagra::index& index, \ + bool include_dataset) \ + { \ + cuvs::neighbors::cagra::detail::serialize( \ + handle, os, index, include_dataset); \ + } \ + \ + void deserialize(raft::resources const& handle, \ + std::istream& is, \ + cuvs::neighbors::cagra::index* index) \ + { \ + cuvs::neighbors::cagra::detail::deserialize(handle, is, index); \ + } \ + \ + void serialize_to_hnswlib(raft::resources const& handle, \ + std::ostream& os, \ + const cuvs::neighbors::cagra::index& index) \ + { \ + cuvs::neighbors::cagra::detail::serialize_to_hnswlib(handle, os, index); \ + } \ + \ + void serialize_to_hnswlib(raft::resources const& handle, \ + const std::string& filename, \ + const cuvs::neighbors::cagra::index& index) \ + { \ + cuvs::neighbors::cagra::detail::serialize_to_hnswlib( \ + handle, filename, index); \ } } // namespace cuvs::neighbors::cagra diff --git a/dependencies.yaml b/dependencies.yaml index 22325d5df..689c98417 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -324,6 +324,7 @@ dependencies: - output_types: [conda] packages: - *cmake_ver + - openblas docs: common: - output_types: [conda]