From ff133d4de2c552b54f4639eca19e9a0a958f41bb Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <hendriksen.a.a@gmail.com>
Date: Fri, 2 Sep 2022 14:04:55 +0200
Subject: [PATCH] Complete the deprecation of duplicated hpp headers (#793)

Replace all .hpp headers that have a .cuh header in the same directory with the
same name by a simple include of the cuh header and a pragma warning of
deprecation.

This change hopefully prevents future head scratching when changes in a
file are seemingly not picked up by the compiler..

Care has been taken to copy the right start year for the copyright line.
Copyright lines have been updated to 2022 when necessary.

The following template has been used for the .hpp header replacement
text:


```
/*
 * %%COPYRIGHT_LINE%%
 *
 * [... snip license .. ]
 */
/**
 * This file is deprecated and will be removed in release 22.06.
 * Please use the cuh version instead.
 */

/**
 * DISCLAIMER: this file is deprecated: use %%CUH_FILE%% instead
 */
#pragma once

#pragma message(__FILE__                                                               \
                " is deprecated and will be removed in a future release." \
                " Please use the cuh version instead.")
```

Authors:
  - Allard Hendriksen (https://github.com/ahendriksen)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/793
---
 BUILD.md                                      |   4 +-
 cpp/bench/distance/distance_common.cuh        |   4 +-
 cpp/bench/linalg/add.cu                       |   2 +-
 cpp/bench/linalg/map_then_reduce.cu           |   2 +-
 cpp/bench/linalg/matrix_vector_op.cu          |   2 +-
 cpp/bench/linalg/reduce.cu                    |   2 +-
 cpp/bench/random/make_blobs.cu                |   2 +-
 cpp/bench/random/permute.cu                   |   2 +-
 cpp/bench/spatial/fused_l2_nn.cu              |   6 +-
 cpp/bench/spatial/selection.cu                |   2 +-
 cpp/include/raft/distance/fused_l2_nn.hpp     | 103 +------
 cpp/include/raft/distance/specializations.hpp |  11 +-
 cpp/include/raft/label/classlabels.hpp        | 109 +------
 cpp/include/raft/label/merge_labels.hpp       |  62 +---
 cpp/include/raft/lap/lap.hpp                  | 285 +-----------------
 cpp/include/raft/linalg/add.hpp               |  76 +----
 cpp/include/raft/linalg/axpy.hpp              |  43 +--
 cpp/include/raft/linalg/binary_op.hpp         |  46 +--
 .../raft/linalg/cholesky_r1_update.hpp        | 126 +-------
 .../raft/linalg/coalesced_reduction.hpp       |  64 +---
 cpp/include/raft/linalg/contractions.hpp      | 197 +-----------
 cpp/include/raft/linalg/divide.hpp            |  37 +--
 cpp/include/raft/linalg/eig.hpp               | 106 +------
 cpp/include/raft/linalg/eltwise.hpp           |  92 +-----
 cpp/include/raft/linalg/gemm.hpp              | 165 +---------
 cpp/include/raft/linalg/gemv.hpp              | 198 +-----------
 cpp/include/raft/linalg/init.hpp              |  46 +--
 cpp/include/raft/linalg/lanczos.hpp           | 148 +--------
 cpp/include/raft/linalg/lstsq.hpp             | 109 +------
 cpp/include/raft/linalg/map_then_reduce.hpp   |  77 +----
 cpp/include/raft/linalg/matrix_vector_op.hpp  |  91 +-----
 .../raft/linalg/mean_squared_error.hpp        |  35 +--
 cpp/include/raft/linalg/multiply.hpp          |  35 +--
 cpp/include/raft/linalg/norm.hpp              |  80 +----
 cpp/include/raft/linalg/power.hpp             |  55 +---
 cpp/include/raft/linalg/reduce.hpp            |  69 +----
 .../raft/linalg/reduce_cols_by_key.hpp        |  45 +--
 .../raft/linalg/reduce_rows_by_key.hpp        | 100 +-----
 cpp/include/raft/linalg/rsvd.hpp              | 129 +-------
 cpp/include/raft/linalg/sqrt.hpp              |  36 +--
 cpp/include/raft/linalg/strided_reduction.hpp |  64 +---
 cpp/include/raft/linalg/subtract.hpp          |  75 +----
 cpp/include/raft/linalg/svd.hpp               | 174 +----------
 cpp/include/raft/linalg/ternary_op.hpp        |  42 +--
 cpp/include/raft/linalg/transpose.hpp         |  47 +--
 cpp/include/raft/linalg/unary_op.hpp          |  63 +---
 cpp/include/raft/matrix/col_wise_sort.hpp     |  44 +--
 cpp/include/raft/matrix/matrix.hpp            | 263 +---------------
 cpp/include/raft/random/make_regression.hpp   |  93 +-----
 .../raft/random/multi_variable_gaussian.hpp   |  51 +---
 cpp/include/raft/random/permute.hpp           |  50 +--
 cpp/include/raft/sparse/linalg/add.hpp        |  85 +-----
 cpp/include/raft/sparse/linalg/degree.hpp     | 109 +------
 cpp/include/raft/sparse/linalg/norm.hpp       |  59 +---
 cpp/include/raft/sparse/linalg/spectral.hpp   |  33 +-
 cpp/include/raft/sparse/linalg/transpose.hpp  |  62 +---
 cpp/include/raft/sparse/op/filter.hpp         |  80 +----
 cpp/include/raft/sparse/op/reduce.hpp         |  75 +----
 cpp/include/raft/sparse/op/row_op.hpp         |  37 +--
 cpp/include/raft/sparse/op/slice.hpp          |  67 +---
 cpp/include/raft/sparse/op/sort.hpp           |  64 +---
 .../sparse/selection/connect_components.hpp   |  68 +----
 cpp/include/raft/sparse/selection/knn.hpp     |  90 +-----
 .../raft/sparse/selection/knn_graph.hpp       |  51 +---
 .../knn/detail/ann_kmeans_balanced.cuh        |   2 +-
 .../raft/spatial/knn/detail/ann_utils.cuh     |   2 +-
 .../raft/spatial/knn/epsilon_neighborhood.hpp |  51 +---
 .../raft/spatial/knn/specializations.hpp      |  13 +-
 cpp/include/raft/spectral/eigen_solvers.hpp   |  95 +-----
 cpp/include/raft/stats/accuracy.hpp           |  32 +-
 .../raft/stats/adjusted_rand_index.hpp        |  39 +--
 cpp/include/raft/stats/contingency_matrix.hpp |  91 +-----
 cpp/include/raft/stats/cov.hpp                |  50 +--
 .../raft/stats/detail/weighted_mean.cuh       |   4 +-
 cpp/include/raft/stats/dispersion.hpp         |  48 +--
 cpp/include/raft/stats/entropy.hpp            |  37 +--
 cpp/include/raft/stats/histogram.hpp          |  54 +---
 cpp/include/raft/stats/homogeneity_score.hpp  |  41 +--
 .../raft/stats/information_criterion.hpp      |  54 +---
 cpp/include/raft/stats/kl_divergence.hpp      |  34 +--
 cpp/include/raft/stats/mean.hpp               |  43 +--
 cpp/include/raft/stats/mean_center.hpp        |  69 +----
 cpp/include/raft/stats/meanvar.hpp            |  48 +--
 cpp/include/raft/stats/minmax.hpp             |  62 +---
 cpp/include/raft/stats/mutual_info_score.hpp  |  39 +--
 cpp/include/raft/stats/r2_score.hpp           |  38 +--
 cpp/include/raft/stats/rand_index.hpp         |  31 +-
 cpp/include/raft/stats/regression_metrics.hpp |  43 +--
 cpp/include/raft/stats/silhouette_score.hpp   |  65 +---
 cpp/include/raft/stats/specializations.hpp    |  12 +-
 cpp/include/raft/stats/stddev.hpp             |  79 +----
 cpp/include/raft/stats/sum.hpp                |  39 +--
 .../raft/stats/trustworthiness_score.hpp      |  41 +--
 cpp/include/raft/stats/v_measure.hpp          |  40 +--
 cpp/include/raft/stats/weighted_mean.hpp      |  82 +----
 cpp/test/spatial/ball_cover.cu                |   4 +-
 96 files changed, 567 insertions(+), 5464 deletions(-)

diff --git a/BUILD.md b/BUILD.md
index c4d8b1b356..3c6ad2bf20 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -205,8 +205,8 @@ The pre-compiled libraries contain template specializations for commonly used ty
 
 The following example tells the compiler to ignore the pre-compiled templates for the `libraft-distance` API so any symbols already compiled into pre-compiled shared library will be used instead:
 ```c++
-#include <raft/distance/distance.hpp>
-#include <raft/distance/specializations.hpp>
+#include <raft/distance/distance.cuh>
+#include <raft/distance/specializations.cuh>
 ```
 
 ### <a id="build_cxx_source"></a>Building RAFT C++ from source in cmake
diff --git a/cpp/bench/distance/distance_common.cuh b/cpp/bench/distance/distance_common.cuh
index dae2550326..4f1a8ccab1 100644
--- a/cpp/bench/distance/distance_common.cuh
+++ b/cpp/bench/distance/distance_common.cuh
@@ -16,9 +16,9 @@
 
 #include <common/benchmark.hpp>
 #include <raft/cudart_utils.h>
-#include <raft/distance/distance.hpp>
+#include <raft/distance/distance.cuh>
 #if defined RAFT_DISTANCE_COMPILED
-#include <raft/distance/specializations.hpp>
+#include <raft/distance/specializations.cuh>
 #endif
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/bench/linalg/add.cu b/cpp/bench/linalg/add.cu
index 7c651b61ed..7d00b8cbae 100644
--- a/cpp/bench/linalg/add.cu
+++ b/cpp/bench/linalg/add.cu
@@ -15,7 +15,7 @@
  */
 
 #include <common/benchmark.hpp>
-#include <raft/linalg/add.hpp>
+#include <raft/linalg/add.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft::bench::linalg {
diff --git a/cpp/bench/linalg/map_then_reduce.cu b/cpp/bench/linalg/map_then_reduce.cu
index 7eeb4a79b6..33a3e66264 100644
--- a/cpp/bench/linalg/map_then_reduce.cu
+++ b/cpp/bench/linalg/map_then_reduce.cu
@@ -15,7 +15,7 @@
  */
 
 #include <common/benchmark.hpp>
-#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/map_then_reduce.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft::bench::linalg {
diff --git a/cpp/bench/linalg/matrix_vector_op.cu b/cpp/bench/linalg/matrix_vector_op.cu
index d3a53ea345..aa8f2667ed 100644
--- a/cpp/bench/linalg/matrix_vector_op.cu
+++ b/cpp/bench/linalg/matrix_vector_op.cu
@@ -15,7 +15,7 @@
  */
 
 #include <common/benchmark.hpp>
-#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/matrix_vector_op.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft::bench::linalg {
diff --git a/cpp/bench/linalg/reduce.cu b/cpp/bench/linalg/reduce.cu
index 018086a689..015e0b8abe 100644
--- a/cpp/bench/linalg/reduce.cu
+++ b/cpp/bench/linalg/reduce.cu
@@ -15,7 +15,7 @@
  */
 
 #include <common/benchmark.hpp>
-#include <raft/linalg/reduce.hpp>
+#include <raft/linalg/reduce.cuh>
 
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/bench/random/make_blobs.cu b/cpp/bench/random/make_blobs.cu
index c449223040..fdd4ef61d2 100644
--- a/cpp/bench/random/make_blobs.cu
+++ b/cpp/bench/random/make_blobs.cu
@@ -15,7 +15,7 @@
  */
 
 #include <common/benchmark.hpp>
-#include <raft/random/make_blobs.hpp>
+#include <raft/random/make_blobs.cuh>
 #include <rmm/device_uvector.hpp>
 #include <vector>
 
diff --git a/cpp/bench/random/permute.cu b/cpp/bench/random/permute.cu
index a72eca3f87..5364bb44e3 100644
--- a/cpp/bench/random/permute.cu
+++ b/cpp/bench/random/permute.cu
@@ -16,7 +16,7 @@
 
 #include <common/benchmark.hpp>
 #include <raft/cudart_utils.h>
-#include <raft/random/permute.hpp>
+#include <raft/random/permute.cuh>
 #include <raft/random/rng.cuh>
 
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/bench/spatial/fused_l2_nn.cu b/cpp/bench/spatial/fused_l2_nn.cu
index dc3b507fbf..e5b5dc377a 100644
--- a/cpp/bench/spatial/fused_l2_nn.cu
+++ b/cpp/bench/spatial/fused_l2_nn.cu
@@ -17,13 +17,13 @@
 #include <common/benchmark.hpp>
 #include <limits>
 #include <raft/cudart_utils.h>
-#include <raft/distance/fused_l2_nn.hpp>
+#include <raft/distance/fused_l2_nn.cuh>
 #include <raft/handle.hpp>
-#include <raft/linalg/norm.hpp>
+#include <raft/linalg/norm.cuh>
 #include <raft/random/rng.cuh>
 
 #if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.hpp>
+#include <raft/spatial/knn/specializations.cuh>
 #endif
 
 namespace raft::bench::spatial {
diff --git a/cpp/bench/spatial/selection.cu b/cpp/bench/spatial/selection.cu
index c3a2bc6d3d..1f116c199f 100644
--- a/cpp/bench/spatial/selection.cu
+++ b/cpp/bench/spatial/selection.cu
@@ -18,7 +18,7 @@
 #include <raft/spatial/knn/knn.cuh>
 
 #if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.hpp>
+#include <raft/spatial/knn/specializations.cuh>
 #endif
 
 #include <raft/random/rng.cuh>
diff --git a/cpp/include/raft/distance/fused_l2_nn.hpp b/cpp/include/raft/distance/fused_l2_nn.hpp
index 768e33b3a7..74ad0974f4 100644
--- a/cpp/include/raft/distance/fused_l2_nn.hpp
+++ b/cpp/include/raft/distance/fused_l2_nn.hpp
@@ -18,105 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __FUSED_L2_NN_H
-#define __FUSED_L2_NN_H
-
-#pragma once
-
-#include <cub/cub.cuh>
-#include <limits>
-#include <raft/cuda_utils.cuh>
-#include <raft/distance/detail/fused_l2_nn.cuh>
-#include <raft/handle.hpp>
-#include <stdint.h>
-
-namespace raft {
-namespace distance {
-
-template <typename LabelT, typename DataT>
-using KVPMinReduce = detail::KVPMinReduceImpl<LabelT, DataT>;
-
-template <typename LabelT, typename DataT>
-using MinAndDistanceReduceOp = detail::MinAndDistanceReduceOpImpl<LabelT, DataT>;
-
-template <typename LabelT, typename DataT>
-using MinReduceOp = detail::MinReduceOpImpl<LabelT, DataT>;
-
 /**
- * Initialize array using init value from reduction op
+ * DISCLAIMER: this file is deprecated: use fused_l2_nn.cuh instead
  */
-template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
-void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
-{
-  detail::initialize<DataT, OutT, IdxT, ReduceOpT>(min, m, maxVal, redOp, handle.get_stream());
-}
 
-/**
- * @brief Fused L2 distance and 1-nearest-neighbor computation in a single call.
- *
- * The benefits of such a call are 2-fold: 1) eliminate the need for an
- * intermediate buffer to store the output of gemm 2) reduce the memory read
- * traffic on this intermediate buffer, otherwise needed during the reduction
- * phase for 1-NN.
- *
- * @tparam DataT     data type
- * @tparam OutT      output type to either store 1-NN indices and their minimum
- *                   distances or store only the min distances. Accordingly, one
- *                   has to pass an appropriate `ReduceOpT`
- * @tparam IdxT      indexing arithmetic type
- * @tparam ReduceOpT A struct to perform the final needed reduction operation
- *                   and also to initialize the output array elements with the
- *                   appropriate initial value needed for reduction.
- *
- * @param[out] min           will contain the reduced output (Length = `m`)
- *                           (on device)
- * @param[in]  x             first matrix. Row major. Dim = `m x k`.
- *                           (on device).
- * @param[in]  y             second matrix. Row major. Dim = `n x k`.
- *                           (on device).
- * @param[in]  xn            L2 squared norm of `x`. Length = `m`. (on device).
- * @param[in]  yn            L2 squared norm of `y`. Length = `n`. (on device)
- * @param[in]  m             gemm m
- * @param[in]  n             gemm n
- * @param[in]  k             gemm k
- * @param[in]  workspace     temp workspace. Size = sizeof(int)*m. (on device)
- * @param[in]  redOp         reduction operator in the epilogue
- * @param[in] pairRedOp reduction operation on key value pairs
- * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
- * @param[in]  initOutBuffer whether to initialize the output buffer before the
- *                           main kernel launch
- * @param[in]  stream        cuda stream
- */
-template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT, typename KVPReduceOpT>
-void fusedL2NN(OutT* min,
-               const DataT* x,
-               const DataT* y,
-               const DataT* xn,
-               const DataT* yn,
-               IdxT m,
-               IdxT n,
-               IdxT k,
-               void* workspace,
-               ReduceOpT redOp,
-               KVPReduceOpT pairRedOp,
-               bool sqrt,
-               bool initOutBuffer,
-               cudaStream_t stream)
-{
-  size_t bytes = sizeof(DataT) * k;
-  if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) {
-    detail::fusedL2NNImpl<DataT, OutT, IdxT, 16 / sizeof(DataT), ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) {
-    detail::fusedL2NNImpl<DataT, OutT, IdxT, 8 / sizeof(DataT), ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
-  } else {
-    detail::fusedL2NNImpl<DataT, OutT, IdxT, 1, ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
-  }
-}
+#pragma once
 
-}  // namespace distance
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "fused_l2_nn.cuh"
diff --git a/cpp/include/raft/distance/specializations.hpp b/cpp/include/raft/distance/specializations.hpp
index 641968d9f1..04afb73036 100644
--- a/cpp/include/raft/distance/specializations.hpp
+++ b/cpp/include/raft/distance/specializations.hpp
@@ -18,11 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __DISTANCE_SPECIALIZATIONS_H
-#define __DISTANCE_SPECIALIZATIONS_H
+/**
+ * DISCLAIMER: this file is deprecated: use specializations.cuh instead
+ */
 
 #pragma once
 
-#include <raft/distance/specializations/distance.cuh>
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "specializations.cuh"
diff --git a/cpp/include/raft/label/classlabels.hpp b/cpp/include/raft/label/classlabels.hpp
index 189c26f69f..4f47b426c0 100644
--- a/cpp/include/raft/label/classlabels.hpp
+++ b/cpp/include/raft/label/classlabels.hpp
@@ -13,110 +13,19 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#ifndef __CLASS_LABELS_H
-#define __CLASS_LABELS_H
-
-#pragma once
-
-#include <raft/label/detail/classlabels.cuh>
-
-namespace raft {
-namespace label {
-
 /**
- * Get unique class labels.
- *
- * The y array is assumed to store class labels. The unique values are selected
- * from this array.
- *
- * @tparam value_t numeric type of the arrays with class labels
- * @param [inout] unique output unique labels
- * @param [in] y device array of labels, size [n]
- * @param [in] n number of labels
- * @param [in] stream cuda stream
- * @returns unique device array of unique labels, unallocated on entry,
- *   on exit it has size
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
  */
-template <typename value_t>
-int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n, cudaStream_t stream)
-{
-  return detail::getUniquelabels<value_t>(unique, y, n, stream);
-}
 
 /**
- * Assign one versus rest labels.
- *
- * The output labels will have values +/-1:
- * y_out = (y == y_unique[idx]) ? +1 : -1;
- *
- * The output type currently is set to value_t, but for SVM in principle we are
- * free to choose other type for y_out (it should represent +/-1, and it is used
- * in floating point arithmetics).
- *
- * @param [in] y device array if input labels, size [n]
- * @param [in] n number of labels
- * @param [in] y_unique device array of unique labels, size [n_classes]
- * @param [in] n_classes number of unique labels
- * @param [out] y_out device array of output labels
- * @param [in] idx index of unique label that should be labeled as 1
- * @param [in] stream cuda stream
- */
-template <typename value_t>
-void getOvrlabels(
-  value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream)
-{
-  detail::getOvrlabels<value_t>(y, n, y_unique, n_classes, y_out, idx, stream);
-}
-/**
- * Maps an input array containing a series of numbers into a new array
- * where numbers have been mapped to a monotonically increasing set
- * of labels. This can be useful in machine learning algorithms, for instance,
- * where a given set of labels is not taken from a monotonically increasing
- * set. This can happen if they are filtered or if only a subset of the
- * total labels are used in a dataset. This is also useful in graph algorithms
- * where a set of vertices need to be labeled in a monotonically increasing
- * order.
- * @tparam Type the numeric type of the input and output arrays
- * @tparam Lambda the type of an optional filter function, which determines
- * which items in the array to map.
- * @param[out] out the output monotonic array
- * @param[in] in input label array
- * @param[in] N number of elements in the input array
- * @param[in] stream cuda stream to use
- * @param[in] filter_op an optional function for specifying which values
- * should have monotonically increasing labels applied to them.
- * @param[in] zero_based force monotonic set to start at 0?
+ * DISCLAIMER: this file is deprecated: use classlabels.cuh instead
  */
-template <typename Type, typename Lambda>
-void make_monotonic(
-  Type* out, Type* in, size_t N, cudaStream_t stream, Lambda filter_op, bool zero_based = false)
-{
-  detail::make_monotonic<Type, Lambda>(out, in, N, stream, filter_op, zero_based);
-}
 
-/**
- * Maps an input array containing a series of numbers into a new array
- * where numbers have been mapped to a monotonically increasing set
- * of labels. This can be useful in machine learning algorithms, for instance,
- * where a given set of labels is not taken from a monotonically increasing
- * set. This can happen if they are filtered or if only a subset of the
- * total labels are used in a dataset. This is also useful in graph algorithms
- * where a set of vertices need to be labeled in a monotonically increasing
- * order.
- * @tparam Type the numeric type of the input and output arrays
- * @param[out] out output label array with labels assigned monotonically
- * @param[in] in input label array
- * @param[in] N number of elements in the input array
- * @param[in] stream cuda stream to use
- * @param[in] zero_based force monotonic label set to start at 0?
- */
-template <typename Type>
-void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zero_based = false)
-{
-  detail::make_monotonic<Type>(out, in, N, stream, zero_based);
-}
-};  // namespace label
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "classlabels.cuh"
diff --git a/cpp/include/raft/label/merge_labels.hpp b/cpp/include/raft/label/merge_labels.hpp
index 2bf2fa830b..7c0c25d038 100644
--- a/cpp/include/raft/label/merge_labels.hpp
+++ b/cpp/include/raft/label/merge_labels.hpp
@@ -13,59 +13,19 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#ifndef __MERGE_LABELS_H
-#define __MERGE_LABELS_H
-
-#pragma once
-
-#include <raft/label/detail/merge_labels.cuh>
-
-namespace raft {
-namespace label {
+/**
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
 
 /**
- * @brief Merge two labellings in-place, according to a core mask
- *
- * A labelling is a representation of disjoint sets (groups) where points that
- * belong to the same group have the same label. It is assumed that group
- * labels take values between 1 and N. labels relate to points, i.e a label i+1
- * means that you belong to the same group as the point i.
- * The special value MAX_LABEL is used to mark points that are not labelled.
- *
- * The two label arrays A and B induce two sets of groups over points 0..N-1.
- * If a point is labelled i in A and j in B and the mask is true for this
- * point, then i and j are equivalent labels and their groups are merged by
- * relabeling the elements of both groups to have the same label. The new label
- * is the smaller one from the original labels.
- * It is required that if the mask is true for a point, this point is labelled
- * (i.e its label is different than the special value MAX_LABEL).
- *
- * One use case is finding connected components: the two input label arrays can
- * represent the connected components of graphs G_A and G_B, and the output
- * would be the connected components labels of G_A \union G_B.
- *
- * @param[inout] labels_a    First input, and output label array (in-place)
- * @param[in]    labels_b    Second input label array
- * @param[in]    mask        Core point mask
- * @param[out]   R           label equivalence map
- * @param[in]    m           Working flag
- * @param[in]    N           Number of points in the dataset
- * @param[in]    stream      CUDA stream
+ * DISCLAIMER: this file is deprecated: use merge_labels.cuh instead
  */
-template <typename value_idx = int, int TPB_X = 256>
-void merge_labels(value_idx* labels_a,
-                  const value_idx* labels_b,
-                  const bool* mask,
-                  value_idx* R,
-                  bool* m,
-                  value_idx N,
-                  cudaStream_t stream)
-{
-  detail::merge_labels<value_idx, TPB_X>(labels_a, labels_b, mask, R, m, N, stream);
-}
 
-};  // namespace label
-};  // namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "merge_labels.cuh"
diff --git a/cpp/include/raft/lap/lap.hpp b/cpp/include/raft/lap/lap.hpp
index a9f205932c..badafb8afd 100644
--- a/cpp/include/raft/lap/lap.hpp
+++ b/cpp/include/raft/lap/lap.hpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -12,289 +11,21 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License.+
- *
- *      CUDA Implementation of O(n^3) alternating tree Hungarian Algorithm
- *      Authors: Ketan Date and Rakesh Nagi
- *
- *      Article reference:
- *          Date, Ketan, and Rakesh Nagi. "GPU-accelerated Hungarian algorithms
- *          for the Linear Assignment Problem." Parallel Computing 57 (2016): 52-72.
- *
+ * limitations under the License.
  */
-
 /**
  * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
-#ifndef __LAP_H
-#define __LAP_H
+/**
+ * DISCLAIMER: this file is deprecated: use lap.cuh instead
+ */
 
 #pragma once
 
-#include <raft/handle.hpp>
-#include <rmm/device_uvector.hpp>
-
-#include <thrust/execution_policy.h>
-#include <thrust/fill.h>
-
-#include "detail/d_structs.h"
-#include "detail/lap_functions.cuh"
-
-namespace raft {
-namespace lap {
-
-template <typename vertex_t, typename weight_t>
-class LinearAssignmentProblem {
-  vertex_t size_;
-  vertex_t batchsize_;
-  weight_t epsilon_;
-
-  weight_t const* d_costs_;
-
-  Vertices<vertex_t, weight_t> d_vertices_dev;
-  VertexData<vertex_t> d_row_data_dev, d_col_data_dev;
-
-  raft::handle_t const& handle_;
-  rmm::device_uvector<int> row_covers_v;
-  rmm::device_uvector<int> col_covers_v;
-  rmm::device_uvector<weight_t> row_duals_v;
-  rmm::device_uvector<weight_t> col_duals_v;
-  rmm::device_uvector<weight_t> col_slacks_v;
-  rmm::device_uvector<int> row_is_visited_v;
-  rmm::device_uvector<int> col_is_visited_v;
-  rmm::device_uvector<vertex_t> row_parents_v;
-  rmm::device_uvector<vertex_t> col_parents_v;
-  rmm::device_uvector<vertex_t> row_children_v;
-  rmm::device_uvector<vertex_t> col_children_v;
-  rmm::device_uvector<weight_t> obj_val_primal_v;
-  rmm::device_uvector<weight_t> obj_val_dual_v;
-
- public:
-  LinearAssignmentProblem(raft::handle_t const& handle,
-                          vertex_t size,
-                          vertex_t batchsize,
-                          weight_t epsilon)
-    : handle_(handle),
-      size_(size),
-      batchsize_(batchsize),
-      epsilon_(epsilon),
-      d_costs_(nullptr),
-      row_covers_v(0, handle_.get_stream()),
-      col_covers_v(0, handle_.get_stream()),
-      row_duals_v(0, handle_.get_stream()),
-      col_duals_v(0, handle_.get_stream()),
-      col_slacks_v(0, handle_.get_stream()),
-      row_is_visited_v(0, handle_.get_stream()),
-      col_is_visited_v(0, handle_.get_stream()),
-      row_parents_v(0, handle_.get_stream()),
-      col_parents_v(0, handle_.get_stream()),
-      row_children_v(0, handle_.get_stream()),
-      col_children_v(0, handle_.get_stream()),
-      obj_val_primal_v(0, handle_.get_stream()),
-      obj_val_dual_v(0, handle_.get_stream())
-  {
-  }
-
-  // Executes Hungarian algorithm on the input cost matrix.
-  void solve(weight_t const* d_cost_matrix, vertex_t* d_row_assignment, vertex_t* d_col_assignment)
-  {
-    initializeDevice();
-
-    d_vertices_dev.row_assignments = d_row_assignment;
-    d_vertices_dev.col_assignments = d_col_assignment;
-
-    d_costs_ = d_cost_matrix;
-
-    int step = 0;
-
-    while (step != 100) {
-      switch (step) {
-        case 0: step = hungarianStep0(); break;
-        case 1: step = hungarianStep1(); break;
-        case 2: step = hungarianStep2(); break;
-        case 3: step = hungarianStep3(); break;
-        case 4: step = hungarianStep4(); break;
-        case 5: step = hungarianStep5(); break;
-        case 6: step = hungarianStep6(); break;
-      }
-    }
-
-    d_costs_ = nullptr;
-  }
-
-  // Function for getting optimal row dual vector for subproblem spId.
-  std::pair<const weight_t*, vertex_t> getRowDualVector(int spId) const
-  {
-    return std::make_pair(row_duals_v.data() + spId * size_, size_);
-  }
-
-  // Function for getting optimal col dual vector for subproblem spId.
-  std::pair<const weight_t*, vertex_t> getColDualVector(int spId)
-  {
-    return std::make_pair(col_duals_v.data() + spId * size_, size_);
-  }
-
-  // Function for getting optimal primal objective value for subproblem spId.
-  weight_t getPrimalObjectiveValue(int spId)
-  {
-    weight_t result;
-    raft::update_host(&result, obj_val_primal_v.data() + spId, 1, handle_.get_stream());
-    CHECK_CUDA(handle_.get_stream());
-    return result;
-  }
-
-  // Function for getting optimal dual objective value for subproblem spId.
-  weight_t getDualObjectiveValue(int spId)
-  {
-    weight_t result;
-    raft::update_host(&result, obj_val_dual_v.data() + spId, 1, handle_.get_stream());
-    CHECK_CUDA(handle_.get_stream());
-    return result;
-  }
-
- private:
-  // Helper function for initializing global variables and arrays on a single host.
-  void initializeDevice()
-  {
-    cudaStream_t stream = handle_.get_stream();
-    row_covers_v.resize(batchsize_ * size_, stream);
-    col_covers_v.resize(batchsize_ * size_, stream);
-    row_duals_v.resize(batchsize_ * size_, stream);
-    col_duals_v.resize(batchsize_ * size_, stream);
-    col_slacks_v.resize(batchsize_ * size_, stream);
-    row_is_visited_v.resize(batchsize_ * size_, stream);
-    col_is_visited_v.resize(batchsize_ * size_, stream);
-    row_parents_v.resize(batchsize_ * size_, stream);
-    col_parents_v.resize(batchsize_ * size_, stream);
-    row_children_v.resize(batchsize_ * size_, stream);
-    col_children_v.resize(batchsize_ * size_, stream);
-    obj_val_primal_v.resize(batchsize_, stream);
-    obj_val_dual_v.resize(batchsize_, stream);
-
-    d_vertices_dev.row_covers = row_covers_v.data();
-    d_vertices_dev.col_covers = col_covers_v.data();
-
-    d_vertices_dev.row_duals  = row_duals_v.data();
-    d_vertices_dev.col_duals  = col_duals_v.data();
-    d_vertices_dev.col_slacks = col_slacks_v.data();
-
-    d_row_data_dev.is_visited = row_is_visited_v.data();
-    d_col_data_dev.is_visited = col_is_visited_v.data();
-    d_row_data_dev.parents    = row_parents_v.data();
-    d_row_data_dev.children   = row_children_v.data();
-    d_col_data_dev.parents    = col_parents_v.data();
-    d_col_data_dev.children   = col_children_v.data();
-
-    thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(), int{0});
-    thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(), int{0});
-    thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(), weight_t{0});
-    thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(), weight_t{0});
-  }
-
-  // Function for calculating initial zeros by subtracting row and column minima from each element.
-  int hungarianStep0()
-  {
-    detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_, size_);
-
-    return 1;
-  }
-
-  // Function for calculating initial zeros by subtracting row and column minima from each element.
-  int hungarianStep1()
-  {
-    detail::computeInitialAssignments(
-      handle_, d_costs_, d_vertices_dev, batchsize_, size_, epsilon_);
-
-    int next = 2;
-
-    while (true) {
-      if ((next = hungarianStep2()) == 6) break;
-
-      if ((next = hungarianStep3()) == 5) break;
-
-      hungarianStep4();
-    }
-
-    return next;
-  }
-
-  // Function for checking optimality and constructing predicates and covers.
-  int hungarianStep2()
-  {
-    int cover_count = detail::computeRowCovers(
-      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_);
-
-    int next = (cover_count == batchsize_ * size_) ? 6 : 3;
-
-    return next;
-  }
-
-  // Function for building alternating tree rooted at unassigned rows.
-  int hungarianStep3()
-  {
-    int next;
-
-    rmm::device_scalar<bool> flag_v(handle_.get_stream());
-
-    bool h_flag = false;
-    flag_v.set_value_async(h_flag, handle_.get_stream());
-
-    detail::executeZeroCover(handle_,
-                             d_costs_,
-                             d_vertices_dev,
-                             d_row_data_dev,
-                             d_col_data_dev,
-                             flag_v.data(),
-                             batchsize_,
-                             size_,
-                             epsilon_);
-
-    h_flag = flag_v.value(handle_.get_stream());
-
-    next = h_flag ? 4 : 5;
-
-    return next;
-  }
-
-  // Function for augmenting the solution along multiple node-disjoint alternating trees.
-  int hungarianStep4()
-  {
-    detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_, size_);
-
-    detail::augmentationPass(
-      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_);
-
-    return 2;
-  }
-
-  // Function for updating dual solution to introduce new zero-cost arcs.
-  int hungarianStep5()
-  {
-    detail::dualUpdate(
-      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_, epsilon_);
-
-    return 3;
-  }
-
-  // Function for calculating primal and dual objective values at optimality.
-  int hungarianStep6()
-  {
-    detail::calcObjValPrimal(handle_,
-                             obj_val_primal_v.data(),
-                             d_costs_,
-                             d_vertices_dev.row_assignments,
-                             batchsize_,
-                             size_);
-
-    detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev, batchsize_, size_);
-
-    return 100;
-  }
-};
-
-}  // namespace lap
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "lap.cuh"
diff --git a/cpp/include/raft/linalg/add.hpp b/cpp/include/raft/linalg/add.hpp
index a80398fcad..e7f9610892 100644
--- a/cpp/include/raft/linalg/add.hpp
+++ b/cpp/include/raft/linalg/add.hpp
@@ -18,78 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __ADD_H
-#define __ADD_H
-
-#pragma once
-
-#include "detail/add.cuh"
-
-namespace raft {
-namespace linalg {
-
-using detail::adds_scalar;
-
-/**
- * @brief Elementwise scalar add operation on the input buffer
- *
- * @tparam InT     input data-type. Also the data-type upon which the math ops
- *                 will be performed
- * @tparam OutT    output data-type
- * @tparam IdxType Integer type used to for addressing
- *
- * @param out    the output buffer
- * @param in     the input buffer
- * @param scalar the scalar used in the operations
- * @param len    number of elements in the input buffer
- * @param stream cuda stream where to launch work
- */
-template <typename InT, typename OutT = InT, typename IdxType = int>
-void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
-{
-  detail::addScalar(out, in, scalar, len, stream);
-}
-
 /**
- * @brief Elementwise add operation on the input buffers
- * @tparam InT     input data-type. Also the data-type upon which the math ops
- *                 will be performed
- * @tparam OutT    output data-type
- * @tparam IdxType Integer type used to for addressing
- *
- * @param out    the output buffer
- * @param in1    the first input buffer
- * @param in2    the second input buffer
- * @param len    number of elements in the input buffers
- * @param stream cuda stream where to launch work
+ * DISCLAIMER: this file is deprecated: use add.cuh instead
  */
-template <typename InT, typename OutT = InT, typename IdxType = int>
-void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
-{
-  detail::add(out, in1, in2, len, stream);
-}
 
-/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
- * write result to outDev[i]
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param outDev the output buffer
- * @param inDev the input buffer
- * @param singleScalarDev pointer to the scalar located in device memory
- * @param len number of elements in the input and output buffer
- * @param stream cuda stream
- */
-template <typename math_t, typename IdxType = int>
-void addDevScalar(math_t* outDev,
-                  const math_t* inDev,
-                  const math_t* singleScalarDev,
-                  IdxType len,
-                  cudaStream_t stream)
-{
-  detail::addDevScalar(outDev, inDev, singleScalarDev, len, stream);
-}
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "add.cuh"
diff --git a/cpp/include/raft/linalg/axpy.hpp b/cpp/include/raft/linalg/axpy.hpp
index c227ba66c8..8db4c5a6e8 100644
--- a/cpp/include/raft/linalg/axpy.hpp
+++ b/cpp/include/raft/linalg/axpy.hpp
@@ -18,43 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __AXPY_H
-#define __AXPY_H
-
-#pragma once
-
-#include "detail/axpy.cuh"
-
-namespace raft::linalg {
-
 /**
- * @brief the wrapper of cublas axpy function
- *  It computes the following equation: y = alpha * x + y
- *
- * @tparam T the element type
- * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
- * @param [in] handle raft handle
- * @param [in] n number of elements in x and y
- * @param [in] alpha host or device scalar
- * @param [in] x vector of length n
- * @param [in] incx stride between consecutive elements of x
- * @param [inout] y vector of length n
- * @param [in] incy stride between consecutive elements of y
- * @param [in] stream
+ * DISCLAIMER: this file is deprecated: use axpy.cuh instead
  */
-template <typename T, bool DevicePointerMode = false>
-void axpy(const raft::handle_t& handle,
-          const int n,
-          const T* alpha,
-          const T* x,
-          const int incx,
-          T* y,
-          const int incy,
-          cudaStream_t stream)
-{
-  detail::axpy<T, DevicePointerMode>(handle, n, alpha, x, incx, y, incy, stream);
-}
 
-}  // namespace raft::linalg
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "axpy.cuh"
diff --git a/cpp/include/raft/linalg/binary_op.hpp b/cpp/include/raft/linalg/binary_op.hpp
index 9983e8ab50..f0a54cb164 100644
--- a/cpp/include/raft/linalg/binary_op.hpp
+++ b/cpp/include/raft/linalg/binary_op.hpp
@@ -18,46 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __BINARY_OP_H
-#define __BINARY_OP_H
-
-#pragma once
-
-#include "detail/binary_op.cuh"
-
-#include <raft/cuda_utils.cuh>
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief perform element-wise binary operation on the input arrays
- * @tparam InType input data-type
- * @tparam Lambda the device-lambda performing the actual operation
- * @tparam OutType output data-type
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads-per-block in the final kernel launched
- * @param out the output array
- * @param in1 the first input array
- * @param in2 the second input array
- * @param len number of elements in the input array
- * @param op the device-lambda
- * @param stream cuda stream where to launch work
- * @note Lambda must be a functor with the following signature:
- *       `OutType func(const InType& val1, const InType& val2);`
+ * DISCLAIMER: this file is deprecated: use binary_op.cuh instead
  */
-template <typename InType,
-          typename Lambda,
-          typename OutType = InType,
-          typename IdxType = int,
-          int TPB          = 256>
-void binaryOp(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
-{
-  detail::binaryOp(out, in1, in2, len, op, stream);
-}
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "binary_op.cuh"
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.hpp b/cpp/include/raft/linalg/cholesky_r1_update.hpp
index 1158ad3aa4..a1967c36cb 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.hpp
+++ b/cpp/include/raft/linalg/cholesky_r1_update.hpp
@@ -18,126 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __CHOLESKY_R1_UPDATE_H
-#define __CHOLESKY_R1_UPDATE_H
+/**
+ * DISCLAIMER: this file is deprecated: use cholesky_r1_update.cuh instead
+ */
 
 #pragma once
 
-#include "detail/cholesky_r1_update.cuh"
-
-namespace raft {
-namespace linalg {
-
-/**
- * @brief Rank 1 update of Cholesky decomposition.
- *
- * This method is useful if an algorithm iteratively builds up matrix A, and
- * the Cholesky decomposition of A is required at each step.
- *
- * On entry, L is the Cholesky decomposition of matrix A, where both A and L
- * have size n-1 x n-1. We are interested in the Cholesky decomposition of a new
- * matrix A', which we get by adding a row and column to A. In Python notation:
- * - A'[0:n-1, 0:n-1] = A;
- * - A'[:,n-1] = A[n-1,:] = A_new
- *
- * On entry, the new column A_new, is stored as the n-th column of L if uplo ==
- * CUBLAS_FILL_MODE_UPPER, else A_new is stored as the n-th row of L.
- *
- * On exit L contains the Cholesky decomposition of A'. In practice the elements
- * of A_new are overwritten with new row/column of the L matrix.
- *
- * The uplo paramater is used to select the matrix layout.
- * If (uplo != CUBLAS_FILL_MODE_UPPER) then the input arg L stores the
- * lower triangular matrix L, so that A = L * L.T. Otherwise the input arg L
- * stores an upper triangular matrix U: A = U.T * U.
- *
- * On exit L will be updated to store the Cholesky decomposition of A'.
- *
- * If the matrix is not positive definit, or very ill conditioned then the new
- * diagonal element of L would be NaN. In such a case an exception is thrown.
- * The eps argument can be used to override this behavior: if eps >= 0 then
- * the diagonal element is replaced by eps in case the diagonal is NaN or
- * smaller than eps. Note: for an iterative solver it is probably better to
- * stop early in case of error, rather than relying on the eps parameter.
- *
- * Examples:
- *
- * - Lower triangular factorization:
- * @code{.cpp}
- * // Initialize arrays
- * int ld_L = n_rows;
- * rmm::device_uvector<math_t> L(ld_L * n_rows, stream);
- * raft::linalg::choleskyRank1Update(handle, L, n_rows, ld_L, nullptr,
- *                                       &n_bytes, CUBLAS_FILL_MODE_LOWER,
- *                                       stream);
- * rmm::device_uvector<char> workspace(n_bytes, stream);
- *
- * for (n=1; n<=n_rows; rank++) {
- *   // Calculate a new row/column of matrix A into A_new
- *   // ...
- *   // Copy new row to L[rank-1,:]
- *   RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1,
- *                           L + n - 1, ld_L, stream));
- *   // Update Cholesky factorization
- *   raft::linalg::choleskyRank1Update(
- *       handle, L, rank, ld_L, workspace, &n_bytes, CUBLAS_FILL_MODE_LOWER,
- *       stream);
- * }
- * Now L stores the Cholesky decomposition of A: A = L * L.T
- * @endcode
- *
- * - Upper triangular factorization:
- * @code{.cpp}
- * // Initialize arrays
- * int ld_U = n_rows;
- * rmm::device_uvector<math_t> U(ld_U * n_rows, stream);
- * raft::linalg::choleskyRank1Update(handle, L, n_rows, ld_U, nullptr,
- *                                       &n_bytes, CUBLAS_FILL_MODE_UPPER,
- *                                       stream);
- * rmm::device_uvector<char> workspace(stream, n_bytes, stream);
- *
- * for (n=1; n<=n_rows; n++) {
- *   // Calculate a new row/column of matrix A into array A_new
- *   // ...
- *   // Copy new row to U[:,n-1] (column major layout)
- *   raft::copy(U + ld_U * (n-1), A_new, n-1, stream);
- *   //
- *   // Update Cholesky factorization
- *   raft::linalg::choleskyRank1Update(
- *       handle, U, n, ld_U, workspace, &n_bytes, CUBLAS_FILL_MODE_UPPER,
- *       stream);
- * }
- * // Now U stores the Cholesky decomposition of A: A = U.T * U
- * @endcode
- *
- * @param handle RAFT handle (used to retrive cuBLAS handles).
- * @param L device array for to store the triangular matrix L, and the new
- *     column of A in column major format, size [n*n]
- * @param n number of elements in the new row.
- * @param ld stride of colums in L
- * @param workspace device pointer to workspace shall be nullptr ar an array
- *    of size [n_bytes].
- * @param n_bytes size of workspace is returned here if workspace==nullptr.
- * @param stream CUDA stream
- * @param uplo indicates whether L is stored as an upper or lower triangular
- *    matrix (CUBLAS_FILL_MODE_UPPER or CUBLAS_FILL_MODE_LOWER)
- * @param eps numerical parameter that can act as a regularizer for ill
- *    conditioned systems. Negative values mean no regularizaton.
- */
-template <typename math_t>
-void choleskyRank1Update(const raft::handle_t& handle,
-                         math_t* L,
-                         int n,
-                         int ld,
-                         void* workspace,
-                         int* n_bytes,
-                         cublasFillMode_t uplo,
-                         cudaStream_t stream,
-                         math_t eps = -1)
-{
-  detail::choleskyRank1Update(handle, L, n, ld, workspace, n_bytes, uplo, stream, eps);
-}
-};  // namespace linalg
-};  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "cholesky_r1_update.cuh"
diff --git a/cpp/include/raft/linalg/coalesced_reduction.hpp b/cpp/include/raft/linalg/coalesced_reduction.hpp
index 48f8798a03..8631a7e5ba 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.hpp
+++ b/cpp/include/raft/linalg/coalesced_reduction.hpp
@@ -18,64 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __COALESCED_REDUCTION_H
-#define __COALESCED_REDUCTION_H
-
-#pragma once
-
-#include "detail/coalesced_reduction.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief Compute reduction of the input matrix along the leading dimension
- *
- * @tparam InType the data type of the input
- * @tparam OutType the data type of the output (as well as the data type for
- *  which reduction is performed)
- * @tparam IdxType data type of the indices of the array
- * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
- * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*ReduceLambda)(OutType);</pre>
- * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*FinalLambda)(OutType);</pre>
- * @param dots the output reduction vector
- * @param data the input matrix
- * @param D leading dimension of data
- * @param N second dimension data
- * @param init initial value to use for the reduction
- * @param main_op elementwise operation to apply before reduction
- * @param reduce_op binary reduction operation
- * @param final_op elementwise operation to apply before storing results
- * @param inplace reduction result added inplace or overwrites old values?
- * @param stream cuda stream where to launch work
+ * DISCLAIMER: this file is deprecated: use coalesced_reduction.cuh instead
  */
-template <typename InType,
-          typename OutType      = InType,
-          typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
-void coalescedReduction(OutType* dots,
-                        const InType* data,
-                        int D,
-                        int N,
-                        OutType init,
-                        cudaStream_t stream,
-                        bool inplace           = false,
-                        MainLambda main_op     = raft::Nop<InType, IdxType>(),
-                        ReduceLambda reduce_op = raft::Sum<OutType>(),
-                        FinalLambda final_op   = raft::Nop<OutType>())
-{
-  detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-}
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "coalesced_reduction.cuh"
diff --git a/cpp/include/raft/linalg/contractions.hpp b/cpp/include/raft/linalg/contractions.hpp
index 256593d9ae..7e5e9be403 100644
--- a/cpp/include/raft/linalg/contractions.hpp
+++ b/cpp/include/raft/linalg/contractions.hpp
@@ -18,199 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __CONTRACTIONS_H
-#define __CONTRACTIONS_H
-
-#pragma once
-
-#include "detail/contractions.cuh"
-
-namespace raft {
-namespace linalg {
-
-/**
- * @brief This is the central enum that should be used to configure the perf
- *        landscape of the Contraction kernel.
- *
- * Main goal of this Policy struct is to provide sufficient knobs to tune the
- * perf of Contraction kernel, as and when we see matrices of different shapes.
- *
- * @tparam DataT   the IO and math datatype
- * @tparam _veclen number of k-elements loaded by each thread for every LDG call
- *                 it makes. This should be configured based on the input 'k'
- *                 value and the input data type. For eg: if DataT = float and
- *                 k is multiples of 4, then setting this to 4 gives the best
- *                 LDG pattern. Possible values are {1, 2, 4}.
- * @tparam _kblk   number of k-elements operated upon per main-loop iteration.
- *                 Therefore total number of main-loop iterations will be
- *                 `ceil(k/_kblk)`. This must be multiples of `_veclen`. Do note
- *                 that bigger this value, the greater shared mem requirement.
- * @tparam _rpt    Defines the number of rows that a given thread accumulates on.
- *                 This directly results in increased register pressure. This
- *                 also is used to compute the number of m-elements worked upon
- *                 by each thread block.
- * @tparam _cpt    Defines the number of cols that a given thread accumulates on.
- *                 This directly results in increased register pressure. This
- *                 also is used to compute the number of n-elements worked upon
- *                 by each thread block.
- * @tparam _tr     Number of threads working on the same output column. This is
- *                 used to compute the number of m-elements worked upon by each
- *                 thread block. This also determines the number of threads per
- *                 thread block
- * @tparam _tc     Number of threads working on the same output row. This is
- *                 used to compute the number of m-elements worked upon by each
- *                 thread block. This also determines the number of threads per
- *                 thread block
- */
-template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
-struct KernelPolicy {
-  enum {
-    /** number of elements along K worked upon per main loop iteration */
-    Kblk = _kblk,
-    /** number of elements loaded per LDG */
-    Veclen = _veclen,
-    /** number of rows a thread works on for accumulation */
-    AccRowsPerTh = _rpt,
-    /** number of cols a thread works on for accumulation */
-    AccColsPerTh = _cpt,
-    /** number of threads working the same output col */
-    AccThRows = _tr,
-    /** number of threads working the same output row */
-    AccThCols = _tc,
-    /** total threads per block */
-    Nthreads = AccThRows * AccThCols,
-    /** output tile size along rows */
-    Mblk = AccRowsPerTh * AccThRows,
-    /** output tile size along cols */
-    Nblk = AccColsPerTh * AccThCols,
-    /** number of threads loading a single row */
-    LdgThRow = Kblk / Veclen,
-    /** number of LDGs issued by a single thread for X */
-    LdgPerThX = Mblk * LdgThRow / Nthreads,
-    /** number of LDGs issued by a single thread for Y */
-    LdgPerThY = Nblk * LdgThRow / Nthreads,
-    /** number of rows of X covered per LDG */
-    LdgRowsX = Mblk / LdgPerThX,
-    /** number of rows of Y covered per LDG */
-    LdgRowsY = Nblk / LdgPerThY,
-    /** stride for accessing X/Y data in shared mem */
-    SmemStride = Kblk + Veclen,
-    /** size of one page for storing X data */
-    SmemPageX = SmemStride * Mblk,
-    /** size of one page for storing Y data */
-    SmemPageY = SmemStride * Nblk,
-    /** size of one smem page */
-    SmemPage = SmemPageX + SmemPageY,
-    /** size (in B) for smem needed */
-    SmemSize = 2 * SmemPage * sizeof(DataT),
-  };  // enum
-
-};  // struct KernelPolicy
-
-template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
-struct ColKernelPolicy {
-  enum {
-    /** number of elements along K worked upon per main loop iteration */
-    Kblk = _kblk,
-    /** number of elements loaded per LDG */
-    Veclen = _veclen,
-    /** number of rows a thread works on for accumulation */
-    AccRowsPerTh = _rpt,
-    /** number of cols a thread works on for accumulation */
-    AccColsPerTh = _cpt,
-    /** number of threads working the same output col */
-    AccThRows = _tr,
-    /** number of threads working the same output row */
-    AccThCols = _tc,
-    /** total threads per block */
-    Nthreads = AccThRows * AccThCols,
-    /** output tile size along rows */
-    Mblk = AccRowsPerTh * AccThRows,
-    /** output tile size along cols */
-    Nblk = AccColsPerTh * AccThCols,
-    /** number of threads loading a single col */
-    LdgThRow = Mblk / Veclen,
-    /** number of LDGs issued by a single thread for X */
-    LdgPerThX = Kblk * LdgThRow / Nthreads,
-    /** number of LDGs issued by a single thread for Y */
-    LdgPerThY = Kblk * LdgThRow / Nthreads,
-    /** number of rows of X covered per LDG */
-    LdgRowsX = Kblk / LdgPerThX,
-    /** number of rows of Y covered per LDG */
-    LdgRowsY = Kblk / LdgPerThY,
-    /** stride for accessing X/Y data in shared mem */
-    SmemStride = Mblk + Veclen,
-    /** size of one page for storing X data */
-    SmemPageX = SmemStride * Kblk,
-    /** size of one page for storing Y data */
-    SmemPageY = SmemStride * Kblk,
-    /** size of one smem page */
-    SmemPage = SmemPageX + SmemPageY,
-    /** size (in B) for smem needed */
-    SmemSize = 2 * SmemPage * sizeof(DataT),
-  };  // colMajor enum
-  static_assert(Mblk == Nblk, "Mblk should be equal to Nblk");
-};
 /**
- * @defgroup Policy4x4 16 elements per thread Policy with k-block = 32
- * @{
+ * DISCLAIMER: this file is deprecated: use contractions.cuh instead
  */
-template <typename DataT, int _veclen>
-struct Policy4x4 {
-};
-
-template <int _veclen>
-struct Policy4x4<float, _veclen> {
-  typedef KernelPolicy<float, _veclen, 32, 4, 4, 16, 16> Policy;
-  typedef ColKernelPolicy<float, _veclen, 32, 4, 4, 16, 16> ColPolicy;
-};
 
-template <int _veclen>
-struct Policy4x4<double, _veclen> {
-  typedef KernelPolicy<double, _veclen, 16, 4, 4, 16, 16> Policy;
-  typedef ColKernelPolicy<double, _veclen, 16, 4, 4, 16, 16> ColPolicy;
-};
-/** @} */
-
-/**
- * @defgroup Policy2x8 16 elements per thread Policy with k-block = 16
- * @{
- */
-template <typename DataT, int _veclen = 1>
-struct Policy2x8 {
-};
-
-template <int _veclen>
-struct Policy2x8<float, _veclen> {
-  typedef KernelPolicy<float, _veclen, 16, 2, 8, 8, 32> Policy;
-  typedef ColKernelPolicy<float, _veclen, 16, 2, 8, 8, 32> ColPolicy;
-};
-
-template <int _veclen>
-struct Policy2x8<double, _veclen> {
-  // this is not used just for keeping compiler happy.
-  typedef KernelPolicy<double, _veclen, 32, 1, 2, 8, 32> Policy;
-  typedef ColKernelPolicy<double, _veclen, 32, 1, 2, 8, 32> ColPolicy;
-};
-/** @} */
-
-/**
- * @brief Base class for gemm-like NT contractions
- *
- * This class does not provide any arithmetic operations, but only provides the
- * memory-related operations of loading the `x` and `y` matrix blocks from the
- * global memory into shared memory and then from shared into registers. Thus,
- * this class acts as a basic building block for further composing gemm-like NT
- * contractions on input matrices which are row-major (and so does the output)
- *
- * @tparam DataT  IO and math data type
- * @tparam IdxT   indexing type
- * @tparam Policy policy used to customize memory access behavior.
- *                See documentation for `KernelPolicy` to know more.
- */
-using detail::Contractions_NT;
+#pragma once
 
-}  // namespace linalg
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "contractions.cuh"
diff --git a/cpp/include/raft/linalg/divide.hpp b/cpp/include/raft/linalg/divide.hpp
index 8d1bd37186..57f4376fcc 100644
--- a/cpp/include/raft/linalg/divide.hpp
+++ b/cpp/include/raft/linalg/divide.hpp
@@ -18,37 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __DIVIDE_H
-#define __DIVIDE_H
-
-#pragma once
-
-#include "detail/divide.cuh"
-
-namespace raft {
-namespace linalg {
-
-using detail::divides_scalar;
-
 /**
- * @defgroup ScalarOps Scalar operations on the input buffer
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param out the output buffer
- * @param in the input buffer
- * @param scalar the scalar used in the operations
- * @param len number of elements in the input buffer
- * @param stream cuda stream where to launch work
- * @{
+ * DISCLAIMER: this file is deprecated: use divide.cuh instead
  */
-template <typename math_t, typename IdxType = int>
-void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
-{
-  detail::divideScalar(out, in, scalar, len, stream);
-}
-/** @} */
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "divide.cuh"
diff --git a/cpp/include/raft/linalg/eig.hpp b/cpp/include/raft/linalg/eig.hpp
index 032c4e97f9..175a2aaccc 100644
--- a/cpp/include/raft/linalg/eig.hpp
+++ b/cpp/include/raft/linalg/eig.hpp
@@ -18,108 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __EIG_H
-#define __EIG_H
-
-#pragma once
-
-#include "detail/eig.cuh"
-
-namespace raft {
-namespace linalg {
-
-/**
- * @defgroup eig Eigen Decomposition Methods
- * @{
- */
-
 /**
- * @brief eig decomp with divide and conquer method for the column-major
- * symmetric matrices
- * @param handle raft handle
- * @param in the input buffer (symmetric matrix that has real eig values and
- * vectors.
- * @param n_rows: number of rows of the input
- * @param n_cols: number of cols of the input
- * @param eig_vectors: eigenvectors
- * @param eig_vals: eigen values
- * @param stream cuda stream
+ * DISCLAIMER: this file is deprecated: use eig.cuh instead
  */
-template <typename math_t>
-void eigDC(const raft::handle_t& handle,
-           const math_t* in,
-           std::size_t n_rows,
-           std::size_t n_cols,
-           math_t* eig_vectors,
-           math_t* eig_vals,
-           cudaStream_t stream)
-{
-  detail::eigDC(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream);
-}
 
-using detail::COPY_INPUT;
-using detail::EigVecMemUsage;
-using detail::OVERWRITE_INPUT;
-
-/**
- * @brief eig sel decomp with divide and conquer method for the column-major
- * symmetric matrices
- * @param handle raft handle
- * @param in the input buffer (symmetric matrix that has real eig values and
- * vectors.
- * @param n_rows: number of rows of the input
- * @param n_cols: number of cols of the input
- * @param n_eig_vals: number of eigenvectors to be generated
- * @param eig_vectors: eigenvectors
- * @param eig_vals: eigen values
- * @param memUsage: the memory selection for eig vector output
- * @param stream cuda stream
- */
-template <typename math_t>
-void eigSelDC(const raft::handle_t& handle,
-              math_t* in,
-              int n_rows,
-              int n_cols,
-              int n_eig_vals,
-              math_t* eig_vectors,
-              math_t* eig_vals,
-              EigVecMemUsage memUsage,
-              cudaStream_t stream)
-{
-  detail::eigSelDC(handle, in, n_rows, n_cols, n_eig_vals, eig_vectors, eig_vals, memUsage, stream);
-}
-
-/**
- * @brief overloaded function for eig decomp with Jacobi method for the
- * column-major symmetric matrices (in parameter)
- * @param handle: raft handle
- * @param in: input matrix
- * @param n_rows: number of rows of the input
- * @param n_cols: number of cols of the input
- * @param eig_vectors: eigenvectors
- * @param eig_vals: eigen values
- * @param stream: stream on which this function will be run
- * @param tol: error tolerance for the jacobi method. Algorithm stops when the
- * error is below tol
- * @param sweeps: number of sweeps in the Jacobi algorithm. The more the better
- * accuracy.
- */
-template <typename math_t>
-void eigJacobi(const raft::handle_t& handle,
-               const math_t* in,
-               int n_rows,
-               int n_cols,
-               math_t* eig_vectors,
-               math_t* eig_vals,
-               cudaStream_t stream,
-               math_t tol = 1.e-7,
-               int sweeps = 15)
-{
-  detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream, tol, sweeps);
-}
-/** @} */  // end of eig
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "eig.cuh"
diff --git a/cpp/include/raft/linalg/eltwise.hpp b/cpp/include/raft/linalg/eltwise.hpp
index 62624f6eeb..8931c88241 100644
--- a/cpp/include/raft/linalg/eltwise.hpp
+++ b/cpp/include/raft/linalg/eltwise.hpp
@@ -18,94 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __ELTWISE_H
-#define __ELTWISE_H
-
-#pragma once
-
-#include "detail/eltwise.cuh"
-
-namespace raft {
-namespace linalg {
-
-using detail::adds_scalar;
-
-/**
- * @defgroup ScalarOps Scalar operations on the input buffer
- * @tparam InType data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param out the output buffer
- * @param in the input buffer
- * @param scalar the scalar used in the operations
- * @param len number of elements in the input buffer
- * @param stream cuda stream where to launch work
- * @{
- */
-template <typename InType, typename IdxType, typename OutType = InType>
-void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
-{
-  detail::scalarAdd(out, in, scalar, len, stream);
-}
-
-using detail::multiplies_scalar;
-
-template <typename InType, typename IdxType, typename OutType = InType>
-void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
-{
-  detail::scalarMultiply(out, in, scalar, len, stream);
-}
-/** @} */
-
 /**
- * @defgroup BinaryOps Element-wise binary operations on the input buffers
- * @tparam InType data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param out the output buffer
- * @param in1 the first input buffer
- * @param in2 the second input buffer
- * @param len number of elements in the input buffers
- * @param stream cuda stream where to launch work
- * @{
+ * DISCLAIMER: this file is deprecated: use eltwise.cuh instead
  */
-template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseAdd(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
-{
-  detail::eltwiseAdd(out, in1, in2, len, stream);
-}
-
-template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseSub(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
-{
-  detail::eltwiseSub(out, in1, in2, len, stream);
-}
 
-template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseMultiply(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
-{
-  detail::eltwiseMultiply(out, in1, in2, len, stream);
-}
-
-template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseDivide(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
-{
-  detail::eltwiseDivide(out, in1, in2, len, stream);
-}
-
-using detail::divides_check_zero;
-
-template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseDivideCheckZero(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
-{
-  detail::eltwiseDivideCheckZero(out, in1, in2, len, stream);
-}
-/** @} */
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "eltwise.cuh"
diff --git a/cpp/include/raft/linalg/gemm.hpp b/cpp/include/raft/linalg/gemm.hpp
index 37c6b2d552..6ad2f1fbe1 100644
--- a/cpp/include/raft/linalg/gemm.hpp
+++ b/cpp/include/raft/linalg/gemm.hpp
@@ -18,167 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __GEMM_H
-#define __GEMM_H
-
-#pragma once
-
-#include "detail/gemm.hpp"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief the wrapper of cublas gemm function
- *  It computes the following equation: C = alpha .* opA(A) * opB(B) + beta .* C
- *
- * @tparam math_t the element type
- * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
- * @param [in] handle raft handle
- * @param [in] trans_a cublas transpose op for A
- * @param [in] trans_b cublas transpose op for B
- * @param [in] m number of rows of C
- * @param [in] n number of columns of C
- * @param [in] k number of rows of opB(B) / number of columns of opA(A)
- * @param [in] alpha host or device scalar
- * @param [in] A such a matrix that the shape of column-major opA(A) is [m, k]
- * @param [in] lda leading dimension of A
- * @param [in] B such a matrix that the shape of column-major opA(B) is [k, n]
- * @param [in] ldb leading dimension of B
- * @param [in] beta host or device scalar
- * @param [inout] C column-major matrix of size [m, n]
- * @param [in] ldc leading dimension of C
- * @param [in] stream
+ * DISCLAIMER: this file is deprecated: use gemm.cuh instead
  */
-template <typename math_t, bool DevicePointerMode = false>
-void gemm(const raft::handle_t& handle,
-          const bool trans_a,
-          const bool trans_b,
-          const int m,
-          const int n,
-          const int k,
-          const math_t* alpha,
-          const math_t* A,
-          const int lda,
-          const math_t* B,
-          const int ldb,
-          const math_t* beta,
-          math_t* C,
-          const int ldc,
-          cudaStream_t stream)
-{
-  detail::gemm<math_t, DevicePointerMode>(
-    handle, trans_a, trans_b, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, stream);
-}
 
-/**
- * @brief the wrapper of cublas gemm function
- *  It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
- * @tparam math_t the type of input/output matrices
- * @param handle raft handle
- * @param a input matrix
- * @param n_rows_a number of rows of A
- * @param n_cols_a number of columns of A
- * @param b input matrix
- * @param c output matrix
- * @param n_rows_c number of rows of C
- * @param n_cols_c number of columns of C
- * @param trans_a cublas transpose op for A
- * @param trans_b cublas transpose op for B
- * @param alpha scalar
- * @param beta scalar
- * @param stream cuda stream
- */
-template <typename math_t>
-void gemm(const raft::handle_t& handle,
-          const math_t* a,
-          int n_rows_a,
-          int n_cols_a,
-          const math_t* b,
-          math_t* c,
-          int n_rows_c,
-          int n_cols_c,
-          cublasOperation_t trans_a,
-          cublasOperation_t trans_b,
-          math_t alpha,
-          math_t beta,
-          cudaStream_t stream)
-{
-  detail::gemm(
-    handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
-}
-
-/**
- * @brief the wrapper of cublas gemm function
- *  It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
- * @tparam math_t the type of input/output matrices
- * @param handle raft handle
- * @param a input matrix
- * @param n_rows_a number of rows of A
- * @param n_cols_a number of columns of A
- * @param b input matrix
- * @param c output matrix
- * @param n_rows_c number of rows of C
- * @param n_cols_c number of columns of C
- * @param trans_a cublas transpose op for A
- * @param trans_b cublas transpose op for B
- * @param stream cuda stream
- */
-template <typename math_t>
-void gemm(const raft::handle_t& handle,
-          const math_t* a,
-          int n_rows_a,
-          int n_cols_a,
-          const math_t* b,
-          math_t* c,
-          int n_rows_c,
-          int n_cols_c,
-          cublasOperation_t trans_a,
-          cublasOperation_t trans_b,
-          cudaStream_t stream)
-{
-  detail::gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, stream);
-}
-
-/**
- * @brief A wrapper for CUBLS GEMM function designed for handling all possible
- * combinations of operand layouts.
- * It computes the following equation: Z = alpha . X * Y + beta . Z
- * @tparam T Data type of input/output matrices (float/double)
- * @param handle raft handle
- * @param z output matrix of size M rows x N columns
- * @param x input matrix of size M rows x K columns
- * @param y input matrix of size K rows x N columns
- * @param _M number of rows of X and Z
- * @param _N number of rows of Y and columns of Z
- * @param _K number of columns of X and rows of Y
- * @param isZColMajor Storage layout of Z. true = col major, false = row major
- * @param isXColMajor Storage layout of X. true = col major, false = row major
- * @param isYColMajor Storage layout of Y. true = col major, false = row major
- * @param stream cuda stream
- * @param alpha scalar
- * @param beta scalar
- */
-template <typename T>
-void gemm(const raft::handle_t& handle,
-          T* z,
-          T* x,
-          T* y,
-          int _M,
-          int _N,
-          int _K,
-          bool isZColMajor,
-          bool isXColMajor,
-          bool isYColMajor,
-          cudaStream_t stream,
-          T alpha = T(1.0),
-          T beta  = T(0.0))
-{
-  detail::gemm(
-    handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor, isYColMajor, stream, alpha, beta);
-}
+#pragma once
 
-}  // end namespace linalg
-}  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
+#include "gemm.cuh"
diff --git a/cpp/include/raft/linalg/gemv.hpp b/cpp/include/raft/linalg/gemv.hpp
index 3b6b60263b..8161631fd3 100644
--- a/cpp/include/raft/linalg/gemv.hpp
+++ b/cpp/include/raft/linalg/gemv.hpp
@@ -18,200 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __GEMV_H
-#define __GEMV_H
-
-#pragma once
-
-#include "detail/gemv.hpp"
-
-namespace raft {
-namespace linalg {
-
-/**
- * @brief the wrapper of cublas gemv function
- *  It computes the following equation: y = alpha .* op(A) * x + beta .* y
- *
- * @tparam math_t the element type
- * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
- * @param [in] handle raft handle
- * @param [in] trans_a cublas transpose op for A
- * @param [in] m number of rows of A
- * @param [in] n number of columns of A
- * @param [in] alpha host or device scalar
- * @param [in] A column-major matrix of size [m, n]
- * @param [in] lda leading dimension of A
- * @param [in] x vector of length n if trans_a else m
- * @param [in] incx stride between consecutive elements of x
- * @param [in] beta host or device scalar
- * @param [inout] y vector of length m if trans_a else n
- * @param [in] incy stride between consecutive elements of y
- * @param [in] stream
- */
-template <typename math_t, bool DevicePointerMode = false>
-void gemv(const raft::handle_t& handle,
-          const bool trans_a,
-          const int m,
-          const int n,
-          const math_t* alpha,
-          const math_t* A,
-          const int lda,
-          const math_t* x,
-          const int incx,
-          const math_t* beta,
-          math_t* y,
-          const int incy,
-          cudaStream_t stream)
-{
-  detail::gemv<math_t, DevicePointerMode>(
-    handle, trans_a, m, n, alpha, A, lda, x, incx, beta, y, incy, stream);
-}
-
-template <typename math_t>
-void gemv(const raft::handle_t& handle,
-          const math_t* A,
-          const int n_rows,
-          const int n_cols,
-          const math_t* x,
-          const int incx,
-          math_t* y,
-          const int incy,
-          const bool trans_a,
-          const math_t alpha,
-          const math_t beta,
-          cudaStream_t stream)
-{
-  detail::gemv(handle, A, n_rows, n_cols, x, incx, y, incy, trans_a, alpha, beta, stream);
-}
-
-/**
- * y = alpha * op(A) * x + beta * y
- *
- * where
- *
- * @param handle raft handle
- * @param A is a column-major matrix of size n_rows_a * n_cols_a.
- *   op(A) is either the transpose operation (trans_a == true) or identity.
- * @param n_rows_a number of rows in A
- * @param n_cols_a number of cols in A
- * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
- * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
- * @param trans_a whether to take transpose of a
- * @param alpha is a scalar scale of Ax.
- * @param beta is a scalar scale of y.
- * @param stream stream on which this function is run
- */
-template <typename math_t>
-void gemv(const raft::handle_t& handle,
-          const math_t* A,
-          const int n_rows_a,
-          const int n_cols_a,
-          const math_t* x,
-          math_t* y,
-          const bool trans_a,
-          const math_t alpha,
-          const math_t beta,
-          cudaStream_t stream)
-{
-  detail::gemv(handle, A, n_rows_a, n_cols_a, x, y, trans_a, alpha, beta, stream);
-}
-
 /**
- * y = op(A) * x
- *
- * where
- *
- * @param handle raft handle
- * @param A is a column-major matrix of size n_rows_a * n_cols_a.
- *   op(A) is either the transpose operation (trans_a == true) or identity.
- * @param n_rows_a number of rows in A
- * @param n_cols_a number of cols in A
- * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
- * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
- * @param trans_a whether to take transpose of a
- * @param stream stream on which this function is run
+ * DISCLAIMER: this file is deprecated: use gemv.cuh instead
  */
-template <typename math_t>
-void gemv(const raft::handle_t& handle,
-          const math_t* A,
-          const int n_rows_a,
-          const int n_cols_a,
-          const math_t* x,
-          math_t* y,
-          const bool trans_a,
-          cudaStream_t stream)
-{
-  detail::gemv(handle, A, n_rows_a, n_cols_a, x, y, trans_a, stream);
-}
 
-/**
- * y = alpha * op(A) * x + beta * y
- *
- * where
- * @param handle raft handle
- * @param A is a column-major matrix of size n_rows_a * n_cols_a.
- *   op(A) is either the transpose operation (trans_a == true) or identity.
- * @param n_rows_a number of rows in A
- * @param n_cols_a number of cols in A
- * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a.
- *     set it when you need to use only the first n_rows_a rows of the matrix A, which has
- *     (perhaps, due to padding) lda rows.
- * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
- * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
- * @param trans_a whether to take transpose of a
- * @param alpha is a scalar scale of Ax.
- * @param beta is a scalar scale of y.
- * @param stream stream on which this function is run
- */
-template <typename math_t>
-void gemv(const raft::handle_t& handle,
-          const math_t* A,
-          const int n_rows_a,
-          const int n_cols_a,
-          const int lda,
-          const math_t* x,
-          math_t* y,
-          const bool trans_a,
-          const math_t alpha,
-          const math_t beta,
-          cudaStream_t stream)
-{
-  detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream);
-}
-
-/**
- * y = op(A) * x
- *
- * where
- * @param handle raft handle
- * @param A is a column-major matrix of size n_rows_a * n_cols_a.
- *   op(A) is either the transpose operation (trans_a == true) or identity.
- * @param n_rows_a number of rows in A
- * @param n_cols_a number of cols in A
- * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a.
- *     set it when you need to use only the first n_rows_a rows of the matrix A, which has
- *     (perhaps, due to padding) lda rows.
- * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
- * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
- * @param trans_a whether to take transpose of a
- * @param stream stream on which this function is run
- *
- */
-template <typename math_t>
-void gemv(const raft::handle_t& handle,
-          const math_t* A,
-          const int n_rows_a,
-          const int n_cols_a,
-          const int lda,
-          const math_t* x,
-          math_t* y,
-          const bool trans_a,
-          cudaStream_t stream)
-{
-  detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, stream);
-}
+#pragma once
 
-};  // namespace linalg
-};  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "gemv.cuh"
diff --git a/cpp/include/raft/linalg/init.hpp b/cpp/include/raft/linalg/init.hpp
index db7b0f9cfe..9c59c886c9 100644
--- a/cpp/include/raft/linalg/init.hpp
+++ b/cpp/include/raft/linalg/init.hpp
@@ -18,48 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __INIT_H
-#define __INIT_H
-
-#pragma once
-
-#include "detail/init.hpp"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief Like Python range.
- *
- * Fills the output as out[i] = i.
- *
- * \param [out] out device array, size [end-start]
- * \param [in] start of the range
- * \param [in] end of range (exclusive)
- * \param [in] stream cuda stream
+ * DISCLAIMER: this file is deprecated: use init.cuh instead
  */
-template <typename T>
-void range(T* out, int start, int end, cudaStream_t stream)
-{
-  detail::range(out, start, end, stream);
-}
 
-/**
- * @brief Like Python range.
- *
- * Fills the output as out[i] = i.
- *
- * \param [out] out device array, size [n]
- * \param [in] n length of the array
- * \param [in] stream cuda stream
- */
-template <typename T, int TPB = 256>
-void range(T* out, int n, cudaStream_t stream)
-{
-  detail::range(out, n, stream);
-}
+#pragma once
 
-}  // namespace linalg
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "init.cuh"
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index 75e3d11444..0529db6b5b 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -18,150 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __LANCZOS_H
-#define __LANCZOS_H
-
-#pragma once
-
-#include "detail/lanczos.cuh"
-#include <raft/spectral/matrix_wrappers.hpp>
-
-namespace raft {
-namespace linalg {
-
-// =========================================================
-// Eigensolver
-// =========================================================
-
 /**
- *  @brief  Compute smallest eigenvectors of symmetric matrix
- *    Computes eigenvalues and eigenvectors that are least
- *    positive. If matrix is positive definite or positive
- *    semidefinite, the computed eigenvalues are smallest in
- *    magnitude.
- *    The largest eigenvalue is estimated by performing several
- *    Lanczos iterations. An implicitly restarted Lanczos method is
- *    then applied to A+s*I, where s is negative the largest
- *    eigenvalue.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param handle the raft handle.
- *  @param A Matrix.
- *  @param nEigVecs Number of eigenvectors to compute.
- *  @param maxIter Maximum number of Lanczos steps. Does not include
- *    Lanczos steps used to estimate largest eigenvalue.
- *  @param restartIter Maximum size of Lanczos system before
- *    performing an implicit restart. Should be at least 4.
- *  @param tol Convergence tolerance. Lanczos iteration will
- *    terminate when the residual norm is less than tol*theta, where
- *    theta is an estimate for the smallest unwanted eigenvalue
- *    (i.e. the (nEigVecs+1)th smallest eigenvalue).
- *  @param reorthogonalize Whether to reorthogonalize Lanczos
- *    vectors.
- *  @param iter On exit, pointer to total number of Lanczos
- *    iterations performed. Does not include Lanczos steps used to
- *    estimate largest eigenvalue.
- *  @param eigVals_dev (Output, device memory, nEigVecs entries)
- *    Smallest eigenvalues of matrix.
- *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
- *    Eigenvectors corresponding to smallest eigenvalues of
- *    matrix. Vectors are stored as columns of a column-major matrix
- *    with dimensions n x nEigVecs.
- *  @param seed random seed.
- *  @return error flag.
+ * DISCLAIMER: this file is deprecated: use lanczos.cuh instead
  */
-template <typename index_type_t, typename value_type_t>
-int computeSmallestEigenvectors(
-  handle_t const& handle,
-  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
-  index_type_t nEigVecs,
-  index_type_t maxIter,
-  index_type_t restartIter,
-  value_type_t tol,
-  bool reorthogonalize,
-  index_type_t& iter,
-  value_type_t* __restrict__ eigVals_dev,
-  value_type_t* __restrict__ eigVecs_dev,
-  unsigned long long seed = 1234567)
-{
-  return detail::computeSmallestEigenvectors(handle,
-                                             A,
-                                             nEigVecs,
-                                             maxIter,
-                                             restartIter,
-                                             tol,
-                                             reorthogonalize,
-                                             iter,
-                                             eigVals_dev,
-                                             eigVecs_dev,
-                                             seed);
-}
 
-/**
- *  @brief  Compute largest eigenvectors of symmetric matrix
- *    Computes eigenvalues and eigenvectors that are least
- *    positive. If matrix is positive definite or positive
- *    semidefinite, the computed eigenvalues are largest in
- *    magnitude.
- *    The largest eigenvalue is estimated by performing several
- *    Lanczos iterations. An implicitly restarted Lanczos method is
- *    then applied to A+s*I, where s is negative the largest
- *    eigenvalue.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param handle the raft handle.
- *  @param A Matrix.
- *  @param nEigVecs Number of eigenvectors to compute.
- *  @param maxIter Maximum number of Lanczos steps. Does not include
- *    Lanczos steps used to estimate largest eigenvalue.
- *  @param restartIter Maximum size of Lanczos system before
- *    performing an implicit restart. Should be at least 4.
- *  @param tol Convergence tolerance. Lanczos iteration will
- *    terminate when the residual norm is less than tol*theta, where
- *    theta is an estimate for the largest unwanted eigenvalue
- *    (i.e. the (nEigVecs+1)th largest eigenvalue).
- *  @param reorthogonalize Whether to reorthogonalize Lanczos
- *    vectors.
- *  @param iter On exit, pointer to total number of Lanczos
- *    iterations performed. Does not include Lanczos steps used to
- *    estimate largest eigenvalue.
- *  @param eigVals_dev (Output, device memory, nEigVecs entries)
- *    Largest eigenvalues of matrix.
- *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
- *    Eigenvectors corresponding to largest eigenvalues of
- *    matrix. Vectors are stored as columns of a column-major matrix
- *    with dimensions n x nEigVecs.
- *  @param seed random seed.
- *  @return error flag.
- */
-template <typename index_type_t, typename value_type_t>
-int computeLargestEigenvectors(
-  handle_t const& handle,
-  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
-  index_type_t nEigVecs,
-  index_type_t maxIter,
-  index_type_t restartIter,
-  value_type_t tol,
-  bool reorthogonalize,
-  index_type_t& iter,
-  value_type_t* __restrict__ eigVals_dev,
-  value_type_t* __restrict__ eigVecs_dev,
-  unsigned long long seed = 123456)
-{
-  return detail::computeLargestEigenvectors(handle,
-                                            A,
-                                            nEigVecs,
-                                            maxIter,
-                                            restartIter,
-                                            tol,
-                                            reorthogonalize,
-                                            iter,
-                                            eigVals_dev,
-                                            eigVecs_dev,
-                                            seed);
-}
+#pragma once
 
-}  // namespace linalg
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "lanczos.cuh"
diff --git a/cpp/include/raft/linalg/lstsq.hpp b/cpp/include/raft/linalg/lstsq.hpp
index f90cd00ea3..3dfbea0629 100644
--- a/cpp/include/raft/linalg/lstsq.hpp
+++ b/cpp/include/raft/linalg/lstsq.hpp
@@ -18,109 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __LSTSQ_H
-#define __LSTSQ_H
-
-#pragma once
-
-#include <raft/handle.hpp>
-#include <raft/linalg/detail/lstsq.cuh>
-namespace raft {
-namespace linalg {
-
-/** Solves the linear ordinary least squares problem `Aw = b`
- *  Via SVD decomposition of `A = U S Vt` using default cuSOLVER routine.
- *
- * @param[in] handle raft handle
- * @param[inout] A input feature matrix.
- *            Warning: the content of this matrix is modified by the cuSOLVER routines.
- * @param[in] n_rows number of rows in A
- * @param[in] n_cols number of columns in A
- * @param[inout] b input target vector.
- *            Warning: the content of this vector is modified by the cuSOLVER routines.
- * @param[out] w output coefficient vector
- * @param[in] stream cuda stream for ordering operations
- */
-template <typename math_t>
-void lstsqSvdQR(const raft::handle_t& handle,
-                math_t* A,
-                const int n_rows,
-                const int n_cols,
-                const math_t* b,
-                math_t* w,
-                cudaStream_t stream)
-{
-  detail::lstsqSvdQR(handle, A, n_rows, n_cols, b, w, stream);
-}
-
-/** Solves the linear ordinary least squares problem `Aw = b`
- *  Via SVD decomposition of `A = U S V^T` using Jacobi iterations (cuSOLVER).
- *
- * @param[in] handle raft handle
- * @param[inout] A input feature matrix.
- *            Warning: the content of this matrix is modified by the cuSOLVER routines.
- * @param[in] n_rows number of rows in A
- * @param[in] n_cols number of columns in A
- * @param[inout] b input target vector.
- *            Warning: the content of this vector is modified by the cuSOLVER routines.
- * @param[out] w output coefficient vector
- * @param[in] stream cuda stream for ordering operations
- */
-template <typename math_t>
-void lstsqSvdJacobi(const raft::handle_t& handle,
-                    math_t* A,
-                    const int n_rows,
-                    const int n_cols,
-                    const math_t* b,
-                    math_t* w,
-                    cudaStream_t stream)
-{
-  detail::lstsqSvdJacobi(handle, A, n_rows, n_cols, b, w, stream);
-}
-
-/** Solves the linear ordinary least squares problem `Aw = b`
- *  via eigenvalue decomposition of `A^T * A` (covariance matrix for dataset A).
- *  (`w = (A^T A)^-1  A^T b`)
+/**
+ * DISCLAIMER: this file is deprecated: use lstsq.cuh instead
  */
-template <typename math_t>
-void lstsqEig(const raft::handle_t& handle,
-              const math_t* A,
-              const int n_rows,
-              const int n_cols,
-              const math_t* b,
-              math_t* w,
-              cudaStream_t stream)
-{
-  detail::lstsqEig(handle, A, n_rows, n_cols, b, w, stream);
-}
 
-/** Solves the linear ordinary least squares problem `Aw = b`
- *  via QR decomposition of `A = QR`.
- *  (triangular system of equations `Rw = Q^T b`)
- *
- * @param[in] handle raft handle
- * @param[inout] A input feature matrix.
- *            Warning: the content of this matrix is modified by the cuSOLVER routines.
- * @param[in] n_rows number of rows in A
- * @param[in] n_cols number of columns in A
- * @param[inout] b input target vector.
- *            Warning: the content of this vector is modified by the cuSOLVER routines.
- * @param[out] w output coefficient vector
- * @param[in] stream cuda stream for ordering operations
- */
-template <typename math_t>
-void lstsqQR(const raft::handle_t& handle,
-             math_t* A,
-             const int n_rows,
-             const int n_cols,
-             math_t* b,
-             math_t* w,
-             cudaStream_t stream)
-{
-  detail::lstsqQR(handle, A, n_rows, n_cols, b, w, stream);
-}
+#pragma once
 
-};  // namespace linalg
-};  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "lstsq.cuh"
diff --git a/cpp/include/raft/linalg/map_then_reduce.hpp b/cpp/include/raft/linalg/map_then_reduce.hpp
index 235485926b..6502a84edb 100644
--- a/cpp/include/raft/linalg/map_then_reduce.hpp
+++ b/cpp/include/raft/linalg/map_then_reduce.hpp
@@ -18,79 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MAP_THEN_REDUCE_H
-#define __MAP_THEN_REDUCE_H
-
-#pragma once
-
-#include "detail/map_then_reduce.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief CUDA version of map and then sum reduction operation
- * @tparam Type data-type upon which the math operation will be performed
- * @tparam MapOp the device-lambda performing the actual operation
- * @tparam TPB threads-per-block in the final kernel launched
- * @tparam Args additional parameters
- * @param out the output sum-reduced value (assumed to be a device pointer)
- * @param len number of elements in the input array
- * @param map the device-lambda
- * @param stream cuda-stream where to launch this kernel
- * @param in the input array
- * @param args additional input arrays
+ * DISCLAIMER: this file is deprecated: use map_then_reduce.cuh instead
  */
 
-template <typename InType,
-          typename MapOp,
-          int TPB = 256,
-          typename... Args,
-          typename OutType = InType>
-void mapThenSumReduce(
-  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
-{
-  detail::mapThenReduceImpl<InType, OutType, MapOp, detail::sum_tag, TPB, Args...>(
-    out, len, (OutType)0, map, detail::sum_tag(), stream, in, args...);
-}
-
-/**
- * @brief CUDA version of map and then generic reduction operation
- * @tparam Type data-type upon which the math operation will be performed
- * @tparam MapOp the device-lambda performing the actual map operation
- * @tparam ReduceLambda the device-lambda performing the actual reduction
- * @tparam TPB threads-per-block in the final kernel launched
- * @tparam Args additional parameters
- * @param out the output reduced value (assumed to be a device pointer)
- * @param len number of elements in the input array
- * @param neutral The neutral element of the reduction operation. For example:
- *    0 for sum, 1 for multiply, +Inf for Min, -Inf for Max
- * @param map the device-lambda
- * @param op the reduction device lambda
- * @param stream cuda-stream where to launch this kernel
- * @param in the input array
- * @param args additional input arrays
- */
+#pragma once
 
-template <typename InType,
-          typename MapOp,
-          typename ReduceLambda,
-          int TPB          = 256,
-          typename OutType = InType,
-          typename... Args>
-void mapThenReduce(OutType* out,
-                   size_t len,
-                   OutType neutral,
-                   MapOp map,
-                   ReduceLambda op,
-                   cudaStream_t stream,
-                   const InType* in,
-                   Args... args)
-{
-  detail::mapThenReduceImpl<InType, OutType, MapOp, ReduceLambda, TPB, Args...>(
-    out, len, neutral, map, op, stream, in, args...);
-}
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "map_then_reduce.cuh"
diff --git a/cpp/include/raft/linalg/matrix_vector_op.hpp b/cpp/include/raft/linalg/matrix_vector_op.hpp
index 574d4aee63..1237961ceb 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.hpp
+++ b/cpp/include/raft/linalg/matrix_vector_op.hpp
@@ -18,93 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MATRIX_VECTOR_OP_H
-#define __MATRIX_VECTOR_OP_H
-
-#pragma once
-
-#include "detail/matrix_vector_op.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief Operations for all the columns or rows with a given vector.
- * Caution : Threads process multiple elements to speed up processing. These
- * are loaded in a single read thanks to type promotion. Faster processing
- * would thus only be enabled when adresses are optimally aligned for it.
- * Note : the function will also check that the size of the window of accesses
- * is a multiple of the number of elements processed by a thread in order to
- * enable faster processing
- * @tparam Type the matrix/vector type
- * @tparam Lambda a device function which represents a binary operator
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads per block of the cuda kernel launched
- * @param out the output matrix (passing out = matrix makes it in-place)
- * @param matrix the input matrix
- * @param vec the vector
- * @param D number of columns of matrix
- * @param N number of rows of matrix
- * @param rowMajor whether input is row or col major
- * @param bcastAlongRows whether the broadcast of vector needs to happen along
- * the rows of the matrix or columns
- * @param op the mathematical operation
- * @param stream cuda stream where to launch work
+ * DISCLAIMER: this file is deprecated: use matrix_vector_op.cuh instead
  */
-template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
-void matrixVectorOp(Type* out,
-                    const Type* matrix,
-                    const Type* vec,
-                    IdxType D,
-                    IdxType N,
-                    bool rowMajor,
-                    bool bcastAlongRows,
-                    Lambda op,
-                    cudaStream_t stream)
-{
-  detail::matrixVectorOp(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-}
 
-/**
- * @brief Operations for all the columns or rows with the given vectors.
- * Caution : Threads process multiple elements to speed up processing. These
- * are loaded in a single read thanks to type promotion. Faster processing
- * would thus only be enabled when adresses are optimally aligned for it.
- * Note : the function will also check that the size of the window of accesses
- * is a multiple of the number of elements processed by a thread in order to
- * enable faster processing
- * @tparam Type the matrix/vector type
- * @tparam Lambda a device function which represents a binary operator
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads per block of the cuda kernel launched
- * @param out the output matrix (passing out = matrix makes it in-place)
- * @param matrix the input matrix
- * @param vec1 the first vector
- * @param vec2 the second vector
- * @param D number of columns of matrix
- * @param N number of rows of matrix
- * @param rowMajor whether input is row or col major
- * @param bcastAlongRows whether the broadcast of vector needs to happen along
- * the rows of the matrix or columns
- * @param op the mathematical operation
- * @param stream cuda stream where to launch work
- */
-template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
-void matrixVectorOp(Type* out,
-                    const Type* matrix,
-                    const Type* vec1,
-                    const Type* vec2,
-                    IdxType D,
-                    IdxType N,
-                    bool rowMajor,
-                    bool bcastAlongRows,
-                    Lambda op,
-                    cudaStream_t stream)
-{
-  detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-}
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "matrix_vector_op.cuh"
diff --git a/cpp/include/raft/linalg/mean_squared_error.hpp b/cpp/include/raft/linalg/mean_squared_error.hpp
index 7a7f03ee18..cbb974e516 100644
--- a/cpp/include/raft/linalg/mean_squared_error.hpp
+++ b/cpp/include/raft/linalg/mean_squared_error.hpp
@@ -18,35 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MSE_H
-#define __MSE_H
-
-#pragma once
-
-#include "detail/mean_squared_error.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief CUDA version mean squared error function mean((A-B)**2)
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam TPB threads-per-block
- * @param out the output mean squared error value (assumed to be a device pointer)
- * @param A input array (assumed to be a device pointer)
- * @param B input array (assumed to be a device pointer)
- * @param len number of elements in the input arrays
- * @param weight weight to apply to every term in the mean squared error calculation
- * @param stream cuda-stream where to launch this kernel
+ * DISCLAIMER: this file is deprecated: use mean_squared_error.cuh instead
  */
-template <typename math_t, int TPB = 256>
-void meanSquaredError(
-  math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream)
-{
-  detail::meanSquaredError(out, A, B, len, weight, stream);
-}
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "mean_squared_error.cuh"
diff --git a/cpp/include/raft/linalg/multiply.hpp b/cpp/include/raft/linalg/multiply.hpp
index eb933cd607..5aa481a894 100644
--- a/cpp/include/raft/linalg/multiply.hpp
+++ b/cpp/include/raft/linalg/multiply.hpp
@@ -18,35 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MULTIPLY_H
-#define __MULTIPLY_H
-
-#pragma once
-
-#include "detail/multiply.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @defgroup ScalarOps Scalar operations on the input buffer
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param out the output buffer
- * @param in the input buffer
- * @param scalar the scalar used in the operations
- * @param len number of elements in the input buffer
- * @param stream cuda stream where to launch work
- * @{
+ * DISCLAIMER: this file is deprecated: use multiply.cuh instead
  */
-template <typename math_t, typename IdxType = int>
-void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
-{
-  detail::multiplyScalar(out, in, scalar, len, stream);
-}
-/** @} */
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "multiply.cuh"
diff --git a/cpp/include/raft/linalg/norm.hpp b/cpp/include/raft/linalg/norm.hpp
index 958784d67e..b750367f05 100644
--- a/cpp/include/raft/linalg/norm.hpp
+++ b/cpp/include/raft/linalg/norm.hpp
@@ -18,82 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __NORM_H
-#define __NORM_H
-
-#pragma once
-
-#include "detail/norm.cuh"
-
-namespace raft {
-namespace linalg {
-
-/** different types of norms supported on the input buffers */
-using detail::L1Norm;
-using detail::L2Norm;
-using detail::NormType;
-
 /**
- * @brief Compute row-wise norm of the input matrix and perform fin_op lambda
- *
- * Row-wise norm is useful while computing pairwise distance matrix, for
- * example.
- * This is used in many clustering algos like knn, kmeans, dbscan, etc... The
- * current implementation is optimized only for bigger values of 'D'.
- *
- * @tparam Type the data type
- * @tparam Lambda device final lambda
- * @tparam IdxType Integer type used to for addressing
- * @param dots the output vector of row-wise dot products
- * @param data the input matrix (currently assumed to be row-major)
- * @param D number of columns of data
- * @param N number of rows of data
- * @param type the type of norm to be applied
- * @param rowMajor whether the input is row-major or not
- * @param stream cuda stream where to launch work
- * @param fin_op the final lambda op
+ * DISCLAIMER: this file is deprecated: use norm.cuh instead
  */
-template <typename Type, typename IdxType = int, typename Lambda = raft::Nop<Type, IdxType>>
-void rowNorm(Type* dots,
-             const Type* data,
-             IdxType D,
-             IdxType N,
-             NormType type,
-             bool rowMajor,
-             cudaStream_t stream,
-             Lambda fin_op = raft::Nop<Type, IdxType>())
-{
-  detail::rowNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op);
-}
 
-/**
- * @brief Compute column-wise norm of the input matrix and perform fin_op
- * @tparam Type the data type
- * @tparam Lambda device final lambda
- * @tparam IdxType Integer type used to for addressing
- * @param dots the output vector of column-wise dot products
- * @param data the input matrix (currently assumed to be row-major)
- * @param D number of columns of data
- * @param N number of rows of data
- * @param type the type of norm to be applied
- * @param rowMajor whether the input is row-major or not
- * @param stream cuda stream where to launch work
- * @param fin_op the final lambda op
- */
-template <typename Type, typename IdxType = int, typename Lambda = raft::Nop<Type, IdxType>>
-void colNorm(Type* dots,
-             const Type* data,
-             IdxType D,
-             IdxType N,
-             NormType type,
-             bool rowMajor,
-             cudaStream_t stream,
-             Lambda fin_op = raft::Nop<Type, IdxType>())
-{
-  detail::colNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op);
-}
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "norm.cuh"
diff --git a/cpp/include/raft/linalg/power.hpp b/cpp/include/raft/linalg/power.hpp
index d1506ff7a9..1e4a56d4fb 100644
--- a/cpp/include/raft/linalg/power.hpp
+++ b/cpp/include/raft/linalg/power.hpp
@@ -18,57 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __POWER_H
-#define __POWER_H
-
-#pragma once
-
-#include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/unary_op.cuh>
-
-namespace raft {
-namespace linalg {
-
 /**
- * @defgroup ScalarOps Scalar operations on the input buffer
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param out the output buffer
- * @param in the input buffer
- * @param scalar the scalar used in the operations
- * @param len number of elements in the input buffer
- * @param stream cuda stream where to launch work
- * @{
+ * DISCLAIMER: this file is deprecated: use power.cuh instead
  */
-template <typename math_t, typename IdxType = int>
-void powerScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
-{
-  raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(math_t in) { return raft::myPow(in, scalar); }, stream);
-}
-/** @} */
 
-/**
- * @defgroup BinaryOps Element-wise binary operations on the input buffers
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param out the output buffer
- * @param in1 the first input buffer
- * @param in2 the second input buffer
- * @param len number of elements in the input buffers
- * @param stream cuda stream where to launch work
- * @{
- */
-template <typename math_t, typename IdxType = int>
-void power(math_t* out, const math_t* in1, const math_t* in2, IdxType len, cudaStream_t stream)
-{
-  raft::linalg::binaryOp(
-    out, in1, in2, len, [] __device__(math_t a, math_t b) { return raft::myPow(a, b); }, stream);
-}
-/** @} */
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "power.cuh"
diff --git a/cpp/include/raft/linalg/reduce.hpp b/cpp/include/raft/linalg/reduce.hpp
index b9cc2c6e9d..b965cfac7b 100644
--- a/cpp/include/raft/linalg/reduce.hpp
+++ b/cpp/include/raft/linalg/reduce.hpp
@@ -18,69 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __REDUCE_H
-#define __REDUCE_H
-
-#pragma once
-
-#include "detail/reduce.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief Compute reduction of the input matrix along the requested dimension
- *
- * @tparam InType the data type of the input
- * @tparam OutType the data type of the output (as well as the data type for
- *  which reduction is performed)
- * @tparam IdxType data type of the indices of the array
- * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
- * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*ReduceLambda)(OutType);</pre>
- * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*FinalLambda)(OutType);</pre>
- * @param dots the output reduction vector
- * @param data the input matrix
- * @param D number of columns
- * @param N number of rows
- * @param init initial value to use for the reduction
- * @param rowMajor input matrix is row-major or not
- * @param alongRows whether to reduce along rows or columns
- * @param stream cuda stream where to launch work
- * @param inplace reduction result added inplace or overwrites old values?
- * @param main_op elementwise operation to apply before reduction
- * @param reduce_op binary reduction operation
- * @param final_op elementwise operation to apply before storing results
+ * DISCLAIMER: this file is deprecated: use reduce.cuh instead
  */
-template <typename InType,
-          typename OutType      = InType,
-          typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
-void reduce(OutType* dots,
-            const InType* data,
-            int D,
-            int N,
-            OutType init,
-            bool rowMajor,
-            bool alongRows,
-            cudaStream_t stream,
-            bool inplace           = false,
-            MainLambda main_op     = raft::Nop<InType, IdxType>(),
-            ReduceLambda reduce_op = raft::Sum<OutType>(),
-            FinalLambda final_op   = raft::Nop<OutType>())
-{
-  detail::reduce(
-    dots, data, D, N, init, rowMajor, alongRows, stream, inplace, main_op, reduce_op, final_op);
-}
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "reduce.cuh"
diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.hpp b/cpp/include/raft/linalg/reduce_cols_by_key.hpp
index c24baa60de..70851c2b69 100644
--- a/cpp/include/raft/linalg/reduce_cols_by_key.hpp
+++ b/cpp/include/raft/linalg/reduce_cols_by_key.hpp
@@ -18,45 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __REDUCE_COLS_BY_KEY
-#define __REDUCE_COLS_BY_KEY
+/**
+ * DISCLAIMER: this file is deprecated: use reduce_cols_by_key.cuh instead
+ */
 
 #pragma once
 
-#include <raft/linalg/detail/reduce_cols_by_key.cuh>
-
-namespace raft {
-namespace linalg {
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-/**
- * @brief Computes the sum-reduction of matrix columns for each given key
- * @tparam T the input data type (as well as the output reduced matrix)
- * @tparam KeyType data type of the keys
- * @tparam IdxType indexing arithmetic type
- * @param data the input data (dim = nrows x ncols). This is assumed to be in
- * row-major layout
- * @param keys keys array (len = ncols). It is assumed that each key in this
- * array is between [0, nkeys). In case this is not true, the caller is expected
- * to have called make_monotonic primitive to prepare such a contiguous and
- * monotonically increasing keys array.
- * @param out the output reduced matrix along columns (dim = nrows x nkeys).
- * This will be assumed to be in row-major layout
- * @param nrows number of rows in the input data
- * @param ncols number of colums in the input data
- * @param nkeys number of unique keys in the keys array
- * @param stream cuda stream to launch the kernel onto
- */
-template <typename T, typename KeyIteratorT, typename IdxType = int>
-void reduce_cols_by_key(const T* data,
-                        const KeyIteratorT keys,
-                        T* out,
-                        IdxType nrows,
-                        IdxType ncols,
-                        IdxType nkeys,
-                        cudaStream_t stream)
-{
-  detail::reduce_cols_by_key(data, keys, out, nrows, ncols, nkeys, stream);
-}
-};  // end namespace linalg
-};  // end namespace raft
-#endif
\ No newline at end of file
+#include "reduce_cols_by_key.cuh"
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.hpp b/cpp/include/raft/linalg/reduce_rows_by_key.hpp
index d18a00aa1d..4b5e76ea8f 100644
--- a/cpp/include/raft/linalg/reduce_rows_by_key.hpp
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.hpp
@@ -18,102 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __REDUCE_ROWS_BY_KEY
-#define __REDUCE_ROWS_BY_KEY
-
-#pragma once
-
-#include <raft/linalg/detail/reduce_rows_by_key.cuh>
-
-namespace raft {
-namespace linalg {
-
-/**
- Small helper function to convert from int->char and char->int
- Transform ncols*nrows read of int in 2*nrows reads of int + ncols*rows reads of chars
-**/
-template <typename IteratorT1, typename IteratorT2>
-void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st)
-{
-  detail::convert_array(dst, src, n, st);
-}
-
 /**
- * @brief Computes the weighted reduction of matrix rows for each given key
- *
- * @tparam DataIteratorT Random-access iterator type, for reading input matrix
- *                       (may be a simple pointer type)
- * @tparam KeysIteratorT Random-access iterator type, for reading input keys
- *                       (may be a simple pointer type)
- *
- * @param[in]  d_A         Input data array (lda x nrows)
- * @param[in]  lda         Real row size for input data, d_A
- * @param[in]  d_keys      Keys for each row (1 x nrows)
- * @param[in]  d_weights   Weights for each observation in d_A (1 x nrows)
- * @param[out] d_keys_char Scratch memory for conversion of keys to char
- * @param[in]  nrows       Number of rows in d_A and d_keys
- * @param[in]  ncols       Number of data columns in d_A
- * @param[in]  nkeys       Number of unique keys in d_keys
- * @param[out] d_sums      Row sums by key (ncols x d_keys)
- * @param[in]  stream      CUDA stream
+ * DISCLAIMER: this file is deprecated: use reduce_rows_by_key.cuh instead
  */
-template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
-void reduce_rows_by_key(const DataIteratorT d_A,
-                        int lda,
-                        const KeysIteratorT d_keys,
-                        const WeightT* d_weights,
-                        char* d_keys_char,
-                        int nrows,
-                        int ncols,
-                        int nkeys,
-                        DataIteratorT d_sums,
-                        cudaStream_t stream)
-{
-  detail::reduce_rows_by_key(
-    d_A, lda, d_keys, d_weights, d_keys_char, nrows, ncols, nkeys, d_sums, stream);
-}
 
-/**
- * @brief Computes the reduction of matrix rows for each given key
- * @tparam DataIteratorT Random-access iterator type, for reading input matrix (may be a simple
- * pointer type)
- * @tparam KeysIteratorT Random-access iterator type, for reading input keys (may be a simple
- * pointer type)
- * @param[in]  d_A         Input data array (lda x nrows)
- * @param[in]  lda         Real row size for input data, d_A
- * @param[in]  d_keys      Keys for each row (1 x nrows)
- * @param      d_keys_char Scratch memory for conversion of keys to char
- * @param[in]  nrows       Number of rows in d_A and d_keys
- * @param[in]  ncols       Number of data columns in d_A
- * @param[in]  nkeys       Number of unique keys in d_keys
- * @param[out] d_sums      Row sums by key (ncols x d_keys)
- * @param[in]  stream      CUDA stream
- */
-template <typename DataIteratorT, typename KeysIteratorT>
-void reduce_rows_by_key(const DataIteratorT d_A,
-                        int lda,
-                        const KeysIteratorT d_keys,
-                        char* d_keys_char,
-                        int nrows,
-                        int ncols,
-                        int nkeys,
-                        DataIteratorT d_sums,
-                        cudaStream_t stream)
-{
-  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
-  reduce_rows_by_key(d_A,
-                     lda,
-                     d_keys,
-                     static_cast<DataType*>(nullptr),
-                     d_keys_char,
-                     nrows,
-                     ncols,
-                     nkeys,
-                     d_sums,
-                     stream);
-}
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "reduce_rows_by_key.cuh"
diff --git a/cpp/include/raft/linalg/rsvd.hpp b/cpp/include/raft/linalg/rsvd.hpp
index ac6e13b555..7e2fffba75 100644
--- a/cpp/include/raft/linalg/rsvd.hpp
+++ b/cpp/include/raft/linalg/rsvd.hpp
@@ -18,131 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __RSVD_H
-#define __RSVD_H
-
-#pragma once
-
-#include <raft/linalg/detail/rsvd.cuh>
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief randomized singular value decomposition (RSVD) on the column major
- * float type input matrix (Jacobi-based), by specifying no. of PCs and
- * upsamples directly
- * @param handle: raft handle
- * @param M: input matrix
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param S_vec: singular values of input matrix
- * @param U: left singular values of input matrix
- * @param V: right singular values of input matrix
- * @param k: no. of singular values to be computed
- * @param p: no. of upsamples
- * @param use_bbt: whether use eigen decomposition in computation or not
- * @param gen_left_vec: left vector needs to be generated or not?
- * @param gen_right_vec: right vector needs to be generated or not?
- * @param use_jacobi: whether to jacobi solver for decomposition
- * @param tol: tolerance for Jacobi-based solvers
- * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers
- * @param stream cuda stream
+ * DISCLAIMER: this file is deprecated: use rsvd.cuh instead
  */
-template <typename math_t>
-void rsvdFixedRank(const raft::handle_t& handle,
-                   math_t* M,
-                   int n_rows,
-                   int n_cols,
-                   math_t* S_vec,
-                   math_t* U,
-                   math_t* V,
-                   int k,
-                   int p,
-                   bool use_bbt,
-                   bool gen_left_vec,
-                   bool gen_right_vec,
-                   bool use_jacobi,
-                   math_t tol,
-                   int max_sweeps,
-                   cudaStream_t stream)
-{
-  detail::rsvdFixedRank(handle,
-                        M,
-                        n_rows,
-                        n_cols,
-                        S_vec,
-                        U,
-                        V,
-                        k,
-                        p,
-                        use_bbt,
-                        gen_left_vec,
-                        gen_right_vec,
-                        use_jacobi,
-                        tol,
-                        max_sweeps,
-                        stream);
-}
 
-/**
- * @brief randomized singular value decomposition (RSVD) on the column major
- * float type input matrix (Jacobi-based), by specifying the PC and upsampling
- * ratio
- * @param handle: raft handle
- * @param M: input matrix
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param S_vec: singular values of input matrix
- * @param U: left singular values of input matrix
- * @param V: right singular values of input matrix
- * @param PC_perc: percentage of singular values to be computed
- * @param UpS_perc: upsampling percentage
- * @param use_bbt: whether use eigen decomposition in computation or not
- * @param gen_left_vec: left vector needs to be generated or not?
- * @param gen_right_vec: right vector needs to be generated or not?
- * @param use_jacobi: whether to jacobi solver for decomposition
- * @param tol: tolerance for Jacobi-based solvers
- * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers
- * @param stream cuda stream
- */
-template <typename math_t>
-void rsvdPerc(const raft::handle_t& handle,
-              math_t* M,
-              int n_rows,
-              int n_cols,
-              math_t* S_vec,
-              math_t* U,
-              math_t* V,
-              math_t PC_perc,
-              math_t UpS_perc,
-              bool use_bbt,
-              bool gen_left_vec,
-              bool gen_right_vec,
-              bool use_jacobi,
-              math_t tol,
-              int max_sweeps,
-              cudaStream_t stream)
-{
-  detail::rsvdPerc(handle,
-                   M,
-                   n_rows,
-                   n_cols,
-                   S_vec,
-                   U,
-                   V,
-                   PC_perc,
-                   UpS_perc,
-                   use_bbt,
-                   gen_left_vec,
-                   gen_right_vec,
-                   use_jacobi,
-                   tol,
-                   max_sweeps,
-                   stream);
-}
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "rsvd.cuh"
diff --git a/cpp/include/raft/linalg/sqrt.hpp b/cpp/include/raft/linalg/sqrt.hpp
index 9c66ee2d14..e0f77f0ab9 100644
--- a/cpp/include/raft/linalg/sqrt.hpp
+++ b/cpp/include/raft/linalg/sqrt.hpp
@@ -18,36 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SQRT_H
-#define __SQRT_H
-
-#pragma once
-
-#include <raft/cuda_utils.cuh>
-#include <raft/linalg/unary_op.cuh>
-
-namespace raft {
-namespace linalg {
-
 /**
- * @defgroup ScalarOps Scalar operations on the input buffer
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param out the output buffer
- * @param in the input buffer
- * @param len number of elements in the input buffer
- * @param stream cuda stream where to launch work
- * @{
+ * DISCLAIMER: this file is deprecated: use sqrt.cuh instead
  */
-template <typename math_t, typename IdxType = int>
-void sqrt(math_t* out, const math_t* in, IdxType len, cudaStream_t stream)
-{
-  raft::linalg::unaryOp(
-    out, in, len, [] __device__(math_t in) { return raft::mySqrt(in); }, stream);
-}
-/** @} */
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "sqrt.cuh"
diff --git a/cpp/include/raft/linalg/strided_reduction.hpp b/cpp/include/raft/linalg/strided_reduction.hpp
index 3b1597dfc3..6720a302ea 100644
--- a/cpp/include/raft/linalg/strided_reduction.hpp
+++ b/cpp/include/raft/linalg/strided_reduction.hpp
@@ -18,64 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __STRIDED_REDUCTION_H
-#define __STRIDED_REDUCTION_H
-
-#pragma once
-
-#include "detail/strided_reduction.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief Compute reduction of the input matrix along the strided dimension
- *
- * @tparam InType the data type of the input
- * @tparam OutType the data type of the output (as well as the data type for
- *  which reduction is performed)
- * @tparam IdxType data type of the indices of the array
- * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
- * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*ReduceLambda)(OutType);</pre>
- * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*FinalLambda)(OutType);</pre>
- * @param dots the output reduction vector
- * @param data the input matrix
- * @param D leading dimension of data
- * @param N second dimension data
- * @param init initial value to use for the reduction
- * @param main_op elementwise operation to apply before reduction
- * @param reduce_op binary reduction operation
- * @param final_op elementwise operation to apply before storing results
- * @param inplace reduction result added inplace or overwrites old values?
- * @param stream cuda stream where to launch work
+ * DISCLAIMER: this file is deprecated: use strided_reduction.cuh instead
  */
-template <typename InType,
-          typename OutType      = InType,
-          typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
-void stridedReduction(OutType* dots,
-                      const InType* data,
-                      IdxType D,
-                      IdxType N,
-                      OutType init,
-                      cudaStream_t stream,
-                      bool inplace           = false,
-                      MainLambda main_op     = raft::Nop<InType, IdxType>(),
-                      ReduceLambda reduce_op = raft::Sum<OutType>(),
-                      FinalLambda final_op   = raft::Nop<OutType>())
-{
-  detail::stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-}
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "strided_reduction.cuh"
diff --git a/cpp/include/raft/linalg/subtract.hpp b/cpp/include/raft/linalg/subtract.hpp
index accf57a939..b0c6508ffe 100644
--- a/cpp/include/raft/linalg/subtract.hpp
+++ b/cpp/include/raft/linalg/subtract.hpp
@@ -18,77 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SUBTRACT_H
-#define __SUBTRACT_H
-
-#pragma once
-
-#include "detail/subtract.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief Elementwise scalar subtraction operation on the input buffer
- *
- * @tparam InT     input data-type. Also the data-type upon which the math ops
- *                 will be performed
- * @tparam OutT    output data-type
- * @tparam IdxType Integer type used to for addressing
- *
- * @param out    the output buffer
- * @param in     the input buffer
- * @param scalar the scalar used in the operations
- * @param len    number of elements in the input buffer
- * @param stream cuda stream where to launch work
+ * DISCLAIMER: this file is deprecated: use subtract.cuh instead
  */
-template <typename InT, typename OutT = InT, typename IdxType = int>
-void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
-{
-  detail::subtractScalar(out, in, scalar, len, stream);
-}
 
-/**
- * @brief Elementwise subtraction operation on the input buffers
- * @tparam InT     input data-type. Also the data-type upon which the math ops
- *                 will be performed
- * @tparam OutT    output data-type
- * @tparam IdxType Integer type used to for addressing
- *
- * @param out    the output buffer
- * @param in1    the first input buffer
- * @param in2    the second input buffer
- * @param len    number of elements in the input buffers
- * @param stream cuda stream where to launch work
- */
-template <typename InT, typename OutT = InT, typename IdxType = int>
-void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
-{
-  detail::subtract(out, in1, in2, len, stream);
-}
-
-/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
- * write result to outDev[i]
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param outDev the output buffer
- * @param inDev the input buffer
- * @param singleScalarDev pointer to the scalar located in device memory
- * @param len number of elements in the input and output buffer
- * @param stream cuda stream
- * @remark block size has not been tuned
- */
-template <typename math_t, typename IdxType = int, int TPB = 256>
-void subtractDevScalar(math_t* outDev,
-                       const math_t* inDev,
-                       const math_t* singleScalarDev,
-                       IdxType len,
-                       cudaStream_t stream)
-{
-  detail::subtractDevScalar(outDev, inDev, singleScalarDev, len, stream);
-}
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "subtract.cuh"
diff --git a/cpp/include/raft/linalg/svd.hpp b/cpp/include/raft/linalg/svd.hpp
index 01788a4188..26bce80388 100644
--- a/cpp/include/raft/linalg/svd.hpp
+++ b/cpp/include/raft/linalg/svd.hpp
@@ -18,176 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SVD_H
-#define __SVD_H
-
-#pragma once
-
-#include "detail/svd.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief singular value decomposition (SVD) on the column major float type
- * input matrix using QR method
- * @param handle: raft handle
- * @param in: input matrix
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param sing_vals: singular values of input matrix
- * @param left_sing_vecs: left singular values of input matrix
- * @param right_sing_vecs: right singular values of input matrix
- * @param trans_right: transpose right vectors or not
- * @param gen_left_vec: generate left eig vector. Not activated.
- * @param gen_right_vec: generate right eig vector. Not activated.
- * @param stream cuda stream
+ * DISCLAIMER: this file is deprecated: use svd.cuh instead
  */
-// TODO: activate gen_left_vec and gen_right_vec options
-// TODO: couldn't template this function due to cusolverDnSgesvd and
-// cusolverSnSgesvd. Check if there is any other way.
-template <typename T>
-void svdQR(const raft::handle_t& handle,
-           T* in,
-           int n_rows,
-           int n_cols,
-           T* sing_vals,
-           T* left_sing_vecs,
-           T* right_sing_vecs,
-           bool trans_right,
-           bool gen_left_vec,
-           bool gen_right_vec,
-           cudaStream_t stream)
-{
-  detail::svdQR(handle,
-                in,
-                n_rows,
-                n_cols,
-                sing_vals,
-                left_sing_vecs,
-                right_sing_vecs,
-                trans_right,
-                gen_left_vec,
-                gen_right_vec,
-                stream);
-}
-
-template <typename T>
-void svdEig(const raft::handle_t& handle,
-            T* in,
-            int n_rows,
-            int n_cols,
-            T* S,
-            T* U,
-            T* V,
-            bool gen_left_vec,
-            cudaStream_t stream)
-{
-  detail::svdEig(handle, in, n_rows, n_cols, S, U, V, gen_left_vec, stream);
-}
 
-/**
- * @brief on the column major input matrix using Jacobi method
- * @param handle: raft handle
- * @param in: input matrix
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param sing_vals: singular values of input matrix
- * @param left_sing_vecs: left singular vectors of input matrix
- * @param right_sing_vecs: right singular vectors of input matrix
- * @param gen_left_vec: generate left eig vector. Not activated.
- * @param gen_right_vec: generate right eig vector. Not activated.
- * @param tol: error tolerance for the jacobi method. Algorithm stops when the
- * error is below tol
- * @param max_sweeps: number of sweeps in the Jacobi algorithm. The more the better
- * accuracy.
- * @param stream cuda stream
- */
-template <typename math_t>
-void svdJacobi(const raft::handle_t& handle,
-               math_t* in,
-               int n_rows,
-               int n_cols,
-               math_t* sing_vals,
-               math_t* left_sing_vecs,
-               math_t* right_sing_vecs,
-               bool gen_left_vec,
-               bool gen_right_vec,
-               math_t tol,
-               int max_sweeps,
-               cudaStream_t stream)
-{
-  detail::svdJacobi(handle,
-                    in,
-                    n_rows,
-                    n_cols,
-                    sing_vals,
-                    left_sing_vecs,
-                    right_sing_vecs,
-                    gen_left_vec,
-                    gen_right_vec,
-                    tol,
-                    max_sweeps,
-                    stream);
-}
-
-/**
- * @brief reconstruct a matrix use left and right singular vectors and
- * singular values
- * @param handle: raft handle
- * @param U: left singular vectors of size n_rows x k
- * @param S: square matrix with singular values on its diagonal, k x k
- * @param V: right singular vectors of size n_cols x k
- * @param out: reconstructed matrix to be returned
- * @param n_rows: number rows of output matrix
- * @param n_cols: number columns of output matrix
- * @param k: number of singular values
- * @param stream cuda stream
- */
-template <typename math_t>
-void svdReconstruction(const raft::handle_t& handle,
-                       math_t* U,
-                       math_t* S,
-                       math_t* V,
-                       math_t* out,
-                       int n_rows,
-                       int n_cols,
-                       int k,
-                       cudaStream_t stream)
-{
-  detail::svdReconstruction(handle, U, S, V, out, n_rows, n_cols, k, stream);
-}
-
-/**
- * @brief reconstruct a matrix use left and right singular vectors and
- * singular values
- * @param handle: raft handle
- * @param A_d: input matrix
- * @param U: left singular vectors of size n_rows x k
- * @param S_vec: singular values as a vector
- * @param V: right singular vectors of size n_cols x k
- * @param n_rows: number rows of output matrix
- * @param n_cols: number columns of output matrix
- * @param k: number of singular values to be computed, 1.0 for normal SVD
- * @param tol: tolerance for the evaluation
- * @param stream cuda stream
- */
-template <typename math_t>
-bool evaluateSVDByL2Norm(const raft::handle_t& handle,
-                         math_t* A_d,
-                         math_t* U,
-                         math_t* S_vec,
-                         math_t* V,
-                         int n_rows,
-                         int n_cols,
-                         int k,
-                         math_t tol,
-                         cudaStream_t stream)
-{
-  return detail::evaluateSVDByL2Norm(handle, A_d, U, S_vec, V, n_rows, n_cols, k, tol, stream);
-}
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "svd.cuh"
diff --git a/cpp/include/raft/linalg/ternary_op.hpp b/cpp/include/raft/linalg/ternary_op.hpp
index bce9eacb11..58dab89609 100644
--- a/cpp/include/raft/linalg/ternary_op.hpp
+++ b/cpp/include/raft/linalg/ternary_op.hpp
@@ -18,42 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __TERNARY_OP_H
-#define __TERNARY_OP_H
-
-#pragma once
-
-#include <raft/linalg/detail/ternary_op.cuh>
-
-namespace raft {
-namespace linalg {
 /**
- * @brief perform element-wise ternary operation on the input arrays
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam Lambda the device-lambda performing the actual operation
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads-per-block in the final kernel launched
- * @param out the output array
- * @param in1 the first input array
- * @param in2 the second input array
- * @param in3 the third input array
- * @param len number of elements in the input array
- * @param op the device-lambda
- * @param stream cuda stream where to launch work
+ * DISCLAIMER: this file is deprecated: use ternary_op.cuh instead
  */
-template <typename math_t, typename Lambda, typename IdxType = int, int TPB = 256>
-void ternaryOp(math_t* out,
-               const math_t* in1,
-               const math_t* in2,
-               const math_t* in3,
-               IdxType len,
-               Lambda op,
-               cudaStream_t stream)
-{
-  detail::ternaryOp(out, in1, in2, in3, len, op, stream);
-}
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "ternary_op.cuh"
diff --git a/cpp/include/raft/linalg/transpose.hpp b/cpp/include/raft/linalg/transpose.hpp
index caa6bafedf..4c3f9224e4 100644
--- a/cpp/include/raft/linalg/transpose.hpp
+++ b/cpp/include/raft/linalg/transpose.hpp
@@ -18,49 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __TRANSPOSE_H
-#define __TRANSPOSE_H
-
-#pragma once
-
-#include "detail/transpose.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief transpose on the column major input matrix using Jacobi method
- * @param handle: raft handle
- * @param in: input matrix
- * @param out: output. Transposed input matrix
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param stream: cuda stream
+ * DISCLAIMER: this file is deprecated: use transpose.cuh instead
  */
-template <typename math_t>
-void transpose(const raft::handle_t& handle,
-               math_t* in,
-               math_t* out,
-               int n_rows,
-               int n_cols,
-               cudaStream_t stream)
-{
-  detail::transpose(handle, in, out, n_rows, n_cols, stream);
-}
 
-/**
- * @brief transpose on the column major input matrix using Jacobi method
- * @param inout: input and output matrix
- * @param n: number of rows and columns of input matrix
- * @param stream: cuda stream
- */
-template <typename math_t>
-void transpose(math_t* inout, int n, cudaStream_t stream)
-{
-  detail::transpose(inout, n, stream);
-}
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "transpose.cuh"
diff --git a/cpp/include/raft/linalg/unary_op.hpp b/cpp/include/raft/linalg/unary_op.hpp
index ca1e3f9875..2ace126ff1 100644
--- a/cpp/include/raft/linalg/unary_op.hpp
+++ b/cpp/include/raft/linalg/unary_op.hpp
@@ -18,65 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __UNARY_OP_H
-#define __UNARY_OP_H
-
-#pragma once
-
-#include "detail/unary_op.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief perform element-wise unary operation in the input array
- * @tparam InType input data-type
- * @tparam Lambda the device-lambda performing the actual operation
- * @tparam OutType output data-type
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads-per-block in the final kernel launched
- * @param out the output array
- * @param in the input array
- * @param len number of elements in the input array
- * @param op the device-lambda
- * @param stream cuda stream where to launch work
- * @note Lambda must be a functor with the following signature:
- *       `OutType func(const InType& val);`
+ * DISCLAIMER: this file is deprecated: use unary_op.cuh instead
  */
-template <typename InType,
-          typename Lambda,
-          typename IdxType = int,
-          typename OutType = InType,
-          int TPB          = 256>
-void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
-{
-  detail::unaryOpCaller(out, in, len, op, stream);
-}
 
-/**
- * @brief Perform an element-wise unary operation into the output array
- *
- * Compared to `unaryOp()`, this method does not do any reads from any inputs
- *
- * @tparam OutType output data-type
- * @tparam Lambda  the device-lambda performing the actual operation
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB     threads-per-block in the final kernel launched
- *
- * @param[out] out    the output array [on device] [len = len]
- * @param[in]  len    number of elements in the input array
- * @param[in]  op     the device-lambda which must be of the form:
- *                    `void func(OutType* outLocationOffset, IdxType idx);`
- *                    where outLocationOffset will be out + idx.
- * @param[in]  stream cuda stream where to launch work
- */
-template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
-void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
-{
-  detail::writeOnlyUnaryOpCaller(out, len, op, stream);
-}
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
+#include "unary_op.cuh"
diff --git a/cpp/include/raft/matrix/col_wise_sort.hpp b/cpp/include/raft/matrix/col_wise_sort.hpp
index 83a8738219..60c36db9e2 100644
--- a/cpp/include/raft/matrix/col_wise_sort.hpp
+++ b/cpp/include/raft/matrix/col_wise_sort.hpp
@@ -18,44 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __COL_WISE_SORT_H
-#define __COL_WISE_SORT_H
+/**
+ * DISCLAIMER: this file is deprecated: use col_wise_sort.cuh instead
+ */
 
 #pragma once
 
-#include <raft/matrix/detail/columnWiseSort.cuh>
-
-namespace raft {
-namespace matrix {
-
-/**
- * @brief sort columns within each row of row-major input matrix and return sorted indexes
- * modelled as key-value sort with key being input matrix and value being index of values
- * @param in: input matrix
- * @param out: output value(index) matrix
- * @param n_rows: number rows of input matrix
- * @param n_columns: number columns of input matrix
- * @param bAllocWorkspace: check returned value, if true allocate workspace passed in workspaceSize
- * @param workspacePtr: pointer to workspace memory
- * @param workspaceSize: Size of workspace to be allocated
- * @param stream: cuda stream to execute prim on
- * @param sortedKeys: Optional, output matrix for sorted keys (input)
- */
-template <typename InType, typename OutType>
-void sort_cols_per_row(const InType* in,
-                       OutType* out,
-                       int n_rows,
-                       int n_columns,
-                       bool& bAllocWorkspace,
-                       void* workspacePtr,
-                       size_t& workspaceSize,
-                       cudaStream_t stream,
-                       InType* sortedKeys = nullptr)
-{
-  detail::sortColumnsPerRow<InType, OutType>(
-    in, out, n_rows, n_columns, bAllocWorkspace, workspacePtr, workspaceSize, stream, sortedKeys);
-}
-};  // end namespace matrix
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "col_wise_sort.cuh"
diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index 7409140d7c..428c914784 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -18,265 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MATRIX_H
-#define __MATRIX_H
-
-#pragma once
-
-#include "detail/linewise_op.cuh"
-#include "detail/matrix.cuh"
-
-#include <raft/common/nvtx.hpp>
-
-namespace raft {
-namespace matrix {
-
-using namespace std;
-
-/**
- * @brief Copy selected rows of the input matrix into contiguous space.
- *
- * On exit out[i + k*n_rows] = in[indices[i] + k*n_rows],
- * where i = 0..n_rows_indices-1, and k = 0..n_cols-1.
- *
- * @param in input matrix
- * @param n_rows number of rows of output matrix
- * @param n_cols number of columns of output matrix
- * @param out output matrix
- * @param indices of the rows to be copied
- * @param n_rows_indices number of rows to copy
- * @param stream cuda stream
- * @param rowMajor whether the matrix has row major layout
- */
-template <typename m_t, typename idx_array_t = int, typename idx_t = size_t>
-void copyRows(const m_t* in,
-              idx_t n_rows,
-              idx_t n_cols,
-              m_t* out,
-              const idx_array_t* indices,
-              idx_t n_rows_indices,
-              cudaStream_t stream,
-              bool rowMajor = false)
-{
-  detail::copyRows(in, n_rows, n_cols, out, indices, n_rows_indices, stream, rowMajor);
-}
-
-/**
- * @brief copy matrix operation for column major matrices.
- * @param in: input matrix
- * @param out: output matrix
- * @param n_rows: number of rows of output matrix
- * @param n_cols: number of columns of output matrix
- * @param stream: cuda stream
- */
-template <typename m_t, typename idx_t = int>
-void copy(const m_t* in, m_t* out, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
-{
-  raft::copy_async(out, in, n_rows * n_cols, stream);
-}
-
-/**
- * @brief copy matrix operation for column major matrices. First n_rows and
- * n_cols of input matrix "in" is copied to "out" matrix.
- * @param in: input matrix
- * @param in_n_rows: number of rows of input matrix
- * @param out: output matrix
- * @param out_n_rows: number of rows of output matrix
- * @param out_n_cols: number of columns of output matrix
- * @param stream: cuda stream
- */
-template <typename m_t, typename idx_t = int>
-void truncZeroOrigin(
-  m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream)
-{
-  detail::truncZeroOrigin(in, in_n_rows, out, out_n_rows, out_n_cols, stream);
-}
-
-/**
- * @brief Columns of a column major matrix is reversed (i.e. first column and
- * last column are swapped)
- * @param inout: input and output matrix
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- * @param stream: cuda stream
- */
-template <typename m_t, typename idx_t = int>
-void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
-{
-  detail::colReverse(inout, n_rows, n_cols, stream);
-}
-
-/**
- * @brief Rows of a column major matrix is reversed (i.e. first row and last
- * row are swapped)
- * @param inout: input and output matrix
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- * @param stream: cuda stream
- */
-template <typename m_t, typename idx_t = int>
-void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
-{
-  detail::rowReverse(inout, n_rows, n_cols, stream);
-}
-
-/**
- * @brief Prints the data stored in GPU memory
- * @param in: input matrix
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- * @param h_separator: horizontal separator character
- * @param v_separator: vertical separator character
- * @param stream: cuda stream
- */
-template <typename m_t, typename idx_t = int>
-void print(const m_t* in,
-           idx_t n_rows,
-           idx_t n_cols,
-           char h_separator    = ' ',
-           char v_separator    = '\n',
-           cudaStream_t stream = rmm::cuda_stream_default)
-{
-  detail::print(in, n_rows, n_cols, h_separator, v_separator, stream);
-}
-
-/**
- * @brief Prints the data stored in CPU memory
- * @param in: input matrix
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- */
-template <typename m_t, typename idx_t = int>
-void printHost(const m_t* in, idx_t n_rows, idx_t n_cols)
-{
-  detail::printHost(in, n_rows, n_cols);
-}
-
-/**
- * @brief Slice a matrix (in-place)
- * @param in: input matrix
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- * @param out: output matrix
- * @param x1, y1: coordinate of the top-left point of the wanted area (0-based)
- * @param x2, y2: coordinate of the bottom-right point of the wanted area
- * (1-based)
- * example: Slice the 2nd and 3rd columns of a 4x3 matrix: slice_matrix(M_d, 4,
- * 3, 0, 1, 4, 3);
- * @param stream: cuda stream
- */
-template <typename m_t, typename idx_t = int>
-void sliceMatrix(m_t* in,
-                 idx_t n_rows,
-                 idx_t n_cols,
-                 m_t* out,
-                 idx_t x1,
-                 idx_t y1,
-                 idx_t x2,
-                 idx_t y2,
-                 cudaStream_t stream)
-{
-  detail::sliceMatrix(in, n_rows, n_cols, out, x1, y1, x2, y2, stream);
-}
-
 /**
- * @brief Copy the upper triangular part of a matrix to another
- * @param src: input matrix with a size of n_rows x n_cols
- * @param dst: output matrix with a size of kxk, k = min(n_rows, n_cols)
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- * @param stream: cuda stream
+ * DISCLAIMER: this file is deprecated: use matrix.cuh instead
  */
-template <typename m_t, typename idx_t = int>
-void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
-{
-  detail::copyUpperTriangular(src, dst, n_rows, n_cols, stream);
-}
 
-/**
- * @brief Initialize a diagonal matrix with a vector
- * @param vec: vector of length k = min(n_rows, n_cols)
- * @param matrix: matrix of size n_rows x n_cols
- * @param n_rows: number of rows of the matrix
- * @param n_cols: number of columns of the matrix
- * @param stream: cuda stream
- */
-template <typename m_t, typename idx_t = int>
-void initializeDiagonalMatrix(
-  m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
-{
-  detail::initializeDiagonalMatrix(vec, matrix, n_rows, n_cols, stream);
-}
-
-/**
- * @brief Get a square matrix with elements on diagonal reversed (in-place)
- * @param in: square input matrix with size len x len
- * @param len: size of one side of the matrix
- * @param stream: cuda stream
- */
-template <typename m_t, typename idx_t = int>
-void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
-{
-  detail::getDiagonalInverseMatrix(in, len, stream);
-}
-
-/**
- * @brief Get the L2/F-norm of a matrix/vector
- * @param handle
- * @param in: input matrix/vector with totally size elements
- * @param size: size of the matrix/vector
- * @param stream: cuda stream
- */
-template <typename m_t, typename idx_t = int>
-m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream)
-{
-  return detail::getL2Norm(handle, in, size, stream);
-}
-
-/**
- * Run a function over matrix lines (rows or columns) with a variable number
- * row-vectors or column-vectors.
- * The term `line` here signifies that the lines can be either columns or rows,
- * depending on the matrix layout.
- * What matters is if the vectors are applied along lines (indices of vectors correspond to
- * indices within lines), or across lines (indices of vectors correspond to line numbers).
- *
- * @param [out] out result of the operation; can be same as `in`; should be aligned the same
- *        as `in` to allow faster vectorized memory transfers.
- * @param [in] in input matrix consisting of `nLines` lines, each `lineLen`-long.
- * @param [in] lineLen length of matrix line in elements (`=nCols` in row-major or `=nRows` in
- * col-major)
- * @param [in] nLines number of matrix lines (`=nRows` in row-major or `=nCols` in col-major)
- * @param [in] alongLines whether vectors are indices along or across lines.
- * @param [in] op the operation applied on each line:
- *    for i in [0..lineLen) and j in [0..nLines):
- *      out[i, j] = op(in[i, j], vec1[i], vec2[i], ... veck[i])   if alongLines = true
- *      out[i, j] = op(in[i, j], vec1[j], vec2[j], ... veck[j])   if alongLines = false
- *    where matrix indexing is row-major ([i, j] = [i + lineLen * j]).
- * @param [in] stream a cuda stream for the kernels
- * @param [in] vecs zero or more vectors to be passed as arguments,
- *    size of each vector is `alongLines ? lineLen : nLines`.
- */
-template <typename m_t, typename idx_t = int, typename Lambda, typename... Vecs>
-void linewiseOp(m_t* out,
-                const m_t* in,
-                const idx_t lineLen,
-                const idx_t nLines,
-                const bool alongLines,
-                Lambda op,
-                cudaStream_t stream,
-                Vecs... vecs)
-{
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("linewiseOp-%c-%zu (%zu, %zu)",
-                                                            alongLines ? 'l' : 'x',
-                                                            sizeof...(Vecs),
-                                                            size_t(lineLen),
-                                                            size_t(nLines));
-  detail::MatrixLinewiseOp<16, 256>::run<m_t, idx_t, Lambda, Vecs...>(
-    out, in, lineLen, nLines, alongLines, op, stream, vecs...);
-}
+#pragma once
 
-};  // end namespace matrix
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "matrix.cuh"
diff --git a/cpp/include/raft/random/make_regression.hpp b/cpp/include/raft/random/make_regression.hpp
index 4f6b2717f6..f3e2113f80 100644
--- a/cpp/include/raft/random/make_regression.hpp
+++ b/cpp/include/raft/random/make_regression.hpp
@@ -13,98 +13,19 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-/* Adapted from scikit-learn
- * https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/datasets/_samples_generator.py
- */
-
 /**
  * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
-#ifndef __MAKE_REGRESSION_H
-#define __MAKE_REGRESSION_H
-
-#pragma once
-
-#include <algorithm>
-
-#include "detail/make_regression.cuh"
-
-namespace raft::random {
-
 /**
- * @brief GPU-equivalent of sklearn.datasets.make_regression as documented at:
- * https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html
- *
- * @tparam  DataT  Scalar type
- * @tparam  IdxT   Index type
- *
- * @param[in]   handle          RAFT handle
- * @param[out]  out             Row-major (samples, features) matrix to store
- *                              the problem data
- * @param[out]  values          Row-major (samples, targets) matrix to store
- *                              the values for the regression problem
- * @param[in]   n_rows          Number of samples
- * @param[in]   n_cols          Number of features
- * @param[in]   n_informative   Number of informative features (non-zero
- *                              coefficients)
- * @param[in]   stream          CUDA stream
- * @param[out]  coef            Row-major (features, targets) matrix to store
- *                              the coefficients used to generate the values
- *                              for the regression problem. If nullptr is
- *                              given, nothing will be written
- * @param[in]   n_targets       Number of targets (generated values per sample)
- * @param[in]   bias            A scalar that will be added to the values
- * @param[in]   effective_rank  The approximate rank of the data matrix (used
- *                              to create correlations in the data). -1 is the
- *                              code to use well-conditioned data
- * @param[in]   tail_strength   The relative importance of the fat noisy tail
- *                              of the singular values profile if
- *                              effective_rank is not -1
- * @param[in]   noise           Standard deviation of the gaussian noise
- *                              applied to the output
- * @param[in]   shuffle         Shuffle the samples and the features
- * @param[in]   seed            Seed for the random number generator
- * @param[in]   type            Random generator type
+ * DISCLAIMER: this file is deprecated: use make_regression.cuh instead
  */
-template <typename DataT, typename IdxT>
-void make_regression(const raft::handle_t& handle,
-                     DataT* out,
-                     DataT* values,
-                     IdxT n_rows,
-                     IdxT n_cols,
-                     IdxT n_informative,
-                     cudaStream_t stream,
-                     DataT* coef         = nullptr,
-                     IdxT n_targets      = (IdxT)1,
-                     DataT bias          = (DataT)0.0,
-                     IdxT effective_rank = (IdxT)-1,
-                     DataT tail_strength = (DataT)0.5,
-                     DataT noise         = (DataT)0.0,
-                     bool shuffle        = true,
-                     uint64_t seed       = 0ULL,
-                     GeneratorType type  = GenPhilox)
-{
-  detail::make_regression_caller(handle,
-                                 out,
-                                 values,
-                                 n_rows,
-                                 n_cols,
-                                 n_informative,
-                                 stream,
-                                 coef,
-                                 n_targets,
-                                 bias,
-                                 effective_rank,
-                                 tail_strength,
-                                 noise,
-                                 shuffle,
-                                 seed,
-                                 type);
-}
 
-}  // namespace raft::random
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "make_regression.cuh"
diff --git a/cpp/include/raft/random/multi_variable_gaussian.hpp b/cpp/include/raft/random/multi_variable_gaussian.hpp
index 6b85ec6a14..e7d78938a2 100644
--- a/cpp/include/raft/random/multi_variable_gaussian.hpp
+++ b/cpp/include/raft/random/multi_variable_gaussian.hpp
@@ -18,51 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MVG_H
-#define __MVG_H
+/**
+ * DISCLAIMER: this file is deprecated: use multi_variable_gaussian.cuh instead
+ */
 
 #pragma once
 
-#include "detail/multi_variable_gaussian.cuh"
-
-namespace raft::random {
-
-template <typename T>
-class multi_variable_gaussian : public detail::multi_variable_gaussian_impl<T> {
- public:
-  // using Decomposer = typename detail::multi_variable_gaussian_impl<T>::Decomposer;
-  // using detail::multi_variable_gaussian_impl<T>::Decomposer::chol_decomp;
-  // using detail::multi_variable_gaussian_impl<T>::Decomposer::jacobi;
-  // using detail::multi_variable_gaussian_impl<T>::Decomposer::qr;
-
-  multi_variable_gaussian() = delete;
-  multi_variable_gaussian(const raft::handle_t& handle,
-                          const int dim,
-                          typename detail::multi_variable_gaussian_impl<T>::Decomposer method)
-    : detail::multi_variable_gaussian_impl<T>{handle, dim, method}
-  {
-  }
-
-  std::size_t get_workspace_size()
-  {
-    return detail::multi_variable_gaussian_impl<T>::get_workspace_size();
-  }
-
-  void set_workspace(T* workarea)
-  {
-    detail::multi_variable_gaussian_impl<T>::set_workspace(workarea);
-  }
-
-  void give_gaussian(const int nPoints, T* P, T* X, const T* x = 0)
-  {
-    detail::multi_variable_gaussian_impl<T>::give_gaussian(nPoints, P, X, x);
-  }
-
-  void deinit() { detail::multi_variable_gaussian_impl<T>::deinit(); }
-
-  ~multi_variable_gaussian() { deinit(); }
-};  // end of multi_variable_gaussian
-
-};  // end of namespace raft::random
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "multi_variable_gaussian.cuh"
diff --git a/cpp/include/raft/random/permute.hpp b/cpp/include/raft/random/permute.hpp
index 26e22e403b..a2fafa6574 100644
--- a/cpp/include/raft/random/permute.hpp
+++ b/cpp/include/raft/random/permute.hpp
@@ -18,50 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __PERMUTE_H
-#define __PERMUTE_H
-
-#pragma once
-
-#include "detail/permute.cuh"
-
-namespace raft::random {
-
 /**
- * @brief Generate permutations of the input array. Pretty useful primitive for
- * shuffling the input datasets in ML algos. See note at the end for some of its
- * limitations!
- * @tparam Type Data type of the array to be shuffled
- * @tparam IntType Integer type used for ther perms array
- * @tparam IdxType Integer type used for addressing indices
- * @tparam TPB threads per block
- * @param perms the output permutation indices. Typically useful only when
- * one wants to refer back. If you don't need this, pass a nullptr
- * @param out the output shuffled array. Pass nullptr if you don't want this to
- * be written. For eg: when you only want the perms array to be filled.
- * @param in input array (in-place is not supported due to race conditions!)
- * @param D number of columns of the input array
- * @param N length of the input array (or number of rows)
- * @param rowMajor whether the input/output matrices are row or col major
- * @param stream cuda stream where to launch the work
- *
- * @note This is NOT a uniform permutation generator! In fact, it only generates
- * very small percentage of permutations. If your application really requires a
- * high quality permutation generator, it is recommended that you pick
- * Knuth Shuffle.
+ * DISCLAIMER: this file is deprecated: use permute.cuh instead
  */
-template <typename Type, typename IntType = int, typename IdxType = int, int TPB = 256>
-void permute(IntType* perms,
-             Type* out,
-             const Type* in,
-             IntType D,
-             IntType N,
-             bool rowMajor,
-             cudaStream_t stream)
-{
-  detail::permute<Type, IntType, IdxType, TPB>(perms, out, in, D, N, rowMajor, stream);
-}
 
-};  // end namespace raft::random
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "permute.cuh"
diff --git a/cpp/include/raft/sparse/linalg/add.hpp b/cpp/include/raft/sparse/linalg/add.hpp
index 39ab2d6450..e6930eaee7 100644
--- a/cpp/include/raft/sparse/linalg/add.hpp
+++ b/cpp/include/raft/sparse/linalg/add.hpp
@@ -18,87 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SPARSE_ADD_H
-#define __SPARSE_ADD_H
-
-#pragma once
-
-#include <raft/sparse/linalg/detail/add.cuh>
-
-namespace raft {
-namespace sparse {
-namespace linalg {
-
 /**
- * @brief Calculate the CSR row_ind array that would result
- * from summing together two CSR matrices
- * @param a_ind: left hand row_ind array
- * @param a_indptr: left hand index_ptr array
- * @param a_val: left hand data array
- * @param nnz1: size of left hand index_ptr and val arrays
- * @param b_ind: right hand row_ind array
- * @param b_indptr: right hand index_ptr array
- * @param b_val: right hand data array
- * @param nnz2: size of right hand index_ptr and val arrays
- * @param m: size of output array (number of rows in final matrix)
- * @param out_ind: output row_ind array
- * @param stream: cuda stream to use
+ * DISCLAIMER: this file is deprecated: use add.cuh instead
  */
-template <typename T>
-size_t csr_add_calc_inds(const int* a_ind,
-                         const int* a_indptr,
-                         const T* a_val,
-                         int nnz1,
-                         const int* b_ind,
-                         const int* b_indptr,
-                         const T* b_val,
-                         int nnz2,
-                         int m,
-                         int* out_ind,
-                         cudaStream_t stream)
-{
-  return detail::csr_add_calc_inds(
-    a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, out_ind, stream);
-}
 
-/**
- * @brief Calculate the CSR row_ind array that would result
- * from summing together two CSR matrices
- * @param a_ind: left hand row_ind array
- * @param a_indptr: left hand index_ptr array
- * @param a_val: left hand data array
- * @param nnz1: size of left hand index_ptr and val arrays
- * @param b_ind: right hand row_ind array
- * @param b_indptr: right hand index_ptr array
- * @param b_val: right hand data array
- * @param nnz2: size of right hand index_ptr and val arrays
- * @param m: size of output array (number of rows in final matrix)
- * @param c_ind: output row_ind array
- * @param c_indptr: output ind_ptr array
- * @param c_val: output data array
- * @param stream: cuda stream to use
- */
-template <typename T>
-void csr_add_finalize(const int* a_ind,
-                      const int* a_indptr,
-                      const T* a_val,
-                      int nnz1,
-                      const int* b_ind,
-                      const int* b_indptr,
-                      const T* b_val,
-                      int nnz2,
-                      int m,
-                      int* c_ind,
-                      int* c_indptr,
-                      T* c_val,
-                      cudaStream_t stream)
-{
-  detail::csr_add_finalize(
-    a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, c_ind, c_indptr, c_val, stream);
-}
+#pragma once
 
-};  // end NAMESPACE linalg
-};  // end NAMESPACE sparse
-};  // end NAMESPACE raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "add.cuh"
diff --git a/cpp/include/raft/sparse/linalg/degree.hpp b/cpp/include/raft/sparse/linalg/degree.hpp
index 7cece7908e..240cfd452f 100644
--- a/cpp/include/raft/sparse/linalg/degree.hpp
+++ b/cpp/include/raft/sparse/linalg/degree.hpp
@@ -18,111 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SPARSE_DEGREE_H
-#define __SPARSE_DEGREE_H
-
-#pragma once
-
-#include <raft/sparse/coo.hpp>
-#include <raft/sparse/linalg/detail/degree.cuh>
-
-namespace raft {
-namespace sparse {
-namespace linalg {
-
-/**
- * @brief Count the number of values for each row
- * @tparam TPB_X: number of threads to use per block
- * @param rows: rows array of the COO matrix
- * @param nnz: size of the rows array
- * @param results: output result array
- * @param stream: cuda stream to use
- */
-template <typename T = int>
-void coo_degree(const T* rows, int nnz, T* results, cudaStream_t stream)
-{
-  detail::coo_degree<64, T>(rows, nnz, results, stream);
-}
-
-/**
- * @brief Count the number of values for each row
- * @tparam TPB_X: number of threads to use per block
- * @tparam T: type name of underlying values array
- * @param in: input COO object for counting rows
- * @param results: output array with row counts (size=in->n_rows)
- * @param stream: cuda stream to use
- */
-template <typename T>
-void coo_degree(COO<T>* in, int* results, cudaStream_t stream)
-{
-  coo_degree(in->rows(), in->nnz, results, stream);
-}
-
-/**
- * @brief Count the number of values for each row that doesn't match a particular scalar
- * @tparam TPB_X: number of threads to use per block
- * @tparam T: the type name of the underlying value arrays
- * @param rows: Input COO row array
- * @param vals: Input COO val arrays
- * @param nnz: size of input COO arrays
- * @param scalar: scalar to match for counting rows
- * @param results: output row counts
- * @param stream: cuda stream to use
- */
-template <typename T>
-void coo_degree_scalar(
-  const int* rows, const T* vals, int nnz, T scalar, int* results, cudaStream_t stream = 0)
-{
-  detail::coo_degree_scalar<64>(rows, vals, nnz, scalar, results, stream);
-}
-
-/**
- * @brief Count the number of values for each row that doesn't match a particular scalar
- * @tparam TPB_X: number of threads to use per block
- * @tparam T: the type name of the underlying value arrays
- * @param in: Input COO array
- * @param scalar: scalar to match for counting rows
- * @param results: output row counts
- * @param stream: cuda stream to use
- */
-template <typename T>
-void coo_degree_scalar(COO<T>* in, T scalar, int* results, cudaStream_t stream)
-{
-  coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, results, stream);
-}
-
 /**
- * @brief Count the number of nonzeros for each row
- * @tparam TPB_X: number of threads to use per block
- * @tparam T: the type name of the underlying value arrays
- * @param rows: Input COO row array
- * @param vals: Input COO val arrays
- * @param nnz: size of input COO arrays
- * @param results: output row counts
- * @param stream: cuda stream to use
+ * DISCLAIMER: this file is deprecated: use degree.cuh instead
  */
-template <typename T>
-void coo_degree_nz(const int* rows, const T* vals, int nnz, int* results, cudaStream_t stream)
-{
-  detail::coo_degree_nz<64>(rows, vals, nnz, results, stream);
-}
 
-/**
- * @brief Count the number of nonzero values for each row
- * @tparam TPB_X: number of threads to use per block
- * @tparam T: the type name of the underlying value arrays
- * @param in: Input COO array
- * @param results: output row counts
- * @param stream: cuda stream to use
- */
-template <typename T>
-void coo_degree_nz(COO<T>* in, int* results, cudaStream_t stream)
-{
-  coo_degree_nz(in->rows(), in->vals(), in->nnz, results, stream);
-}
+#pragma once
 
-};  // end NAMESPACE linalg
-};  // end NAMESPACE sparse
-};  // end NAMESPACE raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "degree.cuh"
diff --git a/cpp/include/raft/sparse/linalg/norm.hpp b/cpp/include/raft/sparse/linalg/norm.hpp
index 1f054e63ab..64261f1178 100644
--- a/cpp/include/raft/sparse/linalg/norm.hpp
+++ b/cpp/include/raft/sparse/linalg/norm.hpp
@@ -18,61 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SPARSE_NORM_H
-#define __SPARSE_NORM_H
-
-#pragma once
-
-#include <raft/sparse/linalg/detail/norm.cuh>
-
-namespace raft {
-namespace sparse {
-namespace linalg {
-
 /**
- * @brief Perform L1 normalization on the rows of a given CSR-formatted sparse matrix
- *
- * @param ia: row_ind array
- * @param vals: data array
- * @param nnz: size of data array
- * @param m: size of row_ind array
- * @param result: l1 normalized data array
- * @param stream: cuda stream to use
+ * DISCLAIMER: this file is deprecated: use norm.cuh instead
  */
-template <typename T>
-void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
-                          const T* vals,
-                          int nnz,  // array of values and number of non-zeros
-                          int m,    // num rows in csr
-                          T* result,
-                          cudaStream_t stream)
-{  // output array
-  detail::csr_row_normalize_l1(ia, vals, nnz, m, result, stream);
-}
 
-/**
- * @brief Perform L_inf normalization on a given CSR-formatted sparse matrix
- *
- * @param ia: row_ind array
- * @param vals: data array
- * @param nnz: size of data array
- * @param m: size of row_ind array
- * @param result: l1 normalized data array
- * @param stream: cuda stream to use
- */
-template <typename T>
-void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
-                           const T* vals,
-                           int nnz,  // array of values and number of non-zeros
-                           int m,    // num total rows in csr
-                           T* result,
-                           cudaStream_t stream)
-{
-  detail::csr_row_normalize_max(ia, vals, nnz, m, result, stream);
-}
+#pragma once
 
-};  // end NAMESPACE linalg
-};  // end NAMESPACE sparse
-};  // end NAMESPACE raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "norm.cuh"
diff --git a/cpp/include/raft/sparse/linalg/spectral.hpp b/cpp/include/raft/sparse/linalg/spectral.hpp
index ff400f1f0f..d7009db03f 100644
--- a/cpp/include/raft/sparse/linalg/spectral.hpp
+++ b/cpp/include/raft/sparse/linalg/spectral.hpp
@@ -18,31 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SPARSE_SPECTRAL_H
-#define __SPARSE_SPECTRAL_H
-
-#include <raft/handle.hpp>
-#include <raft/sparse/linalg/detail/spectral.cuh>
+/**
+ * DISCLAIMER: this file is deprecated: use spectral.cuh instead
+ */
 
-namespace raft {
-namespace sparse {
-namespace spectral {
+#pragma once
 
-template <typename T>
-void fit_embedding(const raft::handle_t& handle,
-                   int* rows,
-                   int* cols,
-                   T* vals,
-                   int nnz,
-                   int n,
-                   int n_components,
-                   T* out,
-                   unsigned long long seed = 1234567)
-{
-  detail::fit_embedding(handle, rows, cols, vals, nnz, n, n_components, out, seed);
-}
-};  // namespace spectral
-};  // namespace sparse
-};  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
+#include "spectral.cuh"
diff --git a/cpp/include/raft/sparse/linalg/transpose.hpp b/cpp/include/raft/sparse/linalg/transpose.hpp
index c709c20473..a6a0539319 100644
--- a/cpp/include/raft/sparse/linalg/transpose.hpp
+++ b/cpp/include/raft/sparse/linalg/transpose.hpp
@@ -18,62 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __TRANSPOSE_H
-#define __TRANSPOSE_H
-
-#pragma once
-
-#include <raft/handle.hpp>
-#include <raft/sparse/linalg/detail/transpose.h>
-
-namespace raft {
-namespace sparse {
-namespace linalg {
-
 /**
- * Transpose a set of CSR arrays into a set of CSC arrays.
- * @tparam value_idx : data type of the CSR index arrays
- * @tparam value_t : data type of the CSR data array
- * @param[in] handle : used for invoking cusparse
- * @param[in] csr_indptr : CSR row index array
- * @param[in] csr_indices : CSR column indices array
- * @param[in] csr_data : CSR data array
- * @param[out] csc_indptr : CSC row index array
- * @param[out] csc_indices : CSC column indices array
- * @param[out] csc_data : CSC data array
- * @param[in] csr_nrows : Number of rows in CSR
- * @param[in] csr_ncols : Number of columns in CSR
- * @param[in] nnz : Number of nonzeros of CSR
- * @param[in] stream : Cuda stream for ordering events
+ * DISCLAIMER: this file is deprecated: use transpose.cuh instead
  */
-template <typename value_idx, typename value_t>
-void csr_transpose(const raft::handle_t& handle,
-                   const value_idx* csr_indptr,
-                   const value_idx* csr_indices,
-                   const value_t* csr_data,
-                   value_idx* csc_indptr,
-                   value_idx* csc_indices,
-                   value_t* csc_data,
-                   value_idx csr_nrows,
-                   value_idx csr_ncols,
-                   value_idx nnz,
-                   cudaStream_t stream)
-{
-  detail::csr_transpose(handle.get_cusparse_handle(),
-                        csr_indptr,
-                        csr_indices,
-                        csr_data,
-                        csc_indptr,
-                        csc_indices,
-                        csc_data,
-                        csr_nrows,
-                        csr_ncols,
-                        nnz,
-                        stream);
-}
 
-};  // end NAMESPACE linalg
-};  // end NAMESPACE sparse
-};  // end NAMESPACE raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "transpose.cuh"
diff --git a/cpp/include/raft/sparse/op/filter.hpp b/cpp/include/raft/sparse/op/filter.hpp
index 3821d963b0..6a59148fd7 100644
--- a/cpp/include/raft/sparse/op/filter.hpp
+++ b/cpp/include/raft/sparse/op/filter.hpp
@@ -18,82 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __FILTER_H
-#define __FILTER_H
-
-#pragma once
-
-#include <raft/handle.hpp>
-#include <raft/sparse/coo.hpp>
-#include <raft/sparse/op/detail/filter.cuh>
-
-namespace raft {
-namespace sparse {
-namespace op {
-
-/**
- * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix.
- *
- * @param rows: input array of rows (size n)
- * @param cols: input array of cols (size n)
- * @param vals: input array of vals (size n)
- * @param nnz: size of current rows/cols/vals arrays
- * @param crows: compressed array of rows
- * @param ccols: compressed array of cols
- * @param cvals: compressed array of vals
- * @param cnnz: array of non-zero counts per row
- * @param cur_cnnz array of counts per row
- * @param scalar: scalar to remove from arrays
- * @param n: number of rows in dense matrix
- * @param stream: cuda stream to use
- */
-template <typename T>
-void coo_remove_scalar(const int* rows,
-                       const int* cols,
-                       const T* vals,
-                       int nnz,
-                       int* crows,
-                       int* ccols,
-                       T* cvals,
-                       int* cnnz,
-                       int* cur_cnnz,
-                       T scalar,
-                       int n,
-                       cudaStream_t stream)
-{
-  detail::coo_remove_scalar<128, T>(
-    rows, cols, vals, nnz, crows, ccols, cvals, cnnz, cur_cnnz, scalar, n, stream);
-}
-
 /**
- * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix.
- *
- * @param in: input COO matrix
- * @param out: output COO matrix
- * @param scalar: scalar to remove from arrays
- * @param stream: cuda stream to use
+ * DISCLAIMER: this file is deprecated: use filter.cuh instead
  */
-template <typename T>
-void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
-{
-  detail::coo_remove_scalar<128, T>(in, out, scalar, stream);
-}
 
-/**
- * @brief Removes zeros from a COO formatted sparse matrix.
- *
- * @param in: input COO matrix
- * @param out: output COO matrix
- * @param stream: cuda stream to use
- */
-template <typename T>
-void coo_remove_zeros(COO<T>* in, COO<T>* out, cudaStream_t stream)
-{
-  coo_remove_scalar<T>(in, out, T(0.0), stream);
-}
+#pragma once
 
-};  // namespace op
-};  // end NAMESPACE sparse
-};  // end NAMESPACE raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "filter.cuh"
diff --git a/cpp/include/raft/sparse/op/reduce.hpp b/cpp/include/raft/sparse/op/reduce.hpp
index bb7560fa3d..37923e070c 100644
--- a/cpp/include/raft/sparse/op/reduce.hpp
+++ b/cpp/include/raft/sparse/op/reduce.hpp
@@ -18,75 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SPARSE_REDUCE_H
-#define __SPARSE_REDUCE_H
-
-#pragma once
-
-#include <raft/handle.hpp>
-#include <raft/sparse/coo.hpp>
-#include <raft/sparse/op/detail/reduce.cuh>
-
-namespace raft {
-namespace sparse {
-namespace op {
 /**
- * Computes a mask from a sorted COO matrix where 0's denote
- * duplicate values and 1's denote new values. This mask can
- * be useful for computing an exclusive scan to pre-build offsets
- * for reducing duplicates, such as when symmetrizing
- * or taking the min of each duplicated value.
- *
- * Note that this function always marks the first value as 0 so that
- * a cumulative sum can be performed as a follow-on. However, even
- * if the mask is used direclty, any duplicates should always have a
- * 1 when first encountered so it can be assumed that the first element
- * is always a 1 otherwise.
- *
- * @tparam value_idx
- * @param[out] mask output mask, size nnz
- * @param[in] rows COO rows array, size nnz
- * @param[in] cols COO cols array, size nnz
- * @param[in] nnz number of nonzeros in input arrays
- * @param[in] stream cuda ops will be ordered wrt this stream
+ * DISCLAIMER: this file is deprecated: use reduce.cuh instead
  */
-template <typename value_idx>
-void compute_duplicates_mask(
-  value_idx* mask, const value_idx* rows, const value_idx* cols, size_t nnz, cudaStream_t stream)
-{
-  detail::compute_duplicates_mask(mask, rows, cols, nnz, stream);
-}
 
-/**
- * Performs a COO reduce of duplicate columns per row, taking the max weight
- * for duplicate columns in each row. This function assumes the input COO
- * has been sorted by both row and column but makes no assumption on
- * the sorting of values.
- * @tparam value_idx
- * @tparam value_t
- * @param[in] handle
- * @param[out] out output COO, the nnz will be computed allocate() will be called in this function.
- * @param[in] rows COO rows array, size nnz
- * @param[in] cols COO cols array, size nnz
- * @param[in] vals COO vals array, size nnz
- * @param[in] nnz number of nonzeros in COO input arrays
- * @param[in] m number of rows in COO input matrix
- * @param[in] n number of columns in COO input matrix
- */
-template <typename value_idx, typename value_t>
-void max_duplicates(const raft::handle_t& handle,
-                    raft::sparse::COO<value_t, value_idx>& out,
-                    const value_idx* rows,
-                    const value_idx* cols,
-                    const value_t* vals,
-                    size_t nnz,
-                    size_t m,
-                    size_t n)
-{
-  detail::max_duplicates(handle, out, rows, cols, vals, nnz, m, n);
-}
-};  // END namespace op
-};  // END namespace sparse
-};  // END namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "reduce.cuh"
diff --git a/cpp/include/raft/sparse/op/row_op.hpp b/cpp/include/raft/sparse/op/row_op.hpp
index ac12432e92..8443f9f090 100644
--- a/cpp/include/raft/sparse/op/row_op.hpp
+++ b/cpp/include/raft/sparse/op/row_op.hpp
@@ -18,37 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SPARSE_ROW_OP_H
-#define __SPARSE_ROW_OP_H
-
-#pragma once
-
-#include <raft/handle.hpp>
-#include <raft/sparse/op/detail/row_op.cuh>
-
-namespace raft {
-namespace sparse {
-namespace op {
-
 /**
- * @brief Perform a custom row operation on a CSR matrix in batches.
- * @tparam T numerical type of row_ind array
- * @tparam TPB_X number of threads per block to use for underlying kernel
- * @tparam Lambda type of custom operation function
- * @param row_ind the CSR row_ind array to perform parallel operations over
- * @param n_rows total number vertices in graph
- * @param nnz number of non-zeros
- * @param op custom row operation functor accepting the row and beginning index.
- * @param stream cuda stream to use
+ * DISCLAIMER: this file is deprecated: use row_op.cuh instead
  */
-template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cudaStream_t stream)
-{
-  detail::csr_row_op<Index_, 128, Lambda>(row_ind, n_rows, nnz, op, stream);
-}
 
-};  // namespace op
-};  // end NAMESPACE sparse
-};  // end NAMESPACE raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "row_op.cuh"
diff --git a/cpp/include/raft/sparse/op/slice.hpp b/cpp/include/raft/sparse/op/slice.hpp
index 75b7e478e5..4d7e1858de 100644
--- a/cpp/include/raft/sparse/op/slice.hpp
+++ b/cpp/include/raft/sparse/op/slice.hpp
@@ -18,69 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SLICE_H
-#define __SLICE_H
-
-#pragma once
-
-#include <raft/handle.hpp>
-#include <raft/sparse/op/detail/slice.cuh>
-
-namespace raft {
-namespace sparse {
-namespace op {
-
 /**
- * Slice consecutive rows from a CSR array and populate newly sliced indptr array
- * @tparam value_idx
- * @param[in] start_row : beginning row to slice
- * @param[in] stop_row : ending row to slice
- * @param[in] indptr : indptr of input CSR to slice
- * @param[out] indptr_out : output sliced indptr to populate
- * @param[in] start_offset : beginning column offset of input indptr
- * @param[in] stop_offset : ending column offset of input indptr
- * @param[in] stream : cuda stream for ordering events
+ * DISCLAIMER: this file is deprecated: use slice.cuh instead
  */
-template <typename value_idx>
-void csr_row_slice_indptr(value_idx start_row,
-                          value_idx stop_row,
-                          const value_idx* indptr,
-                          value_idx* indptr_out,
-                          value_idx* start_offset,
-                          value_idx* stop_offset,
-                          cudaStream_t stream)
-{
-  detail::csr_row_slice_indptr(
-    start_row, stop_row, indptr, indptr_out, start_offset, stop_offset, stream);
-}
 
-/**
- * Slice rows from a CSR, populate column and data arrays
- * @tparam value_idx : data type of CSR index arrays
- * @tparam value_t : data type of CSR data array
- * @param[in] start_offset : beginning column offset to slice
- * @param[in] stop_offset : ending column offset to slice
- * @param[in] indices : column indices array from input CSR
- * @param[in] data : data array from input CSR
- * @param[out] indices_out : output column indices array
- * @param[out] data_out : output data array
- * @param[in] stream : cuda stream for ordering events
- */
-template <typename value_idx, typename value_t>
-void csr_row_slice_populate(value_idx start_offset,
-                            value_idx stop_offset,
-                            const value_idx* indices,
-                            const value_t* data,
-                            value_idx* indices_out,
-                            value_t* data_out,
-                            cudaStream_t stream)
-{
-  detail::csr_row_slice_populate(
-    start_offset, stop_offset, indices, data, indices_out, data_out, stream);
-}
+#pragma once
 
-};  // namespace op
-};  // end NAMESPACE sparse
-};  // end NAMESPACE raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "slice.cuh"
diff --git a/cpp/include/raft/sparse/op/sort.hpp b/cpp/include/raft/sparse/op/sort.hpp
index cd363582fb..867bb1bf35 100644
--- a/cpp/include/raft/sparse/op/sort.hpp
+++ b/cpp/include/raft/sparse/op/sort.hpp
@@ -18,66 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SPARSE_SORT_H
-#define __SPARSE_SORT_H
-
-#pragma once
-
-#include <raft/handle.hpp>
-#include <raft/sparse/op/detail/sort.h>
-
-namespace raft {
-namespace sparse {
-namespace op {
-
 /**
- * @brief Sorts the arrays that comprise the coo matrix
- * by row and then by column.
- *
- * @param m number of rows in coo matrix
- * @param n number of cols in coo matrix
- * @param nnz number of non-zeros
- * @param rows rows array from coo matrix
- * @param cols cols array from coo matrix
- * @param vals vals array from coo matrix
- * @param stream: cuda stream to use
+ * DISCLAIMER: this file is deprecated: use sort.cuh instead
  */
-template <typename T>
-void coo_sort(int m, int n, int nnz, int* rows, int* cols, T* vals, cudaStream_t stream)
-{
-  detail::coo_sort(m, n, nnz, rows, cols, vals, stream);
-}
 
-/**
- * @brief Sort the underlying COO arrays by row
- * @tparam T: the type name of the underlying value array
- * @param in: COO to sort by row
- * @param stream: the cuda stream to use
- */
-template <typename T>
-void coo_sort(COO<T>* const in, cudaStream_t stream)
-{
-  coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
-}
+#pragma once
 
-/**
- * Sorts a COO by its weight
- * @tparam value_idx
- * @tparam value_t
- * @param[inout] rows source edges
- * @param[inout] cols dest edges
- * @param[inout] data edge weights
- * @param[in] nnz number of edges in edge list
- * @param[in] stream cuda stream for which to order cuda operations
- */
-template <typename value_idx, typename value_t>
-void coo_sort_by_weight(
-  value_idx* rows, value_idx* cols, value_t* data, value_idx nnz, cudaStream_t stream)
-{
-  detail::coo_sort_by_weight(rows, cols, data, nnz, stream);
-}
-};  // namespace op
-};  // end NAMESPACE sparse
-};  // end NAMESPACE raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "sort.cuh"
diff --git a/cpp/include/raft/sparse/selection/connect_components.hpp b/cpp/include/raft/sparse/selection/connect_components.hpp
index 25d71367db..b6597babc8 100644
--- a/cpp/include/raft/sparse/selection/connect_components.hpp
+++ b/cpp/include/raft/sparse/selection/connect_components.hpp
@@ -18,70 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __CONNECT_COMPONENTS_H
-#define __CONNECT_COMPONENTS_H
-
-#include <raft/handle.hpp>
-#include <raft/sparse/coo.hpp>
-#include <raft/sparse/selection/detail/connect_components.cuh>
-
-namespace raft {
-namespace linkage {
-
-template <typename value_idx, typename value_t>
-using FixConnectivitiesRedOp = detail::FixConnectivitiesRedOp<value_idx, value_t>;
-
 /**
- * Gets the number of unique components from array of
- * colors or labels. This does not assume the components are
- * drawn from a monotonically increasing set.
- * @tparam value_idx
- * @param[in] colors array of components
- * @param[in] n_rows size of components array
- * @param[in] stream cuda stream for which to order cuda operations
- * @return total number of components
+ * DISCLAIMER: this file is deprecated: use connect_components.cuh instead
  */
-template <typename value_idx>
-value_idx get_n_components(value_idx* colors, size_t n_rows, cudaStream_t stream)
-{
-  return detail::get_n_components(colors, n_rows, stream);
-}
 
-/**
- * Connects the components of an otherwise unconnected knn graph
- * by computing a 1-nn to neighboring components of each data point
- * (e.g. component(nn) != component(self)) and reducing the results to
- * include the set of smallest destination components for each source
- * component. The result will not necessarily contain
- * n_components^2 - n_components number of elements because many components
- * will likely not be contained in the neighborhoods of 1-nns.
- * @tparam value_idx
- * @tparam value_t
- * @param[in] handle raft handle
- * @param[out] out output edge list containing nearest cross-component
- *             edges.
- * @param[in] X original (row-major) dense matrix for which knn graph should be constructed.
- * @param[in] orig_colors array containing component number for each row of X
- * @param[in] n_rows number of rows in X
- * @param[in] n_cols number of cols in X
- * @param[in] reduction_op
- * @param[in] metric
- */
-template <typename value_idx, typename value_t, typename red_op>
-void connect_components(
-  const raft::handle_t& handle,
-  raft::sparse::COO<value_t, value_idx>& out,
-  const value_t* X,
-  const value_idx* orig_colors,
-  size_t n_rows,
-  size_t n_cols,
-  red_op reduction_op,
-  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded)
-{
-  detail::connect_components(handle, out, X, orig_colors, n_rows, n_cols, reduction_op, metric);
-}
+#pragma once
 
-};  // end namespace linkage
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "connect_components.cuh"
diff --git a/cpp/include/raft/sparse/selection/knn.hpp b/cpp/include/raft/sparse/selection/knn.hpp
index bd6dd39fdf..6924e0b5a7 100644
--- a/cpp/include/raft/sparse/selection/knn.hpp
+++ b/cpp/include/raft/sparse/selection/knn.hpp
@@ -18,90 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SPARSE_KNN_H
-#define __SPARSE_KNN_H
-
-#pragma once
-
-#include <raft/distance/distance_type.hpp>
-#include <raft/handle.hpp>
-#include <raft/sparse/selection/detail/knn.cuh>
-
-namespace raft {
-namespace sparse {
-namespace selection {
-
 /**
- * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors
- * using some distance implementation
- * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1)
- * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz)
- * @param[in] idxData csr data array of the index matrix (size idxNNZ)
- * @param[in] idxNNZ number of non-zeros for sparse index matrix
- * @param[in] n_idx_rows number of data samples in index matrix
- * @param[in] n_idx_cols
- * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1)
- * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ)
- * @param[in] queryData csr data array of the query matrix (size queryNNZ)
- * @param[in] queryNNZ number of non-zeros for sparse query matrix
- * @param[in] n_query_rows number of data samples in query matrix
- * @param[in] n_query_cols number of features in query matrix
- * @param[out] output_indices dense matrix for output indices (size n_query_rows * k)
- * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
- * @param[in] k the number of neighbors to query
- * @param[in] handle CUDA handle.get_stream() to order operations with respect to
- * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
- * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
- * @param[in] metric distance metric/measure to use
- * @param[in] metricArg potential argument for metric (currently unused)
+ * DISCLAIMER: this file is deprecated: use knn.cuh instead
  */
-template <typename value_idx = int, typename value_t = float, int TPB_X = 32>
-void brute_force_knn(const value_idx* idxIndptr,
-                     const value_idx* idxIndices,
-                     const value_t* idxData,
-                     size_t idxNNZ,
-                     int n_idx_rows,
-                     int n_idx_cols,
-                     const value_idx* queryIndptr,
-                     const value_idx* queryIndices,
-                     const value_t* queryData,
-                     size_t queryNNZ,
-                     int n_query_rows,
-                     int n_query_cols,
-                     value_idx* output_indices,
-                     value_t* output_dists,
-                     int k,
-                     const raft::handle_t& handle,
-                     size_t batch_size_index             = 2 << 14,  // approx 1M
-                     size_t batch_size_query             = 2 << 14,
-                     raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
-                     float metricArg                     = 0)
-{
-  detail::sparse_knn_t<value_idx, value_t>(idxIndptr,
-                                           idxIndices,
-                                           idxData,
-                                           idxNNZ,
-                                           n_idx_rows,
-                                           n_idx_cols,
-                                           queryIndptr,
-                                           queryIndices,
-                                           queryData,
-                                           queryNNZ,
-                                           n_query_rows,
-                                           n_query_cols,
-                                           output_indices,
-                                           output_dists,
-                                           k,
-                                           handle,
-                                           batch_size_index,
-                                           batch_size_query,
-                                           metric,
-                                           metricArg)
-    .run();
-}
 
-};  // namespace selection
-};  // namespace sparse
-};  // namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "knn.cuh"
diff --git a/cpp/include/raft/sparse/selection/knn_graph.hpp b/cpp/include/raft/sparse/selection/knn_graph.hpp
index be47a6a9ef..833bdb61d2 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.hpp
+++ b/cpp/include/raft/sparse/selection/knn_graph.hpp
@@ -18,51 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __KNN_GRAPH_H
-#define __KNN_GRAPH_H
-
-#pragma once
-
-#include <raft/distance/distance_type.hpp>
-#include <raft/sparse/coo.hpp>
-#include <raft/sparse/selection/detail/knn_graph.cuh>
-
-#include <cstdint>
-
-namespace raft {
-namespace sparse {
-namespace selection {
-
 /**
- * Constructs a (symmetrized) knn graph edge list from
- * dense input vectors.
- *
- * Note: The resulting KNN graph is not guaranteed to be connected.
- *
- * @tparam value_idx
- * @tparam value_t
- * @param[in] handle raft handle
- * @param[in] X dense matrix of input data samples and observations
- * @param[in] m number of data samples (rows) in X
- * @param[in] n number of observations (columns) in X
- * @param[in] metric distance metric to use when constructing neighborhoods
- * @param[out] out output edge list
- * @param c
+ * DISCLAIMER: this file is deprecated: use knn_graph.cuh instead
  */
-template <typename value_idx = int, typename value_t = float>
-void knn_graph(const handle_t& handle,
-               const value_t* X,
-               std::size_t m,
-               std::size_t n,
-               raft::distance::DistanceType metric,
-               raft::sparse::COO<value_t, value_idx>& out,
-               int c = 15)
-{
-  detail::knn_graph(handle, X, m, n, metric, out, c);
-}
 
-};  // namespace selection
-};  // namespace sparse
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "knn_graph.cuh"
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index fca5d05465..123f6cf70f 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -22,7 +22,7 @@
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/cuda_utils.cuh>
-#include <raft/distance/distance.hpp>
+#include <raft/distance/distance.cuh>
 #include <raft/distance/distance_type.hpp>
 #include <raft/linalg/gemm.cuh>
 #include <raft/linalg/unary_op.cuh>
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index d4bce1fdf4..9ff19c2747 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -18,7 +18,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/distance/distance.hpp>
+#include <raft/distance/distance.cuh>
 #include <raft/distance/distance_type.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
index 7674ac0d46..1f1a3d8f8e 100644
--- a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
+++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
@@ -18,51 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __EPSILON_NEIGH_H
-#define __EPSILON_NEIGH_H
+/**
+ * DISCLAIMER: this file is deprecated: use epsilon_neighborhood.cuh instead
+ */
 
 #pragma once
 
-#include <raft/spatial/knn/detail/epsilon_neighborhood.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-
-/**
- * @brief Computes epsilon neighborhood for the L2-Squared distance metric
- *
- * @tparam DataT   IO and math type
- * @tparam IdxT    Index type
- *
- * @param[out] adj    adjacency matrix [row-major] [on device] [dim = m x n]
- * @param[out] vd     vertex degree array [on device] [len = m + 1]
- *                    `vd + m` stores the total number of edges in the adjacency
- *                    matrix. Pass a nullptr if you don't need this info.
- * @param[in]  x      first matrix [row-major] [on device] [dim = m x k]
- * @param[in]  y      second matrix [row-major] [on device] [dim = n x k]
- * @param[in]  m      number of rows in x
- * @param[in]  n      number of rows in y
- * @param[in]  k      number of columns in x and k
- * @param[in]  eps    defines epsilon neighborhood radius (should be passed as
- *                    squared as we compute L2-squared distance in this method)
- * @param[in]  stream cuda stream
- */
-template <typename DataT, typename IdxT>
-void epsUnexpL2SqNeighborhood(bool* adj,
-                              IdxT* vd,
-                              const DataT* x,
-                              const DataT* y,
-                              IdxT m,
-                              IdxT n,
-                              IdxT k,
-                              DataT eps,
-                              cudaStream_t stream)
-{
-  detail::epsUnexpL2SqNeighborhood<DataT, IdxT>(adj, vd, x, y, m, n, k, eps, stream);
-}
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "epsilon_neighborhood.cuh"
diff --git a/cpp/include/raft/spatial/knn/specializations.hpp b/cpp/include/raft/spatial/knn/specializations.hpp
index 13721a975f..04afb73036 100644
--- a/cpp/include/raft/spatial/knn/specializations.hpp
+++ b/cpp/include/raft/spatial/knn/specializations.hpp
@@ -18,13 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __KNN_SPECIALIZATIONS_H
-#define __KNN_SPECIALIZATIONS_H
+/**
+ * DISCLAIMER: this file is deprecated: use specializations.cuh instead
+ */
 
 #pragma once
 
-#include <raft/spatial/knn/specializations/ball_cover.cuh>
-#include <raft/spatial/knn/specializations/fused_l2_knn.cuh>
-#include <raft/spatial/knn/specializations/knn.cuh>
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "specializations.cuh"
diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp
index e6b37f29ec..57553daedf 100644
--- a/cpp/include/raft/spectral/eigen_solvers.hpp
+++ b/cpp/include/raft/spectral/eigen_solvers.hpp
@@ -18,95 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __EIGEN_SOLVERS_H
-#define __EIGEN_SOLVERS_H
+/**
+ * DISCLAIMER: this file is deprecated: use eigen_solvers.cuh instead
+ */
 
 #pragma once
 
-#include <raft/linalg/lanczos.cuh>
-#include <raft/spectral/matrix_wrappers.hpp>
-
-namespace raft {
-namespace spectral {
-
-// aggregate of control params for Eigen Solver:
-//
-template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
-struct eigen_solver_config_t {
-  size_type_t n_eigVecs;
-  size_type_t maxIter;
-
-  size_type_t restartIter;
-  value_type_t tol;
-
-  bool reorthogonalize{false};
-  unsigned long long seed{
-    1234567};  // CAVEAT: this default value is now common to all instances of using seed in
-               // Lanczos; was not the case before: there were places where a default seed = 123456
-               // was used; this may trigger slightly different # solver iterations
-};
-
-template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
-struct lanczos_solver_t {
-  explicit lanczos_solver_t(
-    eigen_solver_config_t<index_type_t, value_type_t, size_type_t> const& config)
-    : config_(config)
-  {
-  }
-
-  index_type_t solve_smallest_eigenvectors(
-    handle_t const& handle,
-    matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
-    value_type_t* __restrict__ eigVals,
-    value_type_t* __restrict__ eigVecs) const
-  {
-    RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
-    RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
-    index_type_t iters{};
-    linalg::computeSmallestEigenvectors(handle,
-                                        A,
-                                        config_.n_eigVecs,
-                                        config_.maxIter,
-                                        config_.restartIter,
-                                        config_.tol,
-                                        config_.reorthogonalize,
-                                        iters,
-                                        eigVals,
-                                        eigVecs,
-                                        config_.seed);
-    return iters;
-  }
-
-  index_type_t solve_largest_eigenvectors(
-    handle_t const& handle,
-    matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
-    value_type_t* __restrict__ eigVals,
-    value_type_t* __restrict__ eigVecs) const
-  {
-    RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
-    RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
-    index_type_t iters{};
-    linalg::computeLargestEigenvectors(handle,
-                                       A,
-                                       config_.n_eigVecs,
-                                       config_.maxIter,
-                                       config_.restartIter,
-                                       config_.tol,
-                                       config_.reorthogonalize,
-                                       iters,
-                                       eigVals,
-                                       eigVecs,
-                                       config_.seed);
-    return iters;
-  }
-
-  auto const& get_config(void) const { return config_; }
-
- private:
-  eigen_solver_config_t<index_type_t, value_type_t, size_type_t> config_;
-};
-
-}  // namespace spectral
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
+#include "eigen_solvers.cuh"
diff --git a/cpp/include/raft/stats/accuracy.hpp b/cpp/include/raft/stats/accuracy.hpp
index 8cbb0f719e..a1b7321879 100644
--- a/cpp/include/raft/stats/accuracy.hpp
+++ b/cpp/include/raft/stats/accuracy.hpp
@@ -18,32 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __STATS_ACCURACY_H
-#define __STATS_ACCURACY_H
-
-#pragma once
-
-#include <raft/stats/detail/scores.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Compute accuracy of predictions. Useful for classification.
- * @tparam math_t: data type for predictions (e.g., int for classification)
- * @param[in] predictions: array of predictions (GPU pointer).
- * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer).
- * @param[in] n: number of elements in each of predictions, ref_predictions.
- * @param[in] stream: cuda stream.
- * @return: Accuracy score in [0, 1]; higher is better.
+ * DISCLAIMER: this file is deprecated: use accuracy.cuh instead
  */
-template <typename math_t>
-float accuracy(const math_t* predictions, const math_t* ref_predictions, int n, cudaStream_t stream)
-{
-  return detail::accuracy_score(predictions, ref_predictions, n, stream);
-}
 
-}  // namespace stats
-}  // namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "accuracy.cuh"
diff --git a/cpp/include/raft/stats/adjusted_rand_index.hpp b/cpp/include/raft/stats/adjusted_rand_index.hpp
index bc836eed86..3a990ac985 100644
--- a/cpp/include/raft/stats/adjusted_rand_index.hpp
+++ b/cpp/include/raft/stats/adjusted_rand_index.hpp
@@ -18,43 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __ADJUSTED_RAND_INDEX_H
-#define __ADJUSTED_RAND_INDEX_H
-
 /**
- * @file adjusted_rand_index.hpp
- * @brief The adjusted Rand index is the corrected-for-chance version of the Rand index.
- * Such a correction for chance establishes a baseline by using the expected similarity
- * of all pair-wise comparisons between clusterings specified by a random model.
+ * DISCLAIMER: this file is deprecated: use adjusted_rand_index.cuh instead
  */
 
 #pragma once
 
-#include <raft/stats/detail/adjusted_rand_index.cuh>
-
-namespace raft {
-namespace stats {
-
-/**
- * @brief Function to calculate Adjusted RandIndex as described
- *        <a href="https://en.wikipedia.org/wiki/Rand_index">here</a>
- * @tparam T data-type for input label arrays
- * @tparam MathT integral data-type used for computing n-choose-r
- * @param firstClusterArray: the array of classes
- * @param secondClusterArray: the array of classes
- * @param size: the size of the data points of type int
- * @param stream: the cudaStream object
- */
-template <typename T, typename MathT = int>
-double adjusted_rand_index(const T* firstClusterArray,
-                           const T* secondClusterArray,
-                           int size,
-                           cudaStream_t stream)
-{
-  return detail::compute_adjusted_rand_index(firstClusterArray, secondClusterArray, size, stream);
-}
-
-};  // end namespace stats
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
+#include "adjusted_rand_index.cuh"
diff --git a/cpp/include/raft/stats/contingency_matrix.hpp b/cpp/include/raft/stats/contingency_matrix.hpp
index 70800be1e6..141f678f94 100644
--- a/cpp/include/raft/stats/contingency_matrix.hpp
+++ b/cpp/include/raft/stats/contingency_matrix.hpp
@@ -18,93 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __CONTINGENCY_MATRIX_H
-#define __CONTINGENCY_MATRIX_H
-
-#pragma once
-
-#include <raft/stats/detail/contingencyMatrix.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief use this to allocate output matrix size
- * size of matrix = (maxLabel - minLabel + 1)^2 * sizeof(int)
- * @param groundTruth: device 1-d array for ground truth (num of rows)
- * @param nSamples: number of elements in input array
- * @param stream: cuda stream for execution
- * @param minLabel: [out] calculated min value in input array
- * @param maxLabel: [out] calculated max value in input array
+ * DISCLAIMER: this file is deprecated: use contingency_matrix.cuh instead
  */
-template <typename T>
-void getInputClassCardinality(
-  const T* groundTruth, const int nSamples, cudaStream_t stream, T& minLabel, T& maxLabel)
-{
-  detail::getInputClassCardinality(groundTruth, nSamples, stream, minLabel, maxLabel);
-}
 
-/**
- * @brief Calculate workspace size for running contingency matrix calculations
- * @tparam T label type
- * @tparam OutT output matrix type
- * @param nSamples: number of elements in input array
- * @param groundTruth: device 1-d array for ground truth (num of rows)
- * @param stream: cuda stream for execution
- * @param minLabel: Optional, min value in input array
- * @param maxLabel: Optional, max value in input array
- */
-template <typename T, typename OutT = int>
-size_t getContingencyMatrixWorkspaceSize(int nSamples,
-                                         const T* groundTruth,
-                                         cudaStream_t stream,
-                                         T minLabel = std::numeric_limits<T>::max(),
-                                         T maxLabel = std::numeric_limits<T>::max())
-{
-  return detail::getContingencyMatrixWorkspaceSize(
-    nSamples, groundTruth, stream, minLabel, maxLabel);
-}
-
-/**
- * @brief contruct contingency matrix given input ground truth and prediction
- *        labels. Users should call function getInputClassCardinality to find
- *        and allocate memory for output. Similarly workspace requirements
- *        should be checked using function getContingencyMatrixWorkspaceSize
- * @tparam T label type
- * @tparam OutT output matrix type
- * @param groundTruth: device 1-d array for ground truth (num of rows)
- * @param predictedLabel: device 1-d array for prediction (num of columns)
- * @param nSamples: number of elements in input array
- * @param outMat: output buffer for contingecy matrix
- * @param stream: cuda stream for execution
- * @param workspace: Optional, workspace memory allocation
- * @param workspaceSize: Optional, size of workspace memory
- * @param minLabel: Optional, min value in input ground truth array
- * @param maxLabel: Optional, max value in input ground truth array
- */
-template <typename T, typename OutT = int>
-void contingencyMatrix(const T* groundTruth,
-                       const T* predictedLabel,
-                       int nSamples,
-                       OutT* outMat,
-                       cudaStream_t stream,
-                       void* workspace      = nullptr,
-                       size_t workspaceSize = 0,
-                       T minLabel           = std::numeric_limits<T>::max(),
-                       T maxLabel           = std::numeric_limits<T>::max())
-{
-  detail::contingencyMatrix<T, OutT>(groundTruth,
-                                     predictedLabel,
-                                     nSamples,
-                                     outMat,
-                                     stream,
-                                     workspace,
-                                     workspaceSize,
-                                     minLabel,
-                                     maxLabel);
-}
+#pragma once
 
-};  // namespace stats
-};  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "contingency_matrix.cuh"
diff --git a/cpp/include/raft/stats/cov.hpp b/cpp/include/raft/stats/cov.hpp
index a584dedc95..a6c653206a 100644
--- a/cpp/include/raft/stats/cov.hpp
+++ b/cpp/include/raft/stats/cov.hpp
@@ -18,50 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __COV_H
-#define __COV_H
+/**
+ * DISCLAIMER: this file is deprecated: use cov.cuh instead
+ */
 
 #pragma once
 
-#include <raft/stats/detail/cov.cuh>
-namespace raft {
-namespace stats {
-/**
- * @brief Compute covariance of the input matrix
- *
- * Mean operation is assumed to be performed on a given column.
- *
- * @tparam Type the data type
- * @param covar the output covariance matrix
- * @param data the input matrix (this will get mean-centered at the end!)
- * @param mu mean vector of the input matrix
- * @param D number of columns of data
- * @param N number of rows of data
- * @param sample whether to evaluate sample covariance or not. In other words,
- * whether to normalize the output using N-1 or N, for true or false,
- * respectively
- * @param rowMajor whether the input data is row or col major
- * @param stable whether to run the slower-but-numerically-stable version or not
- * @param handle cublas handle
- * @param stream cuda stream
- * @note if stable=true, then the input data will be mean centered after this
- * function returns!
- */
-template <typename Type>
-void cov(const raft::handle_t& handle,
-         Type* covar,
-         Type* data,
-         const Type* mu,
-         std::size_t D,
-         std::size_t N,
-         bool sample,
-         bool rowMajor,
-         bool stable,
-         cudaStream_t stream)
-{
-  detail::cov(handle, covar, data, mu, D, N, sample, rowMajor, stable, stream);
-}
-};  // end namespace stats
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "cov.cuh"
diff --git a/cpp/include/raft/stats/detail/weighted_mean.cuh b/cpp/include/raft/stats/detail/weighted_mean.cuh
index 6d6f901fab..9c17d2ed0f 100644
--- a/cpp/include/raft/stats/detail/weighted_mean.cuh
+++ b/cpp/include/raft/stats/detail/weighted_mean.cuh
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/reduce.hpp>
-#include <raft/stats/sum.hpp>
+#include <raft/linalg/reduce.cuh>
+#include <raft/stats/sum.cuh>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/include/raft/stats/dispersion.hpp b/cpp/include/raft/stats/dispersion.hpp
index 7fabf07992..820c9e27ea 100644
--- a/cpp/include/raft/stats/dispersion.hpp
+++ b/cpp/include/raft/stats/dispersion.hpp
@@ -18,48 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __DISPERSION_H
-#define __DISPERSION_H
-
-#pragma once
-
-#include <raft/stats/detail/dispersion.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Compute cluster dispersion metric. This is very useful for
- * automatically finding the 'k' (in kmeans) that improves this metric.
- * @tparam DataT data type
- * @tparam IdxT index type
- * @tparam TPB threads block for kernels launched
- * @param centroids the cluster centroids. This is assumed to be row-major
- *   and of dimension (nClusters x dim)
- * @param clusterSizes number of points in the dataset which belong to each
- *   cluster. This is of length nClusters
- * @param globalCentroid compute the global weighted centroid of all cluster
- *   centroids. This is of length dim. Pass a nullptr if this is not needed
- * @param nClusters number of clusters
- * @param nPoints number of points in the dataset
- * @param dim dataset dimensionality
- * @param stream cuda stream
- * @return the cluster dispersion value
+ * DISCLAIMER: this file is deprecated: use dispersion.cuh instead
  */
-template <typename DataT, typename IdxT = int, int TPB = 256>
-DataT dispersion(const DataT* centroids,
-                 const IdxT* clusterSizes,
-                 DataT* globalCentroid,
-                 IdxT nClusters,
-                 IdxT nPoints,
-                 IdxT dim,
-                 cudaStream_t stream)
-{
-  return detail::dispersion(
-    centroids, clusterSizes, globalCentroid, nClusters, nPoints, dim, stream);
-}
 
-}  // end namespace stats
-}  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "dispersion.cuh"
diff --git a/cpp/include/raft/stats/entropy.hpp b/cpp/include/raft/stats/entropy.hpp
index 37dc2b700c..d8e1c11125 100644
--- a/cpp/include/raft/stats/entropy.hpp
+++ b/cpp/include/raft/stats/entropy.hpp
@@ -18,37 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __ENTROPY_H
-#define __ENTROPY_H
-
-#pragma once
-#include <raft/stats/detail/entropy.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Function to calculate entropy
- * <a href="https://en.wikipedia.org/wiki/Entropy_(information_theory)">more info on entropy</a>
- *
- * @param clusterArray: the array of classes of type T
- * @param size: the size of the data points of type int
- * @param lowerLabelRange: the lower bound of the range of labels
- * @param upperLabelRange: the upper bound of the range of labels
- * @param stream: the cudaStream object
- * @return the entropy score
+ * DISCLAIMER: this file is deprecated: use entropy.cuh instead
  */
-template <typename T>
-double entropy(const T* clusterArray,
-               const int size,
-               const T lowerLabelRange,
-               const T upperLabelRange,
-               cudaStream_t stream)
-{
-  return detail::entropy(clusterArray, size, lowerLabelRange, upperLabelRange, stream);
-}
 
-};  // end namespace stats
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "entropy.cuh"
diff --git a/cpp/include/raft/stats/histogram.hpp b/cpp/include/raft/stats/histogram.hpp
index 627026c219..c123375597 100644
--- a/cpp/include/raft/stats/histogram.hpp
+++ b/cpp/include/raft/stats/histogram.hpp
@@ -18,54 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __HISTOGRAM_H
-#define __HISTOGRAM_H
-
-#pragma once
-
-#include <raft/stats/common.hpp>
-#include <raft/stats/detail/histogram.cuh>
-
-// This file is a shameless amalgamation of independent works done by
-// Lars Nyland and Andy Adinets
-
-///@todo: add cub's histogram as another option
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Perform histogram on the input data. It chooses the right load size
- * based on the input data vector length. It also supports large-bin cases
- * using a specialized smem-based hashing technique.
- * @tparam DataT input data type
- * @tparam IdxT data type used to compute indices
- * @tparam BinnerOp takes the input data and computes its bin index
- * @param type histogram implementation type to choose
- * @param bins the output bins (length = ncols * nbins)
- * @param nbins number of bins
- * @param data input data (length = ncols * nrows)
- * @param nrows data array length in each column (or batch)
- * @param ncols number of columsn (or batch size)
- * @param stream cuda stream
- * @param binner the operation that computes the bin index of the input data
- *
- * @note signature of BinnerOp is `int func(DataT, IdxT);`
+ * DISCLAIMER: this file is deprecated: use histogram.cuh instead
  */
-template <typename DataT, typename IdxT = int, typename BinnerOp = IdentityBinner<DataT, IdxT>>
-void histogram(HistType type,
-               int* bins,
-               IdxT nbins,
-               const DataT* data,
-               IdxT nrows,
-               IdxT ncols,
-               cudaStream_t stream,
-               BinnerOp binner = IdentityBinner<DataT, IdxT>())
-{
-  detail::histogram<DataT, IdxT, BinnerOp>(type, bins, nbins, data, nrows, ncols, stream, binner);
-}
 
-};  // end namespace stats
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "histogram.cuh"
diff --git a/cpp/include/raft/stats/homogeneity_score.hpp b/cpp/include/raft/stats/homogeneity_score.hpp
index 4e119f2bc7..8d2433d1da 100644
--- a/cpp/include/raft/stats/homogeneity_score.hpp
+++ b/cpp/include/raft/stats/homogeneity_score.hpp
@@ -13,46 +13,19 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 /**
  * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
-#ifndef __HOMOGENEITY_SCORE_H
-#define __HOMOGENEITY_SCORE_H
-
-#pragma once
-
-#include <raft/stats/detail/homogeneity_score.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Function to calculate the homogeneity score between two clusters
- * <a href="https://en.wikipedia.org/wiki/Homogeneity_(statistics)">more info on mutual
- * information</a>
- * @param truthClusterArray: the array of truth classes of type T
- * @param predClusterArray: the array of predicted classes of type T
- * @param size: the size of the data points of type int
- * @param lowerLabelRange: the lower bound of the range of labels
- * @param upperLabelRange: the upper bound of the range of labels
- * @param stream: the cudaStream object
+ * DISCLAIMER: this file is deprecated: use homogeneity_score.cuh instead
  */
-template <typename T>
-double homogeneity_score(const T* truthClusterArray,
-                         const T* predClusterArray,
-                         int size,
-                         T lowerLabelRange,
-                         T upperLabelRange,
-                         cudaStream_t stream)
-{
-  return detail::homogeneity_score(
-    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
-}
 
-};  // end namespace stats
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "homogeneity_score.cuh"
diff --git a/cpp/include/raft/stats/information_criterion.hpp b/cpp/include/raft/stats/information_criterion.hpp
index 3a39e56c41..898ffbfa8e 100644
--- a/cpp/include/raft/stats/information_criterion.hpp
+++ b/cpp/include/raft/stats/information_criterion.hpp
@@ -18,56 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __INFORMATION_CRIT_H
-#define __INFORMATION_CRIT_H
-
 /**
- * @file information_criterion.hpp
- * @brief These information criteria are used to evaluate the quality of models
- *        by balancing the quality of the fit and the number of parameters.
- *
- * See:
- *  - AIC: https://en.wikipedia.org/wiki/Akaike_information_criterion
- *  - AICc: https://en.wikipedia.org/wiki/Akaike_information_criterion#AICc
- *  - BIC: https://en.wikipedia.org/wiki/Bayesian_information_criterion
+ * DISCLAIMER: this file is deprecated: use information_criterion.cuh instead
  */
-#pragma once
-
-#include <raft/stats/common.hpp>
-#include <raft/stats/detail/batched/information_criterion.cuh>
-
-namespace raft {
-namespace stats {
 
-/**
- * Compute the given type of information criterion
- *
- * @note: it is safe to do the computation in-place (i.e give same pointer
- *        as input and output)
- *
- * @param[out] d_ic             Information criterion to be returned for each
- *                              series (device)
- * @param[in]  d_loglikelihood  Log-likelihood for each series (device)
- * @param[in]  ic_type          Type of criterion to compute. See IC_Type
- * @param[in]  n_params         Number of parameters in the model
- * @param[in]  batch_size       Number of series in the batch
- * @param[in]  n_samples        Number of samples in each series
- * @param[in]  stream           CUDA stream
- */
-template <typename ScalarT, typename IdxT>
-void information_criterion_batched(ScalarT* d_ic,
-                                   const ScalarT* d_loglikelihood,
-                                   IC_Type ic_type,
-                                   IdxT n_params,
-                                   IdxT batch_size,
-                                   IdxT n_samples,
-                                   cudaStream_t stream)
-{
-  batched::detail::information_criterion(
-    d_ic, d_loglikelihood, ic_type, n_params, batch_size, n_samples, stream);
-}
+#pragma once
 
-}  // namespace stats
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "information_criterion.cuh"
diff --git a/cpp/include/raft/stats/kl_divergence.hpp b/cpp/include/raft/stats/kl_divergence.hpp
index 59db77246f..086d5f1d23 100644
--- a/cpp/include/raft/stats/kl_divergence.hpp
+++ b/cpp/include/raft/stats/kl_divergence.hpp
@@ -18,34 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __KL_DIVERGENCE_H
-#define __KL_DIVERGENCE_H
-
-#pragma once
-
-#include <raft/stats/detail/kl_divergence.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Function to calculate KL Divergence
- * <a href="https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence">more info on KL
- * Divergence</a>
- *
- * @tparam DataT: Data type of the input array
- * @param modelPDF: the model array of probability density functions of type DataT
- * @param candidatePDF: the candidate array of probability density functions of type DataT
- * @param size: the size of the data points of type int
- * @param stream: the cudaStream object
+ * DISCLAIMER: this file is deprecated: use kl_divergence.cuh instead
  */
-template <typename DataT>
-DataT kl_divergence(const DataT* modelPDF, const DataT* candidatePDF, int size, cudaStream_t stream)
-{
-  return detail::kl_divergence(modelPDF, candidatePDF, size, stream);
-}
 
-};  // end namespace stats
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "kl_divergence.cuh"
diff --git a/cpp/include/raft/stats/mean.hpp b/cpp/include/raft/stats/mean.hpp
index 2767b632e6..bce899d9d4 100644
--- a/cpp/include/raft/stats/mean.hpp
+++ b/cpp/include/raft/stats/mean.hpp
@@ -18,43 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MEAN_H
-#define __MEAN_H
-
-#pragma once
-
-#include "detail/mean.cuh"
-
-#include <raft/handle.hpp>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Compute mean of the input matrix
- *
- * Mean operation is assumed to be performed on a given column.
- *
- * @tparam Type: the data type
- * @tparam IdxType Integer type used to for addressing
- * @param mu: the output mean vector
- * @param data: the input matrix
- * @param D: number of columns of data
- * @param N: number of rows of data
- * @param sample: whether to evaluate sample mean or not. In other words,
- * whether
- *  to normalize the output using N-1 or N, for true or false, respectively
- * @param rowMajor: whether the input data is row or col major
- * @param stream: cuda stream
+ * DISCLAIMER: this file is deprecated: use mean.cuh instead
  */
-template <typename Type, typename IdxType = int>
-void mean(
-  Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream)
-{
-  detail::mean(mu, data, D, N, sample, rowMajor, stream);
-}
 
-};  // namespace stats
-};  // namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "mean.cuh"
diff --git a/cpp/include/raft/stats/mean_center.hpp b/cpp/include/raft/stats/mean_center.hpp
index e219891cab..73e49e7307 100644
--- a/cpp/include/raft/stats/mean_center.hpp
+++ b/cpp/include/raft/stats/mean_center.hpp
@@ -18,71 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MEAN_CENTER_H
-#define __MEAN_CENTER_H
-
-#pragma once
-
-#include "detail/mean_center.cuh"
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Center the input matrix wrt its mean
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads per block of the cuda kernel launched
- * @param out the output mean-centered matrix
- * @param data input matrix
- * @param mu the mean vector
- * @param D number of columns of data
- * @param N number of rows of data
- * @param rowMajor whether input is row or col major
- * @param bcastAlongRows whether to broadcast vector along rows or columns
- * @param stream cuda stream where to launch work
+ * DISCLAIMER: this file is deprecated: use mean_center.cuh instead
  */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void meanCenter(Type* out,
-                const Type* data,
-                const Type* mu,
-                IdxType D,
-                IdxType N,
-                bool rowMajor,
-                bool bcastAlongRows,
-                cudaStream_t stream)
-{
-  detail::meanCenter<Type, IdxType, TPB>(out, data, mu, D, N, rowMajor, bcastAlongRows, stream);
-}
 
-/**
- * @brief Add the input matrix wrt its mean
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads per block of the cuda kernel launched
- * @param out the output mean-added matrix
- * @param data input matrix
- * @param mu the mean vector
- * @param D number of columns of data
- * @param N number of rows of data
- * @param rowMajor whether input is row or col major
- * @param bcastAlongRows whether to broadcast vector along rows or columns
- * @param stream cuda stream where to launch work
- */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void meanAdd(Type* out,
-             const Type* data,
-             const Type* mu,
-             IdxType D,
-             IdxType N,
-             bool rowMajor,
-             bool bcastAlongRows,
-             cudaStream_t stream)
-{
-  detail::meanAdd<Type, IdxType, TPB>(out, data, mu, D, N, rowMajor, bcastAlongRows, stream);
-}
+#pragma once
 
-};  // end namespace stats
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "mean_center.cuh"
diff --git a/cpp/include/raft/stats/meanvar.hpp b/cpp/include/raft/stats/meanvar.hpp
index d7ef935fbc..db67a68579 100644
--- a/cpp/include/raft/stats/meanvar.hpp
+++ b/cpp/include/raft/stats/meanvar.hpp
@@ -18,48 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MEANVAR_H
-#define __MEANVAR_H
-
-#pragma once
-
-#include "detail/meanvar.cuh"
-
-namespace raft::stats {
-
 /**
- * @brief Compute mean and variance for each column of a given matrix.
- *
- * The operation is performed in a single sweep. Consider using it when you need to compute
- * both mean and variance, or when you need to compute variance but don't have the mean.
- * It's almost twice faster than running `mean` and `vars` sequentially, because all three
- * kernels are memory-bound.
- *
- * @tparam Type the data type
- * @tparam IdxType Integer type used for addressing
- * @param [out] mean the output mean vector of size D
- * @param [out] var the output variance vector of size D
- * @param [in] data the input matrix of size [N, D]
- * @param [in] D number of columns of data
- * @param [in] N number of rows of data
- * @param [in] sample whether to evaluate sample variance or not. In other words, whether to
- * normalize the variance using N-1 or N, for true or false respectively.
- * @param [in] rowMajor whether the input data is row- or col-major, for true or false respectively.
- * @param [in] stream
+ * DISCLAIMER: this file is deprecated: use meanvar.cuh instead
  */
-template <typename Type, typename IdxType = int>
-void meanvar(Type* mean,
-             Type* var,
-             const Type* data,
-             IdxType D,
-             IdxType N,
-             bool sample,
-             bool rowMajor,
-             cudaStream_t stream)
-{
-  detail::meanvar(mean, var, data, D, N, sample, rowMajor, stream);
-}
 
-};  // namespace raft::stats
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "meanvar.cuh"
diff --git a/cpp/include/raft/stats/minmax.hpp b/cpp/include/raft/stats/minmax.hpp
index 97f06129fa..ad588a38d4 100644
--- a/cpp/include/raft/stats/minmax.hpp
+++ b/cpp/include/raft/stats/minmax.hpp
@@ -18,62 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MINMAX_H
-#define __MINMAX_H
-
-#pragma once
-
-#include <raft/cuda_utils.cuh>
-#include <raft/cudart_utils.h>
-#include <raft/stats/detail/minmax.cuh>
-
-#include <limits>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Computes min/max across every column of the input matrix, as well as
- * optionally allow to subsample based on the given row/col ID mapping vectors
- *
- * @tparam T the data type
- * @tparam TPB number of threads per block
- * @param data input data
- * @param rowids actual row ID mappings. It is of length nrows. If you want to
- * skip this index lookup entirely, pass nullptr
- * @param colids actual col ID mappings. It is of length ncols. If you want to
- * skip this index lookup entirely, pass nullptr
- * @param nrows number of rows of data to be worked upon. The actual rows of the
- * input "data" can be bigger than this!
- * @param ncols number of cols of data to be worked upon. The actual cols of the
- * input "data" can be bigger than this!
- * @param row_stride stride (in number of elements) between 2 adjacent columns
- * @param globalmin final col-wise global minimum (size = ncols)
- * @param globalmax final col-wise global maximum (size = ncols)
- * @param sampledcols output sampled data. Pass nullptr if you don't need this
- * @param stream cuda stream
- * @note This method makes the following assumptions:
- * 1. input and output matrices are assumed to be col-major
- * 2. ncols is small enough to fit the whole of min/max values across all cols
- *    in shared memory
+ * DISCLAIMER: this file is deprecated: use minmax.cuh instead
  */
-template <typename T, int TPB = 512>
-void minmax(const T* data,
-            const unsigned* rowids,
-            const unsigned* colids,
-            int nrows,
-            int ncols,
-            int row_stride,
-            T* globalmin,
-            T* globalmax,
-            T* sampledcols,
-            cudaStream_t stream)
-{
-  detail::minmax<T, TPB>(
-    data, rowids, colids, nrows, ncols, row_stride, globalmin, globalmax, sampledcols, stream);
-}
 
-};  // namespace stats
-};  // namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "minmax.cuh"
diff --git a/cpp/include/raft/stats/mutual_info_score.hpp b/cpp/include/raft/stats/mutual_info_score.hpp
index a080211c36..c3446e3963 100644
--- a/cpp/include/raft/stats/mutual_info_score.hpp
+++ b/cpp/include/raft/stats/mutual_info_score.hpp
@@ -18,39 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MUTUAL_INFO_SCORE_H
-#define __MUTUAL_INFO_SCORE_H
-
-#pragma once
-
-#include <raft/stats/detail/mutual_info_score.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Function to calculate the mutual information between two clusters
- * <a href="https://en.wikipedia.org/wiki/Mutual_information">more info on mutual information</a>
- * @param firstClusterArray: the array of classes of type T
- * @param secondClusterArray: the array of classes of type T
- * @param size: the size of the data points of type int
- * @param lowerLabelRange: the lower bound of the range of labels
- * @param upperLabelRange: the upper bound of the range of labels
- * @param stream: the cudaStream object
+ * DISCLAIMER: this file is deprecated: use mutual_info_score.cuh instead
  */
-template <typename T>
-double mutual_info_score(const T* firstClusterArray,
-                         const T* secondClusterArray,
-                         int size,
-                         T lowerLabelRange,
-                         T upperLabelRange,
-                         cudaStream_t stream)
-{
-  return detail::mutual_info_score(
-    firstClusterArray, secondClusterArray, size, lowerLabelRange, upperLabelRange, stream);
-}
 
-};  // end namespace stats
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "mutual_info_score.cuh"
diff --git a/cpp/include/raft/stats/r2_score.hpp b/cpp/include/raft/stats/r2_score.hpp
index c88a1822ec..bc55a6596d 100644
--- a/cpp/include/raft/stats/r2_score.hpp
+++ b/cpp/include/raft/stats/r2_score.hpp
@@ -18,38 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __R2_SCORE_H
-#define __R2_SCORE_H
-
-#pragma once
-
-#include <raft/stats/detail/scores.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * Calculates the "Coefficient of Determination" (R-Squared) score
- * normalizing the sum of squared errors by the total sum of squares.
- *
- * This score indicates the proportionate amount of variation in an
- * expected response variable is explained by the independent variables
- * in a linear regression model. The larger the R-squared value, the
- * more variability is explained by the linear regression model.
- *
- * @param y: Array of ground-truth response variables
- * @param y_hat: Array of predicted response variables
- * @param n: Number of elements in y and y_hat
- * @param stream: cuda stream
- * @return: The R-squared value.
+ * DISCLAIMER: this file is deprecated: use r2_score.cuh instead
  */
-template <typename math_t>
-math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
-{
-  return detail::r2_score(y, y_hat, n, stream);
-}
 
-}  // namespace stats
-}  // namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "r2_score.cuh"
diff --git a/cpp/include/raft/stats/rand_index.hpp b/cpp/include/raft/stats/rand_index.hpp
index e8c3089371..7d398dddb4 100644
--- a/cpp/include/raft/stats/rand_index.hpp
+++ b/cpp/include/raft/stats/rand_index.hpp
@@ -18,31 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __RAND_INDEX_H
-#define __RAND_INDEX_H
-
-#pragma once
-
-#include <raft/stats/detail/rand_index.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Function to calculate RandIndex
- * <a href="https://en.wikipedia.org/wiki/Rand_index">more info on rand index</a>
- * @param firstClusterArray: the array of classes of type T
- * @param secondClusterArray: the array of classes of type T
- * @param size: the size of the data points of type uint64_t
- * @param stream: the cudaStream object
+ * DISCLAIMER: this file is deprecated: use rand_index.cuh instead
  */
-template <typename T>
-double rand_index(T* firstClusterArray, T* secondClusterArray, uint64_t size, cudaStream_t stream)
-{
-  return detail::compute_rand_index(firstClusterArray, secondClusterArray, size, stream);
-}
 
-};  // end namespace stats
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "rand_index.cuh"
diff --git a/cpp/include/raft/stats/regression_metrics.hpp b/cpp/include/raft/stats/regression_metrics.hpp
index f65ad524ef..084f4f8fbc 100644
--- a/cpp/include/raft/stats/regression_metrics.hpp
+++ b/cpp/include/raft/stats/regression_metrics.hpp
@@ -18,43 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __REGRESSION_METRICS_H
-#define __REGRESSION_METRICS_H
+/**
+ * DISCLAIMER: this file is deprecated: use regression_metrics.cuh instead
+ */
 
 #pragma once
 
-#include <raft/stats/detail/scores.cuh>
-
-namespace raft {
-namespace stats {
-
-/**
- * @brief Compute regression metrics mean absolute error, mean squared error, median absolute error
- * @tparam T: data type for predictions (e.g., float or double for regression).
- * @param[in] predictions: array of predictions (GPU pointer).
- * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer).
- * @param[in] n: number of elements in each of predictions, ref_predictions. Should be > 0.
- * @param[in] stream: cuda stream.
- * @param[out] mean_abs_error: Mean Absolute Error. Sum over n of (|predictions[i] -
- * ref_predictions[i]|) / n.
- * @param[out] mean_squared_error: Mean Squared Error. Sum over n of ((predictions[i] -
- * ref_predictions[i])^2) / n.
- * @param[out] median_abs_error: Median Absolute Error. Median of |predictions[i] -
- * ref_predictions[i]| for i in [0, n).
- */
-template <typename T>
-void regression_metrics(const T* predictions,
-                        const T* ref_predictions,
-                        int n,
-                        cudaStream_t stream,
-                        double& mean_abs_error,
-                        double& mean_squared_error,
-                        double& median_abs_error)
-{
-  detail::regression_metrics(
-    predictions, ref_predictions, n, stream, mean_abs_error, mean_squared_error, median_abs_error);
-}
-}  // namespace stats
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "regression_metrics.cuh"
diff --git a/cpp/include/raft/stats/silhouette_score.hpp b/cpp/include/raft/stats/silhouette_score.hpp
index e6c84855c6..54981edbb6 100644
--- a/cpp/include/raft/stats/silhouette_score.hpp
+++ b/cpp/include/raft/stats/silhouette_score.hpp
@@ -18,67 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SILHOUETTE_SCORE_H
-#define __SILHOUETTE_SCORE_H
-
-#pragma once
-
-#include <raft/stats/detail/batched/silhouette_score.cuh>
-#include <raft/stats/detail/silhouette_score.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief main function that returns the average silhouette score for a given set of data and its
- * clusterings
- * @tparam DataT: type of the data samples
- * @tparam LabelT: type of the labels
- * @param handle: raft handle for managing expensive resources
- * @param X_in: pointer to the input Data samples array (nRows x nCols)
- * @param nRows: number of data samples
- * @param nCols: number of features
- * @param labels: the pointer to the array containing labels for every data sample (1 x nRows)
- * @param nLabels: number of Labels
- * @param silhouette_scorePerSample: pointer to the array that is optionally taken in as input and
- * is populated with the silhouette score for every sample (1 x nRows)
- * @param stream: the cuda stream where to launch this kernel
- * @param metric: the numerical value that maps to the type of distance metric to be used in the
- * calculations
+ * DISCLAIMER: this file is deprecated: use silhouette_score.cuh instead
  */
-template <typename DataT, typename LabelT>
-DataT silhouette_score(
-  const raft::handle_t& handle,
-  DataT* X_in,
-  int nRows,
-  int nCols,
-  LabelT* labels,
-  int nLabels,
-  DataT* silhouette_scorePerSample,
-  cudaStream_t stream,
-  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)
-{
-  return detail::silhouette_score(
-    handle, X_in, nRows, nCols, labels, nLabels, silhouette_scorePerSample, stream, metric);
-}
 
-template <typename value_t, typename value_idx, typename label_idx>
-value_t silhouette_score_batched(
-  const raft::handle_t& handle,
-  value_t* X,
-  value_idx n_rows,
-  value_idx n_cols,
-  label_idx* y,
-  label_idx n_labels,
-  value_t* scores,
-  value_idx chunk,
-  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)
-{
-  return batched::detail::silhouette_score(
-    handle, X, n_rows, n_cols, y, n_labels, scores, chunk, metric);
-}
+#pragma once
 
-};  // namespace stats
-};  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "silhouette_score.cuh"
diff --git a/cpp/include/raft/stats/specializations.hpp b/cpp/include/raft/stats/specializations.hpp
index 3929b3124c..0ae82f27e7 100644
--- a/cpp/include/raft/stats/specializations.hpp
+++ b/cpp/include/raft/stats/specializations.hpp
@@ -18,12 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __STATS_SPECIALIZATIONS_H
-#define __STATS_SPECIALIZATIONS_H
+/**
+ * DISCLAIMER: this file is deprecated: use specializations.cuh instead
+ */
 
 #pragma once
 
-#include <raft/distance/specializations.cuh>
-#include <raft/spatial/knn/specializations.cuh>
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "specializations.cuh"
diff --git a/cpp/include/raft/stats/stddev.hpp b/cpp/include/raft/stats/stddev.hpp
index f496b1fd30..2222a2706d 100644
--- a/cpp/include/raft/stats/stddev.hpp
+++ b/cpp/include/raft/stats/stddev.hpp
@@ -18,81 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __STDDEV_H
-#define __STDDEV_H
-
-#pragma once
-
-#include "detail/stddev.cuh"
-
-#include <raft/handle.hpp>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Compute stddev of the input matrix
- *
- * Stddev operation is assumed to be performed on a given column.
- *
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @param std the output stddev vector
- * @param data the input matrix
- * @param mu the mean vector
- * @param D number of columns of data
- * @param N number of rows of data
- * @param sample whether to evaluate sample stddev or not. In other words,
- * whether
- *  to normalize the output using N-1 or N, for true or false, respectively
- * @param rowMajor whether the input data is row or col major
- * @param stream cuda stream where to launch work
+ * DISCLAIMER: this file is deprecated: use stddev.cuh instead
  */
-template <typename Type, typename IdxType = int>
-void stddev(Type* std,
-            const Type* data,
-            const Type* mu,
-            IdxType D,
-            IdxType N,
-            bool sample,
-            bool rowMajor,
-            cudaStream_t stream)
-{
-  detail::stddev(std, data, mu, D, N, sample, rowMajor, stream);
-}
 
-/**
- * @brief Compute variance of the input matrix
- *
- * Variance operation is assumed to be performed on a given column.
- *
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @param var the output stddev vector
- * @param data the input matrix
- * @param mu the mean vector
- * @param D number of columns of data
- * @param N number of rows of data
- * @param sample whether to evaluate sample stddev or not. In other words,
- * whether
- *  to normalize the output using N-1 or N, for true or false, respectively
- * @param rowMajor whether the input data is row or col major
- * @param stream cuda stream where to launch work
- */
-template <typename Type, typename IdxType = int>
-void vars(Type* var,
-          const Type* data,
-          const Type* mu,
-          IdxType D,
-          IdxType N,
-          bool sample,
-          bool rowMajor,
-          cudaStream_t stream)
-{
-  detail::vars(var, data, mu, D, N, sample, rowMajor, stream);
-}
+#pragma once
 
-};  // namespace stats
-};  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "stddev.cuh"
diff --git a/cpp/include/raft/stats/sum.hpp b/cpp/include/raft/stats/sum.hpp
index e1c8c67777..0b11a6219e 100644
--- a/cpp/include/raft/stats/sum.hpp
+++ b/cpp/include/raft/stats/sum.hpp
@@ -18,39 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SUM_H
-#define __SUM_H
-
-#pragma once
-
-#include "detail/sum.cuh"
-
-#include <raft/cudart_utils.h>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Compute sum of the input matrix
- *
- * Sum operation is assumed to be performed on a given column.
- *
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @param output the output mean vector
- * @param input the input matrix
- * @param D number of columns of data
- * @param N number of rows of data
- * @param rowMajor whether the input data is row or col major
- * @param stream cuda stream where to launch work
+ * DISCLAIMER: this file is deprecated: use sum.cuh instead
  */
-template <typename Type, typename IdxType = int>
-void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream)
-{
-  detail::sum(output, input, D, N, rowMajor, stream);
-}
 
-};  // end namespace stats
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "sum.cuh"
diff --git a/cpp/include/raft/stats/trustworthiness_score.hpp b/cpp/include/raft/stats/trustworthiness_score.hpp
index 81edf2ea04..0053860a92 100644
--- a/cpp/include/raft/stats/trustworthiness_score.hpp
+++ b/cpp/include/raft/stats/trustworthiness_score.hpp
@@ -18,41 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __TRUSTWORTHINESS_SCORE_H
-#define __TRUSTWORTHINESS_SCORE_H
+/**
+ * DISCLAIMER: this file is deprecated: use trustworthiness_score.cuh instead
+ */
 
 #pragma once
-#include <raft/stats/detail/trustworthiness_score.cuh>
-
-namespace raft {
-namespace stats {
 
-/**
- * @brief Compute the trustworthiness score
- * @param[in] h: raft handle
- * @param[in] X: Data in original dimension
- * @param[in] X_embedded: Data in target dimension (embedding)
- * @param[in] n: Number of samples
- * @param[in] m: Number of features in high/original dimension
- * @param[in] d: Number of features in low/embedded dimension
- * @param[in] n_neighbors Number of neighbors considered by trustworthiness score
- * @param[in] batchSize Batch size
- * @return[out] Trustworthiness score
- */
-template <typename math_t, raft::distance::DistanceType distance_type>
-double trustworthiness_score(const raft::handle_t& h,
-                             const math_t* X,
-                             math_t* X_embedded,
-                             int n,
-                             int m,
-                             int d,
-                             int n_neighbors,
-                             int batchSize = 512)
-{
-  return detail::trustworthiness_score<math_t, distance_type>(
-    h, X, X_embedded, n, m, d, n_neighbors, batchSize);
-}
-}  // namespace stats
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "trustworthiness_score.cuh"
diff --git a/cpp/include/raft/stats/v_measure.hpp b/cpp/include/raft/stats/v_measure.hpp
index a137af844d..0179d2c856 100644
--- a/cpp/include/raft/stats/v_measure.hpp
+++ b/cpp/include/raft/stats/v_measure.hpp
@@ -18,40 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __V_MEASURE_H
-#define __V_MEASURE_H
-
-#pragma once
-#include <raft/stats/detail/v_measure.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Function to calculate the v-measure between two clusters
- *
- * @param truthClusterArray: the array of truth classes of type T
- * @param predClusterArray: the array of predicted classes of type T
- * @param size: the size of the data points of type int
- * @param lowerLabelRange: the lower bound of the range of labels
- * @param upperLabelRange: the upper bound of the range of labels
- * @param stream: the cudaStream object
- * @param beta: v_measure parameter
+ * DISCLAIMER: this file is deprecated: use v_measure.cuh instead
  */
-template <typename T>
-double v_measure(const T* truthClusterArray,
-                 const T* predClusterArray,
-                 int size,
-                 T lowerLabelRange,
-                 T upperLabelRange,
-                 cudaStream_t stream,
-                 double beta = 1.0)
-{
-  return detail::v_measure(
-    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream, beta);
-}
 
-};  // end namespace stats
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "v_measure.cuh"
diff --git a/cpp/include/raft/stats/weighted_mean.hpp b/cpp/include/raft/stats/weighted_mean.hpp
index 5b3f4678d8..8bc4bf4623 100644
--- a/cpp/include/raft/stats/weighted_mean.hpp
+++ b/cpp/include/raft/stats/weighted_mean.hpp
@@ -18,84 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __WEIGHTED_MEAN_H
-#define __WEIGHTED_MEAN_H
-
-#pragma once
-
-#include <raft/stats/detail/weighted_mean.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Compute the weighted mean of the input matrix with a
- * vector of weights, along rows or along columns
- *
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @param mu the output mean vector
- * @param data the input matrix
- * @param weights weight of size D if along_row is true, else of size N
- * @param D number of columns of data
- * @param N number of rows of data
- * @param row_major data input matrix is row-major or not
- * @param along_rows whether to reduce along rows or columns
- * @param stream cuda stream to launch work on
+ * DISCLAIMER: this file is deprecated: use weighted_mean.cuh instead
  */
-template <typename Type, typename IdxType = int>
-void weightedMean(Type* mu,
-                  const Type* data,
-                  const Type* weights,
-                  IdxType D,
-                  IdxType N,
-                  bool row_major,
-                  bool along_rows,
-                  cudaStream_t stream)
-{
-  detail::weightedMean(mu, data, weights, D, N, row_major, along_rows, stream);
-}
 
-/**
- * @brief Compute the row-wise weighted mean of the input matrix with a
- * vector of column weights
- *
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @param mu the output mean vector
- * @param data the input matrix (assumed to be row-major)
- * @param weights per-column means
- * @param D number of columns of data
- * @param N number of rows of data
- * @param stream cuda stream to launch work on
- */
-template <typename Type, typename IdxType = int>
-void rowWeightedMean(
-  Type* mu, const Type* data, const Type* weights, IdxType D, IdxType N, cudaStream_t stream)
-{
-  weightedMean(mu, data, weights, D, N, true, true, stream);
-}
+#pragma once
 
-/**
- * @brief Compute the column-wise weighted mean of the input matrix with a
- * vector of row weights
- *
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @param mu the output mean vector
- * @param data the input matrix (assumed to be row-major)
- * @param weights per-row means
- * @param D number of columns of data
- * @param N number of rows of data
- * @param stream cuda stream to launch work on
- */
-template <typename Type, typename IdxType = int>
-void colWeightedMean(
-  Type* mu, const Type* data, const Type* weights, IdxType D, IdxType N, cudaStream_t stream)
-{
-  weightedMean(mu, data, weights, D, N, true, false, stream);
-}
-};  // end namespace stats
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "weighted_mean.cuh"
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu
index d1bfe4a2e4..a23262fc8e 100644
--- a/cpp/test/spatial/ball_cover.cu
+++ b/cpp/test/spatial/ball_cover.cu
@@ -18,8 +18,8 @@
 #include "spatial_data.h"
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance_type.hpp>
-#include <raft/random/make_blobs.hpp>
-#include <raft/spatial/knn/ball_cover.hpp>
+#include <raft/random/make_blobs.cuh>
+#include <raft/spatial/knn/ball_cover.cuh>
 #include <raft/spatial/knn/detail/knn_brute_force_faiss.cuh>
 #if defined RAFT_NN_COMPILED
 #include <raft/spatial/knn/specializations.cuh>