diff --git a/BUILD.md b/BUILD.md
index c4d8b1b356..3c6ad2bf20 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -205,8 +205,8 @@ The pre-compiled libraries contain template specializations for commonly used ty
 
 The following example tells the compiler to ignore the pre-compiled templates for the `libraft-distance` API so any symbols already compiled into pre-compiled shared library will be used instead:
 ```c++
-#include <raft/distance/distance.hpp>
-#include <raft/distance/specializations.hpp>
+#include <raft/distance/distance.cuh>
+#include <raft/distance/specializations.cuh>
 ```
 
 ### <a id="build_cxx_source"></a>Building RAFT C++ from source in cmake
diff --git a/cpp/bench/distance/distance_common.cuh b/cpp/bench/distance/distance_common.cuh
index dae2550326..4f1a8ccab1 100644
--- a/cpp/bench/distance/distance_common.cuh
+++ b/cpp/bench/distance/distance_common.cuh
@@ -16,9 +16,9 @@
 
 #include <common/benchmark.hpp>
 #include <raft/cudart_utils.h>
-#include <raft/distance/distance.hpp>
+#include <raft/distance/distance.cuh>
 #if defined RAFT_DISTANCE_COMPILED
-#include <raft/distance/specializations.hpp>
+#include <raft/distance/specializations.cuh>
 #endif
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/bench/linalg/add.cu b/cpp/bench/linalg/add.cu
index 7c651b61ed..7d00b8cbae 100644
--- a/cpp/bench/linalg/add.cu
+++ b/cpp/bench/linalg/add.cu
@@ -15,7 +15,7 @@
  */
 
 #include <common/benchmark.hpp>
-#include <raft/linalg/add.hpp>
+#include <raft/linalg/add.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft::bench::linalg {
diff --git a/cpp/bench/linalg/map_then_reduce.cu b/cpp/bench/linalg/map_then_reduce.cu
index 7eeb4a79b6..33a3e66264 100644
--- a/cpp/bench/linalg/map_then_reduce.cu
+++ b/cpp/bench/linalg/map_then_reduce.cu
@@ -15,7 +15,7 @@
  */
 
 #include <common/benchmark.hpp>
-#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/map_then_reduce.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft::bench::linalg {
diff --git a/cpp/bench/linalg/matrix_vector_op.cu b/cpp/bench/linalg/matrix_vector_op.cu
index d3a53ea345..aa8f2667ed 100644
--- a/cpp/bench/linalg/matrix_vector_op.cu
+++ b/cpp/bench/linalg/matrix_vector_op.cu
@@ -15,7 +15,7 @@
  */
 
 #include <common/benchmark.hpp>
-#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/matrix_vector_op.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft::bench::linalg {
diff --git a/cpp/bench/linalg/reduce.cu b/cpp/bench/linalg/reduce.cu
index 018086a689..015e0b8abe 100644
--- a/cpp/bench/linalg/reduce.cu
+++ b/cpp/bench/linalg/reduce.cu
@@ -15,7 +15,7 @@
  */
 
 #include <common/benchmark.hpp>
-#include <raft/linalg/reduce.hpp>
+#include <raft/linalg/reduce.cuh>
 
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/bench/random/make_blobs.cu b/cpp/bench/random/make_blobs.cu
index c449223040..fdd4ef61d2 100644
--- a/cpp/bench/random/make_blobs.cu
+++ b/cpp/bench/random/make_blobs.cu
@@ -15,7 +15,7 @@
  */
 
 #include <common/benchmark.hpp>
-#include <raft/random/make_blobs.hpp>
+#include <raft/random/make_blobs.cuh>
 #include <rmm/device_uvector.hpp>
 #include <vector>
 
diff --git a/cpp/bench/random/permute.cu b/cpp/bench/random/permute.cu
index a72eca3f87..5364bb44e3 100644
--- a/cpp/bench/random/permute.cu
+++ b/cpp/bench/random/permute.cu
@@ -16,7 +16,7 @@
 
 #include <common/benchmark.hpp>
 #include <raft/cudart_utils.h>
-#include <raft/random/permute.hpp>
+#include <raft/random/permute.cuh>
 #include <raft/random/rng.cuh>
 
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/bench/spatial/fused_l2_nn.cu b/cpp/bench/spatial/fused_l2_nn.cu
index dc3b507fbf..e5b5dc377a 100644
--- a/cpp/bench/spatial/fused_l2_nn.cu
+++ b/cpp/bench/spatial/fused_l2_nn.cu
@@ -17,13 +17,13 @@
 #include <common/benchmark.hpp>
 #include <limits>
 #include <raft/cudart_utils.h>
-#include <raft/distance/fused_l2_nn.hpp>
+#include <raft/distance/fused_l2_nn.cuh>
 #include <raft/handle.hpp>
-#include <raft/linalg/norm.hpp>
+#include <raft/linalg/norm.cuh>
 #include <raft/random/rng.cuh>
 
 #if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.hpp>
+#include <raft/spatial/knn/specializations.cuh>
 #endif
 
 namespace raft::bench::spatial {
diff --git a/cpp/bench/spatial/selection.cu b/cpp/bench/spatial/selection.cu
index c3a2bc6d3d..1f116c199f 100644
--- a/cpp/bench/spatial/selection.cu
+++ b/cpp/bench/spatial/selection.cu
@@ -18,7 +18,7 @@
 #include <raft/spatial/knn/knn.cuh>
 
 #if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.hpp>
+#include <raft/spatial/knn/specializations.cuh>
 #endif
 
 #include <raft/random/rng.cuh>
diff --git a/cpp/include/raft/distance/fused_l2_nn.hpp b/cpp/include/raft/distance/fused_l2_nn.hpp
index 768e33b3a7..74ad0974f4 100644
--- a/cpp/include/raft/distance/fused_l2_nn.hpp
+++ b/cpp/include/raft/distance/fused_l2_nn.hpp
@@ -18,105 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __FUSED_L2_NN_H
-#define __FUSED_L2_NN_H
-
-#pragma once
-
-#include <cub/cub.cuh>
-#include <limits>
-#include <raft/cuda_utils.cuh>
-#include <raft/distance/detail/fused_l2_nn.cuh>
-#include <raft/handle.hpp>
-#include <stdint.h>
-
-namespace raft {
-namespace distance {
-
-template <typename LabelT, typename DataT>
-using KVPMinReduce = detail::KVPMinReduceImpl<LabelT, DataT>;
-
-template <typename LabelT, typename DataT>
-using MinAndDistanceReduceOp = detail::MinAndDistanceReduceOpImpl<LabelT, DataT>;
-
-template <typename LabelT, typename DataT>
-using MinReduceOp = detail::MinReduceOpImpl<LabelT, DataT>;
-
 /**
- * Initialize array using init value from reduction op
+ * DISCLAIMER: this file is deprecated: use fused_l2_nn.cuh instead
  */
-template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
-void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
-{
-  detail::initialize<DataT, OutT, IdxT, ReduceOpT>(min, m, maxVal, redOp, handle.get_stream());
-}
 
-/**
- * @brief Fused L2 distance and 1-nearest-neighbor computation in a single call.
- *
- * The benefits of such a call are 2-fold: 1) eliminate the need for an
- * intermediate buffer to store the output of gemm 2) reduce the memory read
- * traffic on this intermediate buffer, otherwise needed during the reduction
- * phase for 1-NN.
- *
- * @tparam DataT     data type
- * @tparam OutT      output type to either store 1-NN indices and their minimum
- *                   distances or store only the min distances. Accordingly, one
- *                   has to pass an appropriate `ReduceOpT`
- * @tparam IdxT      indexing arithmetic type
- * @tparam ReduceOpT A struct to perform the final needed reduction operation
- *                   and also to initialize the output array elements with the
- *                   appropriate initial value needed for reduction.
- *
- * @param[out] min           will contain the reduced output (Length = `m`)
- *                           (on device)
- * @param[in]  x             first matrix. Row major. Dim = `m x k`.
- *                           (on device).
- * @param[in]  y             second matrix. Row major. Dim = `n x k`.
- *                           (on device).
- * @param[in]  xn            L2 squared norm of `x`. Length = `m`. (on device).
- * @param[in]  yn            L2 squared norm of `y`. Length = `n`. (on device)
- * @param[in]  m             gemm m
- * @param[in]  n             gemm n
- * @param[in]  k             gemm k
- * @param[in]  workspace     temp workspace. Size = sizeof(int)*m. (on device)
- * @param[in]  redOp         reduction operator in the epilogue
- * @param[in] pairRedOp reduction operation on key value pairs
- * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
- * @param[in]  initOutBuffer whether to initialize the output buffer before the
- *                           main kernel launch
- * @param[in]  stream        cuda stream
- */
-template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT, typename KVPReduceOpT>
-void fusedL2NN(OutT* min,
-               const DataT* x,
-               const DataT* y,
-               const DataT* xn,
-               const DataT* yn,
-               IdxT m,
-               IdxT n,
-               IdxT k,
-               void* workspace,
-               ReduceOpT redOp,
-               KVPReduceOpT pairRedOp,
-               bool sqrt,
-               bool initOutBuffer,
-               cudaStream_t stream)
-{
-  size_t bytes = sizeof(DataT) * k;
-  if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) {
-    detail::fusedL2NNImpl<DataT, OutT, IdxT, 16 / sizeof(DataT), ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) {
-    detail::fusedL2NNImpl<DataT, OutT, IdxT, 8 / sizeof(DataT), ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
-  } else {
-    detail::fusedL2NNImpl<DataT, OutT, IdxT, 1, ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
-  }
-}
+#pragma once
 
-}  // namespace distance
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "fused_l2_nn.cuh"
diff --git a/cpp/include/raft/distance/specializations.hpp b/cpp/include/raft/distance/specializations.hpp
index 641968d9f1..04afb73036 100644
--- a/cpp/include/raft/distance/specializations.hpp
+++ b/cpp/include/raft/distance/specializations.hpp
@@ -18,11 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __DISTANCE_SPECIALIZATIONS_H
-#define __DISTANCE_SPECIALIZATIONS_H
+/**
+ * DISCLAIMER: this file is deprecated: use specializations.cuh instead
+ */
 
 #pragma once
 
-#include <raft/distance/specializations/distance.cuh>
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "specializations.cuh"
diff --git a/cpp/include/raft/label/classlabels.hpp b/cpp/include/raft/label/classlabels.hpp
index 189c26f69f..4f47b426c0 100644
--- a/cpp/include/raft/label/classlabels.hpp
+++ b/cpp/include/raft/label/classlabels.hpp
@@ -13,110 +13,19 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#ifndef __CLASS_LABELS_H
-#define __CLASS_LABELS_H
-
-#pragma once
-
-#include <raft/label/detail/classlabels.cuh>
-
-namespace raft {
-namespace label {
-
 /**
- * Get unique class labels.
- *
- * The y array is assumed to store class labels. The unique values are selected
- * from this array.
- *
- * @tparam value_t numeric type of the arrays with class labels
- * @param [inout] unique output unique labels
- * @param [in] y device array of labels, size [n]
- * @param [in] n number of labels
- * @param [in] stream cuda stream
- * @returns unique device array of unique labels, unallocated on entry,
- *   on exit it has size
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
  */
-template <typename value_t>
-int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n, cudaStream_t stream)
-{
-  return detail::getUniquelabels<value_t>(unique, y, n, stream);
-}
 
 /**
- * Assign one versus rest labels.
- *
- * The output labels will have values +/-1:
- * y_out = (y == y_unique[idx]) ? +1 : -1;
- *
- * The output type currently is set to value_t, but for SVM in principle we are
- * free to choose other type for y_out (it should represent +/-1, and it is used
- * in floating point arithmetics).
- *
- * @param [in] y device array if input labels, size [n]
- * @param [in] n number of labels
- * @param [in] y_unique device array of unique labels, size [n_classes]
- * @param [in] n_classes number of unique labels
- * @param [out] y_out device array of output labels
- * @param [in] idx index of unique label that should be labeled as 1
- * @param [in] stream cuda stream
- */
-template <typename value_t>
-void getOvrlabels(
-  value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream)
-{
-  detail::getOvrlabels<value_t>(y, n, y_unique, n_classes, y_out, idx, stream);
-}
-/**
- * Maps an input array containing a series of numbers into a new array
- * where numbers have been mapped to a monotonically increasing set
- * of labels. This can be useful in machine learning algorithms, for instance,
- * where a given set of labels is not taken from a monotonically increasing
- * set. This can happen if they are filtered or if only a subset of the
- * total labels are used in a dataset. This is also useful in graph algorithms
- * where a set of vertices need to be labeled in a monotonically increasing
- * order.
- * @tparam Type the numeric type of the input and output arrays
- * @tparam Lambda the type of an optional filter function, which determines
- * which items in the array to map.
- * @param[out] out the output monotonic array
- * @param[in] in input label array
- * @param[in] N number of elements in the input array
- * @param[in] stream cuda stream to use
- * @param[in] filter_op an optional function for specifying which values
- * should have monotonically increasing labels applied to them.
- * @param[in] zero_based force monotonic set to start at 0?
+ * DISCLAIMER: this file is deprecated: use classlabels.cuh instead
  */
-template <typename Type, typename Lambda>
-void make_monotonic(
-  Type* out, Type* in, size_t N, cudaStream_t stream, Lambda filter_op, bool zero_based = false)
-{
-  detail::make_monotonic<Type, Lambda>(out, in, N, stream, filter_op, zero_based);
-}
 
-/**
- * Maps an input array containing a series of numbers into a new array
- * where numbers have been mapped to a monotonically increasing set
- * of labels. This can be useful in machine learning algorithms, for instance,
- * where a given set of labels is not taken from a monotonically increasing
- * set. This can happen if they are filtered or if only a subset of the
- * total labels are used in a dataset. This is also useful in graph algorithms
- * where a set of vertices need to be labeled in a monotonically increasing
- * order.
- * @tparam Type the numeric type of the input and output arrays
- * @param[out] out output label array with labels assigned monotonically
- * @param[in] in input label array
- * @param[in] N number of elements in the input array
- * @param[in] stream cuda stream to use
- * @param[in] zero_based force monotonic label set to start at 0?
- */
-template <typename Type>
-void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zero_based = false)
-{
-  detail::make_monotonic<Type>(out, in, N, stream, zero_based);
-}
-};  // namespace label
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "classlabels.cuh"
diff --git a/cpp/include/raft/label/merge_labels.hpp b/cpp/include/raft/label/merge_labels.hpp
index 2bf2fa830b..7c0c25d038 100644
--- a/cpp/include/raft/label/merge_labels.hpp
+++ b/cpp/include/raft/label/merge_labels.hpp
@@ -13,59 +13,19 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#ifndef __MERGE_LABELS_H
-#define __MERGE_LABELS_H
-
-#pragma once
-
-#include <raft/label/detail/merge_labels.cuh>
-
-namespace raft {
-namespace label {
+/**
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
 
 /**
- * @brief Merge two labellings in-place, according to a core mask
- *
- * A labelling is a representation of disjoint sets (groups) where points that
- * belong to the same group have the same label. It is assumed that group
- * labels take values between 1 and N. labels relate to points, i.e a label i+1
- * means that you belong to the same group as the point i.
- * The special value MAX_LABEL is used to mark points that are not labelled.
- *
- * The two label arrays A and B induce two sets of groups over points 0..N-1.
- * If a point is labelled i in A and j in B and the mask is true for this
- * point, then i and j are equivalent labels and their groups are merged by
- * relabeling the elements of both groups to have the same label. The new label
- * is the smaller one from the original labels.
- * It is required that if the mask is true for a point, this point is labelled
- * (i.e its label is different than the special value MAX_LABEL).
- *
- * One use case is finding connected components: the two input label arrays can
- * represent the connected components of graphs G_A and G_B, and the output
- * would be the connected components labels of G_A \union G_B.
- *
- * @param[inout] labels_a    First input, and output label array (in-place)
- * @param[in]    labels_b    Second input label array
- * @param[in]    mask        Core point mask
- * @param[out]   R           label equivalence map
- * @param[in]    m           Working flag
- * @param[in]    N           Number of points in the dataset
- * @param[in]    stream      CUDA stream
+ * DISCLAIMER: this file is deprecated: use merge_labels.cuh instead
  */
-template <typename value_idx = int, int TPB_X = 256>
-void merge_labels(value_idx* labels_a,
-                  const value_idx* labels_b,
-                  const bool* mask,
-                  value_idx* R,
-                  bool* m,
-                  value_idx N,
-                  cudaStream_t stream)
-{
-  detail::merge_labels<value_idx, TPB_X>(labels_a, labels_b, mask, R, m, N, stream);
-}
 
-};  // namespace label
-};  // namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "merge_labels.cuh"
diff --git a/cpp/include/raft/lap/lap.hpp b/cpp/include/raft/lap/lap.hpp
index a9f205932c..badafb8afd 100644
--- a/cpp/include/raft/lap/lap.hpp
+++ b/cpp/include/raft/lap/lap.hpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -12,289 +11,21 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License.+
- *
- *      CUDA Implementation of O(n^3) alternating tree Hungarian Algorithm
- *      Authors: Ketan Date and Rakesh Nagi
- *
- *      Article reference:
- *          Date, Ketan, and Rakesh Nagi. "GPU-accelerated Hungarian algorithms
- *          for the Linear Assignment Problem." Parallel Computing 57 (2016): 52-72.
- *
+ * limitations under the License.
  */
-
 /**
  * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
-#ifndef __LAP_H
-#define __LAP_H
+/**
+ * DISCLAIMER: this file is deprecated: use lap.cuh instead
+ */
 
 #pragma once
 
-#include <raft/handle.hpp>
-#include <rmm/device_uvector.hpp>
-
-#include <thrust/execution_policy.h>
-#include <thrust/fill.h>
-
-#include "detail/d_structs.h"
-#include "detail/lap_functions.cuh"
-
-namespace raft {
-namespace lap {
-
-template <typename vertex_t, typename weight_t>
-class LinearAssignmentProblem {
-  vertex_t size_;
-  vertex_t batchsize_;
-  weight_t epsilon_;
-
-  weight_t const* d_costs_;
-
-  Vertices<vertex_t, weight_t> d_vertices_dev;
-  VertexData<vertex_t> d_row_data_dev, d_col_data_dev;
-
-  raft::handle_t const& handle_;
-  rmm::device_uvector<int> row_covers_v;
-  rmm::device_uvector<int> col_covers_v;
-  rmm::device_uvector<weight_t> row_duals_v;
-  rmm::device_uvector<weight_t> col_duals_v;
-  rmm::device_uvector<weight_t> col_slacks_v;
-  rmm::device_uvector<int> row_is_visited_v;
-  rmm::device_uvector<int> col_is_visited_v;
-  rmm::device_uvector<vertex_t> row_parents_v;
-  rmm::device_uvector<vertex_t> col_parents_v;
-  rmm::device_uvector<vertex_t> row_children_v;
-  rmm::device_uvector<vertex_t> col_children_v;
-  rmm::device_uvector<weight_t> obj_val_primal_v;
-  rmm::device_uvector<weight_t> obj_val_dual_v;
-
- public:
-  LinearAssignmentProblem(raft::handle_t const& handle,
-                          vertex_t size,
-                          vertex_t batchsize,
-                          weight_t epsilon)
-    : handle_(handle),
-      size_(size),
-      batchsize_(batchsize),
-      epsilon_(epsilon),
-      d_costs_(nullptr),
-      row_covers_v(0, handle_.get_stream()),
-      col_covers_v(0, handle_.get_stream()),
-      row_duals_v(0, handle_.get_stream()),
-      col_duals_v(0, handle_.get_stream()),
-      col_slacks_v(0, handle_.get_stream()),
-      row_is_visited_v(0, handle_.get_stream()),
-      col_is_visited_v(0, handle_.get_stream()),
-      row_parents_v(0, handle_.get_stream()),
-      col_parents_v(0, handle_.get_stream()),
-      row_children_v(0, handle_.get_stream()),
-      col_children_v(0, handle_.get_stream()),
-      obj_val_primal_v(0, handle_.get_stream()),
-      obj_val_dual_v(0, handle_.get_stream())
-  {
-  }
-
-  // Executes Hungarian algorithm on the input cost matrix.
-  void solve(weight_t const* d_cost_matrix, vertex_t* d_row_assignment, vertex_t* d_col_assignment)
-  {
-    initializeDevice();
-
-    d_vertices_dev.row_assignments = d_row_assignment;
-    d_vertices_dev.col_assignments = d_col_assignment;
-
-    d_costs_ = d_cost_matrix;
-
-    int step = 0;
-
-    while (step != 100) {
-      switch (step) {
-        case 0: step = hungarianStep0(); break;
-        case 1: step = hungarianStep1(); break;
-        case 2: step = hungarianStep2(); break;
-        case 3: step = hungarianStep3(); break;
-        case 4: step = hungarianStep4(); break;
-        case 5: step = hungarianStep5(); break;
-        case 6: step = hungarianStep6(); break;
-      }
-    }
-
-    d_costs_ = nullptr;
-  }
-
-  // Function for getting optimal row dual vector for subproblem spId.
-  std::pair<const weight_t*, vertex_t> getRowDualVector(int spId) const
-  {
-    return std::make_pair(row_duals_v.data() + spId * size_, size_);
-  }
-
-  // Function for getting optimal col dual vector for subproblem spId.
-  std::pair<const weight_t*, vertex_t> getColDualVector(int spId)
-  {
-    return std::make_pair(col_duals_v.data() + spId * size_, size_);
-  }
-
-  // Function for getting optimal primal objective value for subproblem spId.
-  weight_t getPrimalObjectiveValue(int spId)
-  {
-    weight_t result;
-    raft::update_host(&result, obj_val_primal_v.data() + spId, 1, handle_.get_stream());
-    CHECK_CUDA(handle_.get_stream());
-    return result;
-  }
-
-  // Function for getting optimal dual objective value for subproblem spId.
-  weight_t getDualObjectiveValue(int spId)
-  {
-    weight_t result;
-    raft::update_host(&result, obj_val_dual_v.data() + spId, 1, handle_.get_stream());
-    CHECK_CUDA(handle_.get_stream());
-    return result;
-  }
-
- private:
-  // Helper function for initializing global variables and arrays on a single host.
-  void initializeDevice()
-  {
-    cudaStream_t stream = handle_.get_stream();
-    row_covers_v.resize(batchsize_ * size_, stream);
-    col_covers_v.resize(batchsize_ * size_, stream);
-    row_duals_v.resize(batchsize_ * size_, stream);
-    col_duals_v.resize(batchsize_ * size_, stream);
-    col_slacks_v.resize(batchsize_ * size_, stream);
-    row_is_visited_v.resize(batchsize_ * size_, stream);
-    col_is_visited_v.resize(batchsize_ * size_, stream);
-    row_parents_v.resize(batchsize_ * size_, stream);
-    col_parents_v.resize(batchsize_ * size_, stream);
-    row_children_v.resize(batchsize_ * size_, stream);
-    col_children_v.resize(batchsize_ * size_, stream);
-    obj_val_primal_v.resize(batchsize_, stream);
-    obj_val_dual_v.resize(batchsize_, stream);
-
-    d_vertices_dev.row_covers = row_covers_v.data();
-    d_vertices_dev.col_covers = col_covers_v.data();
-
-    d_vertices_dev.row_duals  = row_duals_v.data();
-    d_vertices_dev.col_duals  = col_duals_v.data();
-    d_vertices_dev.col_slacks = col_slacks_v.data();
-
-    d_row_data_dev.is_visited = row_is_visited_v.data();
-    d_col_data_dev.is_visited = col_is_visited_v.data();
-    d_row_data_dev.parents    = row_parents_v.data();
-    d_row_data_dev.children   = row_children_v.data();
-    d_col_data_dev.parents    = col_parents_v.data();
-    d_col_data_dev.children   = col_children_v.data();
-
-    thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(), int{0});
-    thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(), int{0});
-    thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(), weight_t{0});
-    thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(), weight_t{0});
-  }
-
-  // Function for calculating initial zeros by subtracting row and column minima from each element.
-  int hungarianStep0()
-  {
-    detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_, size_);
-
-    return 1;
-  }
-
-  // Function for calculating initial zeros by subtracting row and column minima from each element.
-  int hungarianStep1()
-  {
-    detail::computeInitialAssignments(
-      handle_, d_costs_, d_vertices_dev, batchsize_, size_, epsilon_);
-
-    int next = 2;
-
-    while (true) {
-      if ((next = hungarianStep2()) == 6) break;
-
-      if ((next = hungarianStep3()) == 5) break;
-
-      hungarianStep4();
-    }
-
-    return next;
-  }
-
-  // Function for checking optimality and constructing predicates and covers.
-  int hungarianStep2()
-  {
-    int cover_count = detail::computeRowCovers(
-      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_);
-
-    int next = (cover_count == batchsize_ * size_) ? 6 : 3;
-
-    return next;
-  }
-
-  // Function for building alternating tree rooted at unassigned rows.
-  int hungarianStep3()
-  {
-    int next;
-
-    rmm::device_scalar<bool> flag_v(handle_.get_stream());
-
-    bool h_flag = false;
-    flag_v.set_value_async(h_flag, handle_.get_stream());
-
-    detail::executeZeroCover(handle_,
-                             d_costs_,
-                             d_vertices_dev,
-                             d_row_data_dev,
-                             d_col_data_dev,
-                             flag_v.data(),
-                             batchsize_,
-                             size_,
-                             epsilon_);
-
-    h_flag = flag_v.value(handle_.get_stream());
-
-    next = h_flag ? 4 : 5;
-
-    return next;
-  }
-
-  // Function for augmenting the solution along multiple node-disjoint alternating trees.
-  int hungarianStep4()
-  {
-    detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_, size_);
-
-    detail::augmentationPass(
-      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_);
-
-    return 2;
-  }
-
-  // Function for updating dual solution to introduce new zero-cost arcs.
-  int hungarianStep5()
-  {
-    detail::dualUpdate(
-      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_, epsilon_);
-
-    return 3;
-  }
-
-  // Function for calculating primal and dual objective values at optimality.
-  int hungarianStep6()
-  {
-    detail::calcObjValPrimal(handle_,
-                             obj_val_primal_v.data(),
-                             d_costs_,
-                             d_vertices_dev.row_assignments,
-                             batchsize_,
-                             size_);
-
-    detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev, batchsize_, size_);
-
-    return 100;
-  }
-};
-
-}  // namespace lap
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "lap.cuh"
diff --git a/cpp/include/raft/linalg/add.hpp b/cpp/include/raft/linalg/add.hpp
index a80398fcad..e7f9610892 100644
--- a/cpp/include/raft/linalg/add.hpp
+++ b/cpp/include/raft/linalg/add.hpp
@@ -18,78 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __ADD_H
-#define __ADD_H
-
-#pragma once
-
-#include "detail/add.cuh"
-
-namespace raft {
-namespace linalg {
-
-using detail::adds_scalar;
-
-/**
- * @brief Elementwise scalar add operation on the input buffer
- *
- * @tparam InT     input data-type. Also the data-type upon which the math ops
- *                 will be performed
- * @tparam OutT    output data-type
- * @tparam IdxType Integer type used to for addressing
- *
- * @param out    the output buffer
- * @param in     the input buffer
- * @param scalar the scalar used in the operations
- * @param len    number of elements in the input buffer
- * @param stream cuda stream where to launch work
- */
-template <typename InT, typename OutT = InT, typename IdxType = int>
-void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
-{
-  detail::addScalar(out, in, scalar, len, stream);
-}
-
 /**
- * @brief Elementwise add operation on the input buffers
- * @tparam InT     input data-type. Also the data-type upon which the math ops
- *                 will be performed
- * @tparam OutT    output data-type
- * @tparam IdxType Integer type used to for addressing
- *
- * @param out    the output buffer
- * @param in1    the first input buffer
- * @param in2    the second input buffer
- * @param len    number of elements in the input buffers
- * @param stream cuda stream where to launch work
+ * DISCLAIMER: this file is deprecated: use add.cuh instead
  */
-template <typename InT, typename OutT = InT, typename IdxType = int>
-void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
-{
-  detail::add(out, in1, in2, len, stream);
-}
 
-/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
- * write result to outDev[i]
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param outDev the output buffer
- * @param inDev the input buffer
- * @param singleScalarDev pointer to the scalar located in device memory
- * @param len number of elements in the input and output buffer
- * @param stream cuda stream
- */
-template <typename math_t, typename IdxType = int>
-void addDevScalar(math_t* outDev,
-                  const math_t* inDev,
-                  const math_t* singleScalarDev,
-                  IdxType len,
-                  cudaStream_t stream)
-{
-  detail::addDevScalar(outDev, inDev, singleScalarDev, len, stream);
-}
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "add.cuh"
diff --git a/cpp/include/raft/linalg/axpy.hpp b/cpp/include/raft/linalg/axpy.hpp
index c227ba66c8..8db4c5a6e8 100644
--- a/cpp/include/raft/linalg/axpy.hpp
+++ b/cpp/include/raft/linalg/axpy.hpp
@@ -18,43 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __AXPY_H
-#define __AXPY_H
-
-#pragma once
-
-#include "detail/axpy.cuh"
-
-namespace raft::linalg {
-
 /**
- * @brief the wrapper of cublas axpy function
- *  It computes the following equation: y = alpha * x + y
- *
- * @tparam T the element type
- * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
- * @param [in] handle raft handle
- * @param [in] n number of elements in x and y
- * @param [in] alpha host or device scalar
- * @param [in] x vector of length n
- * @param [in] incx stride between consecutive elements of x
- * @param [inout] y vector of length n
- * @param [in] incy stride between consecutive elements of y
- * @param [in] stream
+ * DISCLAIMER: this file is deprecated: use axpy.cuh instead
  */
-template <typename T, bool DevicePointerMode = false>
-void axpy(const raft::handle_t& handle,
-          const int n,
-          const T* alpha,
-          const T* x,
-          const int incx,
-          T* y,
-          const int incy,
-          cudaStream_t stream)
-{
-  detail::axpy<T, DevicePointerMode>(handle, n, alpha, x, incx, y, incy, stream);
-}
 
-}  // namespace raft::linalg
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "axpy.cuh"
diff --git a/cpp/include/raft/linalg/binary_op.hpp b/cpp/include/raft/linalg/binary_op.hpp
index 9983e8ab50..f0a54cb164 100644
--- a/cpp/include/raft/linalg/binary_op.hpp
+++ b/cpp/include/raft/linalg/binary_op.hpp
@@ -18,46 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __BINARY_OP_H
-#define __BINARY_OP_H
-
-#pragma once
-
-#include "detail/binary_op.cuh"
-
-#include <raft/cuda_utils.cuh>
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief perform element-wise binary operation on the input arrays
- * @tparam InType input data-type
- * @tparam Lambda the device-lambda performing the actual operation
- * @tparam OutType output data-type
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads-per-block in the final kernel launched
- * @param out the output array
- * @param in1 the first input array
- * @param in2 the second input array
- * @param len number of elements in the input array
- * @param op the device-lambda
- * @param stream cuda stream where to launch work
- * @note Lambda must be a functor with the following signature:
- *       `OutType func(const InType& val1, const InType& val2);`
+ * DISCLAIMER: this file is deprecated: use binary_op.cuh instead
  */
-template <typename InType,
-          typename Lambda,
-          typename OutType = InType,
-          typename IdxType = int,
-          int TPB          = 256>
-void binaryOp(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
-{
-  detail::binaryOp(out, in1, in2, len, op, stream);
-}
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "binary_op.cuh"
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.hpp b/cpp/include/raft/linalg/cholesky_r1_update.hpp
index 1158ad3aa4..a1967c36cb 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.hpp
+++ b/cpp/include/raft/linalg/cholesky_r1_update.hpp
@@ -18,126 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __CHOLESKY_R1_UPDATE_H
-#define __CHOLESKY_R1_UPDATE_H
+/**
+ * DISCLAIMER: this file is deprecated: use cholesky_r1_update.cuh instead
+ */
 
 #pragma once
 
-#include "detail/cholesky_r1_update.cuh"
-
-namespace raft {
-namespace linalg {
-
-/**
- * @brief Rank 1 update of Cholesky decomposition.
- *
- * This method is useful if an algorithm iteratively builds up matrix A, and
- * the Cholesky decomposition of A is required at each step.
- *
- * On entry, L is the Cholesky decomposition of matrix A, where both A and L
- * have size n-1 x n-1. We are interested in the Cholesky decomposition of a new
- * matrix A', which we get by adding a row and column to A. In Python notation:
- * - A'[0:n-1, 0:n-1] = A;
- * - A'[:,n-1] = A[n-1,:] = A_new
- *
- * On entry, the new column A_new, is stored as the n-th column of L if uplo ==
- * CUBLAS_FILL_MODE_UPPER, else A_new is stored as the n-th row of L.
- *
- * On exit L contains the Cholesky decomposition of A'. In practice the elements
- * of A_new are overwritten with new row/column of the L matrix.
- *
- * The uplo paramater is used to select the matrix layout.
- * If (uplo != CUBLAS_FILL_MODE_UPPER) then the input arg L stores the
- * lower triangular matrix L, so that A = L * L.T. Otherwise the input arg L
- * stores an upper triangular matrix U: A = U.T * U.
- *
- * On exit L will be updated to store the Cholesky decomposition of A'.
- *
- * If the matrix is not positive definit, or very ill conditioned then the new
- * diagonal element of L would be NaN. In such a case an exception is thrown.
- * The eps argument can be used to override this behavior: if eps >= 0 then
- * the diagonal element is replaced by eps in case the diagonal is NaN or
- * smaller than eps. Note: for an iterative solver it is probably better to
- * stop early in case of error, rather than relying on the eps parameter.
- *
- * Examples:
- *
- * - Lower triangular factorization:
- * @code{.cpp}
- * // Initialize arrays
- * int ld_L = n_rows;
- * rmm::device_uvector<math_t> L(ld_L * n_rows, stream);
- * raft::linalg::choleskyRank1Update(handle, L, n_rows, ld_L, nullptr,
- *                                       &n_bytes, CUBLAS_FILL_MODE_LOWER,
- *                                       stream);
- * rmm::device_uvector<char> workspace(n_bytes, stream);
- *
- * for (n=1; n<=n_rows; rank++) {
- *   // Calculate a new row/column of matrix A into A_new
- *   // ...
- *   // Copy new row to L[rank-1,:]
- *   RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1,
- *                           L + n - 1, ld_L, stream));
- *   // Update Cholesky factorization
- *   raft::linalg::choleskyRank1Update(
- *       handle, L, rank, ld_L, workspace, &n_bytes, CUBLAS_FILL_MODE_LOWER,
- *       stream);
- * }
- * Now L stores the Cholesky decomposition of A: A = L * L.T
- * @endcode
- *
- * - Upper triangular factorization:
- * @code{.cpp}
- * // Initialize arrays
- * int ld_U = n_rows;
- * rmm::device_uvector<math_t> U(ld_U * n_rows, stream);
- * raft::linalg::choleskyRank1Update(handle, L, n_rows, ld_U, nullptr,
- *                                       &n_bytes, CUBLAS_FILL_MODE_UPPER,
- *                                       stream);
- * rmm::device_uvector<char> workspace(stream, n_bytes, stream);
- *
- * for (n=1; n<=n_rows; n++) {
- *   // Calculate a new row/column of matrix A into array A_new
- *   // ...
- *   // Copy new row to U[:,n-1] (column major layout)
- *   raft::copy(U + ld_U * (n-1), A_new, n-1, stream);
- *   //
- *   // Update Cholesky factorization
- *   raft::linalg::choleskyRank1Update(
- *       handle, U, n, ld_U, workspace, &n_bytes, CUBLAS_FILL_MODE_UPPER,
- *       stream);
- * }
- * // Now U stores the Cholesky decomposition of A: A = U.T * U
- * @endcode
- *
- * @param handle RAFT handle (used to retrive cuBLAS handles).
- * @param L device array for to store the triangular matrix L, and the new
- *     column of A in column major format, size [n*n]
- * @param n number of elements in the new row.
- * @param ld stride of colums in L
- * @param workspace device pointer to workspace shall be nullptr ar an array
- *    of size [n_bytes].
- * @param n_bytes size of workspace is returned here if workspace==nullptr.
- * @param stream CUDA stream
- * @param uplo indicates whether L is stored as an upper or lower triangular
- *    matrix (CUBLAS_FILL_MODE_UPPER or CUBLAS_FILL_MODE_LOWER)
- * @param eps numerical parameter that can act as a regularizer for ill
- *    conditioned systems. Negative values mean no regularizaton.
- */
-template <typename math_t>
-void choleskyRank1Update(const raft::handle_t& handle,
-                         math_t* L,
-                         int n,
-                         int ld,
-                         void* workspace,
-                         int* n_bytes,
-                         cublasFillMode_t uplo,
-                         cudaStream_t stream,
-                         math_t eps = -1)
-{
-  detail::choleskyRank1Update(handle, L, n, ld, workspace, n_bytes, uplo, stream, eps);
-}
-};  // namespace linalg
-};  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "cholesky_r1_update.cuh"
diff --git a/cpp/include/raft/linalg/coalesced_reduction.hpp b/cpp/include/raft/linalg/coalesced_reduction.hpp
index 48f8798a03..8631a7e5ba 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.hpp
+++ b/cpp/include/raft/linalg/coalesced_reduction.hpp
@@ -18,64 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __COALESCED_REDUCTION_H
-#define __COALESCED_REDUCTION_H
-
-#pragma once
-
-#include "detail/coalesced_reduction.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief Compute reduction of the input matrix along the leading dimension
- *
- * @tparam InType the data type of the input
- * @tparam OutType the data type of the output (as well as the data type for
- *  which reduction is performed)
- * @tparam IdxType data type of the indices of the array
- * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
- * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*ReduceLambda)(OutType);</pre>
- * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*FinalLambda)(OutType);</pre>
- * @param dots the output reduction vector
- * @param data the input matrix
- * @param D leading dimension of data
- * @param N second dimension data
- * @param init initial value to use for the reduction
- * @param main_op elementwise operation to apply before reduction
- * @param reduce_op binary reduction operation
- * @param final_op elementwise operation to apply before storing results
- * @param inplace reduction result added inplace or overwrites old values?
- * @param stream cuda stream where to launch work
+ * DISCLAIMER: this file is deprecated: use coalesced_reduction.cuh instead
  */
-template <typename InType,
-          typename OutType      = InType,
-          typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
-void coalescedReduction(OutType* dots,
-                        const InType* data,
-                        int D,
-                        int N,
-                        OutType init,
-                        cudaStream_t stream,
-                        bool inplace           = false,
-                        MainLambda main_op     = raft::Nop<InType, IdxType>(),
-                        ReduceLambda reduce_op = raft::Sum<OutType>(),
-                        FinalLambda final_op   = raft::Nop<OutType>())
-{
-  detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-}
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "coalesced_reduction.cuh"
diff --git a/cpp/include/raft/linalg/contractions.hpp b/cpp/include/raft/linalg/contractions.hpp
index 256593d9ae..7e5e9be403 100644
--- a/cpp/include/raft/linalg/contractions.hpp
+++ b/cpp/include/raft/linalg/contractions.hpp
@@ -18,199 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __CONTRACTIONS_H
-#define __CONTRACTIONS_H
-
-#pragma once
-
-#include "detail/contractions.cuh"
-
-namespace raft {
-namespace linalg {
-
-/**
- * @brief This is the central enum that should be used to configure the perf
- *        landscape of the Contraction kernel.
- *
- * Main goal of this Policy struct is to provide sufficient knobs to tune the
- * perf of Contraction kernel, as and when we see matrices of different shapes.
- *
- * @tparam DataT   the IO and math datatype
- * @tparam _veclen number of k-elements loaded by each thread for every LDG call
- *                 it makes. This should be configured based on the input 'k'
- *                 value and the input data type. For eg: if DataT = float and
- *                 k is multiples of 4, then setting this to 4 gives the best
- *                 LDG pattern. Possible values are {1, 2, 4}.
- * @tparam _kblk   number of k-elements operated upon per main-loop iteration.
- *                 Therefore total number of main-loop iterations will be
- *                 `ceil(k/_kblk)`. This must be multiples of `_veclen`. Do note
- *                 that bigger this value, the greater shared mem requirement.
- * @tparam _rpt    Defines the number of rows that a given thread accumulates on.
- *                 This directly results in increased register pressure. This
- *                 also is used to compute the number of m-elements worked upon
- *                 by each thread block.
- * @tparam _cpt    Defines the number of cols that a given thread accumulates on.
- *                 This directly results in increased register pressure. This
- *                 also is used to compute the number of n-elements worked upon
- *                 by each thread block.
- * @tparam _tr     Number of threads working on the same output column. This is
- *                 used to compute the number of m-elements worked upon by each
- *                 thread block. This also determines the number of threads per
- *                 thread block
- * @tparam _tc     Number of threads working on the same output row. This is
- *                 used to compute the number of m-elements worked upon by each
- *                 thread block. This also determines the number of threads per
- *                 thread block
- */
-template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
-struct KernelPolicy {
-  enum {
-    /** number of elements along K worked upon per main loop iteration */
-    Kblk = _kblk,
-    /** number of elements loaded per LDG */
-    Veclen = _veclen,
-    /** number of rows a thread works on for accumulation */
-    AccRowsPerTh = _rpt,
-    /** number of cols a thread works on for accumulation */
-    AccColsPerTh = _cpt,
-    /** number of threads working the same output col */
-    AccThRows = _tr,
-    /** number of threads working the same output row */
-    AccThCols = _tc,
-    /** total threads per block */
-    Nthreads = AccThRows * AccThCols,
-    /** output tile size along rows */
-    Mblk = AccRowsPerTh * AccThRows,
-    /** output tile size along cols */
-    Nblk = AccColsPerTh * AccThCols,
-    /** number of threads loading a single row */
-    LdgThRow = Kblk / Veclen,
-    /** number of LDGs issued by a single thread for X */
-    LdgPerThX = Mblk * LdgThRow / Nthreads,
-    /** number of LDGs issued by a single thread for Y */
-    LdgPerThY = Nblk * LdgThRow / Nthreads,
-    /** number of rows of X covered per LDG */
-    LdgRowsX = Mblk / LdgPerThX,
-    /** number of rows of Y covered per LDG */
-    LdgRowsY = Nblk / LdgPerThY,
-    /** stride for accessing X/Y data in shared mem */
-    SmemStride = Kblk + Veclen,
-    /** size of one page for storing X data */
-    SmemPageX = SmemStride * Mblk,
-    /** size of one page for storing Y data */
-    SmemPageY = SmemStride * Nblk,
-    /** size of one smem page */
-    SmemPage = SmemPageX + SmemPageY,
-    /** size (in B) for smem needed */
-    SmemSize = 2 * SmemPage * sizeof(DataT),
-  };  // enum
-
-};  // struct KernelPolicy
-
-template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
-struct ColKernelPolicy {
-  enum {
-    /** number of elements along K worked upon per main loop iteration */
-    Kblk = _kblk,
-    /** number of elements loaded per LDG */
-    Veclen = _veclen,
-    /** number of rows a thread works on for accumulation */
-    AccRowsPerTh = _rpt,
-    /** number of cols a thread works on for accumulation */
-    AccColsPerTh = _cpt,
-    /** number of threads working the same output col */
-    AccThRows = _tr,
-    /** number of threads working the same output row */
-    AccThCols = _tc,
-    /** total threads per block */
-    Nthreads = AccThRows * AccThCols,
-    /** output tile size along rows */
-    Mblk = AccRowsPerTh * AccThRows,
-    /** output tile size along cols */
-    Nblk = AccColsPerTh * AccThCols,
-    /** number of threads loading a single col */
-    LdgThRow = Mblk / Veclen,
-    /** number of LDGs issued by a single thread for X */
-    LdgPerThX = Kblk * LdgThRow / Nthreads,
-    /** number of LDGs issued by a single thread for Y */
-    LdgPerThY = Kblk * LdgThRow / Nthreads,
-    /** number of rows of X covered per LDG */
-    LdgRowsX = Kblk / LdgPerThX,
-    /** number of rows of Y covered per LDG */
-    LdgRowsY = Kblk / LdgPerThY,
-    /** stride for accessing X/Y data in shared mem */
-    SmemStride = Mblk + Veclen,
-    /** size of one page for storing X data */
-    SmemPageX = SmemStride * Kblk,
-    /** size of one page for storing Y data */
-    SmemPageY = SmemStride * Kblk,
-    /** size of one smem page */
-    SmemPage = SmemPageX + SmemPageY,
-    /** size (in B) for smem needed */
-    SmemSize = 2 * SmemPage * sizeof(DataT),
-  };  // colMajor enum
-  static_assert(Mblk == Nblk, "Mblk should be equal to Nblk");
-};
 /**
- * @defgroup Policy4x4 16 elements per thread Policy with k-block = 32
- * @{
+ * DISCLAIMER: this file is deprecated: use contractions.cuh instead
  */
-template <typename DataT, int _veclen>
-struct Policy4x4 {
-};
-
-template <int _veclen>
-struct Policy4x4<float, _veclen> {
-  typedef KernelPolicy<float, _veclen, 32, 4, 4, 16, 16> Policy;
-  typedef ColKernelPolicy<float, _veclen, 32, 4, 4, 16, 16> ColPolicy;
-};
 
-template <int _veclen>
-struct Policy4x4<double, _veclen> {
-  typedef KernelPolicy<double, _veclen, 16, 4, 4, 16, 16> Policy;
-  typedef ColKernelPolicy<double, _veclen, 16, 4, 4, 16, 16> ColPolicy;
-};
-/** @} */
-
-/**
- * @defgroup Policy2x8 16 elements per thread Policy with k-block = 16
- * @{
- */
-template <typename DataT, int _veclen = 1>
-struct Policy2x8 {
-};
-
-template <int _veclen>
-struct Policy2x8<float, _veclen> {
-  typedef KernelPolicy<float, _veclen, 16, 2, 8, 8, 32> Policy;
-  typedef ColKernelPolicy<float, _veclen, 16, 2, 8, 8, 32> ColPolicy;
-};
-
-template <int _veclen>
-struct Policy2x8<double, _veclen> {
-  // this is not used just for keeping compiler happy.
-  typedef KernelPolicy<double, _veclen, 32, 1, 2, 8, 32> Policy;
-  typedef ColKernelPolicy<double, _veclen, 32, 1, 2, 8, 32> ColPolicy;
-};
-/** @} */
-
-/**
- * @brief Base class for gemm-like NT contractions
- *
- * This class does not provide any arithmetic operations, but only provides the
- * memory-related operations of loading the `x` and `y` matrix blocks from the
- * global memory into shared memory and then from shared into registers. Thus,
- * this class acts as a basic building block for further composing gemm-like NT
- * contractions on input matrices which are row-major (and so does the output)
- *
- * @tparam DataT  IO and math data type
- * @tparam IdxT   indexing type
- * @tparam Policy policy used to customize memory access behavior.
- *                See documentation for `KernelPolicy` to know more.
- */
-using detail::Contractions_NT;
+#pragma once
 
-}  // namespace linalg
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "contractions.cuh"
diff --git a/cpp/include/raft/linalg/divide.hpp b/cpp/include/raft/linalg/divide.hpp
index 8d1bd37186..57f4376fcc 100644
--- a/cpp/include/raft/linalg/divide.hpp
+++ b/cpp/include/raft/linalg/divide.hpp
@@ -18,37 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __DIVIDE_H
-#define __DIVIDE_H
-
-#pragma once
-
-#include "detail/divide.cuh"
-
-namespace raft {
-namespace linalg {
-
-using detail::divides_scalar;
-
 /**
- * @defgroup ScalarOps Scalar operations on the input buffer
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param out the output buffer
- * @param in the input buffer
- * @param scalar the scalar used in the operations
- * @param len number of elements in the input buffer
- * @param stream cuda stream where to launch work
- * @{
+ * DISCLAIMER: this file is deprecated: use divide.cuh instead
  */
-template <typename math_t, typename IdxType = int>
-void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
-{
-  detail::divideScalar(out, in, scalar, len, stream);
-}
-/** @} */
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "divide.cuh"
diff --git a/cpp/include/raft/linalg/eig.hpp b/cpp/include/raft/linalg/eig.hpp
index 032c4e97f9..175a2aaccc 100644
--- a/cpp/include/raft/linalg/eig.hpp
+++ b/cpp/include/raft/linalg/eig.hpp
@@ -18,108 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __EIG_H
-#define __EIG_H
-
-#pragma once
-
-#include "detail/eig.cuh"
-
-namespace raft {
-namespace linalg {
-
-/**
- * @defgroup eig Eigen Decomposition Methods
- * @{
- */
-
 /**
- * @brief eig decomp with divide and conquer method for the column-major
- * symmetric matrices
- * @param handle raft handle
- * @param in the input buffer (symmetric matrix that has real eig values and
- * vectors.
- * @param n_rows: number of rows of the input
- * @param n_cols: number of cols of the input
- * @param eig_vectors: eigenvectors
- * @param eig_vals: eigen values
- * @param stream cuda stream
+ * DISCLAIMER: this file is deprecated: use eig.cuh instead
  */
-template <typename math_t>
-void eigDC(const raft::handle_t& handle,
-           const math_t* in,
-           std::size_t n_rows,
-           std::size_t n_cols,
-           math_t* eig_vectors,
-           math_t* eig_vals,
-           cudaStream_t stream)
-{
-  detail::eigDC(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream);
-}
 
-using detail::COPY_INPUT;
-using detail::EigVecMemUsage;
-using detail::OVERWRITE_INPUT;
-
-/**
- * @brief eig sel decomp with divide and conquer method for the column-major
- * symmetric matrices
- * @param handle raft handle
- * @param in the input buffer (symmetric matrix that has real eig values and
- * vectors.
- * @param n_rows: number of rows of the input
- * @param n_cols: number of cols of the input
- * @param n_eig_vals: number of eigenvectors to be generated
- * @param eig_vectors: eigenvectors
- * @param eig_vals: eigen values
- * @param memUsage: the memory selection for eig vector output
- * @param stream cuda stream
- */
-template <typename math_t>
-void eigSelDC(const raft::handle_t& handle,
-              math_t* in,
-              int n_rows,
-              int n_cols,
-              int n_eig_vals,
-              math_t* eig_vectors,
-              math_t* eig_vals,
-              EigVecMemUsage memUsage,
-              cudaStream_t stream)
-{
-  detail::eigSelDC(handle, in, n_rows, n_cols, n_eig_vals, eig_vectors, eig_vals, memUsage, stream);
-}
-
-/**
- * @brief overloaded function for eig decomp with Jacobi method for the
- * column-major symmetric matrices (in parameter)
- * @param handle: raft handle
- * @param in: input matrix
- * @param n_rows: number of rows of the input
- * @param n_cols: number of cols of the input
- * @param eig_vectors: eigenvectors
- * @param eig_vals: eigen values
- * @param stream: stream on which this function will be run
- * @param tol: error tolerance for the jacobi method. Algorithm stops when the
- * error is below tol
- * @param sweeps: number of sweeps in the Jacobi algorithm. The more the better
- * accuracy.
- */
-template <typename math_t>
-void eigJacobi(const raft::handle_t& handle,
-               const math_t* in,
-               int n_rows,
-               int n_cols,
-               math_t* eig_vectors,
-               math_t* eig_vals,
-               cudaStream_t stream,
-               math_t tol = 1.e-7,
-               int sweeps = 15)
-{
-  detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream, tol, sweeps);
-}
-/** @} */  // end of eig
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "eig.cuh"
diff --git a/cpp/include/raft/linalg/eltwise.hpp b/cpp/include/raft/linalg/eltwise.hpp
index 62624f6eeb..8931c88241 100644
--- a/cpp/include/raft/linalg/eltwise.hpp
+++ b/cpp/include/raft/linalg/eltwise.hpp
@@ -18,94 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __ELTWISE_H
-#define __ELTWISE_H
-
-#pragma once
-
-#include "detail/eltwise.cuh"
-
-namespace raft {
-namespace linalg {
-
-using detail::adds_scalar;
-
-/**
- * @defgroup ScalarOps Scalar operations on the input buffer
- * @tparam InType data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param out the output buffer
- * @param in the input buffer
- * @param scalar the scalar used in the operations
- * @param len number of elements in the input buffer
- * @param stream cuda stream where to launch work
- * @{
- */
-template <typename InType, typename IdxType, typename OutType = InType>
-void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
-{
-  detail::scalarAdd(out, in, scalar, len, stream);
-}
-
-using detail::multiplies_scalar;
-
-template <typename InType, typename IdxType, typename OutType = InType>
-void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
-{
-  detail::scalarMultiply(out, in, scalar, len, stream);
-}
-/** @} */
-
 /**
- * @defgroup BinaryOps Element-wise binary operations on the input buffers
- * @tparam InType data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param out the output buffer
- * @param in1 the first input buffer
- * @param in2 the second input buffer
- * @param len number of elements in the input buffers
- * @param stream cuda stream where to launch work
- * @{
+ * DISCLAIMER: this file is deprecated: use eltwise.cuh instead
  */
-template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseAdd(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
-{
-  detail::eltwiseAdd(out, in1, in2, len, stream);
-}
-
-template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseSub(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
-{
-  detail::eltwiseSub(out, in1, in2, len, stream);
-}
 
-template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseMultiply(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
-{
-  detail::eltwiseMultiply(out, in1, in2, len, stream);
-}
-
-template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseDivide(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
-{
-  detail::eltwiseDivide(out, in1, in2, len, stream);
-}
-
-using detail::divides_check_zero;
-
-template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseDivideCheckZero(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
-{
-  detail::eltwiseDivideCheckZero(out, in1, in2, len, stream);
-}
-/** @} */
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "eltwise.cuh"
diff --git a/cpp/include/raft/linalg/gemm.hpp b/cpp/include/raft/linalg/gemm.hpp
index 37c6b2d552..6ad2f1fbe1 100644
--- a/cpp/include/raft/linalg/gemm.hpp
+++ b/cpp/include/raft/linalg/gemm.hpp
@@ -18,167 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __GEMM_H
-#define __GEMM_H
-
-#pragma once
-
-#include "detail/gemm.hpp"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief the wrapper of cublas gemm function
- *  It computes the following equation: C = alpha .* opA(A) * opB(B) + beta .* C
- *
- * @tparam math_t the element type
- * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
- * @param [in] handle raft handle
- * @param [in] trans_a cublas transpose op for A
- * @param [in] trans_b cublas transpose op for B
- * @param [in] m number of rows of C
- * @param [in] n number of columns of C
- * @param [in] k number of rows of opB(B) / number of columns of opA(A)
- * @param [in] alpha host or device scalar
- * @param [in] A such a matrix that the shape of column-major opA(A) is [m, k]
- * @param [in] lda leading dimension of A
- * @param [in] B such a matrix that the shape of column-major opA(B) is [k, n]
- * @param [in] ldb leading dimension of B
- * @param [in] beta host or device scalar
- * @param [inout] C column-major matrix of size [m, n]
- * @param [in] ldc leading dimension of C
- * @param [in] stream
+ * DISCLAIMER: this file is deprecated: use gemm.cuh instead
  */
-template <typename math_t, bool DevicePointerMode = false>
-void gemm(const raft::handle_t& handle,
-          const bool trans_a,
-          const bool trans_b,
-          const int m,
-          const int n,
-          const int k,
-          const math_t* alpha,
-          const math_t* A,
-          const int lda,
-          const math_t* B,
-          const int ldb,
-          const math_t* beta,
-          math_t* C,
-          const int ldc,
-          cudaStream_t stream)
-{
-  detail::gemm<math_t, DevicePointerMode>(
-    handle, trans_a, trans_b, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, stream);
-}
 
-/**
- * @brief the wrapper of cublas gemm function
- *  It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
- * @tparam math_t the type of input/output matrices
- * @param handle raft handle
- * @param a input matrix
- * @param n_rows_a number of rows of A
- * @param n_cols_a number of columns of A
- * @param b input matrix
- * @param c output matrix
- * @param n_rows_c number of rows of C
- * @param n_cols_c number of columns of C
- * @param trans_a cublas transpose op for A
- * @param trans_b cublas transpose op for B
- * @param alpha scalar
- * @param beta scalar
- * @param stream cuda stream
- */
-template <typename math_t>
-void gemm(const raft::handle_t& handle,
-          const math_t* a,
-          int n_rows_a,
-          int n_cols_a,
-          const math_t* b,
-          math_t* c,
-          int n_rows_c,
-          int n_cols_c,
-          cublasOperation_t trans_a,
-          cublasOperation_t trans_b,
-          math_t alpha,
-          math_t beta,
-          cudaStream_t stream)
-{
-  detail::gemm(
-    handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
-}
-
-/**
- * @brief the wrapper of cublas gemm function
- *  It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
- * @tparam math_t the type of input/output matrices
- * @param handle raft handle
- * @param a input matrix
- * @param n_rows_a number of rows of A
- * @param n_cols_a number of columns of A
- * @param b input matrix
- * @param c output matrix
- * @param n_rows_c number of rows of C
- * @param n_cols_c number of columns of C
- * @param trans_a cublas transpose op for A
- * @param trans_b cublas transpose op for B
- * @param stream cuda stream
- */
-template <typename math_t>
-void gemm(const raft::handle_t& handle,
-          const math_t* a,
-          int n_rows_a,
-          int n_cols_a,
-          const math_t* b,
-          math_t* c,
-          int n_rows_c,
-          int n_cols_c,
-          cublasOperation_t trans_a,
-          cublasOperation_t trans_b,
-          cudaStream_t stream)
-{
-  detail::gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, stream);
-}
-
-/**
- * @brief A wrapper for CUBLS GEMM function designed for handling all possible
- * combinations of operand layouts.
- * It computes the following equation: Z = alpha . X * Y + beta . Z
- * @tparam T Data type of input/output matrices (float/double)
- * @param handle raft handle
- * @param z output matrix of size M rows x N columns
- * @param x input matrix of size M rows x K columns
- * @param y input matrix of size K rows x N columns
- * @param _M number of rows of X and Z
- * @param _N number of rows of Y and columns of Z
- * @param _K number of columns of X and rows of Y
- * @param isZColMajor Storage layout of Z. true = col major, false = row major
- * @param isXColMajor Storage layout of X. true = col major, false = row major
- * @param isYColMajor Storage layout of Y. true = col major, false = row major
- * @param stream cuda stream
- * @param alpha scalar
- * @param beta scalar
- */
-template <typename T>
-void gemm(const raft::handle_t& handle,
-          T* z,
-          T* x,
-          T* y,
-          int _M,
-          int _N,
-          int _K,
-          bool isZColMajor,
-          bool isXColMajor,
-          bool isYColMajor,
-          cudaStream_t stream,
-          T alpha = T(1.0),
-          T beta  = T(0.0))
-{
-  detail::gemm(
-    handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor, isYColMajor, stream, alpha, beta);
-}
+#pragma once
 
-}  // end namespace linalg
-}  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
+#include "gemm.cuh"
diff --git a/cpp/include/raft/linalg/gemv.hpp b/cpp/include/raft/linalg/gemv.hpp
index 3b6b60263b..8161631fd3 100644
--- a/cpp/include/raft/linalg/gemv.hpp
+++ b/cpp/include/raft/linalg/gemv.hpp
@@ -18,200 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __GEMV_H
-#define __GEMV_H
-
-#pragma once
-
-#include "detail/gemv.hpp"
-
-namespace raft {
-namespace linalg {
-
-/**
- * @brief the wrapper of cublas gemv function
- *  It computes the following equation: y = alpha .* op(A) * x + beta .* y
- *
- * @tparam math_t the element type
- * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
- * @param [in] handle raft handle
- * @param [in] trans_a cublas transpose op for A
- * @param [in] m number of rows of A
- * @param [in] n number of columns of A
- * @param [in] alpha host or device scalar
- * @param [in] A column-major matrix of size [m, n]
- * @param [in] lda leading dimension of A
- * @param [in] x vector of length n if trans_a else m
- * @param [in] incx stride between consecutive elements of x
- * @param [in] beta host or device scalar
- * @param [inout] y vector of length m if trans_a else n
- * @param [in] incy stride between consecutive elements of y
- * @param [in] stream
- */
-template <typename math_t, bool DevicePointerMode = false>
-void gemv(const raft::handle_t& handle,
-          const bool trans_a,
-          const int m,
-          const int n,
-          const math_t* alpha,
-          const math_t* A,
-          const int lda,
-          const math_t* x,
-          const int incx,
-          const math_t* beta,
-          math_t* y,
-          const int incy,
-          cudaStream_t stream)
-{
-  detail::gemv<math_t, DevicePointerMode>(
-    handle, trans_a, m, n, alpha, A, lda, x, incx, beta, y, incy, stream);
-}
-
-template <typename math_t>
-void gemv(const raft::handle_t& handle,
-          const math_t* A,
-          const int n_rows,
-          const int n_cols,
-          const math_t* x,
-          const int incx,
-          math_t* y,
-          const int incy,
-          const bool trans_a,
-          const math_t alpha,
-          const math_t beta,
-          cudaStream_t stream)
-{
-  detail::gemv(handle, A, n_rows, n_cols, x, incx, y, incy, trans_a, alpha, beta, stream);
-}
-
-/**
- * y = alpha * op(A) * x + beta * y
- *
- * where
- *
- * @param handle raft handle
- * @param A is a column-major matrix of size n_rows_a * n_cols_a.
- *   op(A) is either the transpose operation (trans_a == true) or identity.
- * @param n_rows_a number of rows in A
- * @param n_cols_a number of cols in A
- * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
- * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
- * @param trans_a whether to take transpose of a
- * @param alpha is a scalar scale of Ax.
- * @param beta is a scalar scale of y.
- * @param stream stream on which this function is run
- */
-template <typename math_t>
-void gemv(const raft::handle_t& handle,
-          const math_t* A,
-          const int n_rows_a,
-          const int n_cols_a,
-          const math_t* x,
-          math_t* y,
-          const bool trans_a,
-          const math_t alpha,
-          const math_t beta,
-          cudaStream_t stream)
-{
-  detail::gemv(handle, A, n_rows_a, n_cols_a, x, y, trans_a, alpha, beta, stream);
-}
-
 /**
- * y = op(A) * x
- *
- * where
- *
- * @param handle raft handle
- * @param A is a column-major matrix of size n_rows_a * n_cols_a.
- *   op(A) is either the transpose operation (trans_a == true) or identity.
- * @param n_rows_a number of rows in A
- * @param n_cols_a number of cols in A
- * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
- * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
- * @param trans_a whether to take transpose of a
- * @param stream stream on which this function is run
+ * DISCLAIMER: this file is deprecated: use gemv.cuh instead
  */
-template <typename math_t>
-void gemv(const raft::handle_t& handle,
-          const math_t* A,
-          const int n_rows_a,
-          const int n_cols_a,
-          const math_t* x,
-          math_t* y,
-          const bool trans_a,
-          cudaStream_t stream)
-{
-  detail::gemv(handle, A, n_rows_a, n_cols_a, x, y, trans_a, stream);
-}
 
-/**
- * y = alpha * op(A) * x + beta * y
- *
- * where
- * @param handle raft handle
- * @param A is a column-major matrix of size n_rows_a * n_cols_a.
- *   op(A) is either the transpose operation (trans_a == true) or identity.
- * @param n_rows_a number of rows in A
- * @param n_cols_a number of cols in A
- * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a.
- *     set it when you need to use only the first n_rows_a rows of the matrix A, which has
- *     (perhaps, due to padding) lda rows.
- * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
- * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
- * @param trans_a whether to take transpose of a
- * @param alpha is a scalar scale of Ax.
- * @param beta is a scalar scale of y.
- * @param stream stream on which this function is run
- */
-template <typename math_t>
-void gemv(const raft::handle_t& handle,
-          const math_t* A,
-          const int n_rows_a,
-          const int n_cols_a,
-          const int lda,
-          const math_t* x,
-          math_t* y,
-          const bool trans_a,
-          const math_t alpha,
-          const math_t beta,
-          cudaStream_t stream)
-{
-  detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream);
-}
-
-/**
- * y = op(A) * x
- *
- * where
- * @param handle raft handle
- * @param A is a column-major matrix of size n_rows_a * n_cols_a.
- *   op(A) is either the transpose operation (trans_a == true) or identity.
- * @param n_rows_a number of rows in A
- * @param n_cols_a number of cols in A
- * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a.
- *     set it when you need to use only the first n_rows_a rows of the matrix A, which has
- *     (perhaps, due to padding) lda rows.
- * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
- * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
- * @param trans_a whether to take transpose of a
- * @param stream stream on which this function is run
- *
- */
-template <typename math_t>
-void gemv(const raft::handle_t& handle,
-          const math_t* A,
-          const int n_rows_a,
-          const int n_cols_a,
-          const int lda,
-          const math_t* x,
-          math_t* y,
-          const bool trans_a,
-          cudaStream_t stream)
-{
-  detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, stream);
-}
+#pragma once
 
-};  // namespace linalg
-};  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "gemv.cuh"
diff --git a/cpp/include/raft/linalg/init.hpp b/cpp/include/raft/linalg/init.hpp
index db7b0f9cfe..9c59c886c9 100644
--- a/cpp/include/raft/linalg/init.hpp
+++ b/cpp/include/raft/linalg/init.hpp
@@ -18,48 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __INIT_H
-#define __INIT_H
-
-#pragma once
-
-#include "detail/init.hpp"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief Like Python range.
- *
- * Fills the output as out[i] = i.
- *
- * \param [out] out device array, size [end-start]
- * \param [in] start of the range
- * \param [in] end of range (exclusive)
- * \param [in] stream cuda stream
+ * DISCLAIMER: this file is deprecated: use init.cuh instead
  */
-template <typename T>
-void range(T* out, int start, int end, cudaStream_t stream)
-{
-  detail::range(out, start, end, stream);
-}
 
-/**
- * @brief Like Python range.
- *
- * Fills the output as out[i] = i.
- *
- * \param [out] out device array, size [n]
- * \param [in] n length of the array
- * \param [in] stream cuda stream
- */
-template <typename T, int TPB = 256>
-void range(T* out, int n, cudaStream_t stream)
-{
-  detail::range(out, n, stream);
-}
+#pragma once
 
-}  // namespace linalg
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "init.cuh"
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index 75e3d11444..0529db6b5b 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -18,150 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __LANCZOS_H
-#define __LANCZOS_H
-
-#pragma once
-
-#include "detail/lanczos.cuh"
-#include <raft/spectral/matrix_wrappers.hpp>
-
-namespace raft {
-namespace linalg {
-
-// =========================================================
-// Eigensolver
-// =========================================================
-
 /**
- *  @brief  Compute smallest eigenvectors of symmetric matrix
- *    Computes eigenvalues and eigenvectors that are least
- *    positive. If matrix is positive definite or positive
- *    semidefinite, the computed eigenvalues are smallest in
- *    magnitude.
- *    The largest eigenvalue is estimated by performing several
- *    Lanczos iterations. An implicitly restarted Lanczos method is
- *    then applied to A+s*I, where s is negative the largest
- *    eigenvalue.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param handle the raft handle.
- *  @param A Matrix.
- *  @param nEigVecs Number of eigenvectors to compute.
- *  @param maxIter Maximum number of Lanczos steps. Does not include
- *    Lanczos steps used to estimate largest eigenvalue.
- *  @param restartIter Maximum size of Lanczos system before
- *    performing an implicit restart. Should be at least 4.
- *  @param tol Convergence tolerance. Lanczos iteration will
- *    terminate when the residual norm is less than tol*theta, where
- *    theta is an estimate for the smallest unwanted eigenvalue
- *    (i.e. the (nEigVecs+1)th smallest eigenvalue).
- *  @param reorthogonalize Whether to reorthogonalize Lanczos
- *    vectors.
- *  @param iter On exit, pointer to total number of Lanczos
- *    iterations performed. Does not include Lanczos steps used to
- *    estimate largest eigenvalue.
- *  @param eigVals_dev (Output, device memory, nEigVecs entries)
- *    Smallest eigenvalues of matrix.
- *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
- *    Eigenvectors corresponding to smallest eigenvalues of
- *    matrix. Vectors are stored as columns of a column-major matrix
- *    with dimensions n x nEigVecs.
- *  @param seed random seed.
- *  @return error flag.
+ * DISCLAIMER: this file is deprecated: use lanczos.cuh instead
  */
-template <typename index_type_t, typename value_type_t>
-int computeSmallestEigenvectors(
-  handle_t const& handle,
-  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
-  index_type_t nEigVecs,
-  index_type_t maxIter,
-  index_type_t restartIter,
-  value_type_t tol,
-  bool reorthogonalize,
-  index_type_t& iter,
-  value_type_t* __restrict__ eigVals_dev,
-  value_type_t* __restrict__ eigVecs_dev,
-  unsigned long long seed = 1234567)
-{
-  return detail::computeSmallestEigenvectors(handle,
-                                             A,
-                                             nEigVecs,
-                                             maxIter,
-                                             restartIter,
-                                             tol,
-                                             reorthogonalize,
-                                             iter,
-                                             eigVals_dev,
-                                             eigVecs_dev,
-                                             seed);
-}
 
-/**
- *  @brief  Compute largest eigenvectors of symmetric matrix
- *    Computes eigenvalues and eigenvectors that are least
- *    positive. If matrix is positive definite or positive
- *    semidefinite, the computed eigenvalues are largest in
- *    magnitude.
- *    The largest eigenvalue is estimated by performing several
- *    Lanczos iterations. An implicitly restarted Lanczos method is
- *    then applied to A+s*I, where s is negative the largest
- *    eigenvalue.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param handle the raft handle.
- *  @param A Matrix.
- *  @param nEigVecs Number of eigenvectors to compute.
- *  @param maxIter Maximum number of Lanczos steps. Does not include
- *    Lanczos steps used to estimate largest eigenvalue.
- *  @param restartIter Maximum size of Lanczos system before
- *    performing an implicit restart. Should be at least 4.
- *  @param tol Convergence tolerance. Lanczos iteration will
- *    terminate when the residual norm is less than tol*theta, where
- *    theta is an estimate for the largest unwanted eigenvalue
- *    (i.e. the (nEigVecs+1)th largest eigenvalue).
- *  @param reorthogonalize Whether to reorthogonalize Lanczos
- *    vectors.
- *  @param iter On exit, pointer to total number of Lanczos
- *    iterations performed. Does not include Lanczos steps used to
- *    estimate largest eigenvalue.
- *  @param eigVals_dev (Output, device memory, nEigVecs entries)
- *    Largest eigenvalues of matrix.
- *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
- *    Eigenvectors corresponding to largest eigenvalues of
- *    matrix. Vectors are stored as columns of a column-major matrix
- *    with dimensions n x nEigVecs.
- *  @param seed random seed.
- *  @return error flag.
- */
-template <typename index_type_t, typename value_type_t>
-int computeLargestEigenvectors(
-  handle_t const& handle,
-  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
-  index_type_t nEigVecs,
-  index_type_t maxIter,
-  index_type_t restartIter,
-  value_type_t tol,
-  bool reorthogonalize,
-  index_type_t& iter,
-  value_type_t* __restrict__ eigVals_dev,
-  value_type_t* __restrict__ eigVecs_dev,
-  unsigned long long seed = 123456)
-{
-  return detail::computeLargestEigenvectors(handle,
-                                            A,
-                                            nEigVecs,
-                                            maxIter,
-                                            restartIter,
-                                            tol,
-                                            reorthogonalize,
-                                            iter,
-                                            eigVals_dev,
-                                            eigVecs_dev,
-                                            seed);
-}
+#pragma once
 
-}  // namespace linalg
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "lanczos.cuh"
diff --git a/cpp/include/raft/linalg/lstsq.hpp b/cpp/include/raft/linalg/lstsq.hpp
index f90cd00ea3..3dfbea0629 100644
--- a/cpp/include/raft/linalg/lstsq.hpp
+++ b/cpp/include/raft/linalg/lstsq.hpp
@@ -18,109 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __LSTSQ_H
-#define __LSTSQ_H
-
-#pragma once
-
-#include <raft/handle.hpp>
-#include <raft/linalg/detail/lstsq.cuh>
-namespace raft {
-namespace linalg {
-
-/** Solves the linear ordinary least squares problem `Aw = b`
- *  Via SVD decomposition of `A = U S Vt` using default cuSOLVER routine.
- *
- * @param[in] handle raft handle
- * @param[inout] A input feature matrix.
- *            Warning: the content of this matrix is modified by the cuSOLVER routines.
- * @param[in] n_rows number of rows in A
- * @param[in] n_cols number of columns in A
- * @param[inout] b input target vector.
- *            Warning: the content of this vector is modified by the cuSOLVER routines.
- * @param[out] w output coefficient vector
- * @param[in] stream cuda stream for ordering operations
- */
-template <typename math_t>
-void lstsqSvdQR(const raft::handle_t& handle,
-                math_t* A,
-                const int n_rows,
-                const int n_cols,
-                const math_t* b,
-                math_t* w,
-                cudaStream_t stream)
-{
-  detail::lstsqSvdQR(handle, A, n_rows, n_cols, b, w, stream);
-}
-
-/** Solves the linear ordinary least squares problem `Aw = b`
- *  Via SVD decomposition of `A = U S V^T` using Jacobi iterations (cuSOLVER).
- *
- * @param[in] handle raft handle
- * @param[inout] A input feature matrix.
- *            Warning: the content of this matrix is modified by the cuSOLVER routines.
- * @param[in] n_rows number of rows in A
- * @param[in] n_cols number of columns in A
- * @param[inout] b input target vector.
- *            Warning: the content of this vector is modified by the cuSOLVER routines.
- * @param[out] w output coefficient vector
- * @param[in] stream cuda stream for ordering operations
- */
-template <typename math_t>
-void lstsqSvdJacobi(const raft::handle_t& handle,
-                    math_t* A,
-                    const int n_rows,
-                    const int n_cols,
-                    const math_t* b,
-                    math_t* w,
-                    cudaStream_t stream)
-{
-  detail::lstsqSvdJacobi(handle, A, n_rows, n_cols, b, w, stream);
-}
-
-/** Solves the linear ordinary least squares problem `Aw = b`
- *  via eigenvalue decomposition of `A^T * A` (covariance matrix for dataset A).
- *  (`w = (A^T A)^-1  A^T b`)
+/**
+ * DISCLAIMER: this file is deprecated: use lstsq.cuh instead
  */
-template <typename math_t>
-void lstsqEig(const raft::handle_t& handle,
-              const math_t* A,
-              const int n_rows,
-              const int n_cols,
-              const math_t* b,
-              math_t* w,
-              cudaStream_t stream)
-{
-  detail::lstsqEig(handle, A, n_rows, n_cols, b, w, stream);
-}
 
-/** Solves the linear ordinary least squares problem `Aw = b`
- *  via QR decomposition of `A = QR`.
- *  (triangular system of equations `Rw = Q^T b`)
- *
- * @param[in] handle raft handle
- * @param[inout] A input feature matrix.
- *            Warning: the content of this matrix is modified by the cuSOLVER routines.
- * @param[in] n_rows number of rows in A
- * @param[in] n_cols number of columns in A
- * @param[inout] b input target vector.
- *            Warning: the content of this vector is modified by the cuSOLVER routines.
- * @param[out] w output coefficient vector
- * @param[in] stream cuda stream for ordering operations
- */
-template <typename math_t>
-void lstsqQR(const raft::handle_t& handle,
-             math_t* A,
-             const int n_rows,
-             const int n_cols,
-             math_t* b,
-             math_t* w,
-             cudaStream_t stream)
-{
-  detail::lstsqQR(handle, A, n_rows, n_cols, b, w, stream);
-}
+#pragma once
 
-};  // namespace linalg
-};  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "lstsq.cuh"
diff --git a/cpp/include/raft/linalg/map_then_reduce.hpp b/cpp/include/raft/linalg/map_then_reduce.hpp
index 235485926b..6502a84edb 100644
--- a/cpp/include/raft/linalg/map_then_reduce.hpp
+++ b/cpp/include/raft/linalg/map_then_reduce.hpp
@@ -18,79 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MAP_THEN_REDUCE_H
-#define __MAP_THEN_REDUCE_H
-
-#pragma once
-
-#include "detail/map_then_reduce.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief CUDA version of map and then sum reduction operation
- * @tparam Type data-type upon which the math operation will be performed
- * @tparam MapOp the device-lambda performing the actual operation
- * @tparam TPB threads-per-block in the final kernel launched
- * @tparam Args additional parameters
- * @param out the output sum-reduced value (assumed to be a device pointer)
- * @param len number of elements in the input array
- * @param map the device-lambda
- * @param stream cuda-stream where to launch this kernel
- * @param in the input array
- * @param args additional input arrays
+ * DISCLAIMER: this file is deprecated: use map_then_reduce.cuh instead
  */
 
-template <typename InType,
-          typename MapOp,
-          int TPB = 256,
-          typename... Args,
-          typename OutType = InType>
-void mapThenSumReduce(
-  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
-{
-  detail::mapThenReduceImpl<InType, OutType, MapOp, detail::sum_tag, TPB, Args...>(
-    out, len, (OutType)0, map, detail::sum_tag(), stream, in, args...);
-}
-
-/**
- * @brief CUDA version of map and then generic reduction operation
- * @tparam Type data-type upon which the math operation will be performed
- * @tparam MapOp the device-lambda performing the actual map operation
- * @tparam ReduceLambda the device-lambda performing the actual reduction
- * @tparam TPB threads-per-block in the final kernel launched
- * @tparam Args additional parameters
- * @param out the output reduced value (assumed to be a device pointer)
- * @param len number of elements in the input array
- * @param neutral The neutral element of the reduction operation. For example:
- *    0 for sum, 1 for multiply, +Inf for Min, -Inf for Max
- * @param map the device-lambda
- * @param op the reduction device lambda
- * @param stream cuda-stream where to launch this kernel
- * @param in the input array
- * @param args additional input arrays
- */
+#pragma once
 
-template <typename InType,
-          typename MapOp,
-          typename ReduceLambda,
-          int TPB          = 256,
-          typename OutType = InType,
-          typename... Args>
-void mapThenReduce(OutType* out,
-                   size_t len,
-                   OutType neutral,
-                   MapOp map,
-                   ReduceLambda op,
-                   cudaStream_t stream,
-                   const InType* in,
-                   Args... args)
-{
-  detail::mapThenReduceImpl<InType, OutType, MapOp, ReduceLambda, TPB, Args...>(
-    out, len, neutral, map, op, stream, in, args...);
-}
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "map_then_reduce.cuh"
diff --git a/cpp/include/raft/linalg/matrix_vector_op.hpp b/cpp/include/raft/linalg/matrix_vector_op.hpp
index 574d4aee63..1237961ceb 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.hpp
+++ b/cpp/include/raft/linalg/matrix_vector_op.hpp
@@ -18,93 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MATRIX_VECTOR_OP_H
-#define __MATRIX_VECTOR_OP_H
-
-#pragma once
-
-#include "detail/matrix_vector_op.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief Operations for all the columns or rows with a given vector.
- * Caution : Threads process multiple elements to speed up processing. These
- * are loaded in a single read thanks to type promotion. Faster processing
- * would thus only be enabled when adresses are optimally aligned for it.
- * Note : the function will also check that the size of the window of accesses
- * is a multiple of the number of elements processed by a thread in order to
- * enable faster processing
- * @tparam Type the matrix/vector type
- * @tparam Lambda a device function which represents a binary operator
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads per block of the cuda kernel launched
- * @param out the output matrix (passing out = matrix makes it in-place)
- * @param matrix the input matrix
- * @param vec the vector
- * @param D number of columns of matrix
- * @param N number of rows of matrix
- * @param rowMajor whether input is row or col major
- * @param bcastAlongRows whether the broadcast of vector needs to happen along
- * the rows of the matrix or columns
- * @param op the mathematical operation
- * @param stream cuda stream where to launch work
+ * DISCLAIMER: this file is deprecated: use matrix_vector_op.cuh instead
  */
-template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
-void matrixVectorOp(Type* out,
-                    const Type* matrix,
-                    const Type* vec,
-                    IdxType D,
-                    IdxType N,
-                    bool rowMajor,
-                    bool bcastAlongRows,
-                    Lambda op,
-                    cudaStream_t stream)
-{
-  detail::matrixVectorOp(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-}
 
-/**
- * @brief Operations for all the columns or rows with the given vectors.
- * Caution : Threads process multiple elements to speed up processing. These
- * are loaded in a single read thanks to type promotion. Faster processing
- * would thus only be enabled when adresses are optimally aligned for it.
- * Note : the function will also check that the size of the window of accesses
- * is a multiple of the number of elements processed by a thread in order to
- * enable faster processing
- * @tparam Type the matrix/vector type
- * @tparam Lambda a device function which represents a binary operator
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads per block of the cuda kernel launched
- * @param out the output matrix (passing out = matrix makes it in-place)
- * @param matrix the input matrix
- * @param vec1 the first vector
- * @param vec2 the second vector
- * @param D number of columns of matrix
- * @param N number of rows of matrix
- * @param rowMajor whether input is row or col major
- * @param bcastAlongRows whether the broadcast of vector needs to happen along
- * the rows of the matrix or columns
- * @param op the mathematical operation
- * @param stream cuda stream where to launch work
- */
-template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
-void matrixVectorOp(Type* out,
-                    const Type* matrix,
-                    const Type* vec1,
-                    const Type* vec2,
-                    IdxType D,
-                    IdxType N,
-                    bool rowMajor,
-                    bool bcastAlongRows,
-                    Lambda op,
-                    cudaStream_t stream)
-{
-  detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-}
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "matrix_vector_op.cuh"
diff --git a/cpp/include/raft/linalg/mean_squared_error.hpp b/cpp/include/raft/linalg/mean_squared_error.hpp
index 7a7f03ee18..cbb974e516 100644
--- a/cpp/include/raft/linalg/mean_squared_error.hpp
+++ b/cpp/include/raft/linalg/mean_squared_error.hpp
@@ -18,35 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MSE_H
-#define __MSE_H
-
-#pragma once
-
-#include "detail/mean_squared_error.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief CUDA version mean squared error function mean((A-B)**2)
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam TPB threads-per-block
- * @param out the output mean squared error value (assumed to be a device pointer)
- * @param A input array (assumed to be a device pointer)
- * @param B input array (assumed to be a device pointer)
- * @param len number of elements in the input arrays
- * @param weight weight to apply to every term in the mean squared error calculation
- * @param stream cuda-stream where to launch this kernel
+ * DISCLAIMER: this file is deprecated: use mean_squared_error.cuh instead
  */
-template <typename math_t, int TPB = 256>
-void meanSquaredError(
-  math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream)
-{
-  detail::meanSquaredError(out, A, B, len, weight, stream);
-}
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "mean_squared_error.cuh"
diff --git a/cpp/include/raft/linalg/multiply.hpp b/cpp/include/raft/linalg/multiply.hpp
index eb933cd607..5aa481a894 100644
--- a/cpp/include/raft/linalg/multiply.hpp
+++ b/cpp/include/raft/linalg/multiply.hpp
@@ -18,35 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MULTIPLY_H
-#define __MULTIPLY_H
-
-#pragma once
-
-#include "detail/multiply.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @defgroup ScalarOps Scalar operations on the input buffer
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param out the output buffer
- * @param in the input buffer
- * @param scalar the scalar used in the operations
- * @param len number of elements in the input buffer
- * @param stream cuda stream where to launch work
- * @{
+ * DISCLAIMER: this file is deprecated: use multiply.cuh instead
  */
-template <typename math_t, typename IdxType = int>
-void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
-{
-  detail::multiplyScalar(out, in, scalar, len, stream);
-}
-/** @} */
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "multiply.cuh"
diff --git a/cpp/include/raft/linalg/norm.hpp b/cpp/include/raft/linalg/norm.hpp
index 958784d67e..b750367f05 100644
--- a/cpp/include/raft/linalg/norm.hpp
+++ b/cpp/include/raft/linalg/norm.hpp
@@ -18,82 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __NORM_H
-#define __NORM_H
-
-#pragma once
-
-#include "detail/norm.cuh"
-
-namespace raft {
-namespace linalg {
-
-/** different types of norms supported on the input buffers */
-using detail::L1Norm;
-using detail::L2Norm;
-using detail::NormType;
-
 /**
- * @brief Compute row-wise norm of the input matrix and perform fin_op lambda
- *
- * Row-wise norm is useful while computing pairwise distance matrix, for
- * example.
- * This is used in many clustering algos like knn, kmeans, dbscan, etc... The
- * current implementation is optimized only for bigger values of 'D'.
- *
- * @tparam Type the data type
- * @tparam Lambda device final lambda
- * @tparam IdxType Integer type used to for addressing
- * @param dots the output vector of row-wise dot products
- * @param data the input matrix (currently assumed to be row-major)
- * @param D number of columns of data
- * @param N number of rows of data
- * @param type the type of norm to be applied
- * @param rowMajor whether the input is row-major or not
- * @param stream cuda stream where to launch work
- * @param fin_op the final lambda op
+ * DISCLAIMER: this file is deprecated: use norm.cuh instead
  */
-template <typename Type, typename IdxType = int, typename Lambda = raft::Nop<Type, IdxType>>
-void rowNorm(Type* dots,
-             const Type* data,
-             IdxType D,
-             IdxType N,
-             NormType type,
-             bool rowMajor,
-             cudaStream_t stream,
-             Lambda fin_op = raft::Nop<Type, IdxType>())
-{
-  detail::rowNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op);
-}
 
-/**
- * @brief Compute column-wise norm of the input matrix and perform fin_op
- * @tparam Type the data type
- * @tparam Lambda device final lambda
- * @tparam IdxType Integer type used to for addressing
- * @param dots the output vector of column-wise dot products
- * @param data the input matrix (currently assumed to be row-major)
- * @param D number of columns of data
- * @param N number of rows of data
- * @param type the type of norm to be applied
- * @param rowMajor whether the input is row-major or not
- * @param stream cuda stream where to launch work
- * @param fin_op the final lambda op
- */
-template <typename Type, typename IdxType = int, typename Lambda = raft::Nop<Type, IdxType>>
-void colNorm(Type* dots,
-             const Type* data,
-             IdxType D,
-             IdxType N,
-             NormType type,
-             bool rowMajor,
-             cudaStream_t stream,
-             Lambda fin_op = raft::Nop<Type, IdxType>())
-{
-  detail::colNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op);
-}
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "norm.cuh"
diff --git a/cpp/include/raft/linalg/power.hpp b/cpp/include/raft/linalg/power.hpp
index d1506ff7a9..1e4a56d4fb 100644
--- a/cpp/include/raft/linalg/power.hpp
+++ b/cpp/include/raft/linalg/power.hpp
@@ -18,57 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __POWER_H
-#define __POWER_H
-
-#pragma once
-
-#include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/unary_op.cuh>
-
-namespace raft {
-namespace linalg {
-
 /**
- * @defgroup ScalarOps Scalar operations on the input buffer
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param out the output buffer
- * @param in the input buffer
- * @param scalar the scalar used in the operations
- * @param len number of elements in the input buffer
- * @param stream cuda stream where to launch work
- * @{
+ * DISCLAIMER: this file is deprecated: use power.cuh instead
  */
-template <typename math_t, typename IdxType = int>
-void powerScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
-{
-  raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(math_t in) { return raft::myPow(in, scalar); }, stream);
-}
-/** @} */
 
-/**
- * @defgroup BinaryOps Element-wise binary operations on the input buffers
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param out the output buffer
- * @param in1 the first input buffer
- * @param in2 the second input buffer
- * @param len number of elements in the input buffers
- * @param stream cuda stream where to launch work
- * @{
- */
-template <typename math_t, typename IdxType = int>
-void power(math_t* out, const math_t* in1, const math_t* in2, IdxType len, cudaStream_t stream)
-{
-  raft::linalg::binaryOp(
-    out, in1, in2, len, [] __device__(math_t a, math_t b) { return raft::myPow(a, b); }, stream);
-}
-/** @} */
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "power.cuh"
diff --git a/cpp/include/raft/linalg/reduce.hpp b/cpp/include/raft/linalg/reduce.hpp
index b9cc2c6e9d..b965cfac7b 100644
--- a/cpp/include/raft/linalg/reduce.hpp
+++ b/cpp/include/raft/linalg/reduce.hpp
@@ -18,69 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __REDUCE_H
-#define __REDUCE_H
-
-#pragma once
-
-#include "detail/reduce.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief Compute reduction of the input matrix along the requested dimension
- *
- * @tparam InType the data type of the input
- * @tparam OutType the data type of the output (as well as the data type for
- *  which reduction is performed)
- * @tparam IdxType data type of the indices of the array
- * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
- * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*ReduceLambda)(OutType);</pre>
- * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*FinalLambda)(OutType);</pre>
- * @param dots the output reduction vector
- * @param data the input matrix
- * @param D number of columns
- * @param N number of rows
- * @param init initial value to use for the reduction
- * @param rowMajor input matrix is row-major or not
- * @param alongRows whether to reduce along rows or columns
- * @param stream cuda stream where to launch work
- * @param inplace reduction result added inplace or overwrites old values?
- * @param main_op elementwise operation to apply before reduction
- * @param reduce_op binary reduction operation
- * @param final_op elementwise operation to apply before storing results
+ * DISCLAIMER: this file is deprecated: use reduce.cuh instead
  */
-template <typename InType,
-          typename OutType      = InType,
-          typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
-void reduce(OutType* dots,
-            const InType* data,
-            int D,
-            int N,
-            OutType init,
-            bool rowMajor,
-            bool alongRows,
-            cudaStream_t stream,
-            bool inplace           = false,
-            MainLambda main_op     = raft::Nop<InType, IdxType>(),
-            ReduceLambda reduce_op = raft::Sum<OutType>(),
-            FinalLambda final_op   = raft::Nop<OutType>())
-{
-  detail::reduce(
-    dots, data, D, N, init, rowMajor, alongRows, stream, inplace, main_op, reduce_op, final_op);
-}
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "reduce.cuh"
diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.hpp b/cpp/include/raft/linalg/reduce_cols_by_key.hpp
index c24baa60de..70851c2b69 100644
--- a/cpp/include/raft/linalg/reduce_cols_by_key.hpp
+++ b/cpp/include/raft/linalg/reduce_cols_by_key.hpp
@@ -18,45 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __REDUCE_COLS_BY_KEY
-#define __REDUCE_COLS_BY_KEY
+/**
+ * DISCLAIMER: this file is deprecated: use reduce_cols_by_key.cuh instead
+ */
 
 #pragma once
 
-#include <raft/linalg/detail/reduce_cols_by_key.cuh>
-
-namespace raft {
-namespace linalg {
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-/**
- * @brief Computes the sum-reduction of matrix columns for each given key
- * @tparam T the input data type (as well as the output reduced matrix)
- * @tparam KeyType data type of the keys
- * @tparam IdxType indexing arithmetic type
- * @param data the input data (dim = nrows x ncols). This is assumed to be in
- * row-major layout
- * @param keys keys array (len = ncols). It is assumed that each key in this
- * array is between [0, nkeys). In case this is not true, the caller is expected
- * to have called make_monotonic primitive to prepare such a contiguous and
- * monotonically increasing keys array.
- * @param out the output reduced matrix along columns (dim = nrows x nkeys).
- * This will be assumed to be in row-major layout
- * @param nrows number of rows in the input data
- * @param ncols number of colums in the input data
- * @param nkeys number of unique keys in the keys array
- * @param stream cuda stream to launch the kernel onto
- */
-template <typename T, typename KeyIteratorT, typename IdxType = int>
-void reduce_cols_by_key(const T* data,
-                        const KeyIteratorT keys,
-                        T* out,
-                        IdxType nrows,
-                        IdxType ncols,
-                        IdxType nkeys,
-                        cudaStream_t stream)
-{
-  detail::reduce_cols_by_key(data, keys, out, nrows, ncols, nkeys, stream);
-}
-};  // end namespace linalg
-};  // end namespace raft
-#endif
\ No newline at end of file
+#include "reduce_cols_by_key.cuh"
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.hpp b/cpp/include/raft/linalg/reduce_rows_by_key.hpp
index d18a00aa1d..4b5e76ea8f 100644
--- a/cpp/include/raft/linalg/reduce_rows_by_key.hpp
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.hpp
@@ -18,102 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __REDUCE_ROWS_BY_KEY
-#define __REDUCE_ROWS_BY_KEY
-
-#pragma once
-
-#include <raft/linalg/detail/reduce_rows_by_key.cuh>
-
-namespace raft {
-namespace linalg {
-
-/**
- Small helper function to convert from int->char and char->int
- Transform ncols*nrows read of int in 2*nrows reads of int + ncols*rows reads of chars
-**/
-template <typename IteratorT1, typename IteratorT2>
-void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st)
-{
-  detail::convert_array(dst, src, n, st);
-}
-
 /**
- * @brief Computes the weighted reduction of matrix rows for each given key
- *
- * @tparam DataIteratorT Random-access iterator type, for reading input matrix
- *                       (may be a simple pointer type)
- * @tparam KeysIteratorT Random-access iterator type, for reading input keys
- *                       (may be a simple pointer type)
- *
- * @param[in]  d_A         Input data array (lda x nrows)
- * @param[in]  lda         Real row size for input data, d_A
- * @param[in]  d_keys      Keys for each row (1 x nrows)
- * @param[in]  d_weights   Weights for each observation in d_A (1 x nrows)
- * @param[out] d_keys_char Scratch memory for conversion of keys to char
- * @param[in]  nrows       Number of rows in d_A and d_keys
- * @param[in]  ncols       Number of data columns in d_A
- * @param[in]  nkeys       Number of unique keys in d_keys
- * @param[out] d_sums      Row sums by key (ncols x d_keys)
- * @param[in]  stream      CUDA stream
+ * DISCLAIMER: this file is deprecated: use reduce_rows_by_key.cuh instead
  */
-template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
-void reduce_rows_by_key(const DataIteratorT d_A,
-                        int lda,
-                        const KeysIteratorT d_keys,
-                        const WeightT* d_weights,
-                        char* d_keys_char,
-                        int nrows,
-                        int ncols,
-                        int nkeys,
-                        DataIteratorT d_sums,
-                        cudaStream_t stream)
-{
-  detail::reduce_rows_by_key(
-    d_A, lda, d_keys, d_weights, d_keys_char, nrows, ncols, nkeys, d_sums, stream);
-}
 
-/**
- * @brief Computes the reduction of matrix rows for each given key
- * @tparam DataIteratorT Random-access iterator type, for reading input matrix (may be a simple
- * pointer type)
- * @tparam KeysIteratorT Random-access iterator type, for reading input keys (may be a simple
- * pointer type)
- * @param[in]  d_A         Input data array (lda x nrows)
- * @param[in]  lda         Real row size for input data, d_A
- * @param[in]  d_keys      Keys for each row (1 x nrows)
- * @param      d_keys_char Scratch memory for conversion of keys to char
- * @param[in]  nrows       Number of rows in d_A and d_keys
- * @param[in]  ncols       Number of data columns in d_A
- * @param[in]  nkeys       Number of unique keys in d_keys
- * @param[out] d_sums      Row sums by key (ncols x d_keys)
- * @param[in]  stream      CUDA stream
- */
-template <typename DataIteratorT, typename KeysIteratorT>
-void reduce_rows_by_key(const DataIteratorT d_A,
-                        int lda,
-                        const KeysIteratorT d_keys,
-                        char* d_keys_char,
-                        int nrows,
-                        int ncols,
-                        int nkeys,
-                        DataIteratorT d_sums,
-                        cudaStream_t stream)
-{
-  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
-  reduce_rows_by_key(d_A,
-                     lda,
-                     d_keys,
-                     static_cast<DataType*>(nullptr),
-                     d_keys_char,
-                     nrows,
-                     ncols,
-                     nkeys,
-                     d_sums,
-                     stream);
-}
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "reduce_rows_by_key.cuh"
diff --git a/cpp/include/raft/linalg/rsvd.hpp b/cpp/include/raft/linalg/rsvd.hpp
index ac6e13b555..7e2fffba75 100644
--- a/cpp/include/raft/linalg/rsvd.hpp
+++ b/cpp/include/raft/linalg/rsvd.hpp
@@ -18,131 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __RSVD_H
-#define __RSVD_H
-
-#pragma once
-
-#include <raft/linalg/detail/rsvd.cuh>
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief randomized singular value decomposition (RSVD) on the column major
- * float type input matrix (Jacobi-based), by specifying no. of PCs and
- * upsamples directly
- * @param handle: raft handle
- * @param M: input matrix
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param S_vec: singular values of input matrix
- * @param U: left singular values of input matrix
- * @param V: right singular values of input matrix
- * @param k: no. of singular values to be computed
- * @param p: no. of upsamples
- * @param use_bbt: whether use eigen decomposition in computation or not
- * @param gen_left_vec: left vector needs to be generated or not?
- * @param gen_right_vec: right vector needs to be generated or not?
- * @param use_jacobi: whether to jacobi solver for decomposition
- * @param tol: tolerance for Jacobi-based solvers
- * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers
- * @param stream cuda stream
+ * DISCLAIMER: this file is deprecated: use rsvd.cuh instead
  */
-template <typename math_t>
-void rsvdFixedRank(const raft::handle_t& handle,
-                   math_t* M,
-                   int n_rows,
-                   int n_cols,
-                   math_t* S_vec,
-                   math_t* U,
-                   math_t* V,
-                   int k,
-                   int p,
-                   bool use_bbt,
-                   bool gen_left_vec,
-                   bool gen_right_vec,
-                   bool use_jacobi,
-                   math_t tol,
-                   int max_sweeps,
-                   cudaStream_t stream)
-{
-  detail::rsvdFixedRank(handle,
-                        M,
-                        n_rows,
-                        n_cols,
-                        S_vec,
-                        U,
-                        V,
-                        k,
-                        p,
-                        use_bbt,
-                        gen_left_vec,
-                        gen_right_vec,
-                        use_jacobi,
-                        tol,
-                        max_sweeps,
-                        stream);
-}
 
-/**
- * @brief randomized singular value decomposition (RSVD) on the column major
- * float type input matrix (Jacobi-based), by specifying the PC and upsampling
- * ratio
- * @param handle: raft handle
- * @param M: input matrix
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param S_vec: singular values of input matrix
- * @param U: left singular values of input matrix
- * @param V: right singular values of input matrix
- * @param PC_perc: percentage of singular values to be computed
- * @param UpS_perc: upsampling percentage
- * @param use_bbt: whether use eigen decomposition in computation or not
- * @param gen_left_vec: left vector needs to be generated or not?
- * @param gen_right_vec: right vector needs to be generated or not?
- * @param use_jacobi: whether to jacobi solver for decomposition
- * @param tol: tolerance for Jacobi-based solvers
- * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers
- * @param stream cuda stream
- */
-template <typename math_t>
-void rsvdPerc(const raft::handle_t& handle,
-              math_t* M,
-              int n_rows,
-              int n_cols,
-              math_t* S_vec,
-              math_t* U,
-              math_t* V,
-              math_t PC_perc,
-              math_t UpS_perc,
-              bool use_bbt,
-              bool gen_left_vec,
-              bool gen_right_vec,
-              bool use_jacobi,
-              math_t tol,
-              int max_sweeps,
-              cudaStream_t stream)
-{
-  detail::rsvdPerc(handle,
-                   M,
-                   n_rows,
-                   n_cols,
-                   S_vec,
-                   U,
-                   V,
-                   PC_perc,
-                   UpS_perc,
-                   use_bbt,
-                   gen_left_vec,
-                   gen_right_vec,
-                   use_jacobi,
-                   tol,
-                   max_sweeps,
-                   stream);
-}
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "rsvd.cuh"
diff --git a/cpp/include/raft/linalg/sqrt.hpp b/cpp/include/raft/linalg/sqrt.hpp
index 9c66ee2d14..e0f77f0ab9 100644
--- a/cpp/include/raft/linalg/sqrt.hpp
+++ b/cpp/include/raft/linalg/sqrt.hpp
@@ -18,36 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SQRT_H
-#define __SQRT_H
-
-#pragma once
-
-#include <raft/cuda_utils.cuh>
-#include <raft/linalg/unary_op.cuh>
-
-namespace raft {
-namespace linalg {
-
 /**
- * @defgroup ScalarOps Scalar operations on the input buffer
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param out the output buffer
- * @param in the input buffer
- * @param len number of elements in the input buffer
- * @param stream cuda stream where to launch work
- * @{
+ * DISCLAIMER: this file is deprecated: use sqrt.cuh instead
  */
-template <typename math_t, typename IdxType = int>
-void sqrt(math_t* out, const math_t* in, IdxType len, cudaStream_t stream)
-{
-  raft::linalg::unaryOp(
-    out, in, len, [] __device__(math_t in) { return raft::mySqrt(in); }, stream);
-}
-/** @} */
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "sqrt.cuh"
diff --git a/cpp/include/raft/linalg/strided_reduction.hpp b/cpp/include/raft/linalg/strided_reduction.hpp
index 3b1597dfc3..6720a302ea 100644
--- a/cpp/include/raft/linalg/strided_reduction.hpp
+++ b/cpp/include/raft/linalg/strided_reduction.hpp
@@ -18,64 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __STRIDED_REDUCTION_H
-#define __STRIDED_REDUCTION_H
-
-#pragma once
-
-#include "detail/strided_reduction.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief Compute reduction of the input matrix along the strided dimension
- *
- * @tparam InType the data type of the input
- * @tparam OutType the data type of the output (as well as the data type for
- *  which reduction is performed)
- * @tparam IdxType data type of the indices of the array
- * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
- * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*ReduceLambda)(OutType);</pre>
- * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*FinalLambda)(OutType);</pre>
- * @param dots the output reduction vector
- * @param data the input matrix
- * @param D leading dimension of data
- * @param N second dimension data
- * @param init initial value to use for the reduction
- * @param main_op elementwise operation to apply before reduction
- * @param reduce_op binary reduction operation
- * @param final_op elementwise operation to apply before storing results
- * @param inplace reduction result added inplace or overwrites old values?
- * @param stream cuda stream where to launch work
+ * DISCLAIMER: this file is deprecated: use strided_reduction.cuh instead
  */
-template <typename InType,
-          typename OutType      = InType,
-          typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
-void stridedReduction(OutType* dots,
-                      const InType* data,
-                      IdxType D,
-                      IdxType N,
-                      OutType init,
-                      cudaStream_t stream,
-                      bool inplace           = false,
-                      MainLambda main_op     = raft::Nop<InType, IdxType>(),
-                      ReduceLambda reduce_op = raft::Sum<OutType>(),
-                      FinalLambda final_op   = raft::Nop<OutType>())
-{
-  detail::stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-}
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "strided_reduction.cuh"
diff --git a/cpp/include/raft/linalg/subtract.hpp b/cpp/include/raft/linalg/subtract.hpp
index accf57a939..b0c6508ffe 100644
--- a/cpp/include/raft/linalg/subtract.hpp
+++ b/cpp/include/raft/linalg/subtract.hpp
@@ -18,77 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SUBTRACT_H
-#define __SUBTRACT_H
-
-#pragma once
-
-#include "detail/subtract.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief Elementwise scalar subtraction operation on the input buffer
- *
- * @tparam InT     input data-type. Also the data-type upon which the math ops
- *                 will be performed
- * @tparam OutT    output data-type
- * @tparam IdxType Integer type used to for addressing
- *
- * @param out    the output buffer
- * @param in     the input buffer
- * @param scalar the scalar used in the operations
- * @param len    number of elements in the input buffer
- * @param stream cuda stream where to launch work
+ * DISCLAIMER: this file is deprecated: use subtract.cuh instead
  */
-template <typename InT, typename OutT = InT, typename IdxType = int>
-void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
-{
-  detail::subtractScalar(out, in, scalar, len, stream);
-}
 
-/**
- * @brief Elementwise subtraction operation on the input buffers
- * @tparam InT     input data-type. Also the data-type upon which the math ops
- *                 will be performed
- * @tparam OutT    output data-type
- * @tparam IdxType Integer type used to for addressing
- *
- * @param out    the output buffer
- * @param in1    the first input buffer
- * @param in2    the second input buffer
- * @param len    number of elements in the input buffers
- * @param stream cuda stream where to launch work
- */
-template <typename InT, typename OutT = InT, typename IdxType = int>
-void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
-{
-  detail::subtract(out, in1, in2, len, stream);
-}
-
-/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
- * write result to outDev[i]
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param outDev the output buffer
- * @param inDev the input buffer
- * @param singleScalarDev pointer to the scalar located in device memory
- * @param len number of elements in the input and output buffer
- * @param stream cuda stream
- * @remark block size has not been tuned
- */
-template <typename math_t, typename IdxType = int, int TPB = 256>
-void subtractDevScalar(math_t* outDev,
-                       const math_t* inDev,
-                       const math_t* singleScalarDev,
-                       IdxType len,
-                       cudaStream_t stream)
-{
-  detail::subtractDevScalar(outDev, inDev, singleScalarDev, len, stream);
-}
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "subtract.cuh"
diff --git a/cpp/include/raft/linalg/svd.hpp b/cpp/include/raft/linalg/svd.hpp
index 01788a4188..26bce80388 100644
--- a/cpp/include/raft/linalg/svd.hpp
+++ b/cpp/include/raft/linalg/svd.hpp
@@ -18,176 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SVD_H
-#define __SVD_H
-
-#pragma once
-
-#include "detail/svd.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief singular value decomposition (SVD) on the column major float type
- * input matrix using QR method
- * @param handle: raft handle
- * @param in: input matrix
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param sing_vals: singular values of input matrix
- * @param left_sing_vecs: left singular values of input matrix
- * @param right_sing_vecs: right singular values of input matrix
- * @param trans_right: transpose right vectors or not
- * @param gen_left_vec: generate left eig vector. Not activated.
- * @param gen_right_vec: generate right eig vector. Not activated.
- * @param stream cuda stream
+ * DISCLAIMER: this file is deprecated: use svd.cuh instead
  */
-// TODO: activate gen_left_vec and gen_right_vec options
-// TODO: couldn't template this function due to cusolverDnSgesvd and
-// cusolverSnSgesvd. Check if there is any other way.
-template <typename T>
-void svdQR(const raft::handle_t& handle,
-           T* in,
-           int n_rows,
-           int n_cols,
-           T* sing_vals,
-           T* left_sing_vecs,
-           T* right_sing_vecs,
-           bool trans_right,
-           bool gen_left_vec,
-           bool gen_right_vec,
-           cudaStream_t stream)
-{
-  detail::svdQR(handle,
-                in,
-                n_rows,
-                n_cols,
-                sing_vals,
-                left_sing_vecs,
-                right_sing_vecs,
-                trans_right,
-                gen_left_vec,
-                gen_right_vec,
-                stream);
-}
-
-template <typename T>
-void svdEig(const raft::handle_t& handle,
-            T* in,
-            int n_rows,
-            int n_cols,
-            T* S,
-            T* U,
-            T* V,
-            bool gen_left_vec,
-            cudaStream_t stream)
-{
-  detail::svdEig(handle, in, n_rows, n_cols, S, U, V, gen_left_vec, stream);
-}
 
-/**
- * @brief on the column major input matrix using Jacobi method
- * @param handle: raft handle
- * @param in: input matrix
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param sing_vals: singular values of input matrix
- * @param left_sing_vecs: left singular vectors of input matrix
- * @param right_sing_vecs: right singular vectors of input matrix
- * @param gen_left_vec: generate left eig vector. Not activated.
- * @param gen_right_vec: generate right eig vector. Not activated.
- * @param tol: error tolerance for the jacobi method. Algorithm stops when the
- * error is below tol
- * @param max_sweeps: number of sweeps in the Jacobi algorithm. The more the better
- * accuracy.
- * @param stream cuda stream
- */
-template <typename math_t>
-void svdJacobi(const raft::handle_t& handle,
-               math_t* in,
-               int n_rows,
-               int n_cols,
-               math_t* sing_vals,
-               math_t* left_sing_vecs,
-               math_t* right_sing_vecs,
-               bool gen_left_vec,
-               bool gen_right_vec,
-               math_t tol,
-               int max_sweeps,
-               cudaStream_t stream)
-{
-  detail::svdJacobi(handle,
-                    in,
-                    n_rows,
-                    n_cols,
-                    sing_vals,
-                    left_sing_vecs,
-                    right_sing_vecs,
-                    gen_left_vec,
-                    gen_right_vec,
-                    tol,
-                    max_sweeps,
-                    stream);
-}
-
-/**
- * @brief reconstruct a matrix use left and right singular vectors and
- * singular values
- * @param handle: raft handle
- * @param U: left singular vectors of size n_rows x k
- * @param S: square matrix with singular values on its diagonal, k x k
- * @param V: right singular vectors of size n_cols x k
- * @param out: reconstructed matrix to be returned
- * @param n_rows: number rows of output matrix
- * @param n_cols: number columns of output matrix
- * @param k: number of singular values
- * @param stream cuda stream
- */
-template <typename math_t>
-void svdReconstruction(const raft::handle_t& handle,
-                       math_t* U,
-                       math_t* S,
-                       math_t* V,
-                       math_t* out,
-                       int n_rows,
-                       int n_cols,
-                       int k,
-                       cudaStream_t stream)
-{
-  detail::svdReconstruction(handle, U, S, V, out, n_rows, n_cols, k, stream);
-}
-
-/**
- * @brief reconstruct a matrix use left and right singular vectors and
- * singular values
- * @param handle: raft handle
- * @param A_d: input matrix
- * @param U: left singular vectors of size n_rows x k
- * @param S_vec: singular values as a vector
- * @param V: right singular vectors of size n_cols x k
- * @param n_rows: number rows of output matrix
- * @param n_cols: number columns of output matrix
- * @param k: number of singular values to be computed, 1.0 for normal SVD
- * @param tol: tolerance for the evaluation
- * @param stream cuda stream
- */
-template <typename math_t>
-bool evaluateSVDByL2Norm(const raft::handle_t& handle,
-                         math_t* A_d,
-                         math_t* U,
-                         math_t* S_vec,
-                         math_t* V,
-                         int n_rows,
-                         int n_cols,
-                         int k,
-                         math_t tol,
-                         cudaStream_t stream)
-{
-  return detail::evaluateSVDByL2Norm(handle, A_d, U, S_vec, V, n_rows, n_cols, k, tol, stream);
-}
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "svd.cuh"
diff --git a/cpp/include/raft/linalg/ternary_op.hpp b/cpp/include/raft/linalg/ternary_op.hpp
index bce9eacb11..58dab89609 100644
--- a/cpp/include/raft/linalg/ternary_op.hpp
+++ b/cpp/include/raft/linalg/ternary_op.hpp
@@ -18,42 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __TERNARY_OP_H
-#define __TERNARY_OP_H
-
-#pragma once
-
-#include <raft/linalg/detail/ternary_op.cuh>
-
-namespace raft {
-namespace linalg {
 /**
- * @brief perform element-wise ternary operation on the input arrays
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam Lambda the device-lambda performing the actual operation
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads-per-block in the final kernel launched
- * @param out the output array
- * @param in1 the first input array
- * @param in2 the second input array
- * @param in3 the third input array
- * @param len number of elements in the input array
- * @param op the device-lambda
- * @param stream cuda stream where to launch work
+ * DISCLAIMER: this file is deprecated: use ternary_op.cuh instead
  */
-template <typename math_t, typename Lambda, typename IdxType = int, int TPB = 256>
-void ternaryOp(math_t* out,
-               const math_t* in1,
-               const math_t* in2,
-               const math_t* in3,
-               IdxType len,
-               Lambda op,
-               cudaStream_t stream)
-{
-  detail::ternaryOp(out, in1, in2, in3, len, op, stream);
-}
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "ternary_op.cuh"
diff --git a/cpp/include/raft/linalg/transpose.hpp b/cpp/include/raft/linalg/transpose.hpp
index caa6bafedf..4c3f9224e4 100644
--- a/cpp/include/raft/linalg/transpose.hpp
+++ b/cpp/include/raft/linalg/transpose.hpp
@@ -18,49 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __TRANSPOSE_H
-#define __TRANSPOSE_H
-
-#pragma once
-
-#include "detail/transpose.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief transpose on the column major input matrix using Jacobi method
- * @param handle: raft handle
- * @param in: input matrix
- * @param out: output. Transposed input matrix
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param stream: cuda stream
+ * DISCLAIMER: this file is deprecated: use transpose.cuh instead
  */
-template <typename math_t>
-void transpose(const raft::handle_t& handle,
-               math_t* in,
-               math_t* out,
-               int n_rows,
-               int n_cols,
-               cudaStream_t stream)
-{
-  detail::transpose(handle, in, out, n_rows, n_cols, stream);
-}
 
-/**
- * @brief transpose on the column major input matrix using Jacobi method
- * @param inout: input and output matrix
- * @param n: number of rows and columns of input matrix
- * @param stream: cuda stream
- */
-template <typename math_t>
-void transpose(math_t* inout, int n, cudaStream_t stream)
-{
-  detail::transpose(inout, n, stream);
-}
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "transpose.cuh"
diff --git a/cpp/include/raft/linalg/unary_op.hpp b/cpp/include/raft/linalg/unary_op.hpp
index ca1e3f9875..2ace126ff1 100644
--- a/cpp/include/raft/linalg/unary_op.hpp
+++ b/cpp/include/raft/linalg/unary_op.hpp
@@ -18,65 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __UNARY_OP_H
-#define __UNARY_OP_H
-
-#pragma once
-
-#include "detail/unary_op.cuh"
-
-namespace raft {
-namespace linalg {
-
 /**
- * @brief perform element-wise unary operation in the input array
- * @tparam InType input data-type
- * @tparam Lambda the device-lambda performing the actual operation
- * @tparam OutType output data-type
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads-per-block in the final kernel launched
- * @param out the output array
- * @param in the input array
- * @param len number of elements in the input array
- * @param op the device-lambda
- * @param stream cuda stream where to launch work
- * @note Lambda must be a functor with the following signature:
- *       `OutType func(const InType& val);`
+ * DISCLAIMER: this file is deprecated: use unary_op.cuh instead
  */
-template <typename InType,
-          typename Lambda,
-          typename IdxType = int,
-          typename OutType = InType,
-          int TPB          = 256>
-void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
-{
-  detail::unaryOpCaller(out, in, len, op, stream);
-}
 
-/**
- * @brief Perform an element-wise unary operation into the output array
- *
- * Compared to `unaryOp()`, this method does not do any reads from any inputs
- *
- * @tparam OutType output data-type
- * @tparam Lambda  the device-lambda performing the actual operation
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB     threads-per-block in the final kernel launched
- *
- * @param[out] out    the output array [on device] [len = len]
- * @param[in]  len    number of elements in the input array
- * @param[in]  op     the device-lambda which must be of the form:
- *                    `void func(OutType* outLocationOffset, IdxType idx);`
- *                    where outLocationOffset will be out + idx.
- * @param[in]  stream cuda stream where to launch work
- */
-template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
-void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
-{
-  detail::writeOnlyUnaryOpCaller(out, len, op, stream);
-}
+#pragma once
 
-};  // end namespace linalg
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
+#include "unary_op.cuh"
diff --git a/cpp/include/raft/matrix/col_wise_sort.hpp b/cpp/include/raft/matrix/col_wise_sort.hpp
index 83a8738219..60c36db9e2 100644
--- a/cpp/include/raft/matrix/col_wise_sort.hpp
+++ b/cpp/include/raft/matrix/col_wise_sort.hpp
@@ -18,44 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __COL_WISE_SORT_H
-#define __COL_WISE_SORT_H
+/**
+ * DISCLAIMER: this file is deprecated: use col_wise_sort.cuh instead
+ */
 
 #pragma once
 
-#include <raft/matrix/detail/columnWiseSort.cuh>
-
-namespace raft {
-namespace matrix {
-
-/**
- * @brief sort columns within each row of row-major input matrix and return sorted indexes
- * modelled as key-value sort with key being input matrix and value being index of values
- * @param in: input matrix
- * @param out: output value(index) matrix
- * @param n_rows: number rows of input matrix
- * @param n_columns: number columns of input matrix
- * @param bAllocWorkspace: check returned value, if true allocate workspace passed in workspaceSize
- * @param workspacePtr: pointer to workspace memory
- * @param workspaceSize: Size of workspace to be allocated
- * @param stream: cuda stream to execute prim on
- * @param sortedKeys: Optional, output matrix for sorted keys (input)
- */
-template <typename InType, typename OutType>
-void sort_cols_per_row(const InType* in,
-                       OutType* out,
-                       int n_rows,
-                       int n_columns,
-                       bool& bAllocWorkspace,
-                       void* workspacePtr,
-                       size_t& workspaceSize,
-                       cudaStream_t stream,
-                       InType* sortedKeys = nullptr)
-{
-  detail::sortColumnsPerRow<InType, OutType>(
-    in, out, n_rows, n_columns, bAllocWorkspace, workspacePtr, workspaceSize, stream, sortedKeys);
-}
-};  // end namespace matrix
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "col_wise_sort.cuh"
diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index 7409140d7c..428c914784 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -18,265 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MATRIX_H
-#define __MATRIX_H
-
-#pragma once
-
-#include "detail/linewise_op.cuh"
-#include "detail/matrix.cuh"
-
-#include <raft/common/nvtx.hpp>
-
-namespace raft {
-namespace matrix {
-
-using namespace std;
-
-/**
- * @brief Copy selected rows of the input matrix into contiguous space.
- *
- * On exit out[i + k*n_rows] = in[indices[i] + k*n_rows],
- * where i = 0..n_rows_indices-1, and k = 0..n_cols-1.
- *
- * @param in input matrix
- * @param n_rows number of rows of output matrix
- * @param n_cols number of columns of output matrix
- * @param out output matrix
- * @param indices of the rows to be copied
- * @param n_rows_indices number of rows to copy
- * @param stream cuda stream
- * @param rowMajor whether the matrix has row major layout
- */
-template <typename m_t, typename idx_array_t = int, typename idx_t = size_t>
-void copyRows(const m_t* in,
-              idx_t n_rows,
-              idx_t n_cols,
-              m_t* out,
-              const idx_array_t* indices,
-              idx_t n_rows_indices,
-              cudaStream_t stream,
-              bool rowMajor = false)
-{
-  detail::copyRows(in, n_rows, n_cols, out, indices, n_rows_indices, stream, rowMajor);
-}
-
-/**
- * @brief copy matrix operation for column major matrices.
- * @param in: input matrix
- * @param out: output matrix
- * @param n_rows: number of rows of output matrix
- * @param n_cols: number of columns of output matrix
- * @param stream: cuda stream
- */
-template <typename m_t, typename idx_t = int>
-void copy(const m_t* in, m_t* out, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
-{
-  raft::copy_async(out, in, n_rows * n_cols, stream);
-}
-
-/**
- * @brief copy matrix operation for column major matrices. First n_rows and
- * n_cols of input matrix "in" is copied to "out" matrix.
- * @param in: input matrix
- * @param in_n_rows: number of rows of input matrix
- * @param out: output matrix
- * @param out_n_rows: number of rows of output matrix
- * @param out_n_cols: number of columns of output matrix
- * @param stream: cuda stream
- */
-template <typename m_t, typename idx_t = int>
-void truncZeroOrigin(
-  m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream)
-{
-  detail::truncZeroOrigin(in, in_n_rows, out, out_n_rows, out_n_cols, stream);
-}
-
-/**
- * @brief Columns of a column major matrix is reversed (i.e. first column and
- * last column are swapped)
- * @param inout: input and output matrix
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- * @param stream: cuda stream
- */
-template <typename m_t, typename idx_t = int>
-void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
-{
-  detail::colReverse(inout, n_rows, n_cols, stream);
-}
-
-/**
- * @brief Rows of a column major matrix is reversed (i.e. first row and last
- * row are swapped)
- * @param inout: input and output matrix
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- * @param stream: cuda stream
- */
-template <typename m_t, typename idx_t = int>
-void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
-{
-  detail::rowReverse(inout, n_rows, n_cols, stream);
-}
-
-/**
- * @brief Prints the data stored in GPU memory
- * @param in: input matrix
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- * @param h_separator: horizontal separator character
- * @param v_separator: vertical separator character
- * @param stream: cuda stream
- */
-template <typename m_t, typename idx_t = int>
-void print(const m_t* in,
-           idx_t n_rows,
-           idx_t n_cols,
-           char h_separator    = ' ',
-           char v_separator    = '\n',
-           cudaStream_t stream = rmm::cuda_stream_default)
-{
-  detail::print(in, n_rows, n_cols, h_separator, v_separator, stream);
-}
-
-/**
- * @brief Prints the data stored in CPU memory
- * @param in: input matrix
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- */
-template <typename m_t, typename idx_t = int>
-void printHost(const m_t* in, idx_t n_rows, idx_t n_cols)
-{
-  detail::printHost(in, n_rows, n_cols);
-}
-
-/**
- * @brief Slice a matrix (in-place)
- * @param in: input matrix
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- * @param out: output matrix
- * @param x1, y1: coordinate of the top-left point of the wanted area (0-based)
- * @param x2, y2: coordinate of the bottom-right point of the wanted area
- * (1-based)
- * example: Slice the 2nd and 3rd columns of a 4x3 matrix: slice_matrix(M_d, 4,
- * 3, 0, 1, 4, 3);
- * @param stream: cuda stream
- */
-template <typename m_t, typename idx_t = int>
-void sliceMatrix(m_t* in,
-                 idx_t n_rows,
-                 idx_t n_cols,
-                 m_t* out,
-                 idx_t x1,
-                 idx_t y1,
-                 idx_t x2,
-                 idx_t y2,
-                 cudaStream_t stream)
-{
-  detail::sliceMatrix(in, n_rows, n_cols, out, x1, y1, x2, y2, stream);
-}
-
 /**
- * @brief Copy the upper triangular part of a matrix to another
- * @param src: input matrix with a size of n_rows x n_cols
- * @param dst: output matrix with a size of kxk, k = min(n_rows, n_cols)
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- * @param stream: cuda stream
+ * DISCLAIMER: this file is deprecated: use matrix.cuh instead
  */
-template <typename m_t, typename idx_t = int>
-void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
-{
-  detail::copyUpperTriangular(src, dst, n_rows, n_cols, stream);
-}
 
-/**
- * @brief Initialize a diagonal matrix with a vector
- * @param vec: vector of length k = min(n_rows, n_cols)
- * @param matrix: matrix of size n_rows x n_cols
- * @param n_rows: number of rows of the matrix
- * @param n_cols: number of columns of the matrix
- * @param stream: cuda stream
- */
-template <typename m_t, typename idx_t = int>
-void initializeDiagonalMatrix(
-  m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
-{
-  detail::initializeDiagonalMatrix(vec, matrix, n_rows, n_cols, stream);
-}
-
-/**
- * @brief Get a square matrix with elements on diagonal reversed (in-place)
- * @param in: square input matrix with size len x len
- * @param len: size of one side of the matrix
- * @param stream: cuda stream
- */
-template <typename m_t, typename idx_t = int>
-void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
-{
-  detail::getDiagonalInverseMatrix(in, len, stream);
-}
-
-/**
- * @brief Get the L2/F-norm of a matrix/vector
- * @param handle
- * @param in: input matrix/vector with totally size elements
- * @param size: size of the matrix/vector
- * @param stream: cuda stream
- */
-template <typename m_t, typename idx_t = int>
-m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream)
-{
-  return detail::getL2Norm(handle, in, size, stream);
-}
-
-/**
- * Run a function over matrix lines (rows or columns) with a variable number
- * row-vectors or column-vectors.
- * The term `line` here signifies that the lines can be either columns or rows,
- * depending on the matrix layout.
- * What matters is if the vectors are applied along lines (indices of vectors correspond to
- * indices within lines), or across lines (indices of vectors correspond to line numbers).
- *
- * @param [out] out result of the operation; can be same as `in`; should be aligned the same
- *        as `in` to allow faster vectorized memory transfers.
- * @param [in] in input matrix consisting of `nLines` lines, each `lineLen`-long.
- * @param [in] lineLen length of matrix line in elements (`=nCols` in row-major or `=nRows` in
- * col-major)
- * @param [in] nLines number of matrix lines (`=nRows` in row-major or `=nCols` in col-major)
- * @param [in] alongLines whether vectors are indices along or across lines.
- * @param [in] op the operation applied on each line:
- *    for i in [0..lineLen) and j in [0..nLines):
- *      out[i, j] = op(in[i, j], vec1[i], vec2[i], ... veck[i])   if alongLines = true
- *      out[i, j] = op(in[i, j], vec1[j], vec2[j], ... veck[j])   if alongLines = false
- *    where matrix indexing is row-major ([i, j] = [i + lineLen * j]).
- * @param [in] stream a cuda stream for the kernels
- * @param [in] vecs zero or more vectors to be passed as arguments,
- *    size of each vector is `alongLines ? lineLen : nLines`.
- */
-template <typename m_t, typename idx_t = int, typename Lambda, typename... Vecs>
-void linewiseOp(m_t* out,
-                const m_t* in,
-                const idx_t lineLen,
-                const idx_t nLines,
-                const bool alongLines,
-                Lambda op,
-                cudaStream_t stream,
-                Vecs... vecs)
-{
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("linewiseOp-%c-%zu (%zu, %zu)",
-                                                            alongLines ? 'l' : 'x',
-                                                            sizeof...(Vecs),
-                                                            size_t(lineLen),
-                                                            size_t(nLines));
-  detail::MatrixLinewiseOp<16, 256>::run<m_t, idx_t, Lambda, Vecs...>(
-    out, in, lineLen, nLines, alongLines, op, stream, vecs...);
-}
+#pragma once
 
-};  // end namespace matrix
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "matrix.cuh"
diff --git a/cpp/include/raft/random/make_regression.hpp b/cpp/include/raft/random/make_regression.hpp
index 4f6b2717f6..f3e2113f80 100644
--- a/cpp/include/raft/random/make_regression.hpp
+++ b/cpp/include/raft/random/make_regression.hpp
@@ -13,98 +13,19 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-/* Adapted from scikit-learn
- * https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/datasets/_samples_generator.py
- */
-
 /**
  * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
-#ifndef __MAKE_REGRESSION_H
-#define __MAKE_REGRESSION_H
-
-#pragma once
-
-#include <algorithm>
-
-#include "detail/make_regression.cuh"
-
-namespace raft::random {
-
 /**
- * @brief GPU-equivalent of sklearn.datasets.make_regression as documented at:
- * https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html
- *
- * @tparam  DataT  Scalar type
- * @tparam  IdxT   Index type
- *
- * @param[in]   handle          RAFT handle
- * @param[out]  out             Row-major (samples, features) matrix to store
- *                              the problem data
- * @param[out]  values          Row-major (samples, targets) matrix to store
- *                              the values for the regression problem
- * @param[in]   n_rows          Number of samples
- * @param[in]   n_cols          Number of features
- * @param[in]   n_informative   Number of informative features (non-zero
- *                              coefficients)
- * @param[in]   stream          CUDA stream
- * @param[out]  coef            Row-major (features, targets) matrix to store
- *                              the coefficients used to generate the values
- *                              for the regression problem. If nullptr is
- *                              given, nothing will be written
- * @param[in]   n_targets       Number of targets (generated values per sample)
- * @param[in]   bias            A scalar that will be added to the values
- * @param[in]   effective_rank  The approximate rank of the data matrix (used
- *                              to create correlations in the data). -1 is the
- *                              code to use well-conditioned data
- * @param[in]   tail_strength   The relative importance of the fat noisy tail
- *                              of the singular values profile if
- *                              effective_rank is not -1
- * @param[in]   noise           Standard deviation of the gaussian noise
- *                              applied to the output
- * @param[in]   shuffle         Shuffle the samples and the features
- * @param[in]   seed            Seed for the random number generator
- * @param[in]   type            Random generator type
+ * DISCLAIMER: this file is deprecated: use make_regression.cuh instead
  */
-template <typename DataT, typename IdxT>
-void make_regression(const raft::handle_t& handle,
-                     DataT* out,
-                     DataT* values,
-                     IdxT n_rows,
-                     IdxT n_cols,
-                     IdxT n_informative,
-                     cudaStream_t stream,
-                     DataT* coef         = nullptr,
-                     IdxT n_targets      = (IdxT)1,
-                     DataT bias          = (DataT)0.0,
-                     IdxT effective_rank = (IdxT)-1,
-                     DataT tail_strength = (DataT)0.5,
-                     DataT noise         = (DataT)0.0,
-                     bool shuffle        = true,
-                     uint64_t seed       = 0ULL,
-                     GeneratorType type  = GenPhilox)
-{
-  detail::make_regression_caller(handle,
-                                 out,
-                                 values,
-                                 n_rows,
-                                 n_cols,
-                                 n_informative,
-                                 stream,
-                                 coef,
-                                 n_targets,
-                                 bias,
-                                 effective_rank,
-                                 tail_strength,
-                                 noise,
-                                 shuffle,
-                                 seed,
-                                 type);
-}
 
-}  // namespace raft::random
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "make_regression.cuh"
diff --git a/cpp/include/raft/random/multi_variable_gaussian.hpp b/cpp/include/raft/random/multi_variable_gaussian.hpp
index 6b85ec6a14..e7d78938a2 100644
--- a/cpp/include/raft/random/multi_variable_gaussian.hpp
+++ b/cpp/include/raft/random/multi_variable_gaussian.hpp
@@ -18,51 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MVG_H
-#define __MVG_H
+/**
+ * DISCLAIMER: this file is deprecated: use multi_variable_gaussian.cuh instead
+ */
 
 #pragma once
 
-#include "detail/multi_variable_gaussian.cuh"
-
-namespace raft::random {
-
-template <typename T>
-class multi_variable_gaussian : public detail::multi_variable_gaussian_impl<T> {
- public:
-  // using Decomposer = typename detail::multi_variable_gaussian_impl<T>::Decomposer;
-  // using detail::multi_variable_gaussian_impl<T>::Decomposer::chol_decomp;
-  // using detail::multi_variable_gaussian_impl<T>::Decomposer::jacobi;
-  // using detail::multi_variable_gaussian_impl<T>::Decomposer::qr;
-
-  multi_variable_gaussian() = delete;
-  multi_variable_gaussian(const raft::handle_t& handle,
-                          const int dim,
-                          typename detail::multi_variable_gaussian_impl<T>::Decomposer method)
-    : detail::multi_variable_gaussian_impl<T>{handle, dim, method}
-  {
-  }
-
-  std::size_t get_workspace_size()
-  {
-    return detail::multi_variable_gaussian_impl<T>::get_workspace_size();
-  }
-
-  void set_workspace(T* workarea)
-  {
-    detail::multi_variable_gaussian_impl<T>::set_workspace(workarea);
-  }
-
-  void give_gaussian(const int nPoints, T* P, T* X, const T* x = 0)
-  {
-    detail::multi_variable_gaussian_impl<T>::give_gaussian(nPoints, P, X, x);
-  }
-
-  void deinit() { detail::multi_variable_gaussian_impl<T>::deinit(); }
-
-  ~multi_variable_gaussian() { deinit(); }
-};  // end of multi_variable_gaussian
-
-};  // end of namespace raft::random
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "multi_variable_gaussian.cuh"
diff --git a/cpp/include/raft/random/permute.hpp b/cpp/include/raft/random/permute.hpp
index 26e22e403b..a2fafa6574 100644
--- a/cpp/include/raft/random/permute.hpp
+++ b/cpp/include/raft/random/permute.hpp
@@ -18,50 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __PERMUTE_H
-#define __PERMUTE_H
-
-#pragma once
-
-#include "detail/permute.cuh"
-
-namespace raft::random {
-
 /**
- * @brief Generate permutations of the input array. Pretty useful primitive for
- * shuffling the input datasets in ML algos. See note at the end for some of its
- * limitations!
- * @tparam Type Data type of the array to be shuffled
- * @tparam IntType Integer type used for ther perms array
- * @tparam IdxType Integer type used for addressing indices
- * @tparam TPB threads per block
- * @param perms the output permutation indices. Typically useful only when
- * one wants to refer back. If you don't need this, pass a nullptr
- * @param out the output shuffled array. Pass nullptr if you don't want this to
- * be written. For eg: when you only want the perms array to be filled.
- * @param in input array (in-place is not supported due to race conditions!)
- * @param D number of columns of the input array
- * @param N length of the input array (or number of rows)
- * @param rowMajor whether the input/output matrices are row or col major
- * @param stream cuda stream where to launch the work
- *
- * @note This is NOT a uniform permutation generator! In fact, it only generates
- * very small percentage of permutations. If your application really requires a
- * high quality permutation generator, it is recommended that you pick
- * Knuth Shuffle.
+ * DISCLAIMER: this file is deprecated: use permute.cuh instead
  */
-template <typename Type, typename IntType = int, typename IdxType = int, int TPB = 256>
-void permute(IntType* perms,
-             Type* out,
-             const Type* in,
-             IntType D,
-             IntType N,
-             bool rowMajor,
-             cudaStream_t stream)
-{
-  detail::permute<Type, IntType, IdxType, TPB>(perms, out, in, D, N, rowMajor, stream);
-}
 
-};  // end namespace raft::random
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "permute.cuh"
diff --git a/cpp/include/raft/sparse/linalg/add.hpp b/cpp/include/raft/sparse/linalg/add.hpp
index 39ab2d6450..e6930eaee7 100644
--- a/cpp/include/raft/sparse/linalg/add.hpp
+++ b/cpp/include/raft/sparse/linalg/add.hpp
@@ -18,87 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SPARSE_ADD_H
-#define __SPARSE_ADD_H
-
-#pragma once
-
-#include <raft/sparse/linalg/detail/add.cuh>
-
-namespace raft {
-namespace sparse {
-namespace linalg {
-
 /**
- * @brief Calculate the CSR row_ind array that would result
- * from summing together two CSR matrices
- * @param a_ind: left hand row_ind array
- * @param a_indptr: left hand index_ptr array
- * @param a_val: left hand data array
- * @param nnz1: size of left hand index_ptr and val arrays
- * @param b_ind: right hand row_ind array
- * @param b_indptr: right hand index_ptr array
- * @param b_val: right hand data array
- * @param nnz2: size of right hand index_ptr and val arrays
- * @param m: size of output array (number of rows in final matrix)
- * @param out_ind: output row_ind array
- * @param stream: cuda stream to use
+ * DISCLAIMER: this file is deprecated: use add.cuh instead
  */
-template <typename T>
-size_t csr_add_calc_inds(const int* a_ind,
-                         const int* a_indptr,
-                         const T* a_val,
-                         int nnz1,
-                         const int* b_ind,
-                         const int* b_indptr,
-                         const T* b_val,
-                         int nnz2,
-                         int m,
-                         int* out_ind,
-                         cudaStream_t stream)
-{
-  return detail::csr_add_calc_inds(
-    a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, out_ind, stream);
-}
 
-/**
- * @brief Calculate the CSR row_ind array that would result
- * from summing together two CSR matrices
- * @param a_ind: left hand row_ind array
- * @param a_indptr: left hand index_ptr array
- * @param a_val: left hand data array
- * @param nnz1: size of left hand index_ptr and val arrays
- * @param b_ind: right hand row_ind array
- * @param b_indptr: right hand index_ptr array
- * @param b_val: right hand data array
- * @param nnz2: size of right hand index_ptr and val arrays
- * @param m: size of output array (number of rows in final matrix)
- * @param c_ind: output row_ind array
- * @param c_indptr: output ind_ptr array
- * @param c_val: output data array
- * @param stream: cuda stream to use
- */
-template <typename T>
-void csr_add_finalize(const int* a_ind,
-                      const int* a_indptr,
-                      const T* a_val,
-                      int nnz1,
-                      const int* b_ind,
-                      const int* b_indptr,
-                      const T* b_val,
-                      int nnz2,
-                      int m,
-                      int* c_ind,
-                      int* c_indptr,
-                      T* c_val,
-                      cudaStream_t stream)
-{
-  detail::csr_add_finalize(
-    a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, c_ind, c_indptr, c_val, stream);
-}
+#pragma once
 
-};  // end NAMESPACE linalg
-};  // end NAMESPACE sparse
-};  // end NAMESPACE raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "add.cuh"
diff --git a/cpp/include/raft/sparse/linalg/degree.hpp b/cpp/include/raft/sparse/linalg/degree.hpp
index 7cece7908e..240cfd452f 100644
--- a/cpp/include/raft/sparse/linalg/degree.hpp
+++ b/cpp/include/raft/sparse/linalg/degree.hpp
@@ -18,111 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SPARSE_DEGREE_H
-#define __SPARSE_DEGREE_H
-
-#pragma once
-
-#include <raft/sparse/coo.hpp>
-#include <raft/sparse/linalg/detail/degree.cuh>
-
-namespace raft {
-namespace sparse {
-namespace linalg {
-
-/**
- * @brief Count the number of values for each row
- * @tparam TPB_X: number of threads to use per block
- * @param rows: rows array of the COO matrix
- * @param nnz: size of the rows array
- * @param results: output result array
- * @param stream: cuda stream to use
- */
-template <typename T = int>
-void coo_degree(const T* rows, int nnz, T* results, cudaStream_t stream)
-{
-  detail::coo_degree<64, T>(rows, nnz, results, stream);
-}
-
-/**
- * @brief Count the number of values for each row
- * @tparam TPB_X: number of threads to use per block
- * @tparam T: type name of underlying values array
- * @param in: input COO object for counting rows
- * @param results: output array with row counts (size=in->n_rows)
- * @param stream: cuda stream to use
- */
-template <typename T>
-void coo_degree(COO<T>* in, int* results, cudaStream_t stream)
-{
-  coo_degree(in->rows(), in->nnz, results, stream);
-}
-
-/**
- * @brief Count the number of values for each row that doesn't match a particular scalar
- * @tparam TPB_X: number of threads to use per block
- * @tparam T: the type name of the underlying value arrays
- * @param rows: Input COO row array
- * @param vals: Input COO val arrays
- * @param nnz: size of input COO arrays
- * @param scalar: scalar to match for counting rows
- * @param results: output row counts
- * @param stream: cuda stream to use
- */
-template <typename T>
-void coo_degree_scalar(
-  const int* rows, const T* vals, int nnz, T scalar, int* results, cudaStream_t stream = 0)
-{
-  detail::coo_degree_scalar<64>(rows, vals, nnz, scalar, results, stream);
-}
-
-/**
- * @brief Count the number of values for each row that doesn't match a particular scalar
- * @tparam TPB_X: number of threads to use per block
- * @tparam T: the type name of the underlying value arrays
- * @param in: Input COO array
- * @param scalar: scalar to match for counting rows
- * @param results: output row counts
- * @param stream: cuda stream to use
- */
-template <typename T>
-void coo_degree_scalar(COO<T>* in, T scalar, int* results, cudaStream_t stream)
-{
-  coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, results, stream);
-}
-
 /**
- * @brief Count the number of nonzeros for each row
- * @tparam TPB_X: number of threads to use per block
- * @tparam T: the type name of the underlying value arrays
- * @param rows: Input COO row array
- * @param vals: Input COO val arrays
- * @param nnz: size of input COO arrays
- * @param results: output row counts
- * @param stream: cuda stream to use
+ * DISCLAIMER: this file is deprecated: use degree.cuh instead
  */
-template <typename T>
-void coo_degree_nz(const int* rows, const T* vals, int nnz, int* results, cudaStream_t stream)
-{
-  detail::coo_degree_nz<64>(rows, vals, nnz, results, stream);
-}
 
-/**
- * @brief Count the number of nonzero values for each row
- * @tparam TPB_X: number of threads to use per block
- * @tparam T: the type name of the underlying value arrays
- * @param in: Input COO array
- * @param results: output row counts
- * @param stream: cuda stream to use
- */
-template <typename T>
-void coo_degree_nz(COO<T>* in, int* results, cudaStream_t stream)
-{
-  coo_degree_nz(in->rows(), in->vals(), in->nnz, results, stream);
-}
+#pragma once
 
-};  // end NAMESPACE linalg
-};  // end NAMESPACE sparse
-};  // end NAMESPACE raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "degree.cuh"
diff --git a/cpp/include/raft/sparse/linalg/norm.hpp b/cpp/include/raft/sparse/linalg/norm.hpp
index 1f054e63ab..64261f1178 100644
--- a/cpp/include/raft/sparse/linalg/norm.hpp
+++ b/cpp/include/raft/sparse/linalg/norm.hpp
@@ -18,61 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SPARSE_NORM_H
-#define __SPARSE_NORM_H
-
-#pragma once
-
-#include <raft/sparse/linalg/detail/norm.cuh>
-
-namespace raft {
-namespace sparse {
-namespace linalg {
-
 /**
- * @brief Perform L1 normalization on the rows of a given CSR-formatted sparse matrix
- *
- * @param ia: row_ind array
- * @param vals: data array
- * @param nnz: size of data array
- * @param m: size of row_ind array
- * @param result: l1 normalized data array
- * @param stream: cuda stream to use
+ * DISCLAIMER: this file is deprecated: use norm.cuh instead
  */
-template <typename T>
-void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
-                          const T* vals,
-                          int nnz,  // array of values and number of non-zeros
-                          int m,    // num rows in csr
-                          T* result,
-                          cudaStream_t stream)
-{  // output array
-  detail::csr_row_normalize_l1(ia, vals, nnz, m, result, stream);
-}
 
-/**
- * @brief Perform L_inf normalization on a given CSR-formatted sparse matrix
- *
- * @param ia: row_ind array
- * @param vals: data array
- * @param nnz: size of data array
- * @param m: size of row_ind array
- * @param result: l1 normalized data array
- * @param stream: cuda stream to use
- */
-template <typename T>
-void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
-                           const T* vals,
-                           int nnz,  // array of values and number of non-zeros
-                           int m,    // num total rows in csr
-                           T* result,
-                           cudaStream_t stream)
-{
-  detail::csr_row_normalize_max(ia, vals, nnz, m, result, stream);
-}
+#pragma once
 
-};  // end NAMESPACE linalg
-};  // end NAMESPACE sparse
-};  // end NAMESPACE raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "norm.cuh"
diff --git a/cpp/include/raft/sparse/linalg/spectral.hpp b/cpp/include/raft/sparse/linalg/spectral.hpp
index ff400f1f0f..d7009db03f 100644
--- a/cpp/include/raft/sparse/linalg/spectral.hpp
+++ b/cpp/include/raft/sparse/linalg/spectral.hpp
@@ -18,31 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SPARSE_SPECTRAL_H
-#define __SPARSE_SPECTRAL_H
-
-#include <raft/handle.hpp>
-#include <raft/sparse/linalg/detail/spectral.cuh>
+/**
+ * DISCLAIMER: this file is deprecated: use spectral.cuh instead
+ */
 
-namespace raft {
-namespace sparse {
-namespace spectral {
+#pragma once
 
-template <typename T>
-void fit_embedding(const raft::handle_t& handle,
-                   int* rows,
-                   int* cols,
-                   T* vals,
-                   int nnz,
-                   int n,
-                   int n_components,
-                   T* out,
-                   unsigned long long seed = 1234567)
-{
-  detail::fit_embedding(handle, rows, cols, vals, nnz, n, n_components, out, seed);
-}
-};  // namespace spectral
-};  // namespace sparse
-};  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
+#include "spectral.cuh"
diff --git a/cpp/include/raft/sparse/linalg/transpose.hpp b/cpp/include/raft/sparse/linalg/transpose.hpp
index c709c20473..a6a0539319 100644
--- a/cpp/include/raft/sparse/linalg/transpose.hpp
+++ b/cpp/include/raft/sparse/linalg/transpose.hpp
@@ -18,62 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __TRANSPOSE_H
-#define __TRANSPOSE_H
-
-#pragma once
-
-#include <raft/handle.hpp>
-#include <raft/sparse/linalg/detail/transpose.h>
-
-namespace raft {
-namespace sparse {
-namespace linalg {
-
 /**
- * Transpose a set of CSR arrays into a set of CSC arrays.
- * @tparam value_idx : data type of the CSR index arrays
- * @tparam value_t : data type of the CSR data array
- * @param[in] handle : used for invoking cusparse
- * @param[in] csr_indptr : CSR row index array
- * @param[in] csr_indices : CSR column indices array
- * @param[in] csr_data : CSR data array
- * @param[out] csc_indptr : CSC row index array
- * @param[out] csc_indices : CSC column indices array
- * @param[out] csc_data : CSC data array
- * @param[in] csr_nrows : Number of rows in CSR
- * @param[in] csr_ncols : Number of columns in CSR
- * @param[in] nnz : Number of nonzeros of CSR
- * @param[in] stream : Cuda stream for ordering events
+ * DISCLAIMER: this file is deprecated: use transpose.cuh instead
  */
-template <typename value_idx, typename value_t>
-void csr_transpose(const raft::handle_t& handle,
-                   const value_idx* csr_indptr,
-                   const value_idx* csr_indices,
-                   const value_t* csr_data,
-                   value_idx* csc_indptr,
-                   value_idx* csc_indices,
-                   value_t* csc_data,
-                   value_idx csr_nrows,
-                   value_idx csr_ncols,
-                   value_idx nnz,
-                   cudaStream_t stream)
-{
-  detail::csr_transpose(handle.get_cusparse_handle(),
-                        csr_indptr,
-                        csr_indices,
-                        csr_data,
-                        csc_indptr,
-                        csc_indices,
-                        csc_data,
-                        csr_nrows,
-                        csr_ncols,
-                        nnz,
-                        stream);
-}
 
-};  // end NAMESPACE linalg
-};  // end NAMESPACE sparse
-};  // end NAMESPACE raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "transpose.cuh"
diff --git a/cpp/include/raft/sparse/op/filter.hpp b/cpp/include/raft/sparse/op/filter.hpp
index 3821d963b0..6a59148fd7 100644
--- a/cpp/include/raft/sparse/op/filter.hpp
+++ b/cpp/include/raft/sparse/op/filter.hpp
@@ -18,82 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __FILTER_H
-#define __FILTER_H
-
-#pragma once
-
-#include <raft/handle.hpp>
-#include <raft/sparse/coo.hpp>
-#include <raft/sparse/op/detail/filter.cuh>
-
-namespace raft {
-namespace sparse {
-namespace op {
-
-/**
- * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix.
- *
- * @param rows: input array of rows (size n)
- * @param cols: input array of cols (size n)
- * @param vals: input array of vals (size n)
- * @param nnz: size of current rows/cols/vals arrays
- * @param crows: compressed array of rows
- * @param ccols: compressed array of cols
- * @param cvals: compressed array of vals
- * @param cnnz: array of non-zero counts per row
- * @param cur_cnnz array of counts per row
- * @param scalar: scalar to remove from arrays
- * @param n: number of rows in dense matrix
- * @param stream: cuda stream to use
- */
-template <typename T>
-void coo_remove_scalar(const int* rows,
-                       const int* cols,
-                       const T* vals,
-                       int nnz,
-                       int* crows,
-                       int* ccols,
-                       T* cvals,
-                       int* cnnz,
-                       int* cur_cnnz,
-                       T scalar,
-                       int n,
-                       cudaStream_t stream)
-{
-  detail::coo_remove_scalar<128, T>(
-    rows, cols, vals, nnz, crows, ccols, cvals, cnnz, cur_cnnz, scalar, n, stream);
-}
-
 /**
- * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix.
- *
- * @param in: input COO matrix
- * @param out: output COO matrix
- * @param scalar: scalar to remove from arrays
- * @param stream: cuda stream to use
+ * DISCLAIMER: this file is deprecated: use filter.cuh instead
  */
-template <typename T>
-void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
-{
-  detail::coo_remove_scalar<128, T>(in, out, scalar, stream);
-}
 
-/**
- * @brief Removes zeros from a COO formatted sparse matrix.
- *
- * @param in: input COO matrix
- * @param out: output COO matrix
- * @param stream: cuda stream to use
- */
-template <typename T>
-void coo_remove_zeros(COO<T>* in, COO<T>* out, cudaStream_t stream)
-{
-  coo_remove_scalar<T>(in, out, T(0.0), stream);
-}
+#pragma once
 
-};  // namespace op
-};  // end NAMESPACE sparse
-};  // end NAMESPACE raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "filter.cuh"
diff --git a/cpp/include/raft/sparse/op/reduce.hpp b/cpp/include/raft/sparse/op/reduce.hpp
index bb7560fa3d..37923e070c 100644
--- a/cpp/include/raft/sparse/op/reduce.hpp
+++ b/cpp/include/raft/sparse/op/reduce.hpp
@@ -18,75 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SPARSE_REDUCE_H
-#define __SPARSE_REDUCE_H
-
-#pragma once
-
-#include <raft/handle.hpp>
-#include <raft/sparse/coo.hpp>
-#include <raft/sparse/op/detail/reduce.cuh>
-
-namespace raft {
-namespace sparse {
-namespace op {
 /**
- * Computes a mask from a sorted COO matrix where 0's denote
- * duplicate values and 1's denote new values. This mask can
- * be useful for computing an exclusive scan to pre-build offsets
- * for reducing duplicates, such as when symmetrizing
- * or taking the min of each duplicated value.
- *
- * Note that this function always marks the first value as 0 so that
- * a cumulative sum can be performed as a follow-on. However, even
- * if the mask is used direclty, any duplicates should always have a
- * 1 when first encountered so it can be assumed that the first element
- * is always a 1 otherwise.
- *
- * @tparam value_idx
- * @param[out] mask output mask, size nnz
- * @param[in] rows COO rows array, size nnz
- * @param[in] cols COO cols array, size nnz
- * @param[in] nnz number of nonzeros in input arrays
- * @param[in] stream cuda ops will be ordered wrt this stream
+ * DISCLAIMER: this file is deprecated: use reduce.cuh instead
  */
-template <typename value_idx>
-void compute_duplicates_mask(
-  value_idx* mask, const value_idx* rows, const value_idx* cols, size_t nnz, cudaStream_t stream)
-{
-  detail::compute_duplicates_mask(mask, rows, cols, nnz, stream);
-}
 
-/**
- * Performs a COO reduce of duplicate columns per row, taking the max weight
- * for duplicate columns in each row. This function assumes the input COO
- * has been sorted by both row and column but makes no assumption on
- * the sorting of values.
- * @tparam value_idx
- * @tparam value_t
- * @param[in] handle
- * @param[out] out output COO, the nnz will be computed allocate() will be called in this function.
- * @param[in] rows COO rows array, size nnz
- * @param[in] cols COO cols array, size nnz
- * @param[in] vals COO vals array, size nnz
- * @param[in] nnz number of nonzeros in COO input arrays
- * @param[in] m number of rows in COO input matrix
- * @param[in] n number of columns in COO input matrix
- */
-template <typename value_idx, typename value_t>
-void max_duplicates(const raft::handle_t& handle,
-                    raft::sparse::COO<value_t, value_idx>& out,
-                    const value_idx* rows,
-                    const value_idx* cols,
-                    const value_t* vals,
-                    size_t nnz,
-                    size_t m,
-                    size_t n)
-{
-  detail::max_duplicates(handle, out, rows, cols, vals, nnz, m, n);
-}
-};  // END namespace op
-};  // END namespace sparse
-};  // END namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "reduce.cuh"
diff --git a/cpp/include/raft/sparse/op/row_op.hpp b/cpp/include/raft/sparse/op/row_op.hpp
index ac12432e92..8443f9f090 100644
--- a/cpp/include/raft/sparse/op/row_op.hpp
+++ b/cpp/include/raft/sparse/op/row_op.hpp
@@ -18,37 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SPARSE_ROW_OP_H
-#define __SPARSE_ROW_OP_H
-
-#pragma once
-
-#include <raft/handle.hpp>
-#include <raft/sparse/op/detail/row_op.cuh>
-
-namespace raft {
-namespace sparse {
-namespace op {
-
 /**
- * @brief Perform a custom row operation on a CSR matrix in batches.
- * @tparam T numerical type of row_ind array
- * @tparam TPB_X number of threads per block to use for underlying kernel
- * @tparam Lambda type of custom operation function
- * @param row_ind the CSR row_ind array to perform parallel operations over
- * @param n_rows total number vertices in graph
- * @param nnz number of non-zeros
- * @param op custom row operation functor accepting the row and beginning index.
- * @param stream cuda stream to use
+ * DISCLAIMER: this file is deprecated: use row_op.cuh instead
  */
-template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cudaStream_t stream)
-{
-  detail::csr_row_op<Index_, 128, Lambda>(row_ind, n_rows, nnz, op, stream);
-}
 
-};  // namespace op
-};  // end NAMESPACE sparse
-};  // end NAMESPACE raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "row_op.cuh"
diff --git a/cpp/include/raft/sparse/op/slice.hpp b/cpp/include/raft/sparse/op/slice.hpp
index 75b7e478e5..4d7e1858de 100644
--- a/cpp/include/raft/sparse/op/slice.hpp
+++ b/cpp/include/raft/sparse/op/slice.hpp
@@ -18,69 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SLICE_H
-#define __SLICE_H
-
-#pragma once
-
-#include <raft/handle.hpp>
-#include <raft/sparse/op/detail/slice.cuh>
-
-namespace raft {
-namespace sparse {
-namespace op {
-
 /**
- * Slice consecutive rows from a CSR array and populate newly sliced indptr array
- * @tparam value_idx
- * @param[in] start_row : beginning row to slice
- * @param[in] stop_row : ending row to slice
- * @param[in] indptr : indptr of input CSR to slice
- * @param[out] indptr_out : output sliced indptr to populate
- * @param[in] start_offset : beginning column offset of input indptr
- * @param[in] stop_offset : ending column offset of input indptr
- * @param[in] stream : cuda stream for ordering events
+ * DISCLAIMER: this file is deprecated: use slice.cuh instead
  */
-template <typename value_idx>
-void csr_row_slice_indptr(value_idx start_row,
-                          value_idx stop_row,
-                          const value_idx* indptr,
-                          value_idx* indptr_out,
-                          value_idx* start_offset,
-                          value_idx* stop_offset,
-                          cudaStream_t stream)
-{
-  detail::csr_row_slice_indptr(
-    start_row, stop_row, indptr, indptr_out, start_offset, stop_offset, stream);
-}
 
-/**
- * Slice rows from a CSR, populate column and data arrays
- * @tparam value_idx : data type of CSR index arrays
- * @tparam value_t : data type of CSR data array
- * @param[in] start_offset : beginning column offset to slice
- * @param[in] stop_offset : ending column offset to slice
- * @param[in] indices : column indices array from input CSR
- * @param[in] data : data array from input CSR
- * @param[out] indices_out : output column indices array
- * @param[out] data_out : output data array
- * @param[in] stream : cuda stream for ordering events
- */
-template <typename value_idx, typename value_t>
-void csr_row_slice_populate(value_idx start_offset,
-                            value_idx stop_offset,
-                            const value_idx* indices,
-                            const value_t* data,
-                            value_idx* indices_out,
-                            value_t* data_out,
-                            cudaStream_t stream)
-{
-  detail::csr_row_slice_populate(
-    start_offset, stop_offset, indices, data, indices_out, data_out, stream);
-}
+#pragma once
 
-};  // namespace op
-};  // end NAMESPACE sparse
-};  // end NAMESPACE raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "slice.cuh"
diff --git a/cpp/include/raft/sparse/op/sort.hpp b/cpp/include/raft/sparse/op/sort.hpp
index cd363582fb..867bb1bf35 100644
--- a/cpp/include/raft/sparse/op/sort.hpp
+++ b/cpp/include/raft/sparse/op/sort.hpp
@@ -18,66 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SPARSE_SORT_H
-#define __SPARSE_SORT_H
-
-#pragma once
-
-#include <raft/handle.hpp>
-#include <raft/sparse/op/detail/sort.h>
-
-namespace raft {
-namespace sparse {
-namespace op {
-
 /**
- * @brief Sorts the arrays that comprise the coo matrix
- * by row and then by column.
- *
- * @param m number of rows in coo matrix
- * @param n number of cols in coo matrix
- * @param nnz number of non-zeros
- * @param rows rows array from coo matrix
- * @param cols cols array from coo matrix
- * @param vals vals array from coo matrix
- * @param stream: cuda stream to use
+ * DISCLAIMER: this file is deprecated: use sort.cuh instead
  */
-template <typename T>
-void coo_sort(int m, int n, int nnz, int* rows, int* cols, T* vals, cudaStream_t stream)
-{
-  detail::coo_sort(m, n, nnz, rows, cols, vals, stream);
-}
 
-/**
- * @brief Sort the underlying COO arrays by row
- * @tparam T: the type name of the underlying value array
- * @param in: COO to sort by row
- * @param stream: the cuda stream to use
- */
-template <typename T>
-void coo_sort(COO<T>* const in, cudaStream_t stream)
-{
-  coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
-}
+#pragma once
 
-/**
- * Sorts a COO by its weight
- * @tparam value_idx
- * @tparam value_t
- * @param[inout] rows source edges
- * @param[inout] cols dest edges
- * @param[inout] data edge weights
- * @param[in] nnz number of edges in edge list
- * @param[in] stream cuda stream for which to order cuda operations
- */
-template <typename value_idx, typename value_t>
-void coo_sort_by_weight(
-  value_idx* rows, value_idx* cols, value_t* data, value_idx nnz, cudaStream_t stream)
-{
-  detail::coo_sort_by_weight(rows, cols, data, nnz, stream);
-}
-};  // namespace op
-};  // end NAMESPACE sparse
-};  // end NAMESPACE raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "sort.cuh"
diff --git a/cpp/include/raft/sparse/selection/connect_components.hpp b/cpp/include/raft/sparse/selection/connect_components.hpp
index 25d71367db..b6597babc8 100644
--- a/cpp/include/raft/sparse/selection/connect_components.hpp
+++ b/cpp/include/raft/sparse/selection/connect_components.hpp
@@ -18,70 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __CONNECT_COMPONENTS_H
-#define __CONNECT_COMPONENTS_H
-
-#include <raft/handle.hpp>
-#include <raft/sparse/coo.hpp>
-#include <raft/sparse/selection/detail/connect_components.cuh>
-
-namespace raft {
-namespace linkage {
-
-template <typename value_idx, typename value_t>
-using FixConnectivitiesRedOp = detail::FixConnectivitiesRedOp<value_idx, value_t>;
-
 /**
- * Gets the number of unique components from array of
- * colors or labels. This does not assume the components are
- * drawn from a monotonically increasing set.
- * @tparam value_idx
- * @param[in] colors array of components
- * @param[in] n_rows size of components array
- * @param[in] stream cuda stream for which to order cuda operations
- * @return total number of components
+ * DISCLAIMER: this file is deprecated: use connect_components.cuh instead
  */
-template <typename value_idx>
-value_idx get_n_components(value_idx* colors, size_t n_rows, cudaStream_t stream)
-{
-  return detail::get_n_components(colors, n_rows, stream);
-}
 
-/**
- * Connects the components of an otherwise unconnected knn graph
- * by computing a 1-nn to neighboring components of each data point
- * (e.g. component(nn) != component(self)) and reducing the results to
- * include the set of smallest destination components for each source
- * component. The result will not necessarily contain
- * n_components^2 - n_components number of elements because many components
- * will likely not be contained in the neighborhoods of 1-nns.
- * @tparam value_idx
- * @tparam value_t
- * @param[in] handle raft handle
- * @param[out] out output edge list containing nearest cross-component
- *             edges.
- * @param[in] X original (row-major) dense matrix for which knn graph should be constructed.
- * @param[in] orig_colors array containing component number for each row of X
- * @param[in] n_rows number of rows in X
- * @param[in] n_cols number of cols in X
- * @param[in] reduction_op
- * @param[in] metric
- */
-template <typename value_idx, typename value_t, typename red_op>
-void connect_components(
-  const raft::handle_t& handle,
-  raft::sparse::COO<value_t, value_idx>& out,
-  const value_t* X,
-  const value_idx* orig_colors,
-  size_t n_rows,
-  size_t n_cols,
-  red_op reduction_op,
-  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded)
-{
-  detail::connect_components(handle, out, X, orig_colors, n_rows, n_cols, reduction_op, metric);
-}
+#pragma once
 
-};  // end namespace linkage
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "connect_components.cuh"
diff --git a/cpp/include/raft/sparse/selection/knn.hpp b/cpp/include/raft/sparse/selection/knn.hpp
index bd6dd39fdf..6924e0b5a7 100644
--- a/cpp/include/raft/sparse/selection/knn.hpp
+++ b/cpp/include/raft/sparse/selection/knn.hpp
@@ -18,90 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SPARSE_KNN_H
-#define __SPARSE_KNN_H
-
-#pragma once
-
-#include <raft/distance/distance_type.hpp>
-#include <raft/handle.hpp>
-#include <raft/sparse/selection/detail/knn.cuh>
-
-namespace raft {
-namespace sparse {
-namespace selection {
-
 /**
- * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors
- * using some distance implementation
- * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1)
- * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz)
- * @param[in] idxData csr data array of the index matrix (size idxNNZ)
- * @param[in] idxNNZ number of non-zeros for sparse index matrix
- * @param[in] n_idx_rows number of data samples in index matrix
- * @param[in] n_idx_cols
- * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1)
- * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ)
- * @param[in] queryData csr data array of the query matrix (size queryNNZ)
- * @param[in] queryNNZ number of non-zeros for sparse query matrix
- * @param[in] n_query_rows number of data samples in query matrix
- * @param[in] n_query_cols number of features in query matrix
- * @param[out] output_indices dense matrix for output indices (size n_query_rows * k)
- * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
- * @param[in] k the number of neighbors to query
- * @param[in] handle CUDA handle.get_stream() to order operations with respect to
- * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
- * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
- * @param[in] metric distance metric/measure to use
- * @param[in] metricArg potential argument for metric (currently unused)
+ * DISCLAIMER: this file is deprecated: use knn.cuh instead
  */
-template <typename value_idx = int, typename value_t = float, int TPB_X = 32>
-void brute_force_knn(const value_idx* idxIndptr,
-                     const value_idx* idxIndices,
-                     const value_t* idxData,
-                     size_t idxNNZ,
-                     int n_idx_rows,
-                     int n_idx_cols,
-                     const value_idx* queryIndptr,
-                     const value_idx* queryIndices,
-                     const value_t* queryData,
-                     size_t queryNNZ,
-                     int n_query_rows,
-                     int n_query_cols,
-                     value_idx* output_indices,
-                     value_t* output_dists,
-                     int k,
-                     const raft::handle_t& handle,
-                     size_t batch_size_index             = 2 << 14,  // approx 1M
-                     size_t batch_size_query             = 2 << 14,
-                     raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
-                     float metricArg                     = 0)
-{
-  detail::sparse_knn_t<value_idx, value_t>(idxIndptr,
-                                           idxIndices,
-                                           idxData,
-                                           idxNNZ,
-                                           n_idx_rows,
-                                           n_idx_cols,
-                                           queryIndptr,
-                                           queryIndices,
-                                           queryData,
-                                           queryNNZ,
-                                           n_query_rows,
-                                           n_query_cols,
-                                           output_indices,
-                                           output_dists,
-                                           k,
-                                           handle,
-                                           batch_size_index,
-                                           batch_size_query,
-                                           metric,
-                                           metricArg)
-    .run();
-}
 
-};  // namespace selection
-};  // namespace sparse
-};  // namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "knn.cuh"
diff --git a/cpp/include/raft/sparse/selection/knn_graph.hpp b/cpp/include/raft/sparse/selection/knn_graph.hpp
index be47a6a9ef..833bdb61d2 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.hpp
+++ b/cpp/include/raft/sparse/selection/knn_graph.hpp
@@ -18,51 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __KNN_GRAPH_H
-#define __KNN_GRAPH_H
-
-#pragma once
-
-#include <raft/distance/distance_type.hpp>
-#include <raft/sparse/coo.hpp>
-#include <raft/sparse/selection/detail/knn_graph.cuh>
-
-#include <cstdint>
-
-namespace raft {
-namespace sparse {
-namespace selection {
-
 /**
- * Constructs a (symmetrized) knn graph edge list from
- * dense input vectors.
- *
- * Note: The resulting KNN graph is not guaranteed to be connected.
- *
- * @tparam value_idx
- * @tparam value_t
- * @param[in] handle raft handle
- * @param[in] X dense matrix of input data samples and observations
- * @param[in] m number of data samples (rows) in X
- * @param[in] n number of observations (columns) in X
- * @param[in] metric distance metric to use when constructing neighborhoods
- * @param[out] out output edge list
- * @param c
+ * DISCLAIMER: this file is deprecated: use knn_graph.cuh instead
  */
-template <typename value_idx = int, typename value_t = float>
-void knn_graph(const handle_t& handle,
-               const value_t* X,
-               std::size_t m,
-               std::size_t n,
-               raft::distance::DistanceType metric,
-               raft::sparse::COO<value_t, value_idx>& out,
-               int c = 15)
-{
-  detail::knn_graph(handle, X, m, n, metric, out, c);
-}
 
-};  // namespace selection
-};  // namespace sparse
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "knn_graph.cuh"
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index fca5d05465..123f6cf70f 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -22,7 +22,7 @@
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/cuda_utils.cuh>
-#include <raft/distance/distance.hpp>
+#include <raft/distance/distance.cuh>
 #include <raft/distance/distance_type.hpp>
 #include <raft/linalg/gemm.cuh>
 #include <raft/linalg/unary_op.cuh>
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index d4bce1fdf4..9ff19c2747 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -18,7 +18,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/distance/distance.hpp>
+#include <raft/distance/distance.cuh>
 #include <raft/distance/distance_type.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
index 7674ac0d46..1f1a3d8f8e 100644
--- a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
+++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
@@ -18,51 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __EPSILON_NEIGH_H
-#define __EPSILON_NEIGH_H
+/**
+ * DISCLAIMER: this file is deprecated: use epsilon_neighborhood.cuh instead
+ */
 
 #pragma once
 
-#include <raft/spatial/knn/detail/epsilon_neighborhood.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-
-/**
- * @brief Computes epsilon neighborhood for the L2-Squared distance metric
- *
- * @tparam DataT   IO and math type
- * @tparam IdxT    Index type
- *
- * @param[out] adj    adjacency matrix [row-major] [on device] [dim = m x n]
- * @param[out] vd     vertex degree array [on device] [len = m + 1]
- *                    `vd + m` stores the total number of edges in the adjacency
- *                    matrix. Pass a nullptr if you don't need this info.
- * @param[in]  x      first matrix [row-major] [on device] [dim = m x k]
- * @param[in]  y      second matrix [row-major] [on device] [dim = n x k]
- * @param[in]  m      number of rows in x
- * @param[in]  n      number of rows in y
- * @param[in]  k      number of columns in x and k
- * @param[in]  eps    defines epsilon neighborhood radius (should be passed as
- *                    squared as we compute L2-squared distance in this method)
- * @param[in]  stream cuda stream
- */
-template <typename DataT, typename IdxT>
-void epsUnexpL2SqNeighborhood(bool* adj,
-                              IdxT* vd,
-                              const DataT* x,
-                              const DataT* y,
-                              IdxT m,
-                              IdxT n,
-                              IdxT k,
-                              DataT eps,
-                              cudaStream_t stream)
-{
-  detail::epsUnexpL2SqNeighborhood<DataT, IdxT>(adj, vd, x, y, m, n, k, eps, stream);
-}
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "epsilon_neighborhood.cuh"
diff --git a/cpp/include/raft/spatial/knn/specializations.hpp b/cpp/include/raft/spatial/knn/specializations.hpp
index 13721a975f..04afb73036 100644
--- a/cpp/include/raft/spatial/knn/specializations.hpp
+++ b/cpp/include/raft/spatial/knn/specializations.hpp
@@ -18,13 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __KNN_SPECIALIZATIONS_H
-#define __KNN_SPECIALIZATIONS_H
+/**
+ * DISCLAIMER: this file is deprecated: use specializations.cuh instead
+ */
 
 #pragma once
 
-#include <raft/spatial/knn/specializations/ball_cover.cuh>
-#include <raft/spatial/knn/specializations/fused_l2_knn.cuh>
-#include <raft/spatial/knn/specializations/knn.cuh>
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "specializations.cuh"
diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp
index e6b37f29ec..57553daedf 100644
--- a/cpp/include/raft/spectral/eigen_solvers.hpp
+++ b/cpp/include/raft/spectral/eigen_solvers.hpp
@@ -18,95 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __EIGEN_SOLVERS_H
-#define __EIGEN_SOLVERS_H
+/**
+ * DISCLAIMER: this file is deprecated: use eigen_solvers.cuh instead
+ */
 
 #pragma once
 
-#include <raft/linalg/lanczos.cuh>
-#include <raft/spectral/matrix_wrappers.hpp>
-
-namespace raft {
-namespace spectral {
-
-// aggregate of control params for Eigen Solver:
-//
-template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
-struct eigen_solver_config_t {
-  size_type_t n_eigVecs;
-  size_type_t maxIter;
-
-  size_type_t restartIter;
-  value_type_t tol;
-
-  bool reorthogonalize{false};
-  unsigned long long seed{
-    1234567};  // CAVEAT: this default value is now common to all instances of using seed in
-               // Lanczos; was not the case before: there were places where a default seed = 123456
-               // was used; this may trigger slightly different # solver iterations
-};
-
-template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
-struct lanczos_solver_t {
-  explicit lanczos_solver_t(
-    eigen_solver_config_t<index_type_t, value_type_t, size_type_t> const& config)
-    : config_(config)
-  {
-  }
-
-  index_type_t solve_smallest_eigenvectors(
-    handle_t const& handle,
-    matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
-    value_type_t* __restrict__ eigVals,
-    value_type_t* __restrict__ eigVecs) const
-  {
-    RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
-    RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
-    index_type_t iters{};
-    linalg::computeSmallestEigenvectors(handle,
-                                        A,
-                                        config_.n_eigVecs,
-                                        config_.maxIter,
-                                        config_.restartIter,
-                                        config_.tol,
-                                        config_.reorthogonalize,
-                                        iters,
-                                        eigVals,
-                                        eigVecs,
-                                        config_.seed);
-    return iters;
-  }
-
-  index_type_t solve_largest_eigenvectors(
-    handle_t const& handle,
-    matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
-    value_type_t* __restrict__ eigVals,
-    value_type_t* __restrict__ eigVecs) const
-  {
-    RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
-    RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
-    index_type_t iters{};
-    linalg::computeLargestEigenvectors(handle,
-                                       A,
-                                       config_.n_eigVecs,
-                                       config_.maxIter,
-                                       config_.restartIter,
-                                       config_.tol,
-                                       config_.reorthogonalize,
-                                       iters,
-                                       eigVals,
-                                       eigVecs,
-                                       config_.seed);
-    return iters;
-  }
-
-  auto const& get_config(void) const { return config_; }
-
- private:
-  eigen_solver_config_t<index_type_t, value_type_t, size_type_t> config_;
-};
-
-}  // namespace spectral
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
+#include "eigen_solvers.cuh"
diff --git a/cpp/include/raft/stats/accuracy.hpp b/cpp/include/raft/stats/accuracy.hpp
index 8cbb0f719e..a1b7321879 100644
--- a/cpp/include/raft/stats/accuracy.hpp
+++ b/cpp/include/raft/stats/accuracy.hpp
@@ -18,32 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __STATS_ACCURACY_H
-#define __STATS_ACCURACY_H
-
-#pragma once
-
-#include <raft/stats/detail/scores.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Compute accuracy of predictions. Useful for classification.
- * @tparam math_t: data type for predictions (e.g., int for classification)
- * @param[in] predictions: array of predictions (GPU pointer).
- * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer).
- * @param[in] n: number of elements in each of predictions, ref_predictions.
- * @param[in] stream: cuda stream.
- * @return: Accuracy score in [0, 1]; higher is better.
+ * DISCLAIMER: this file is deprecated: use accuracy.cuh instead
  */
-template <typename math_t>
-float accuracy(const math_t* predictions, const math_t* ref_predictions, int n, cudaStream_t stream)
-{
-  return detail::accuracy_score(predictions, ref_predictions, n, stream);
-}
 
-}  // namespace stats
-}  // namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "accuracy.cuh"
diff --git a/cpp/include/raft/stats/adjusted_rand_index.hpp b/cpp/include/raft/stats/adjusted_rand_index.hpp
index bc836eed86..3a990ac985 100644
--- a/cpp/include/raft/stats/adjusted_rand_index.hpp
+++ b/cpp/include/raft/stats/adjusted_rand_index.hpp
@@ -18,43 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __ADJUSTED_RAND_INDEX_H
-#define __ADJUSTED_RAND_INDEX_H
-
 /**
- * @file adjusted_rand_index.hpp
- * @brief The adjusted Rand index is the corrected-for-chance version of the Rand index.
- * Such a correction for chance establishes a baseline by using the expected similarity
- * of all pair-wise comparisons between clusterings specified by a random model.
+ * DISCLAIMER: this file is deprecated: use adjusted_rand_index.cuh instead
  */
 
 #pragma once
 
-#include <raft/stats/detail/adjusted_rand_index.cuh>
-
-namespace raft {
-namespace stats {
-
-/**
- * @brief Function to calculate Adjusted RandIndex as described
- *        <a href="https://en.wikipedia.org/wiki/Rand_index">here</a>
- * @tparam T data-type for input label arrays
- * @tparam MathT integral data-type used for computing n-choose-r
- * @param firstClusterArray: the array of classes
- * @param secondClusterArray: the array of classes
- * @param size: the size of the data points of type int
- * @param stream: the cudaStream object
- */
-template <typename T, typename MathT = int>
-double adjusted_rand_index(const T* firstClusterArray,
-                           const T* secondClusterArray,
-                           int size,
-                           cudaStream_t stream)
-{
-  return detail::compute_adjusted_rand_index(firstClusterArray, secondClusterArray, size, stream);
-}
-
-};  // end namespace stats
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
+#include "adjusted_rand_index.cuh"
diff --git a/cpp/include/raft/stats/contingency_matrix.hpp b/cpp/include/raft/stats/contingency_matrix.hpp
index 70800be1e6..141f678f94 100644
--- a/cpp/include/raft/stats/contingency_matrix.hpp
+++ b/cpp/include/raft/stats/contingency_matrix.hpp
@@ -18,93 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __CONTINGENCY_MATRIX_H
-#define __CONTINGENCY_MATRIX_H
-
-#pragma once
-
-#include <raft/stats/detail/contingencyMatrix.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief use this to allocate output matrix size
- * size of matrix = (maxLabel - minLabel + 1)^2 * sizeof(int)
- * @param groundTruth: device 1-d array for ground truth (num of rows)
- * @param nSamples: number of elements in input array
- * @param stream: cuda stream for execution
- * @param minLabel: [out] calculated min value in input array
- * @param maxLabel: [out] calculated max value in input array
+ * DISCLAIMER: this file is deprecated: use contingency_matrix.cuh instead
  */
-template <typename T>
-void getInputClassCardinality(
-  const T* groundTruth, const int nSamples, cudaStream_t stream, T& minLabel, T& maxLabel)
-{
-  detail::getInputClassCardinality(groundTruth, nSamples, stream, minLabel, maxLabel);
-}
 
-/**
- * @brief Calculate workspace size for running contingency matrix calculations
- * @tparam T label type
- * @tparam OutT output matrix type
- * @param nSamples: number of elements in input array
- * @param groundTruth: device 1-d array for ground truth (num of rows)
- * @param stream: cuda stream for execution
- * @param minLabel: Optional, min value in input array
- * @param maxLabel: Optional, max value in input array
- */
-template <typename T, typename OutT = int>
-size_t getContingencyMatrixWorkspaceSize(int nSamples,
-                                         const T* groundTruth,
-                                         cudaStream_t stream,
-                                         T minLabel = std::numeric_limits<T>::max(),
-                                         T maxLabel = std::numeric_limits<T>::max())
-{
-  return detail::getContingencyMatrixWorkspaceSize(
-    nSamples, groundTruth, stream, minLabel, maxLabel);
-}
-
-/**
- * @brief contruct contingency matrix given input ground truth and prediction
- *        labels. Users should call function getInputClassCardinality to find
- *        and allocate memory for output. Similarly workspace requirements
- *        should be checked using function getContingencyMatrixWorkspaceSize
- * @tparam T label type
- * @tparam OutT output matrix type
- * @param groundTruth: device 1-d array for ground truth (num of rows)
- * @param predictedLabel: device 1-d array for prediction (num of columns)
- * @param nSamples: number of elements in input array
- * @param outMat: output buffer for contingecy matrix
- * @param stream: cuda stream for execution
- * @param workspace: Optional, workspace memory allocation
- * @param workspaceSize: Optional, size of workspace memory
- * @param minLabel: Optional, min value in input ground truth array
- * @param maxLabel: Optional, max value in input ground truth array
- */
-template <typename T, typename OutT = int>
-void contingencyMatrix(const T* groundTruth,
-                       const T* predictedLabel,
-                       int nSamples,
-                       OutT* outMat,
-                       cudaStream_t stream,
-                       void* workspace      = nullptr,
-                       size_t workspaceSize = 0,
-                       T minLabel           = std::numeric_limits<T>::max(),
-                       T maxLabel           = std::numeric_limits<T>::max())
-{
-  detail::contingencyMatrix<T, OutT>(groundTruth,
-                                     predictedLabel,
-                                     nSamples,
-                                     outMat,
-                                     stream,
-                                     workspace,
-                                     workspaceSize,
-                                     minLabel,
-                                     maxLabel);
-}
+#pragma once
 
-};  // namespace stats
-};  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "contingency_matrix.cuh"
diff --git a/cpp/include/raft/stats/cov.hpp b/cpp/include/raft/stats/cov.hpp
index a584dedc95..a6c653206a 100644
--- a/cpp/include/raft/stats/cov.hpp
+++ b/cpp/include/raft/stats/cov.hpp
@@ -18,50 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __COV_H
-#define __COV_H
+/**
+ * DISCLAIMER: this file is deprecated: use cov.cuh instead
+ */
 
 #pragma once
 
-#include <raft/stats/detail/cov.cuh>
-namespace raft {
-namespace stats {
-/**
- * @brief Compute covariance of the input matrix
- *
- * Mean operation is assumed to be performed on a given column.
- *
- * @tparam Type the data type
- * @param covar the output covariance matrix
- * @param data the input matrix (this will get mean-centered at the end!)
- * @param mu mean vector of the input matrix
- * @param D number of columns of data
- * @param N number of rows of data
- * @param sample whether to evaluate sample covariance or not. In other words,
- * whether to normalize the output using N-1 or N, for true or false,
- * respectively
- * @param rowMajor whether the input data is row or col major
- * @param stable whether to run the slower-but-numerically-stable version or not
- * @param handle cublas handle
- * @param stream cuda stream
- * @note if stable=true, then the input data will be mean centered after this
- * function returns!
- */
-template <typename Type>
-void cov(const raft::handle_t& handle,
-         Type* covar,
-         Type* data,
-         const Type* mu,
-         std::size_t D,
-         std::size_t N,
-         bool sample,
-         bool rowMajor,
-         bool stable,
-         cudaStream_t stream)
-{
-  detail::cov(handle, covar, data, mu, D, N, sample, rowMajor, stable, stream);
-}
-};  // end namespace stats
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "cov.cuh"
diff --git a/cpp/include/raft/stats/detail/weighted_mean.cuh b/cpp/include/raft/stats/detail/weighted_mean.cuh
index 6d6f901fab..9c17d2ed0f 100644
--- a/cpp/include/raft/stats/detail/weighted_mean.cuh
+++ b/cpp/include/raft/stats/detail/weighted_mean.cuh
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/reduce.hpp>
-#include <raft/stats/sum.hpp>
+#include <raft/linalg/reduce.cuh>
+#include <raft/stats/sum.cuh>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/include/raft/stats/dispersion.hpp b/cpp/include/raft/stats/dispersion.hpp
index 7fabf07992..820c9e27ea 100644
--- a/cpp/include/raft/stats/dispersion.hpp
+++ b/cpp/include/raft/stats/dispersion.hpp
@@ -18,48 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __DISPERSION_H
-#define __DISPERSION_H
-
-#pragma once
-
-#include <raft/stats/detail/dispersion.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Compute cluster dispersion metric. This is very useful for
- * automatically finding the 'k' (in kmeans) that improves this metric.
- * @tparam DataT data type
- * @tparam IdxT index type
- * @tparam TPB threads block for kernels launched
- * @param centroids the cluster centroids. This is assumed to be row-major
- *   and of dimension (nClusters x dim)
- * @param clusterSizes number of points in the dataset which belong to each
- *   cluster. This is of length nClusters
- * @param globalCentroid compute the global weighted centroid of all cluster
- *   centroids. This is of length dim. Pass a nullptr if this is not needed
- * @param nClusters number of clusters
- * @param nPoints number of points in the dataset
- * @param dim dataset dimensionality
- * @param stream cuda stream
- * @return the cluster dispersion value
+ * DISCLAIMER: this file is deprecated: use dispersion.cuh instead
  */
-template <typename DataT, typename IdxT = int, int TPB = 256>
-DataT dispersion(const DataT* centroids,
-                 const IdxT* clusterSizes,
-                 DataT* globalCentroid,
-                 IdxT nClusters,
-                 IdxT nPoints,
-                 IdxT dim,
-                 cudaStream_t stream)
-{
-  return detail::dispersion(
-    centroids, clusterSizes, globalCentroid, nClusters, nPoints, dim, stream);
-}
 
-}  // end namespace stats
-}  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "dispersion.cuh"
diff --git a/cpp/include/raft/stats/entropy.hpp b/cpp/include/raft/stats/entropy.hpp
index 37dc2b700c..d8e1c11125 100644
--- a/cpp/include/raft/stats/entropy.hpp
+++ b/cpp/include/raft/stats/entropy.hpp
@@ -18,37 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __ENTROPY_H
-#define __ENTROPY_H
-
-#pragma once
-#include <raft/stats/detail/entropy.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Function to calculate entropy
- * <a href="https://en.wikipedia.org/wiki/Entropy_(information_theory)">more info on entropy</a>
- *
- * @param clusterArray: the array of classes of type T
- * @param size: the size of the data points of type int
- * @param lowerLabelRange: the lower bound of the range of labels
- * @param upperLabelRange: the upper bound of the range of labels
- * @param stream: the cudaStream object
- * @return the entropy score
+ * DISCLAIMER: this file is deprecated: use entropy.cuh instead
  */
-template <typename T>
-double entropy(const T* clusterArray,
-               const int size,
-               const T lowerLabelRange,
-               const T upperLabelRange,
-               cudaStream_t stream)
-{
-  return detail::entropy(clusterArray, size, lowerLabelRange, upperLabelRange, stream);
-}
 
-};  // end namespace stats
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "entropy.cuh"
diff --git a/cpp/include/raft/stats/histogram.hpp b/cpp/include/raft/stats/histogram.hpp
index 627026c219..c123375597 100644
--- a/cpp/include/raft/stats/histogram.hpp
+++ b/cpp/include/raft/stats/histogram.hpp
@@ -18,54 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __HISTOGRAM_H
-#define __HISTOGRAM_H
-
-#pragma once
-
-#include <raft/stats/common.hpp>
-#include <raft/stats/detail/histogram.cuh>
-
-// This file is a shameless amalgamation of independent works done by
-// Lars Nyland and Andy Adinets
-
-///@todo: add cub's histogram as another option
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Perform histogram on the input data. It chooses the right load size
- * based on the input data vector length. It also supports large-bin cases
- * using a specialized smem-based hashing technique.
- * @tparam DataT input data type
- * @tparam IdxT data type used to compute indices
- * @tparam BinnerOp takes the input data and computes its bin index
- * @param type histogram implementation type to choose
- * @param bins the output bins (length = ncols * nbins)
- * @param nbins number of bins
- * @param data input data (length = ncols * nrows)
- * @param nrows data array length in each column (or batch)
- * @param ncols number of columsn (or batch size)
- * @param stream cuda stream
- * @param binner the operation that computes the bin index of the input data
- *
- * @note signature of BinnerOp is `int func(DataT, IdxT);`
+ * DISCLAIMER: this file is deprecated: use histogram.cuh instead
  */
-template <typename DataT, typename IdxT = int, typename BinnerOp = IdentityBinner<DataT, IdxT>>
-void histogram(HistType type,
-               int* bins,
-               IdxT nbins,
-               const DataT* data,
-               IdxT nrows,
-               IdxT ncols,
-               cudaStream_t stream,
-               BinnerOp binner = IdentityBinner<DataT, IdxT>())
-{
-  detail::histogram<DataT, IdxT, BinnerOp>(type, bins, nbins, data, nrows, ncols, stream, binner);
-}
 
-};  // end namespace stats
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "histogram.cuh"
diff --git a/cpp/include/raft/stats/homogeneity_score.hpp b/cpp/include/raft/stats/homogeneity_score.hpp
index 4e119f2bc7..8d2433d1da 100644
--- a/cpp/include/raft/stats/homogeneity_score.hpp
+++ b/cpp/include/raft/stats/homogeneity_score.hpp
@@ -13,46 +13,19 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 /**
  * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
-#ifndef __HOMOGENEITY_SCORE_H
-#define __HOMOGENEITY_SCORE_H
-
-#pragma once
-
-#include <raft/stats/detail/homogeneity_score.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Function to calculate the homogeneity score between two clusters
- * <a href="https://en.wikipedia.org/wiki/Homogeneity_(statistics)">more info on mutual
- * information</a>
- * @param truthClusterArray: the array of truth classes of type T
- * @param predClusterArray: the array of predicted classes of type T
- * @param size: the size of the data points of type int
- * @param lowerLabelRange: the lower bound of the range of labels
- * @param upperLabelRange: the upper bound of the range of labels
- * @param stream: the cudaStream object
+ * DISCLAIMER: this file is deprecated: use homogeneity_score.cuh instead
  */
-template <typename T>
-double homogeneity_score(const T* truthClusterArray,
-                         const T* predClusterArray,
-                         int size,
-                         T lowerLabelRange,
-                         T upperLabelRange,
-                         cudaStream_t stream)
-{
-  return detail::homogeneity_score(
-    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
-}
 
-};  // end namespace stats
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "homogeneity_score.cuh"
diff --git a/cpp/include/raft/stats/information_criterion.hpp b/cpp/include/raft/stats/information_criterion.hpp
index 3a39e56c41..898ffbfa8e 100644
--- a/cpp/include/raft/stats/information_criterion.hpp
+++ b/cpp/include/raft/stats/information_criterion.hpp
@@ -18,56 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __INFORMATION_CRIT_H
-#define __INFORMATION_CRIT_H
-
 /**
- * @file information_criterion.hpp
- * @brief These information criteria are used to evaluate the quality of models
- *        by balancing the quality of the fit and the number of parameters.
- *
- * See:
- *  - AIC: https://en.wikipedia.org/wiki/Akaike_information_criterion
- *  - AICc: https://en.wikipedia.org/wiki/Akaike_information_criterion#AICc
- *  - BIC: https://en.wikipedia.org/wiki/Bayesian_information_criterion
+ * DISCLAIMER: this file is deprecated: use information_criterion.cuh instead
  */
-#pragma once
-
-#include <raft/stats/common.hpp>
-#include <raft/stats/detail/batched/information_criterion.cuh>
-
-namespace raft {
-namespace stats {
 
-/**
- * Compute the given type of information criterion
- *
- * @note: it is safe to do the computation in-place (i.e give same pointer
- *        as input and output)
- *
- * @param[out] d_ic             Information criterion to be returned for each
- *                              series (device)
- * @param[in]  d_loglikelihood  Log-likelihood for each series (device)
- * @param[in]  ic_type          Type of criterion to compute. See IC_Type
- * @param[in]  n_params         Number of parameters in the model
- * @param[in]  batch_size       Number of series in the batch
- * @param[in]  n_samples        Number of samples in each series
- * @param[in]  stream           CUDA stream
- */
-template <typename ScalarT, typename IdxT>
-void information_criterion_batched(ScalarT* d_ic,
-                                   const ScalarT* d_loglikelihood,
-                                   IC_Type ic_type,
-                                   IdxT n_params,
-                                   IdxT batch_size,
-                                   IdxT n_samples,
-                                   cudaStream_t stream)
-{
-  batched::detail::information_criterion(
-    d_ic, d_loglikelihood, ic_type, n_params, batch_size, n_samples, stream);
-}
+#pragma once
 
-}  // namespace stats
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "information_criterion.cuh"
diff --git a/cpp/include/raft/stats/kl_divergence.hpp b/cpp/include/raft/stats/kl_divergence.hpp
index 59db77246f..086d5f1d23 100644
--- a/cpp/include/raft/stats/kl_divergence.hpp
+++ b/cpp/include/raft/stats/kl_divergence.hpp
@@ -18,34 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __KL_DIVERGENCE_H
-#define __KL_DIVERGENCE_H
-
-#pragma once
-
-#include <raft/stats/detail/kl_divergence.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Function to calculate KL Divergence
- * <a href="https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence">more info on KL
- * Divergence</a>
- *
- * @tparam DataT: Data type of the input array
- * @param modelPDF: the model array of probability density functions of type DataT
- * @param candidatePDF: the candidate array of probability density functions of type DataT
- * @param size: the size of the data points of type int
- * @param stream: the cudaStream object
+ * DISCLAIMER: this file is deprecated: use kl_divergence.cuh instead
  */
-template <typename DataT>
-DataT kl_divergence(const DataT* modelPDF, const DataT* candidatePDF, int size, cudaStream_t stream)
-{
-  return detail::kl_divergence(modelPDF, candidatePDF, size, stream);
-}
 
-};  // end namespace stats
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "kl_divergence.cuh"
diff --git a/cpp/include/raft/stats/mean.hpp b/cpp/include/raft/stats/mean.hpp
index 2767b632e6..bce899d9d4 100644
--- a/cpp/include/raft/stats/mean.hpp
+++ b/cpp/include/raft/stats/mean.hpp
@@ -18,43 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MEAN_H
-#define __MEAN_H
-
-#pragma once
-
-#include "detail/mean.cuh"
-
-#include <raft/handle.hpp>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Compute mean of the input matrix
- *
- * Mean operation is assumed to be performed on a given column.
- *
- * @tparam Type: the data type
- * @tparam IdxType Integer type used to for addressing
- * @param mu: the output mean vector
- * @param data: the input matrix
- * @param D: number of columns of data
- * @param N: number of rows of data
- * @param sample: whether to evaluate sample mean or not. In other words,
- * whether
- *  to normalize the output using N-1 or N, for true or false, respectively
- * @param rowMajor: whether the input data is row or col major
- * @param stream: cuda stream
+ * DISCLAIMER: this file is deprecated: use mean.cuh instead
  */
-template <typename Type, typename IdxType = int>
-void mean(
-  Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream)
-{
-  detail::mean(mu, data, D, N, sample, rowMajor, stream);
-}
 
-};  // namespace stats
-};  // namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "mean.cuh"
diff --git a/cpp/include/raft/stats/mean_center.hpp b/cpp/include/raft/stats/mean_center.hpp
index e219891cab..73e49e7307 100644
--- a/cpp/include/raft/stats/mean_center.hpp
+++ b/cpp/include/raft/stats/mean_center.hpp
@@ -18,71 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MEAN_CENTER_H
-#define __MEAN_CENTER_H
-
-#pragma once
-
-#include "detail/mean_center.cuh"
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Center the input matrix wrt its mean
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads per block of the cuda kernel launched
- * @param out the output mean-centered matrix
- * @param data input matrix
- * @param mu the mean vector
- * @param D number of columns of data
- * @param N number of rows of data
- * @param rowMajor whether input is row or col major
- * @param bcastAlongRows whether to broadcast vector along rows or columns
- * @param stream cuda stream where to launch work
+ * DISCLAIMER: this file is deprecated: use mean_center.cuh instead
  */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void meanCenter(Type* out,
-                const Type* data,
-                const Type* mu,
-                IdxType D,
-                IdxType N,
-                bool rowMajor,
-                bool bcastAlongRows,
-                cudaStream_t stream)
-{
-  detail::meanCenter<Type, IdxType, TPB>(out, data, mu, D, N, rowMajor, bcastAlongRows, stream);
-}
 
-/**
- * @brief Add the input matrix wrt its mean
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads per block of the cuda kernel launched
- * @param out the output mean-added matrix
- * @param data input matrix
- * @param mu the mean vector
- * @param D number of columns of data
- * @param N number of rows of data
- * @param rowMajor whether input is row or col major
- * @param bcastAlongRows whether to broadcast vector along rows or columns
- * @param stream cuda stream where to launch work
- */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void meanAdd(Type* out,
-             const Type* data,
-             const Type* mu,
-             IdxType D,
-             IdxType N,
-             bool rowMajor,
-             bool bcastAlongRows,
-             cudaStream_t stream)
-{
-  detail::meanAdd<Type, IdxType, TPB>(out, data, mu, D, N, rowMajor, bcastAlongRows, stream);
-}
+#pragma once
 
-};  // end namespace stats
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "mean_center.cuh"
diff --git a/cpp/include/raft/stats/meanvar.hpp b/cpp/include/raft/stats/meanvar.hpp
index d7ef935fbc..db67a68579 100644
--- a/cpp/include/raft/stats/meanvar.hpp
+++ b/cpp/include/raft/stats/meanvar.hpp
@@ -18,48 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MEANVAR_H
-#define __MEANVAR_H
-
-#pragma once
-
-#include "detail/meanvar.cuh"
-
-namespace raft::stats {
-
 /**
- * @brief Compute mean and variance for each column of a given matrix.
- *
- * The operation is performed in a single sweep. Consider using it when you need to compute
- * both mean and variance, or when you need to compute variance but don't have the mean.
- * It's almost twice faster than running `mean` and `vars` sequentially, because all three
- * kernels are memory-bound.
- *
- * @tparam Type the data type
- * @tparam IdxType Integer type used for addressing
- * @param [out] mean the output mean vector of size D
- * @param [out] var the output variance vector of size D
- * @param [in] data the input matrix of size [N, D]
- * @param [in] D number of columns of data
- * @param [in] N number of rows of data
- * @param [in] sample whether to evaluate sample variance or not. In other words, whether to
- * normalize the variance using N-1 or N, for true or false respectively.
- * @param [in] rowMajor whether the input data is row- or col-major, for true or false respectively.
- * @param [in] stream
+ * DISCLAIMER: this file is deprecated: use meanvar.cuh instead
  */
-template <typename Type, typename IdxType = int>
-void meanvar(Type* mean,
-             Type* var,
-             const Type* data,
-             IdxType D,
-             IdxType N,
-             bool sample,
-             bool rowMajor,
-             cudaStream_t stream)
-{
-  detail::meanvar(mean, var, data, D, N, sample, rowMajor, stream);
-}
 
-};  // namespace raft::stats
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "meanvar.cuh"
diff --git a/cpp/include/raft/stats/minmax.hpp b/cpp/include/raft/stats/minmax.hpp
index 97f06129fa..ad588a38d4 100644
--- a/cpp/include/raft/stats/minmax.hpp
+++ b/cpp/include/raft/stats/minmax.hpp
@@ -18,62 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MINMAX_H
-#define __MINMAX_H
-
-#pragma once
-
-#include <raft/cuda_utils.cuh>
-#include <raft/cudart_utils.h>
-#include <raft/stats/detail/minmax.cuh>
-
-#include <limits>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Computes min/max across every column of the input matrix, as well as
- * optionally allow to subsample based on the given row/col ID mapping vectors
- *
- * @tparam T the data type
- * @tparam TPB number of threads per block
- * @param data input data
- * @param rowids actual row ID mappings. It is of length nrows. If you want to
- * skip this index lookup entirely, pass nullptr
- * @param colids actual col ID mappings. It is of length ncols. If you want to
- * skip this index lookup entirely, pass nullptr
- * @param nrows number of rows of data to be worked upon. The actual rows of the
- * input "data" can be bigger than this!
- * @param ncols number of cols of data to be worked upon. The actual cols of the
- * input "data" can be bigger than this!
- * @param row_stride stride (in number of elements) between 2 adjacent columns
- * @param globalmin final col-wise global minimum (size = ncols)
- * @param globalmax final col-wise global maximum (size = ncols)
- * @param sampledcols output sampled data. Pass nullptr if you don't need this
- * @param stream cuda stream
- * @note This method makes the following assumptions:
- * 1. input and output matrices are assumed to be col-major
- * 2. ncols is small enough to fit the whole of min/max values across all cols
- *    in shared memory
+ * DISCLAIMER: this file is deprecated: use minmax.cuh instead
  */
-template <typename T, int TPB = 512>
-void minmax(const T* data,
-            const unsigned* rowids,
-            const unsigned* colids,
-            int nrows,
-            int ncols,
-            int row_stride,
-            T* globalmin,
-            T* globalmax,
-            T* sampledcols,
-            cudaStream_t stream)
-{
-  detail::minmax<T, TPB>(
-    data, rowids, colids, nrows, ncols, row_stride, globalmin, globalmax, sampledcols, stream);
-}
 
-};  // namespace stats
-};  // namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "minmax.cuh"
diff --git a/cpp/include/raft/stats/mutual_info_score.hpp b/cpp/include/raft/stats/mutual_info_score.hpp
index a080211c36..c3446e3963 100644
--- a/cpp/include/raft/stats/mutual_info_score.hpp
+++ b/cpp/include/raft/stats/mutual_info_score.hpp
@@ -18,39 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MUTUAL_INFO_SCORE_H
-#define __MUTUAL_INFO_SCORE_H
-
-#pragma once
-
-#include <raft/stats/detail/mutual_info_score.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Function to calculate the mutual information between two clusters
- * <a href="https://en.wikipedia.org/wiki/Mutual_information">more info on mutual information</a>
- * @param firstClusterArray: the array of classes of type T
- * @param secondClusterArray: the array of classes of type T
- * @param size: the size of the data points of type int
- * @param lowerLabelRange: the lower bound of the range of labels
- * @param upperLabelRange: the upper bound of the range of labels
- * @param stream: the cudaStream object
+ * DISCLAIMER: this file is deprecated: use mutual_info_score.cuh instead
  */
-template <typename T>
-double mutual_info_score(const T* firstClusterArray,
-                         const T* secondClusterArray,
-                         int size,
-                         T lowerLabelRange,
-                         T upperLabelRange,
-                         cudaStream_t stream)
-{
-  return detail::mutual_info_score(
-    firstClusterArray, secondClusterArray, size, lowerLabelRange, upperLabelRange, stream);
-}
 
-};  // end namespace stats
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "mutual_info_score.cuh"
diff --git a/cpp/include/raft/stats/r2_score.hpp b/cpp/include/raft/stats/r2_score.hpp
index c88a1822ec..bc55a6596d 100644
--- a/cpp/include/raft/stats/r2_score.hpp
+++ b/cpp/include/raft/stats/r2_score.hpp
@@ -18,38 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __R2_SCORE_H
-#define __R2_SCORE_H
-
-#pragma once
-
-#include <raft/stats/detail/scores.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * Calculates the "Coefficient of Determination" (R-Squared) score
- * normalizing the sum of squared errors by the total sum of squares.
- *
- * This score indicates the proportionate amount of variation in an
- * expected response variable is explained by the independent variables
- * in a linear regression model. The larger the R-squared value, the
- * more variability is explained by the linear regression model.
- *
- * @param y: Array of ground-truth response variables
- * @param y_hat: Array of predicted response variables
- * @param n: Number of elements in y and y_hat
- * @param stream: cuda stream
- * @return: The R-squared value.
+ * DISCLAIMER: this file is deprecated: use r2_score.cuh instead
  */
-template <typename math_t>
-math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
-{
-  return detail::r2_score(y, y_hat, n, stream);
-}
 
-}  // namespace stats
-}  // namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "r2_score.cuh"
diff --git a/cpp/include/raft/stats/rand_index.hpp b/cpp/include/raft/stats/rand_index.hpp
index e8c3089371..7d398dddb4 100644
--- a/cpp/include/raft/stats/rand_index.hpp
+++ b/cpp/include/raft/stats/rand_index.hpp
@@ -18,31 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __RAND_INDEX_H
-#define __RAND_INDEX_H
-
-#pragma once
-
-#include <raft/stats/detail/rand_index.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Function to calculate RandIndex
- * <a href="https://en.wikipedia.org/wiki/Rand_index">more info on rand index</a>
- * @param firstClusterArray: the array of classes of type T
- * @param secondClusterArray: the array of classes of type T
- * @param size: the size of the data points of type uint64_t
- * @param stream: the cudaStream object
+ * DISCLAIMER: this file is deprecated: use rand_index.cuh instead
  */
-template <typename T>
-double rand_index(T* firstClusterArray, T* secondClusterArray, uint64_t size, cudaStream_t stream)
-{
-  return detail::compute_rand_index(firstClusterArray, secondClusterArray, size, stream);
-}
 
-};  // end namespace stats
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "rand_index.cuh"
diff --git a/cpp/include/raft/stats/regression_metrics.hpp b/cpp/include/raft/stats/regression_metrics.hpp
index f65ad524ef..084f4f8fbc 100644
--- a/cpp/include/raft/stats/regression_metrics.hpp
+++ b/cpp/include/raft/stats/regression_metrics.hpp
@@ -18,43 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __REGRESSION_METRICS_H
-#define __REGRESSION_METRICS_H
+/**
+ * DISCLAIMER: this file is deprecated: use regression_metrics.cuh instead
+ */
 
 #pragma once
 
-#include <raft/stats/detail/scores.cuh>
-
-namespace raft {
-namespace stats {
-
-/**
- * @brief Compute regression metrics mean absolute error, mean squared error, median absolute error
- * @tparam T: data type for predictions (e.g., float or double for regression).
- * @param[in] predictions: array of predictions (GPU pointer).
- * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer).
- * @param[in] n: number of elements in each of predictions, ref_predictions. Should be > 0.
- * @param[in] stream: cuda stream.
- * @param[out] mean_abs_error: Mean Absolute Error. Sum over n of (|predictions[i] -
- * ref_predictions[i]|) / n.
- * @param[out] mean_squared_error: Mean Squared Error. Sum over n of ((predictions[i] -
- * ref_predictions[i])^2) / n.
- * @param[out] median_abs_error: Median Absolute Error. Median of |predictions[i] -
- * ref_predictions[i]| for i in [0, n).
- */
-template <typename T>
-void regression_metrics(const T* predictions,
-                        const T* ref_predictions,
-                        int n,
-                        cudaStream_t stream,
-                        double& mean_abs_error,
-                        double& mean_squared_error,
-                        double& median_abs_error)
-{
-  detail::regression_metrics(
-    predictions, ref_predictions, n, stream, mean_abs_error, mean_squared_error, median_abs_error);
-}
-}  // namespace stats
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "regression_metrics.cuh"
diff --git a/cpp/include/raft/stats/silhouette_score.hpp b/cpp/include/raft/stats/silhouette_score.hpp
index e6c84855c6..54981edbb6 100644
--- a/cpp/include/raft/stats/silhouette_score.hpp
+++ b/cpp/include/raft/stats/silhouette_score.hpp
@@ -18,67 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SILHOUETTE_SCORE_H
-#define __SILHOUETTE_SCORE_H
-
-#pragma once
-
-#include <raft/stats/detail/batched/silhouette_score.cuh>
-#include <raft/stats/detail/silhouette_score.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief main function that returns the average silhouette score for a given set of data and its
- * clusterings
- * @tparam DataT: type of the data samples
- * @tparam LabelT: type of the labels
- * @param handle: raft handle for managing expensive resources
- * @param X_in: pointer to the input Data samples array (nRows x nCols)
- * @param nRows: number of data samples
- * @param nCols: number of features
- * @param labels: the pointer to the array containing labels for every data sample (1 x nRows)
- * @param nLabels: number of Labels
- * @param silhouette_scorePerSample: pointer to the array that is optionally taken in as input and
- * is populated with the silhouette score for every sample (1 x nRows)
- * @param stream: the cuda stream where to launch this kernel
- * @param metric: the numerical value that maps to the type of distance metric to be used in the
- * calculations
+ * DISCLAIMER: this file is deprecated: use silhouette_score.cuh instead
  */
-template <typename DataT, typename LabelT>
-DataT silhouette_score(
-  const raft::handle_t& handle,
-  DataT* X_in,
-  int nRows,
-  int nCols,
-  LabelT* labels,
-  int nLabels,
-  DataT* silhouette_scorePerSample,
-  cudaStream_t stream,
-  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)
-{
-  return detail::silhouette_score(
-    handle, X_in, nRows, nCols, labels, nLabels, silhouette_scorePerSample, stream, metric);
-}
 
-template <typename value_t, typename value_idx, typename label_idx>
-value_t silhouette_score_batched(
-  const raft::handle_t& handle,
-  value_t* X,
-  value_idx n_rows,
-  value_idx n_cols,
-  label_idx* y,
-  label_idx n_labels,
-  value_t* scores,
-  value_idx chunk,
-  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)
-{
-  return batched::detail::silhouette_score(
-    handle, X, n_rows, n_cols, y, n_labels, scores, chunk, metric);
-}
+#pragma once
 
-};  // namespace stats
-};  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "silhouette_score.cuh"
diff --git a/cpp/include/raft/stats/specializations.hpp b/cpp/include/raft/stats/specializations.hpp
index 3929b3124c..0ae82f27e7 100644
--- a/cpp/include/raft/stats/specializations.hpp
+++ b/cpp/include/raft/stats/specializations.hpp
@@ -18,12 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __STATS_SPECIALIZATIONS_H
-#define __STATS_SPECIALIZATIONS_H
+/**
+ * DISCLAIMER: this file is deprecated: use specializations.cuh instead
+ */
 
 #pragma once
 
-#include <raft/distance/specializations.cuh>
-#include <raft/spatial/knn/specializations.cuh>
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "specializations.cuh"
diff --git a/cpp/include/raft/stats/stddev.hpp b/cpp/include/raft/stats/stddev.hpp
index f496b1fd30..2222a2706d 100644
--- a/cpp/include/raft/stats/stddev.hpp
+++ b/cpp/include/raft/stats/stddev.hpp
@@ -18,81 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __STDDEV_H
-#define __STDDEV_H
-
-#pragma once
-
-#include "detail/stddev.cuh"
-
-#include <raft/handle.hpp>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Compute stddev of the input matrix
- *
- * Stddev operation is assumed to be performed on a given column.
- *
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @param std the output stddev vector
- * @param data the input matrix
- * @param mu the mean vector
- * @param D number of columns of data
- * @param N number of rows of data
- * @param sample whether to evaluate sample stddev or not. In other words,
- * whether
- *  to normalize the output using N-1 or N, for true or false, respectively
- * @param rowMajor whether the input data is row or col major
- * @param stream cuda stream where to launch work
+ * DISCLAIMER: this file is deprecated: use stddev.cuh instead
  */
-template <typename Type, typename IdxType = int>
-void stddev(Type* std,
-            const Type* data,
-            const Type* mu,
-            IdxType D,
-            IdxType N,
-            bool sample,
-            bool rowMajor,
-            cudaStream_t stream)
-{
-  detail::stddev(std, data, mu, D, N, sample, rowMajor, stream);
-}
 
-/**
- * @brief Compute variance of the input matrix
- *
- * Variance operation is assumed to be performed on a given column.
- *
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @param var the output stddev vector
- * @param data the input matrix
- * @param mu the mean vector
- * @param D number of columns of data
- * @param N number of rows of data
- * @param sample whether to evaluate sample stddev or not. In other words,
- * whether
- *  to normalize the output using N-1 or N, for true or false, respectively
- * @param rowMajor whether the input data is row or col major
- * @param stream cuda stream where to launch work
- */
-template <typename Type, typename IdxType = int>
-void vars(Type* var,
-          const Type* data,
-          const Type* mu,
-          IdxType D,
-          IdxType N,
-          bool sample,
-          bool rowMajor,
-          cudaStream_t stream)
-{
-  detail::vars(var, data, mu, D, N, sample, rowMajor, stream);
-}
+#pragma once
 
-};  // namespace stats
-};  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "stddev.cuh"
diff --git a/cpp/include/raft/stats/sum.hpp b/cpp/include/raft/stats/sum.hpp
index e1c8c67777..0b11a6219e 100644
--- a/cpp/include/raft/stats/sum.hpp
+++ b/cpp/include/raft/stats/sum.hpp
@@ -18,39 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SUM_H
-#define __SUM_H
-
-#pragma once
-
-#include "detail/sum.cuh"
-
-#include <raft/cudart_utils.h>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Compute sum of the input matrix
- *
- * Sum operation is assumed to be performed on a given column.
- *
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @param output the output mean vector
- * @param input the input matrix
- * @param D number of columns of data
- * @param N number of rows of data
- * @param rowMajor whether the input data is row or col major
- * @param stream cuda stream where to launch work
+ * DISCLAIMER: this file is deprecated: use sum.cuh instead
  */
-template <typename Type, typename IdxType = int>
-void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream)
-{
-  detail::sum(output, input, D, N, rowMajor, stream);
-}
 
-};  // end namespace stats
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "sum.cuh"
diff --git a/cpp/include/raft/stats/trustworthiness_score.hpp b/cpp/include/raft/stats/trustworthiness_score.hpp
index 81edf2ea04..0053860a92 100644
--- a/cpp/include/raft/stats/trustworthiness_score.hpp
+++ b/cpp/include/raft/stats/trustworthiness_score.hpp
@@ -18,41 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __TRUSTWORTHINESS_SCORE_H
-#define __TRUSTWORTHINESS_SCORE_H
+/**
+ * DISCLAIMER: this file is deprecated: use trustworthiness_score.cuh instead
+ */
 
 #pragma once
-#include <raft/stats/detail/trustworthiness_score.cuh>
-
-namespace raft {
-namespace stats {
 
-/**
- * @brief Compute the trustworthiness score
- * @param[in] h: raft handle
- * @param[in] X: Data in original dimension
- * @param[in] X_embedded: Data in target dimension (embedding)
- * @param[in] n: Number of samples
- * @param[in] m: Number of features in high/original dimension
- * @param[in] d: Number of features in low/embedded dimension
- * @param[in] n_neighbors Number of neighbors considered by trustworthiness score
- * @param[in] batchSize Batch size
- * @return[out] Trustworthiness score
- */
-template <typename math_t, raft::distance::DistanceType distance_type>
-double trustworthiness_score(const raft::handle_t& h,
-                             const math_t* X,
-                             math_t* X_embedded,
-                             int n,
-                             int m,
-                             int d,
-                             int n_neighbors,
-                             int batchSize = 512)
-{
-  return detail::trustworthiness_score<math_t, distance_type>(
-    h, X, X_embedded, n, m, d, n_neighbors, batchSize);
-}
-}  // namespace stats
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "trustworthiness_score.cuh"
diff --git a/cpp/include/raft/stats/v_measure.hpp b/cpp/include/raft/stats/v_measure.hpp
index a137af844d..0179d2c856 100644
--- a/cpp/include/raft/stats/v_measure.hpp
+++ b/cpp/include/raft/stats/v_measure.hpp
@@ -18,40 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __V_MEASURE_H
-#define __V_MEASURE_H
-
-#pragma once
-#include <raft/stats/detail/v_measure.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Function to calculate the v-measure between two clusters
- *
- * @param truthClusterArray: the array of truth classes of type T
- * @param predClusterArray: the array of predicted classes of type T
- * @param size: the size of the data points of type int
- * @param lowerLabelRange: the lower bound of the range of labels
- * @param upperLabelRange: the upper bound of the range of labels
- * @param stream: the cudaStream object
- * @param beta: v_measure parameter
+ * DISCLAIMER: this file is deprecated: use v_measure.cuh instead
  */
-template <typename T>
-double v_measure(const T* truthClusterArray,
-                 const T* predClusterArray,
-                 int size,
-                 T lowerLabelRange,
-                 T upperLabelRange,
-                 cudaStream_t stream,
-                 double beta = 1.0)
-{
-  return detail::v_measure(
-    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream, beta);
-}
 
-};  // end namespace stats
-};  // end namespace raft
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "v_measure.cuh"
diff --git a/cpp/include/raft/stats/weighted_mean.hpp b/cpp/include/raft/stats/weighted_mean.hpp
index 5b3f4678d8..8bc4bf4623 100644
--- a/cpp/include/raft/stats/weighted_mean.hpp
+++ b/cpp/include/raft/stats/weighted_mean.hpp
@@ -18,84 +18,14 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __WEIGHTED_MEAN_H
-#define __WEIGHTED_MEAN_H
-
-#pragma once
-
-#include <raft/stats/detail/weighted_mean.cuh>
-
-namespace raft {
-namespace stats {
-
 /**
- * @brief Compute the weighted mean of the input matrix with a
- * vector of weights, along rows or along columns
- *
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @param mu the output mean vector
- * @param data the input matrix
- * @param weights weight of size D if along_row is true, else of size N
- * @param D number of columns of data
- * @param N number of rows of data
- * @param row_major data input matrix is row-major or not
- * @param along_rows whether to reduce along rows or columns
- * @param stream cuda stream to launch work on
+ * DISCLAIMER: this file is deprecated: use weighted_mean.cuh instead
  */
-template <typename Type, typename IdxType = int>
-void weightedMean(Type* mu,
-                  const Type* data,
-                  const Type* weights,
-                  IdxType D,
-                  IdxType N,
-                  bool row_major,
-                  bool along_rows,
-                  cudaStream_t stream)
-{
-  detail::weightedMean(mu, data, weights, D, N, row_major, along_rows, stream);
-}
 
-/**
- * @brief Compute the row-wise weighted mean of the input matrix with a
- * vector of column weights
- *
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @param mu the output mean vector
- * @param data the input matrix (assumed to be row-major)
- * @param weights per-column means
- * @param D number of columns of data
- * @param N number of rows of data
- * @param stream cuda stream to launch work on
- */
-template <typename Type, typename IdxType = int>
-void rowWeightedMean(
-  Type* mu, const Type* data, const Type* weights, IdxType D, IdxType N, cudaStream_t stream)
-{
-  weightedMean(mu, data, weights, D, N, true, true, stream);
-}
+#pragma once
 
-/**
- * @brief Compute the column-wise weighted mean of the input matrix with a
- * vector of row weights
- *
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @param mu the output mean vector
- * @param data the input matrix (assumed to be row-major)
- * @param weights per-row means
- * @param D number of columns of data
- * @param N number of rows of data
- * @param stream cuda stream to launch work on
- */
-template <typename Type, typename IdxType = int>
-void colWeightedMean(
-  Type* mu, const Type* data, const Type* weights, IdxType D, IdxType N, cudaStream_t stream)
-{
-  weightedMean(mu, data, weights, D, N, true, false, stream);
-}
-};  // end namespace stats
-};  // end namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "weighted_mean.cuh"
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu
index d1bfe4a2e4..a23262fc8e 100644
--- a/cpp/test/spatial/ball_cover.cu
+++ b/cpp/test/spatial/ball_cover.cu
@@ -18,8 +18,8 @@
 #include "spatial_data.h"
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance_type.hpp>
-#include <raft/random/make_blobs.hpp>
-#include <raft/spatial/knn/ball_cover.hpp>
+#include <raft/random/make_blobs.cuh>
+#include <raft/spatial/knn/ball_cover.cuh>
 #include <raft/spatial/knn/detail/knn_brute_force_faiss.cuh>
 #if defined RAFT_NN_COMPILED
 #include <raft/spatial/knn/specializations.cuh>