Java utilities to aid in accelerating aggregations on 128-bit types (#…

…10201) This adds a couple of custom kernels for Java to help accelerate sum aggregations on 128-bit types and check for overflows. The first kernel extracts a 32-bit chunk from an 128-bit type which can be used to feed four 32-bit chunks into a sum aggregation. The second kernel takes the resulting upscaled 64-bit integer results and reassembles the parts into a 128-bit type column along with a boolean column to indicate whether the value overflowed. By splitting the 128-bit type into 32-bit chunks, a sum aggregation on DECIMAL128 which is a sort-based aggregation can be turned into a hash-based aggregation on 32-bit integer inputs for improved performance. As a bonus, this approach can also check for overflow which is difficult to do when aggregating on DECIMAL128 sums directly. Authors: - Jason Lowe (https://github.com/jlowe) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - Robert (Bobby) Evans (https://github.com/revans2) URL: #10201
rapidsai · Feb 4, 2022 · 4e8cb4f · 4e8cb4f
1 parent c191d16
commit 4e8cb4f
Show file tree

Hide file tree

Showing 6 changed files with 394 additions and 2 deletions.
diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation128Utils.java b/java/src/main/java/ai/rapids/cudf/Aggregation128Utils.java
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * Utility methods for breaking apart and reassembling 128-bit values during aggregations
+ * to enable hash-based aggregations and detect overflows.
+ */
+public class Aggregation128Utils {
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
+  /**
+   * Extract a 32-bit chunk from a 128-bit value.
+   * @param col column of 128-bit values (e.g.: DECIMAL128)
+   * @param outType integer type to use for the output column (e.g.: UINT32 or INT32)
+   * @param chunkIdx index of the 32-bit chunk to extract where 0 is the least significant chunk
+   *                 and 3 is the most significant chunk
+   * @return column containing the specified 32-bit chunk of the input column values. A null input
+   *                row will result in a corresponding null output row.
+   */
+  public static ColumnVector extractInt32Chunk(ColumnView col, DType outType, int chunkIdx) {
+    return new ColumnVector(extractInt32Chunk(col.getNativeView(),
+        outType.getTypeId().getNativeId(), chunkIdx));
+  }
+
+  /**
+   * Reassemble a column of 128-bit values from a table of four 64-bit integer columns and check
+   * for overflow. The 128-bit value is reconstructed by overlapping the 64-bit values by 32-bits.
+   * The least significant 32-bits of the least significant 64-bit value are used directly as the
+   * least significant 32-bits of the final 128-bit value, and the remaining 32-bits are added to
+   * the next most significant 64-bit value. The lower 32-bits of that sum become the next most
+   * significant 32-bits in the final 128-bit value, and the remaining 32-bits are added to the
+   * next most significant 64-bit input value, and so on.
+   *
+   * @param chunks table of four 64-bit integer columns with the columns ordered from least
+   *               significant to most significant. The last column must be of type INT64.
+   * @param type the type to use for the resulting 128-bit value column
+   * @return table containing a boolean column and a 128-bit value column of the requested type.
+   *         The boolean value will be true if an overflow was detected for that row's value when
+   *         it was reassembled. A null input row will result in a corresponding null output row.
+   */
+  public static Table combineInt64SumChunks(Table chunks, DType type) {
+    return new Table(combineInt64SumChunks(chunks.getNativeView(),
+        type.getTypeId().getNativeId(),
+        type.getScale()));
+  }
+
+  private static native long extractInt32Chunk(long columnView, int outTypeId, int chunkIdx);
+
+  private static native long[] combineInt64SumChunks(long chunksTableView, int dtype, int scale);
+}
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -219,7 +219,7 @@ endif()
 
 add_library(
   cudfjni SHARED
-  src/row_conversion.cu
+  src/Aggregation128UtilsJni.cpp
   src/AggregationJni.cpp
   src/CudfJni.cpp
   src/CudaJni.cpp
@@ -236,7 +236,9 @@ add_library(
   src/RmmJni.cpp
   src/ScalarJni.cpp
   src/TableJni.cpp
+  src/aggregation128_utils.cu
   src/map_lookup.cu
+  src/row_conversion.cu
   src/check_nvcomp_output_sizes.cu
 )
 

diff --git a/java/src/main/native/src/Aggregation128UtilsJni.cpp b/java/src/main/native/src/Aggregation128UtilsJni.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "aggregation128_utils.hpp"
+#include "cudf_jni_apis.hpp"
+#include "dtype_utils.hpp"
+
+extern "C" {
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation128Utils_extractInt32Chunk(
+    JNIEnv *env, jclass, jlong j_column_view, jint j_out_dtype, jint j_chunk_idx) {
+  JNI_NULL_CHECK(env, j_column_view, "column is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto cview = reinterpret_cast<cudf::column_view const *>(j_column_view);
+    auto dtype = cudf::jni::make_data_type(j_out_dtype, 0);
+    return cudf::jni::release_as_jlong(cudf::jni::extract_chunk32(*cview, dtype, j_chunk_idx));
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Aggregation128Utils_combineInt64SumChunks(
+    JNIEnv *env, jclass, jlong j_table_view, jint j_dtype, jint j_scale) {
+  JNI_NULL_CHECK(env, j_table_view, "table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto tview = reinterpret_cast<cudf::table_view const *>(j_table_view);
+    std::unique_ptr<cudf::table> result =
+        cudf::jni::assemble128_from_sum(*tview, cudf::jni::make_data_type(j_dtype, j_scale));
+    return cudf::jni::convert_table_for_return(env, result);
+  }
+  CATCH_STD(env, 0);
+}
+}
diff --git a/java/src/main/native/src/aggregation128_utils.cu b/java/src/main/native/src/aggregation128_utils.cu
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/utilities/error.hpp>
+#include <rmm/exec_policy.hpp>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include "aggregation128_utils.hpp"
+
+namespace {
+
+// Functor to reassemble a 128-bit value from four 64-bit chunks with overflow detection.
+class chunk_assembler : public thrust::unary_function<cudf::size_type, __int128_t> {
+public:
+  chunk_assembler(bool *overflows, uint64_t const *chunks0, uint64_t const *chunks1,
+                  uint64_t const *chunks2, int64_t const *chunks3)
+      : overflows(overflows), chunks0(chunks0), chunks1(chunks1), chunks2(chunks2),
+        chunks3(chunks3) {}
+
+  __device__ __int128_t operator()(cudf::size_type i) const {
+    // Starting with the least significant input and moving to the most significant, propagate the
+    // upper 32-bits of the previous column into the next column, i.e.: propagate the "carry" bits
+    // of each 64-bit chunk into the next chunk.
+    uint64_t const c0 = chunks0[i];
+    uint64_t const c1 = chunks1[i] + (c0 >> 32);
+    uint64_t const c2 = chunks2[i] + (c1 >> 32);
+    int64_t const c3 = chunks3[i] + (c2 >> 32);
+    uint64_t const lower64 = (c1 << 32) | static_cast<uint32_t>(c0);
+    int64_t const upper64 = (c3 << 32) | static_cast<uint32_t>(c2);
+
+    // check for overflow by ensuring the sign bit matches the top carry bits
+    int32_t const replicated_sign_bit = static_cast<int32_t>(c3) >> 31;
+    int32_t const top_carry_bits = static_cast<int32_t>(c3 >> 32);
+    overflows[i] = (replicated_sign_bit != top_carry_bits);
+
+    return (static_cast<__int128_t>(upper64) << 64) | lower64;
+  }
+
+private:
+  // output column for overflow detected
+  bool *const overflows;
+
+  // input columns for the four 64-bit values
+  uint64_t const *const chunks0;
+  uint64_t const *const chunks1;
+  uint64_t const *const chunks2;
+  int64_t const *const chunks3;
+};
+
+} // anonymous namespace
+
+namespace cudf::jni {
+
+// Extract a 32-bit chunk from a 128-bit value.
+std::unique_ptr<cudf::column> extract_chunk32(cudf::column_view const &in_col, cudf::data_type type,
+                                              int chunk_idx, rmm::cuda_stream_view stream) {
+  CUDF_EXPECTS(in_col.type().id() == cudf::type_id::DECIMAL128, "not a 128-bit type");
+  CUDF_EXPECTS(chunk_idx >= 0 && chunk_idx < 4, "invalid chunk index");
+  CUDF_EXPECTS(type.id() == cudf::type_id::INT32 || type.id() == cudf::type_id::UINT32,
+               "not a 32-bit integer type");
+  auto const num_rows = in_col.size();
+  auto out_col = cudf::make_fixed_width_column(type, num_rows, copy_bitmask(in_col));
+  auto out_view = out_col->mutable_view();
+  auto const in_begin = in_col.begin<int32_t>();
+
+  // Build an iterator for every fourth 32-bit value, i.e.: one "chunk" of a __int128_t value
+  thrust::transform_iterator transform_iter{thrust::counting_iterator{0},
+                                            [] __device__(auto i) { return i * 4; }};
+  thrust::permutation_iterator stride_iter{in_begin + chunk_idx, transform_iter};
+
+  thrust::copy(rmm::exec_policy(stream), stride_iter, stride_iter + num_rows,
+               out_view.data<int32_t>());
+  return out_col;
+}
+
+// Reassemble a column of 128-bit values from four 64-bit integer columns with overflow detection.
+std::unique_ptr<cudf::table> assemble128_from_sum(cudf::table_view const &chunks_table,
+                                                  cudf::data_type output_type,
+                                                  rmm::cuda_stream_view stream) {
+  CUDF_EXPECTS(output_type.id() == cudf::type_id::DECIMAL128, "not a 128-bit type");
+  CUDF_EXPECTS(chunks_table.num_columns() == 4, "must be 4 column table");
+  auto const num_rows = chunks_table.num_rows();
+  auto const chunks0 = chunks_table.column(0);
+  auto const chunks1 = chunks_table.column(1);
+  auto const chunks2 = chunks_table.column(2);
+  auto const chunks3 = chunks_table.column(3);
+  CUDF_EXPECTS(cudf::size_of(chunks0.type()) == 8 && cudf::size_of(chunks1.type()) == 8 &&
+                   cudf::size_of(chunks2.type()) == 8 &&
+                   chunks3.type().id() == cudf::type_id::INT64,
+               "chunks type mismatch");
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, num_rows,
+                                                  copy_bitmask(chunks0)));
+  columns.push_back(cudf::make_fixed_width_column(output_type, num_rows, copy_bitmask(chunks0)));
+  auto overflows_view = columns[0]->mutable_view();
+  auto assembled_view = columns[1]->mutable_view();
+  thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator<cudf::size_type>(0),
+                    thrust::make_counting_iterator<cudf::size_type>(num_rows),
+                    assembled_view.begin<__int128_t>(),
+                    chunk_assembler(overflows_view.begin<bool>(), chunks0.begin<uint64_t>(),
+                                    chunks1.begin<uint64_t>(), chunks2.begin<uint64_t>(),
+                                    chunks3.begin<int64_t>()));
+  return std::make_unique<cudf::table>(std::move(columns));
+}
+
+} // namespace cudf::jni
diff --git a/java/src/main/native/src/aggregation128_utils.hpp b/java/src/main/native/src/aggregation128_utils.hpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <memory>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/table/table.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf::jni {
+
+/**
+ * @brief Extract a 32-bit integer column from a column of 128-bit values.
+ *
+ * Given a 128-bit input column, a 32-bit integer column is returned corresponding to
+ * the index of which 32-bit chunk of the original 128-bit values to extract.
+ * 0 corresponds to the least significant chunk, and 3 corresponds to the most
+ * significant chunk.
+ *
+ * A null input row will result in a corresponding null output row.
+ *
+ * @param col       Column of 128-bit values
+ * @param dtype     Integer type to use for the output column (e.g.: UINT32 or INT32)
+ * @param chunk_idx Index of the 32-bit chunk to extract
+ * @param stream    CUDA stream to use
+ * @return          A column containing the extracted 32-bit integer values
+ */
+std::unique_ptr<cudf::column>
+extract_chunk32(cudf::column_view const &col, cudf::data_type dtype, int chunk_idx,
+                rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
+/**
+ * @brief Reassemble a 128-bit column from four 64-bit integer columns with overflow detection.
+ *
+ * The 128-bit value is reconstructed by overlapping the 64-bit values by 32-bits. The least
+ * significant 32-bits of the least significant 64-bit value are used directly as the least
+ * significant 32-bits of the final 128-bit value, and the remaining 32-bits are added to the next
+ * most significant 64-bit value. The lower 32-bits of that sum become the next most significant
+ * 32-bits in the final 128-bit value, and the remaining 32-bits are added to the next most
+ * significant 64-bit input value, and so on.
+ *
+ * A null input row will result in a corresponding null output row.
+ *
+ * @param chunks_table Table of four 64-bit integer columns with the columns ordered from least
+ *                     significant to most significant. The last column must be an INT64 column.
+ * @param output_type  The type to use for the resulting 128-bit value column
+ * @param stream       CUDA stream to use
+ * @return             Table containing a boolean column and a 128-bit value column of the
+ *                     requested type. The boolean value will be true if an overflow was detected
+ *                     for that row's value.
+ */
+std::unique_ptr<cudf::table>
+assemble128_from_sum(cudf::table_view const &chunks_table, cudf::data_type output_type,
+                     rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
+} // namespace cudf::jni