Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove HASH_SERIAL_MURMUR3 / serial32BitMurmurHash3 #11383

Merged
merged 1 commit into from
Aug 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions cpp/benchmarks/hashing/hash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,9 @@ static void BM_hash(benchmark::State& state, cudf::hash_id hid, contains_nulls h
#define HASH_BENCHMARK_DEFINE(hid, n) H_BENCHMARK_DEFINE(concat(hid, _, n), hid, n)

HASH_BENCHMARK_DEFINE(HASH_MURMUR3, nulls)
HASH_BENCHMARK_DEFINE(HASH_SERIAL_MURMUR3, nulls)
HASH_BENCHMARK_DEFINE(HASH_SPARK_MURMUR3, nulls)
HASH_BENCHMARK_DEFINE(HASH_MD5, nulls)

HASH_BENCHMARK_DEFINE(HASH_MURMUR3, no_nulls)
HASH_BENCHMARK_DEFINE(HASH_SERIAL_MURMUR3, no_nulls)
HASH_BENCHMARK_DEFINE(HASH_SPARK_MURMUR3, no_nulls)
HASH_BENCHMARK_DEFINE(HASH_MD5, no_nulls)
7 changes: 0 additions & 7 deletions cpp/include/cudf/detail/hashing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,6 @@ std::unique_ptr<column> spark_murmur_hash3_32(
rmm::cuda_stream_view stream = cudf::default_stream_value,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

template <template <typename> class hash_function>
std::unique_ptr<column> serial_murmur_hash3_32(
table_view const& input,
uint32_t seed = cudf::DEFAULT_HASH_SEED,
rmm::cuda_stream_view stream = cudf::default_stream_value,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

std::unique_ptr<column> md5_hash(
table_view const& input,
rmm::cuda_stream_view stream = cudf::default_stream_value,
Expand Down
9 changes: 4 additions & 5 deletions cpp/include/cudf/hashing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,10 @@ using hash_value_type = uint32_t; ///< Type of hash value
* @brief Identifies the hash function to be used
*/
enum class hash_id {
HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed
HASH_MURMUR3, ///< Murmur3 hash function
HASH_SERIAL_MURMUR3, ///< Serial Murmur3 hash function
HASH_SPARK_MURMUR3, ///< Spark Murmur3 hash function
HASH_MD5 ///< MD5 hash function
HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed
HASH_MURMUR3, ///< Murmur3 hash function
HASH_SPARK_MURMUR3, ///< Spark Murmur3 hash function
HASH_MD5 ///< MD5 hash function
};

/**
Expand Down
39 changes: 0 additions & 39 deletions cpp/src/hash/hashing.cu
Original file line number Diff line number Diff line change
Expand Up @@ -52,43 +52,6 @@ std::vector<column_view> to_leaf_columns(IterType iter_begin, IterType iter_end)

} // namespace

template <template <typename> class hash_function>
std::unique_ptr<column> serial_murmur_hash3_32(table_view const& input,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto output = make_numeric_column(
data_type(type_id::INT32), input.num_rows(), mask_state::UNALLOCATED, stream, mr);

if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }

table_view const leaf_table(to_leaf_columns(input.begin(), input.end()));
auto const device_input = table_device_view::create(leaf_table, stream);
auto output_view = output->mutable_view();

thrust::tabulate(
rmm::exec_policy(stream),
output_view.begin<int32_t>(),
output_view.end<int32_t>(),
[device_input = *device_input, nulls = has_nulls(leaf_table), seed] __device__(auto row_index) {
return detail::accumulate(
device_input.begin(),
device_input.end(),
seed,
[row_index, nulls] __device__(auto hash, auto column) {
return cudf::type_dispatcher(
column.type(),
experimental::row::hash::element_hasher<hash_function, nullate::DYNAMIC>{
nullate::DYNAMIC{nulls}, hash, hash},
column,
row_index);
});
});

return output;
}

std::unique_ptr<column> hash(table_view const& input,
hash_id hash_function,
uint32_t seed,
Expand All @@ -97,8 +60,6 @@ std::unique_ptr<column> hash(table_view const& input,
{
switch (hash_function) {
case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, seed, stream, mr);
case (hash_id::HASH_SERIAL_MURMUR3):
return serial_murmur_hash3_32<MurmurHash3_32>(input, seed, stream, mr);
case (hash_id::HASH_SPARK_MURMUR3): return spark_murmur_hash3_32(input, seed, stream, mr);
case (hash_id::HASH_MD5): return md5_hash(input, stream, mr);
default: CUDF_FAIL("Unsupported hash function.");
Expand Down
79 changes: 0 additions & 79 deletions cpp/tests/hashing/hash_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,6 @@ TEST_F(HashTest, MultiValueNulls)
EXPECT_EQ(input1.num_rows(), output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());

auto const serial_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, 0);
auto const serial_output2 = cudf::hash(input2, cudf::hash_id::HASH_SERIAL_MURMUR3);

EXPECT_EQ(input1.num_rows(), serial_output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(serial_output1->view(), serial_output2->view());

auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, 0);
auto const spark_output2 = cudf::hash(input2, cudf::hash_id::HASH_SPARK_MURMUR3);

Expand Down Expand Up @@ -371,12 +365,6 @@ TYPED_TEST(HashTestTyped, Equality)
EXPECT_EQ(input.num_rows(), output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());

auto const serial_output1 = cudf::hash(input, cudf::hash_id::HASH_SERIAL_MURMUR3, 0);
auto const serial_output2 = cudf::hash(input, cudf::hash_id::HASH_SERIAL_MURMUR3);

EXPECT_EQ(input.num_rows(), serial_output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(serial_output1->view(), serial_output2->view());

auto const spark_output1 = cudf::hash(input, cudf::hash_id::HASH_SPARK_MURMUR3, 0);
auto const spark_output2 = cudf::hash(input, cudf::hash_id::HASH_SPARK_MURMUR3);

Expand All @@ -401,12 +389,6 @@ TYPED_TEST(HashTestTyped, EqualityNulls)
EXPECT_EQ(input1.num_rows(), output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());

auto const serial_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, 0);
auto const serial_output2 = cudf::hash(input2, cudf::hash_id::HASH_SERIAL_MURMUR3);

EXPECT_EQ(input1.num_rows(), serial_output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(serial_output1->view(), serial_output2->view());

auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, 0);
auto const spark_output2 = cudf::hash(input2, cudf::hash_id::HASH_SPARK_MURMUR3);

Expand Down Expand Up @@ -445,14 +427,6 @@ TYPED_TEST(HashTestFloatTyped, TestExtremes)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_zero, verbosity);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_nan, verbosity);

constexpr auto serial_hasher = cudf::hash_id::HASH_SERIAL_MURMUR3;
auto const serial_col = cudf::hash(table_col, serial_hasher, 0);
auto const serial_col_neg_zero = cudf::hash(table_col_neg_zero, serial_hasher);
auto const serial_col_neg_nan = cudf::hash(table_col_neg_nan, serial_hasher);

CUDF_TEST_EXPECT_COLUMNS_EQUAL(*serial_col, *serial_col_neg_zero, verbosity);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*serial_col, *serial_col_neg_nan, verbosity);

// Spark hash is sensitive to 0 and -0
constexpr auto spark_hasher = cudf::hash_id::HASH_SPARK_MURMUR3;
auto const spark_col = cudf::hash(table_col, spark_hasher, 0);
Expand All @@ -461,59 +435,6 @@ TYPED_TEST(HashTestFloatTyped, TestExtremes)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*spark_col, *spark_col_neg_nan);
}

class SerialMurmurHash3Test : public cudf::test::BaseFixture {
};

TEST_F(SerialMurmurHash3Test, MultiValueWithSeeds)
{
fixed_width_column_wrapper<int32_t> const strings_col_result(
{1467149710, -680899318, -1620282500, 91106683, -1564993834});
fixed_width_column_wrapper<int32_t> const ints_col_result(
{933211791, 751823303, -1080202046, 723455942, 133916647});

strings_column_wrapper const strings_col({"",
"The quick brown fox",
"jumps over the lazy dog.",
"All work and no play makes Jack a dull boy",
"!\"#$%&\'()*+,-./]:;<=>?@[\\]^_`{|}~\ud720\ud721"});

using limits = std::numeric_limits<int32_t>;
fixed_width_column_wrapper<int32_t> const ints_col({0, 100, -100, limits::min(), limits::max()});

fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});

std::vector<std::unique_ptr<cudf::column>> struct_field_cols;
struct_field_cols.emplace_back(std::make_unique<cudf::column>(strings_col));
struct_field_cols.emplace_back(std::make_unique<cudf::column>(ints_col));
struct_field_cols.emplace_back(std::make_unique<cudf::column>(bools_col1));
structs_column_wrapper structs_col(std::move(struct_field_cols));

auto const combo1 = cudf::table_view({strings_col, ints_col, bools_col1});
auto const combo2 = cudf::table_view({strings_col, ints_col, bools_col2});

constexpr auto hasher = cudf::hash_id::HASH_SERIAL_MURMUR3;
auto const strings_hash = cudf::hash(cudf::table_view({strings_col}), hasher, 314);
auto const ints_hash = cudf::hash(cudf::table_view({ints_col}), hasher, 42);
auto const combo1_hash = cudf::hash(combo1, hasher, {});
auto const combo2_hash = cudf::hash(combo2, hasher, {});
auto const structs_hash = cudf::hash(cudf::table_view({structs_col}), hasher, {});

CUDF_TEST_EXPECT_COLUMNS_EQUAL(*strings_hash, strings_col_result, verbosity);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ints_hash, ints_col_result, verbosity);
EXPECT_EQ(combo1.num_rows(), combo1_hash->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*combo1_hash, *combo2_hash, verbosity);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*structs_hash, *combo1_hash, verbosity);
}

TEST_F(SerialMurmurHash3Test, ListThrows)
{
lists_column_wrapper<cudf::string_view> strings_list_col({{""}, {"abc"}, {"123"}});
EXPECT_THROW(
cudf::hash(cudf::table_view({strings_list_col}), cudf::hash_id::HASH_SERIAL_MURMUR3, {}),
cudf::logic_error);
}

class SparkMurmurHash3Test : public cudf::test::BaseFixture {
};

Expand Down
34 changes: 0 additions & 34 deletions java/src/main/java/ai/rapids/cudf/ColumnVector.java
Original file line number Diff line number Diff line change
Expand Up @@ -725,40 +725,6 @@ public static ColumnVector md5Hash(ColumnView... columns) {
return new ColumnVector(hash(columnViews, HashType.HASH_MD5.getNativeId(), 0));
}

/**
* Create a new vector containing the murmur3 hash of each row in the table.
*
* @param seed integer seed for the murmur3 hash function
* @param columns array of columns to hash, must have identical number of rows.
* @return the new ColumnVector of 32-bit values representing each row's hash value.
*/
public static ColumnVector serial32BitMurmurHash3(int seed, ColumnView columns[]) {
if (columns.length < 1) {
throw new IllegalArgumentException("Murmur3 hashing requires at least 1 column of input");
}
long[] columnViews = new long[columns.length];
long size = columns[0].getRowCount();

for(int i = 0; i < columns.length; i++) {
assert columns[i] != null : "Column vectors passed may not be null";
assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size";
assert !columns[i].getType().isDurationType() : "Unsupported column type Duration";
assert !columns[i].getType().equals(DType.LIST) : "List columns are not supported";
columnViews[i] = columns[i].getNativeView();
}
return new ColumnVector(hash(columnViews, HashType.HASH_SERIAL_MURMUR3.getNativeId(), seed));
}

/**
* Create a new vector containing the murmur3 hash of each row in the table, seed defaulted to 0.
*
* @param columns array of columns to hash, must have identical number of rows.
* @return the new ColumnVector of 32-bit values representing each row's hash value.
*/
public static ColumnVector serial32BitMurmurHash3(ColumnView columns[]) {
return serial32BitMurmurHash3(0, columns);
}

/**
* Create a new vector containing spark's 32-bit murmur3 hash of each row in the table.
* Spark's murmur3 hash uses a different tail processing algorithm.
Expand Down
5 changes: 2 additions & 3 deletions java/src/main/java/ai/rapids/cudf/HashType.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,8 @@
public enum HashType {
IDENTITY(0),
MURMUR3(1),
HASH_SERIAL_MURMUR3(2),
HASH_SPARK_MURMUR3(3),
HASH_MD5(4);
HASH_SPARK_MURMUR3(2),
HASH_MD5(3);

private static final HashType[] HASH_TYPES = HashType.values();
final int nativeId;
Expand Down
97 changes: 0 additions & 97 deletions java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -519,103 +519,6 @@ void testMD5HashLists() {
assertColumnsAreEqual(expected, result);
}
}
@Test
void testSerial32BitMurmur3HashStrings() {
try (ColumnVector v0 = ColumnVector.fromStrings(
"a", "B\nc", "dE\"\u0100\t\u0101 \ud720\ud721\\Fg2\'",
"A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
"in the MD5 hash function. This string needed to be longer.A 60 character string to " +
"test MD5's message padding algorithm",
"hiJ\ud720\ud721\ud720\ud721", null);
ColumnVector result = ColumnVector.serial32BitMurmurHash3(42, new ColumnVector[]{v0});
ColumnVector expected = ColumnVector.fromBoxedInts(-1293573533, 1163854319, 1423943036, 1504480835, 1249086584, 42)) {
assertColumnsAreEqual(expected, result);
}
}

@Test
void testSerial32BitMurmur3HashInts() {
try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null);
ColumnVector v1 = ColumnVector.fromBoxedInts(0, null, -100, null, null, Integer.MAX_VALUE);
ColumnVector result = ColumnVector.serial32BitMurmurHash3(42, new ColumnVector[]{v0, v1});
ColumnVector expected = ColumnVector.fromBoxedInts(59727262, 751823303, -1080202046, 42, 723455942, 133916647)) {
assertColumnsAreEqual(expected, result);
}
}

@Test
void testSerial32BitMurmur3HashDoubles() {
try (ColumnVector v = ColumnVector.fromBoxedDoubles(
0.0, null, 100.0, -100.0, Double.MIN_NORMAL, Double.MAX_VALUE,
POSITIVE_DOUBLE_NAN_UPPER_RANGE, POSITIVE_DOUBLE_NAN_LOWER_RANGE,
NEGATIVE_DOUBLE_NAN_UPPER_RANGE, NEGATIVE_DOUBLE_NAN_LOWER_RANGE,
Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY);
ColumnVector result = ColumnVector.serial32BitMurmurHash3(new ColumnVector[]{v});
ColumnVector expected = ColumnVector.fromBoxedInts(1669671676, 0, -544903190, -1831674681, 150502665, 474144502, 1428788237, 1428788237, 1428788237, 1428788237, 420913893, 1915664072)) {
assertColumnsAreEqual(expected, result);
}
}

@Test
void testSerialBitMurmur3HashFloats() {
try (ColumnVector v = ColumnVector.fromBoxedFloats(
0f, 100f, -100f, Float.MIN_NORMAL, Float.MAX_VALUE, null,
POSITIVE_FLOAT_NAN_LOWER_RANGE, POSITIVE_FLOAT_NAN_UPPER_RANGE,
NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE,
Float.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY);
ColumnVector result = ColumnVector.serial32BitMurmurHash3(411, new ColumnVector[]{v});
ColumnVector expected = ColumnVector.fromBoxedInts(-235179434, 1812056886, 2028471189, 1775092689, -1531511762, 411, -1053523253, -1053523253, -1053523253, -1053523253, -1526256646, 930080402)){
assertColumnsAreEqual(expected, result);
}
}

@Test
void testSerial32BitMurmur3HashBools() {
try (ColumnVector v0 = ColumnVector.fromBoxedBooleans(null, true, false, true, null, false);
ColumnVector v1 = ColumnVector.fromBoxedBooleans(null, true, false, null, false, true);
ColumnVector result = ColumnVector.serial32BitMurmurHash3(0, new ColumnVector[]{v0, v1});
ColumnVector expected = ColumnVector.fromBoxedInts(0, 884701402, 1032769583, -463810133, 1364076727, -991270669)) {
assertColumnsAreEqual(expected, result);
}
}

@Test
void testSerial32BitMurmur3HashMixed() {
try (ColumnVector strings = ColumnVector.fromStrings(
"a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721",
"A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
"in the MD5 hash function. This string needed to be longer.",
null, null);
ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
ColumnVector doubles = ColumnVector.fromBoxedDoubles(
0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
ColumnVector floats = ColumnVector.fromBoxedFloats(
0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null);
ColumnVector result = ColumnVector.serial32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools});
ColumnVector expected = ColumnVector.fromBoxedInts(387200465, 1988790727, 774895031, 814731646, -1073686048, 1868)) {
assertColumnsAreEqual(expected, result);
}
}

@Test
void testSerial32BitMurmur3HashStruct() {
try (ColumnVector strings = ColumnVector.fromStrings(
"a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721",
"A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
"in the MD5 hash function. This string needed to be longer.",
null, null);
ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
ColumnVector doubles = ColumnVector.fromBoxedDoubles(
0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
ColumnVector floats = ColumnVector.fromBoxedFloats(
0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null);
ColumnVector result = ColumnVector.serial32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools});
ColumnVector expected = ColumnVector.fromBoxedInts(387200465, 1988790727, 774895031, 814731646, -1073686048, 1868)) {
assertColumnsAreEqual(expected, result);
}
}

@Test
void testSpark32BitMurmur3HashStrings() {
Expand Down
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/cpp/hash.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil:
ctypedef enum hash_id "cudf::hash_id":
HASH_IDENTITY "cudf::hash_id::HASH_IDENTITY"
HASH_MURMUR3 "cudf::hash_id::HASH_MURMUR3"
HASH_SERIAL_MURMUR3 "cudf::hash_id::HASH_SERIAL_MURMUR3"
HASH_SPARK_MURMUR3 "cudf::hash_id::HASH_SPARK_MURMUR3"
HASH_MD5 "cudf::hash_id::HASH_MD5"

Expand Down