Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MD5 Python hash API #9390

Merged
merged 12 commits into from Oct 12, 2021
14 changes: 10 additions & 4 deletions python/cudf/cudf/_lib/hash.pyx
Expand Up @@ -54,17 +54,23 @@ def hash_partition(source_table, object columns_to_hash,
)


def hash(source_table, object initial_hash_values=None, int seed=0):
cdef vector[uint32_t] c_initial_hash = initial_hash_values or []
def hash(source_table, str method, object initial_hash=None, int seed=0):
cdef vector[uint32_t] c_initial_hash = initial_hash or []
cdef table_view c_source_view = table_view_from_table(
source_table, ignore_index=True)

cdef unique_ptr[column] c_result
cdef libcudf_types.hash_id c_hash_function
if method == "murmur3":
c_hash_function = libcudf_types.hash_id.HASH_MURMUR3
elif method == "md5":
c_hash_function = libcudf_types.hash_id.HASH_MD5
else:
raise ValueError(f"Unsupported hash function: {method}")
with nogil:
c_result = move(
cpp_hash(
c_source_view,
libcudf_types.hash_id.HASH_MURMUR3,
c_hash_function,
c_initial_hash,
seed
)
Expand Down
13 changes: 11 additions & 2 deletions python/cudf/cudf/core/dataframe.py
Expand Up @@ -5003,22 +5003,31 @@ def apply_chunks(
tpb=tpb,
)

def hash_columns(self, columns=None):
def hash_columns(self, columns=None, method="murmur3"):
"""Hash the given *columns* and return a new device array

Parameters
----------
columns : sequence of str; optional
Sequence of column names. If columns is *None* (unspecified),
all columns in the frame are used.
method : {'murmur3', 'md5'}, default 'murmur3'
Hash function to use:
* murmur3: MurmurHash3 hash function.
* md5: MD5 hash function.

Returns
-------
Series
Hash values for each row.
"""
if columns is None:
table_to_hash = self
else:
cols = [self[k]._column for k in columns]
bdice marked this conversation as resolved.
Show resolved Hide resolved
table_to_hash = Frame(data=dict(zip(columns, cols)))

return Series(table_to_hash._hash()).values
return Series(table_to_hash._hash(method=method))
bdice marked this conversation as resolved.
Show resolved Hide resolved

def partition_by_hash(self, columns, nparts, keep_index=True):
"""Partition the dataframe by the hashed value of data in *columns*.
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/frame.py
Expand Up @@ -623,8 +623,8 @@ def _gather(self, gather_map, keep_index=True, nullify=False):
result._index.names = self._index.names
return result

def _hash(self, initial_hash_values=None):
return libcudf.hash.hash(self, initial_hash_values)
def _hash(self, method, initial_hash=None):
return libcudf.hash.hash(self, method, initial_hash)

def _hash_partition(
self, columns_to_hash, num_partitions, keep_index=True
Expand Down
21 changes: 15 additions & 6 deletions python/cudf/cudf/core/series.py
Expand Up @@ -4095,13 +4095,20 @@ def floor(self):
"""
return self._unaryop("floor")

def hash_values(self):
def hash_values(self, method="murmur3"):
"""Compute the hash of values in this column.

Parameters
----------
method : {'murmur3', 'md5'}, default 'murmur3'
Hash function to use:
* murmur3: MurmurHash3 hash function.
* md5: MD5 hash function.

Returns
-------
cupy array
A cupy array with hash values.
Series
A Series with hash values.

Examples
--------
Expand All @@ -4112,10 +4119,10 @@ def hash_values(self):
1 120
2 30
dtype: int64
>>> series.hash_values()
>>> series.hash_values(method="murmur3")
array([-1930516747, 422619251, -941520876], dtype=int32)
"""
return Series(self._hash()).values
return Series(self._hash(method=method))
bdice marked this conversation as resolved.
Show resolved Hide resolved

def hash_encode(self, stop, use_name=False):
"""Encode column values as ints in [0, stop) using hash function.
Expand Down Expand Up @@ -4158,7 +4165,9 @@ def hash_encode(self, stop, use_name=False):
raise ValueError("stop must be a positive integer.")

initial_hash = [hash(self.name) & 0xFFFFFFFF] if use_name else None
hashed_values = Series(self._hash(initial_hash))
hashed_values = Series(
bdice marked this conversation as resolved.
Show resolved Hide resolved
self._hash(method="murmur3", initial_hash=initial_hash)
)

if hashed_values.has_nulls:
raise ValueError("Column must have no nulls.")
Expand Down
13 changes: 7 additions & 6 deletions python/cudf/cudf/tests/test_dataframe.py
Expand Up @@ -1103,27 +1103,28 @@ def test_assign():


@pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
def test_dataframe_hash_columns(nrows):
@pytest.mark.parametrize("method", ["murmur3", "md5"])
def test_dataframe_hash_columns(nrows, method):
bdice marked this conversation as resolved.
Show resolved Hide resolved
gdf = cudf.DataFrame()
data = np.asarray(range(nrows))
data[0] = data[-1] # make first and last the same
gdf["a"] = data
gdf["b"] = gdf.a + 100
out = gdf.hash_columns(["a", "b"])
assert isinstance(out, cupy.ndarray)
assert isinstance(out, cudf.Series)
assert len(out) == nrows
assert out.dtype == np.int32

# Check default
out_all = gdf.hash_columns()
np.testing.assert_array_equal(cupy.asnumpy(out), cupy.asnumpy(out_all))
assert_eq(out, out_all)

# Check single column
out_one = cupy.asnumpy(gdf.hash_columns(["a"]))
out_one = gdf.hash_columns(["a"], method=method)
# First matches last
assert out_one[0] == out_one[-1]
assert out_one.iloc[0] == out_one.iloc[-1]
# Equivalent to the cudf.Series.hash_values()
np.testing.assert_array_equal(cupy.asnumpy(gdf.a.hash_values()), out_one)
assert_eq(gdf["a"].hash_values(method=method), out_one)


@pytest.mark.parametrize("nrows", [3, 10, 100, 1000])
Expand Down