Skip to content

Commit

Permalink
Migrate lists/filtering to pylibcudf (#16184)
Browse files Browse the repository at this point in the history
Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #16184
  • Loading branch information
Matt711 committed Jul 25, 2024
1 parent a33f520 commit 6486bb9
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 64 deletions.
46 changes: 7 additions & 39 deletions python/cudf/cudf/_lib/lists.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,9 @@
from cudf.core.buffer import acquire_spill_lock

from libcpp cimport bool
from libcpp.memory cimport make_shared, shared_ptr, unique_ptr
from libcpp.utility cimport move

from cudf._lib.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
lists_column_view,
)
from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
distinct as cpp_distinct,
)
from cudf._lib.pylibcudf.libcudf.types cimport (
nan_equality,
null_equality,
null_order,
size_type,
)
from cudf._lib.pylibcudf.libcudf.types cimport null_order, size_type
from cudf._lib.utils cimport columns_from_pylibcudf_table

from cudf._lib import pylibcudf
Expand Down Expand Up @@ -47,31 +33,13 @@ def explode_outer(list source_columns, int explode_column_idx):

@acquire_spill_lock()
def distinct(Column col, bool nulls_equal, bool nans_all_equal):
"""
nulls_equal == True indicates that libcudf should treat any two nulls as
equal, and as unequal otherwise.
nans_all_equal == True indicates that libcudf should treat any two
elements from {-nan, +nan} as equal, and as unequal otherwise.
"""
cdef shared_ptr[lists_column_view] list_view = (
make_shared[lists_column_view](col.view())
)
cdef null_equality c_nulls_equal = (
null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
)
cdef nan_equality c_nans_equal = (
nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL
)

cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_distinct(list_view.get()[0],
c_nulls_equal,
c_nans_equal)
return Column.from_pylibcudf(
pylibcudf.lists.distinct(
col.to_pylibcudf(mode="read"),
nulls_equal,
nans_all_equal,
)
return Column.from_unique_ptr(move(c_result))
)


@acquire_spill_lock()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,13 @@ from cudf._lib.pylibcudf.libcudf.types cimport nan_equality, null_equality

cdef extern from "cudf/lists/stream_compaction.hpp" \
namespace "cudf::lists" nogil:
cdef unique_ptr[column] apply_boolean_mask(
const lists_column_view& lists_column,
const lists_column_view& boolean_mask,
) except +

cdef unique_ptr[column] distinct(
const lists_column_view lists_column,
const lists_column_view& lists_column,
null_equality nulls_equal,
nan_equality nans_equal
) except +
4 changes: 4 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/lists.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,7 @@ cpdef Column have_overlap(Column, Column, bool nulls_equal=*, bool nans_equal=*)
cpdef Column intersect_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)

cpdef Column union_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)

cpdef Column apply_boolean_mask(Column, Column)

cpdef Column distinct(Column, bool, bool)
71 changes: 71 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/lists.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
sort_lists as cpp_sort_lists,
stable_sort_lists as cpp_stable_sort_lists,
)
from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
apply_boolean_mask as cpp_apply_boolean_mask,
distinct as cpp_distinct,
)
from cudf._lib.pylibcudf.libcudf.table.table cimport table
from cudf._lib.pylibcudf.libcudf.types cimport (
nan_equality,
Expand Down Expand Up @@ -614,3 +618,70 @@ cpdef Column union_distinct(
c_nans_equal,
))
return Column.from_libcudf(move(c_result))


cpdef Column apply_boolean_mask(Column input, Column boolean_mask):
"""Filters elements in each row of the input lists column using a boolean mask
For details, see :cpp:func:`apply_boolean_mask`.
Parameters
----------
input : Column
The input column.
boolean_mask : Column
The boolean mask.
Returns
-------
Column
A Column of filtered elements based upon the boolean mask.
"""
cdef unique_ptr[column] c_result
cdef ListColumnView list_view = input.list_view()
cdef ListColumnView mask_view = boolean_mask.list_view()
with nogil:
c_result = move(cpp_apply_boolean_mask(
list_view.view(),
mask_view.view(),
))
return Column.from_libcudf(move(c_result))


cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal):
"""Create a new list column without duplicate elements in each list.
For details, see :cpp:func:`distinct`.
Parameters
----------
input : Column
The input column.
nulls_equal : bool
If true, null elements are considered equal. Otherwise, unequal.
nans_equal : bool
If true, libcudf will treat nan elements from {-nan, +nan}
as equal. Otherwise, unequal. Otherwise, unequal.
Returns
-------
Column
A new list column without duplicate elements in each list.
"""
cdef unique_ptr[column] c_result
cdef ListColumnView list_view = input.list_view()

cdef null_equality c_nulls_equal = (
null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
)
cdef nan_equality c_nans_equal = (
nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
)

with nogil:
c_result = move(cpp_distinct(
list_view.view(),
c_nulls_equal,
c_nans_equal,
))
return Column.from_libcudf(move(c_result))
94 changes: 70 additions & 24 deletions python/cudf/cudf/pylibcudf_tests/test_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,26 @@ def test_data():
return [[[[0, 1], [2], [5], [6, 7]], [[8], [9], [], [13, 14, 15]]]]


@pytest.fixture
def list_column():
return [[0, 1], [2], [5], [6, 7]]


@pytest.fixture
def scalar():
return pa.scalar(1)


@pytest.fixture
def column():
def search_key_column():
return pa.array([3, 2, 5, 6]), pa.array([-1, 0, 0, 0], type=pa.int32())


@pytest.fixture
def bool_column():
return pa.array([[False, True], [True], [True], [True, True]])


@pytest.fixture
def set_lists_column():
lhs = [[np.nan, np.nan, 2, 1, 2], [1, 2, 3], None, [4, None, 5]]
Expand Down Expand Up @@ -72,8 +82,7 @@ def test_concatenate_list_elements(test_data, dropna, expected):
assert_column_eq(expect, res)


def test_contains_scalar(test_data, scalar):
list_column = test_data[0][0]
def test_contains_scalar(list_column, scalar):
arr = pa.array(list_column)

plc_column = plc.interop.from_arrow(arr)
Expand All @@ -85,17 +94,17 @@ def test_contains_scalar(test_data, scalar):
assert_column_eq(expect, res)


def test_contains_list_column(test_data):
list_column1 = test_data[0][0]
list_column2 = [1, 3, 5, 1]
def test_contains_list_column(list_column, search_key_column):
list_column1 = list_column
list_column2, _ = search_key_column
arr1 = pa.array(list_column1)
arr2 = pa.array(list_column2)

plc_column1 = plc.interop.from_arrow(arr1)
plc_column2 = plc.interop.from_arrow(arr2)
res = plc.lists.contains(plc_column1, plc_column2)

expect = pa.array([True, False, True, False])
expect = pa.array([False, True, True, True])

assert_column_eq(expect, res)

Expand Down Expand Up @@ -123,8 +132,7 @@ def test_contains_nulls(list_column, expected):
assert_column_eq(expect, res)


def test_index_of_scalar(test_data, scalar):
list_column = test_data[0][0]
def test_index_of_scalar(list_column, scalar):
arr = pa.array(list_column)

plc_column = plc.interop.from_arrow(arr)
Expand All @@ -136,21 +144,19 @@ def test_index_of_scalar(test_data, scalar):
assert_column_eq(expect, res)


def test_index_of_list_column(test_data, column):
list_column = test_data[0][0]
def test_index_of_list_column(list_column, search_key_column):
arr1 = pa.array(list_column)
arr2, expect = column
arr2, expect = search_key_column
plc_column1 = plc.interop.from_arrow(arr1)
plc_column2 = plc.interop.from_arrow(arr2)
res = plc.lists.index_of(plc_column1, plc_column2, True)

expect = pa.array(column[1], type=pa.int32())
expect = pa.array(search_key_column[1], type=pa.int32())

assert_column_eq(expect, res)


def test_reverse(test_data):
list_column = test_data[0][0]
def test_reverse(list_column):
arr = pa.array(list_column)
plc_column = plc.interop.from_arrow(arr)

Expand All @@ -162,8 +168,7 @@ def test_reverse(test_data):


def test_segmented_gather(test_data):
list_column1 = test_data[0][0]
list_column2 = test_data[0][1]
list_column1, list_column2 = test_data[0]

plc_column1 = plc.interop.from_arrow(pa.array(list_column1))
plc_column2 = plc.interop.from_arrow(pa.array(list_column2))
Expand All @@ -175,19 +180,17 @@ def test_segmented_gather(test_data):
assert_column_eq(expect, res)


def test_extract_list_element_scalar(test_data):
arr = pa.array(test_data[0][0])
plc_column = plc.interop.from_arrow(arr)
def test_extract_list_element_scalar(list_column):
plc_column = plc.interop.from_arrow(pa.array(list_column))

res = plc.lists.extract_list_element(plc_column, 0)
expect = pa.compute.list_element(test_data[0][0], 0)
expect = pa.compute.list_element(list_column, 0)

assert_column_eq(expect, res)


def test_extract_list_element_column(test_data):
arr = pa.array(test_data[0][0])
plc_column = plc.interop.from_arrow(arr)
def test_extract_list_element_column(list_column):
plc_column = plc.interop.from_arrow(pa.array(list_column))
indices = plc.interop.from_arrow(pa.array([0, 1, -4, -1]))

res = plc.lists.extract_list_element(plc_column, indices)
Expand Down Expand Up @@ -343,3 +346,46 @@ def test_set_operations(
else:
expect = pa.array(expected)
assert_column_eq(expect, res)


@pytest.mark.parametrize(
"nans_equal,nulls_equal,expected",
[
(True, True, [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]]),
(
False,
True,
[[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, None, 5]],
),
(
True,
False,
[[np.nan, np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]],
),
(
False,
False,
[
[np.nan, np.nan, 0, 1, 2, 3],
[3, 1, 2],
None,
[4, None, None, 5],
],
),
],
)
def test_distinct(list_column, nans_equal, nulls_equal, expected):
list_column = [
[np.nan, np.nan, 0, 1, 2, 3, 2],
[3, 1, 2],
None,
[4, None, None, 5],
]
arr = pa.array(list_column)
plc_column = plc.interop.from_arrow(arr)

res = plc.lists.distinct(plc_column, nans_equal, nulls_equal)

expect = pa.array(expected)

assert_column_eq(expect, res)

0 comments on commit 6486bb9

Please sign in to comment.