Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid DataFrame conversion in MultiIndex.from_pandas #14470

Merged
merged 12 commits into from
Nov 28, 2023
46 changes: 18 additions & 28 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from pandas._config import get_option

import cudf
from cudf import _lib as libcudf
from cudf._typing import DataFrameOrSeries
from cudf.api.extensions import no_default
from cudf.api.types import is_integer, is_list_like, is_object_dtype
Expand Down Expand Up @@ -141,7 +140,9 @@ def __init__(
if copy:
if isinstance(codes, cudf.DataFrame):
codes = codes.copy(deep=True)
if len(levels) > 0 and isinstance(levels[0], cudf.Series):
if len(levels) > 0 and isinstance(
levels[0], (cudf.Index, cudf.Series)
):
levels = [level.copy(deep=True) for level in levels]

if not isinstance(codes, cudf.DataFrame):
Expand All @@ -158,7 +159,7 @@ def __init__(
"codes and is inconsistent!"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, I am not sure that codes can be np.int64 it should probably be size_type_dtype. But let us not fix that here.

)

levels = [cudf.Series(level) for level in levels]
levels = [cudf.Index(level) for level in levels]

if len(levels) != len(codes._data):
raise ValueError(
Expand All @@ -178,18 +179,12 @@ def __init__(
)

source_data = {}
for i, (column_name, col) in enumerate(codes._data.items()):
if -1 in col:
level = cudf.DataFrame(
{column_name: [None] + list(levels[i])},
index=range(-1, len(levels[i])),
)
else:
level = cudf.DataFrame({column_name: levels[i]})

source_data[column_name] = libcudf.copying.gather(
[level._data[column_name]], col
)[0]
for (column_name, col), level in zip(codes._data.items(), levels):
result_col = level._column.take(col)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On line 175 we already do partial bounds-checking. The treatment of -1 is AFAICT undocumented pandas behaviour. That is codes do not do normal indexing but instead -1 is for "missing value".

But let us move all of the validation into one place:

import cudf._lib as libcudf
from cudf._lib.types import size_type_dtype
import numpy as np

lo, hi = libcudf.reduce.minmax(code)
if lo.value < -1 or hi.value > len(level) - 1:
    raise ValueError(...)
if lo.value == -1:
    # Now we can gather and insert null automatically
    code[code == -1] = np.iinfo(size_type_dtype).min
result_col, = libcudf.copying.gather([level], code, nullify=True)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice find. Was able to consolidate the codes validate in this loop

mask = col == -1
if mask.any():
result_col[mask] = cudf.NA
source_data[column_name] = result_col

super().__init__(source_data)
self._levels = levels
Expand Down Expand Up @@ -481,7 +476,8 @@ def copy(

mi = MultiIndex._from_data(self._data.copy(deep=deep))
if self._levels is not None:
mi._levels = [s.copy(deep) for s in self._levels]
# TODO: why is deep=True not being respected here
mi._levels = [idx.copy(deep) for idx in self._levels]
wence- marked this conversation as resolved.
Show resolved Hide resolved
if self._codes is not None:
mi._codes = self._codes.copy(deep)
if names is not None:
Expand Down Expand Up @@ -1625,19 +1621,13 @@ def from_pandas(cls, multiindex, nan_as_null=no_default):
nan_as_null = (
False if cudf.get_option("mode.pandas_compatible") else None
)

# if `multiindex` has two or more levels that
# have the same name, then `multiindex.to_frame()`
# results in a DataFrame containing only one of those
# levels. Thus, set `names` to some tuple of unique values
# and then call `multiindex.to_frame(name=names)`,
# which preserves all levels of `multiindex`.
names = tuple(range(len(multiindex.names)))

df = cudf.DataFrame.from_pandas(
multiindex.to_frame(index=False, name=names), nan_as_null
levels = [
cudf.Series(level, nan_as_null=nan_as_null)
for level in multiindex.levels
]
return cls(
levels=levels, codes=multiindex.codes, names=multiindex.names
)
return cls.from_frame(df, names=multiindex.names)

@cached_property # type: ignore
@_cudf_nvtx_annotate
Expand Down
23 changes: 20 additions & 3 deletions python/cudf/cudf/tests/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -846,14 +846,14 @@ def test_multiindex_copy_deep(data, copy_on_write, deep):

# Assert ._levels identity
lptrs = [
lv._data._data[None].base_data.get_ptr(mode="read")
next(iter(lv._data._data.values())).base_data.get_ptr(mode="read")
for lv in mi1._levels
]
# TODO: lv._data._data had {True: column}. Why was the key True?
rptrs = [
lv._data._data[None].base_data.get_ptr(mode="read")
next(iter(lv._data._data.values())).base_data.get_ptr(mode="read")
for lv in mi2._levels
]

assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))

# Assert ._codes identity
Expand Down Expand Up @@ -2079,3 +2079,20 @@ def test_multiindex_eq_other_multiindex():
result = idx == idx
expected = np.array([True, True])
assert_eq(result, expected)


@pytest.mark.parametrize(
"midx",
[
cudf.MultiIndex.from_product([[0, 1], [1, 0]]),
cudf.MultiIndex.from_tuples([(0, 1), (0, 0), (1, 1), (1, 0)]),
# TODO: Implement from_arrays like pandas
# cudf.MultiIndex.from_arrays([0, 0, 1, 1], [1, 0, 1, 0]),
cudf.MultiIndex(
levels=[[0, 1], [0, 1]], codes=[[0, 0, 1, 1], [1, 0, 1, 0]]
),
],
)
wence- marked this conversation as resolved.
Show resolved Hide resolved
def test_multindex_constructor_levels_always_indexes(midx):
assert_eq(midx.levels[0], cudf.Index([0, 1]))
assert_eq(midx.levels[1], cudf.Index([0, 1]))
Loading