Skip to content

Commit

Permalink
Avoid DataFrame conversion in MultiIndex.from_pandas (#14470)
Browse files Browse the repository at this point in the history
This uncovered a bug where `MultiIndex.levels` could be a list of Series instead of a list of Index.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: #14470
  • Loading branch information
mroeschke committed Nov 28, 2023
1 parent 5e58e71 commit d65a2af
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 41 deletions.
63 changes: 29 additions & 34 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
from pandas._config import get_option

import cudf
from cudf import _lib as libcudf
import cudf._lib as libcudf
from cudf._lib.types import size_type_dtype
from cudf._typing import DataFrameOrSeries
from cudf.api.extensions import no_default
from cudf.api.types import is_integer, is_list_like, is_object_dtype
Expand Down Expand Up @@ -141,7 +142,9 @@ def __init__(
if copy:
if isinstance(codes, cudf.DataFrame):
codes = codes.copy(deep=True)
if len(levels) > 0 and isinstance(levels[0], cudf.Series):
if len(levels) > 0 and isinstance(
levels[0], (cudf.Index, cudf.Series)
):
levels = [level.copy(deep=True) for level in levels]

if not isinstance(codes, cudf.DataFrame):
Expand All @@ -158,7 +161,7 @@ def __init__(
"codes and is inconsistent!"
)

levels = [cudf.Series(level) for level in levels]
levels = [cudf.Index(level) for level in levels]

if len(levels) != len(codes._data):
raise ValueError(
Expand All @@ -170,26 +173,24 @@ def __init__(
"MultiIndex length of codes does not match "
"and is inconsistent!"
)
for level, code in zip(levels, codes._data.columns):
if code.max() > len(level) - 1:
raise ValueError(
"MultiIndex code %d contains value %d larger "
"than maximum level size at this position"
)

source_data = {}
for i, (column_name, col) in enumerate(codes._data.items()):
if -1 in col:
level = cudf.DataFrame(
{column_name: [None] + list(levels[i])},
index=range(-1, len(levels[i])),
)
else:
level = cudf.DataFrame({column_name: levels[i]})

source_data[column_name] = libcudf.copying.gather(
[level._data[column_name]], col
)[0]
for (column_name, code), level in zip(codes._data.items(), levels):
if len(code):
lo, hi = libcudf.reduce.minmax(code)
if lo.value < -1 or hi.value > len(level) - 1:
raise ValueError(
f"Codes must be -1 <= codes <= {len(level) - 1}"
)
if lo.value == -1:
# Now we can gather and insert null automatically
code[code == -1] = np.iinfo(size_type_dtype).min
result_col = libcudf.copying.gather(
[level._column], code, nullify=True
)
source_data[column_name] = result_col[0]._with_type_metadata(
level.dtype
)

super().__init__(source_data)
self._levels = levels
Expand Down Expand Up @@ -481,7 +482,7 @@ def copy(

mi = MultiIndex._from_data(self._data.copy(deep=deep))
if self._levels is not None:
mi._levels = [s.copy(deep) for s in self._levels]
mi._levels = [idx.copy(deep=deep) for idx in self._levels]
if self._codes is not None:
mi._codes = self._codes.copy(deep)
if names is not None:
Expand Down Expand Up @@ -1625,19 +1626,13 @@ def from_pandas(cls, multiindex, nan_as_null=no_default):
nan_as_null = (
False if cudf.get_option("mode.pandas_compatible") else None
)

# if `multiindex` has two or more levels that
# have the same name, then `multiindex.to_frame()`
# results in a DataFrame containing only one of those
# levels. Thus, set `names` to some tuple of unique values
# and then call `multiindex.to_frame(name=names)`,
# which preserves all levels of `multiindex`.
names = tuple(range(len(multiindex.names)))

df = cudf.DataFrame.from_pandas(
multiindex.to_frame(index=False, name=names), nan_as_null
levels = [
cudf.Index.from_pandas(level, nan_as_null=nan_as_null)
for level in multiindex.levels
]
return cls(
levels=levels, codes=multiindex.codes, names=multiindex.names
)
return cls.from_frame(df, names=multiindex.names)

@cached_property # type: ignore
@_cudf_nvtx_annotate
Expand Down
17 changes: 10 additions & 7 deletions python/cudf/cudf/tests/test_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,19 +256,22 @@ def test_dropna_index(data, dtype):

@pytest.mark.parametrize("data", [[[1, None, 2], [None, None, 2]]])
@pytest.mark.parametrize("how", ["all", "any"])
def test_dropna_multiindex(data, how):
def test_dropna_multiindex(data, how, request):
pi = pd.MultiIndex.from_arrays(data)
gi = cudf.from_pandas(pi)

expect = pi.dropna(how)
got = gi.dropna(how)

with pytest.raises(AssertionError, match="different"):
# pandas-gh44792. Pandas infers the dtypes as (int64, int64), though
# int64 doesn't really store null/nans. The dtype propagates to the
# result of dropna. cuDF infers the dtypes as (float, float), which
# differs from pandas.
assert_eq(expect, got)
if how == "all" and "data0" in request.node.callspec.id:
request.applymarker(
pytest.mark.xfail(
reason="pandas NA value np.nan results in float type. "
"cuDF correctly retains int type "
"(https://github.com/pandas-dev/pandas/issues/44792)"
)
)
assert_eq(expect, got)


@pytest.mark.parametrize(
Expand Down
33 changes: 33 additions & 0 deletions python/cudf/cudf/tests/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -2079,3 +2079,36 @@ def test_multiindex_eq_other_multiindex():
result = idx == idx
expected = np.array([True, True])
assert_eq(result, expected)


@pytest.fixture(
params=[
"from_product",
"from_tuples",
pytest.param(
"from_arrays",
marks=pytest.mark.xfail(
reason="TODO: from_arrays is not implemented"
),
),
"init",
]
)
def midx(request):
if request.param == "from_product":
return cudf.MultiIndex.from_product([[0, 1], [1, 0]])
elif request.param == "from_tuples":
return cudf.MultiIndex.from_tuples([(0, 1), (0, 0), (1, 1), (1, 0)])
elif request.param == "from_arrays":
return cudf.MultiIndex.from_arrays([0, 0, 1, 1], [1, 0, 1, 0])
elif request.param == "init":
return cudf.MultiIndex(
levels=[[0, 1], [0, 1]], codes=[[0, 0, 1, 1], [1, 0, 1, 0]]
)
else:
raise NotImplementedError(f"{request.param} not implemented")


def test_multindex_constructor_levels_always_indexes(midx):
assert_eq(midx.levels[0], cudf.Index([0, 1]))
assert_eq(midx.levels[1], cudf.Index([0, 1]))

0 comments on commit d65a2af

Please sign in to comment.