Avoid DataFrame conversion in MultiIndex.from_pandas (#14470)

This uncovered a bug where `MultiIndex.levels` could be a list of Series instead of a list of Index. Authors: - Matthew Roeschke (https://github.com/mroeschke) - Lawrence Mitchell (https://github.com/wence-) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: #14470
rapidsai · Nov 28, 2023 · d65a2af · d65a2af
1 parent 5e58e71
commit d65a2af
Show file tree

Hide file tree

Showing 3 changed files with 72 additions and 41 deletions.
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
@@ -18,7 +18,8 @@
 from pandas._config import get_option
 
 import cudf
-from cudf import _lib as libcudf
+import cudf._lib as libcudf
+from cudf._lib.types import size_type_dtype
 from cudf._typing import DataFrameOrSeries
 from cudf.api.extensions import no_default
 from cudf.api.types import is_integer, is_list_like, is_object_dtype
@@ -141,7 +142,9 @@ def __init__(
         if copy:
             if isinstance(codes, cudf.DataFrame):
                 codes = codes.copy(deep=True)
-            if len(levels) > 0 and isinstance(levels[0], cudf.Series):
+            if len(levels) > 0 and isinstance(
+                levels[0], (cudf.Index, cudf.Series)
+            ):
                 levels = [level.copy(deep=True) for level in levels]
 
         if not isinstance(codes, cudf.DataFrame):
@@ -158,7 +161,7 @@ def __init__(
                     "codes and is inconsistent!"
                 )
 
-        levels = [cudf.Series(level) for level in levels]
+        levels = [cudf.Index(level) for level in levels]
 
         if len(levels) != len(codes._data):
             raise ValueError(
@@ -170,26 +173,24 @@ def __init__(
                 "MultiIndex length of codes does not match "
                 "and is inconsistent!"
             )
-        for level, code in zip(levels, codes._data.columns):
-            if code.max() > len(level) - 1:
-                raise ValueError(
-                    "MultiIndex code %d contains value %d larger "
-                    "than maximum level size at this position"
-                )
 
         source_data = {}
-        for i, (column_name, col) in enumerate(codes._data.items()):
-            if -1 in col:
-                level = cudf.DataFrame(
-                    {column_name: [None] + list(levels[i])},
-                    index=range(-1, len(levels[i])),
-                )
-            else:
-                level = cudf.DataFrame({column_name: levels[i]})
-
-            source_data[column_name] = libcudf.copying.gather(
-                [level._data[column_name]], col
-            )[0]
+        for (column_name, code), level in zip(codes._data.items(), levels):
+            if len(code):
+                lo, hi = libcudf.reduce.minmax(code)
+                if lo.value < -1 or hi.value > len(level) - 1:
+                    raise ValueError(
+                        f"Codes must be -1 <= codes <= {len(level) - 1}"
+                    )
+                if lo.value == -1:
+                    # Now we can gather and insert null automatically
+                    code[code == -1] = np.iinfo(size_type_dtype).min
+            result_col = libcudf.copying.gather(
+                [level._column], code, nullify=True
+            )
+            source_data[column_name] = result_col[0]._with_type_metadata(
+                level.dtype
+            )
 
         super().__init__(source_data)
         self._levels = levels
@@ -481,7 +482,7 @@ def copy(
 
         mi = MultiIndex._from_data(self._data.copy(deep=deep))
         if self._levels is not None:
-            mi._levels = [s.copy(deep) for s in self._levels]
+            mi._levels = [idx.copy(deep=deep) for idx in self._levels]
         if self._codes is not None:
             mi._codes = self._codes.copy(deep)
         if names is not None:
@@ -1625,19 +1626,13 @@ def from_pandas(cls, multiindex, nan_as_null=no_default):
             nan_as_null = (
                 False if cudf.get_option("mode.pandas_compatible") else None
             )
-
-        # if `multiindex` has two or more levels that
-        # have the same name, then `multiindex.to_frame()`
-        # results in a DataFrame containing only one of those
-        # levels. Thus, set `names` to some tuple of unique values
-        # and then call `multiindex.to_frame(name=names)`,
-        # which preserves all levels of `multiindex`.
-        names = tuple(range(len(multiindex.names)))
-
-        df = cudf.DataFrame.from_pandas(
-            multiindex.to_frame(index=False, name=names), nan_as_null
+        levels = [
+            cudf.Index.from_pandas(level, nan_as_null=nan_as_null)
+            for level in multiindex.levels
+        ]
+        return cls(
+            levels=levels, codes=multiindex.codes, names=multiindex.names
         )
-        return cls.from_frame(df, names=multiindex.names)
 
     @cached_property  # type: ignore
     @_cudf_nvtx_annotate

diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
@@ -256,19 +256,22 @@ def test_dropna_index(data, dtype):
 
 @pytest.mark.parametrize("data", [[[1, None, 2], [None, None, 2]]])
 @pytest.mark.parametrize("how", ["all", "any"])
-def test_dropna_multiindex(data, how):
+def test_dropna_multiindex(data, how, request):
     pi = pd.MultiIndex.from_arrays(data)
     gi = cudf.from_pandas(pi)
 
     expect = pi.dropna(how)
     got = gi.dropna(how)
 
-    with pytest.raises(AssertionError, match="different"):
-        # pandas-gh44792. Pandas infers the dtypes as (int64, int64), though
-        # int64 doesn't really store null/nans. The dtype propagates to the
-        # result of dropna. cuDF infers the dtypes as (float, float), which
-        # differs from pandas.
-        assert_eq(expect, got)
+    if how == "all" and "data0" in request.node.callspec.id:
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="pandas NA value np.nan results in float type. "
+                "cuDF correctly retains int type "
+                "(https://github.com/pandas-dev/pandas/issues/44792)"
+            )
+        )
+    assert_eq(expect, got)
 
 
 @pytest.mark.parametrize(

diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
@@ -2079,3 +2079,36 @@ def test_multiindex_eq_other_multiindex():
     result = idx == idx
     expected = np.array([True, True])
     assert_eq(result, expected)
+
+
+@pytest.fixture(
+    params=[
+        "from_product",
+        "from_tuples",
+        pytest.param(
+            "from_arrays",
+            marks=pytest.mark.xfail(
+                reason="TODO: from_arrays is not implemented"
+            ),
+        ),
+        "init",
+    ]
+)
+def midx(request):
+    if request.param == "from_product":
+        return cudf.MultiIndex.from_product([[0, 1], [1, 0]])
+    elif request.param == "from_tuples":
+        return cudf.MultiIndex.from_tuples([(0, 1), (0, 0), (1, 1), (1, 0)])
+    elif request.param == "from_arrays":
+        return cudf.MultiIndex.from_arrays([0, 0, 1, 1], [1, 0, 1, 0])
+    elif request.param == "init":
+        return cudf.MultiIndex(
+            levels=[[0, 1], [0, 1]], codes=[[0, 0, 1, 1], [1, 0, 1, 0]]
+        )
+    else:
+        raise NotImplementedError(f"{request.param} not implemented")
+
+
+def test_multindex_constructor_levels_always_indexes(midx):
+    assert_eq(midx.levels[0], cudf.Index([0, 1]))
+    assert_eq(midx.levels[1], cudf.Index([0, 1]))