diff --git a/asv_bench/benchmarks/pandas.py b/asv_bench/benchmarks/pandas.py new file mode 100644 index 00000000000..42ef18ac0c2 --- /dev/null +++ b/asv_bench/benchmarks/pandas.py @@ -0,0 +1,24 @@ +import numpy as np +import pandas as pd + +from . import parameterized + + +class MultiIndexSeries: + def setup(self, dtype, subset): + data = np.random.rand(100000).astype(dtype) + index = pd.MultiIndex.from_product( + [ + list("abcdefhijk"), + list("abcdefhijk"), + pd.date_range(start="2000-01-01", periods=1000, freq="B"), + ] + ) + series = pd.Series(data, index) + if subset: + series = series[::3] + self.series = series + + @parameterized(["dtype", "subset"], ([int, float], [True, False])) + def time_to_xarray(self, dtype, subset): + self.series.to_xarray() diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e4223f2b4e0..5dc39da5a06 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -49,7 +49,10 @@ Enhancements For orthogonal linear- and nearest-neighbor interpolation, we do 1d-interpolation sequentially rather than interpolating in multidimensional space. (:issue:`2223`) By `Keisuke Fujii `_. -- :py:meth:`DataArray.reset_index` and :py:meth:`Dataset.reset_index` now keep +- Major performance improvement for :py:meth:`Dataset.from_dataframe` when the + dataframe has a MultiIndex (:pull:`4184`). + By `Stephan Hoyer `_. + - :py:meth:`DataArray.reset_index` and :py:meth:`Dataset.reset_index` now keep coordinate attributes (:pull:`4103`). By `Oriol Abril `_. New Features @@ -133,8 +136,9 @@ Bug fixes By `Deepak Cherian `_. - ``ValueError`` is raised when ``fill_value`` is not a scalar in :py:meth:`full_like`. (:issue:`3977`) By `Huite Bootsma `_. -- Fix wrong order in converting a ``pd.Series`` with a MultiIndex to ``DataArray``. (:issue:`3951`) - By `Keisuke Fujii `_. +- Fix wrong order in converting a ``pd.Series`` with a MultiIndex to ``DataArray``. + (:issue:`3951`, :issue:`4186`) + By `Keisuke Fujii `_ and `Stephan Hoyer `_. - Fix renaming of coords when one or more stacked coords is not in sorted order during stack+groupby+apply operations. (:issue:`3287`, :pull:`3906`) By `Spencer Hill `_ diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index b46b1d6dce0..5bfddaa710b 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4543,11 +4543,10 @@ def to_dataframe(self): return self._to_dataframe(self.dims) def _set_sparse_data_from_dataframe( - self, dataframe: pd.DataFrame, dims: tuple + self, idx: pd.Index, arrays: List[Tuple[Hashable, np.ndarray]], dims: tuple ) -> None: from sparse import COO - idx = dataframe.index if isinstance(idx, pd.MultiIndex): coords = np.stack([np.asarray(code) for code in idx.codes], axis=0) is_sorted = idx.is_lexsorted() @@ -4557,11 +4556,7 @@ def _set_sparse_data_from_dataframe( is_sorted = True shape = (idx.size,) - for name, series in dataframe.items(): - # Cast to a NumPy array first, in case the Series is a pandas - # Extension array (which doesn't have a valid NumPy dtype) - values = np.asarray(series) - + for name, values in arrays: # In virtually all real use cases, the sparse array will now have # missing values and needs a fill_value. For consistency, don't # special case the rare exceptions (e.g., dtype=int without a @@ -4580,18 +4575,36 @@ def _set_sparse_data_from_dataframe( self[name] = (dims, data) def _set_numpy_data_from_dataframe( - self, dataframe: pd.DataFrame, dims: tuple + self, idx: pd.Index, arrays: List[Tuple[Hashable, np.ndarray]], dims: tuple ) -> None: - idx = dataframe.index - if isinstance(idx, pd.MultiIndex): - # expand the DataFrame to include the product of all levels - full_idx = pd.MultiIndex.from_product(idx.levels, names=idx.names) - dataframe = dataframe.reindex(full_idx) - shape = tuple(lev.size for lev in idx.levels) - else: - shape = (idx.size,) - for name, series in dataframe.items(): - data = np.asarray(series).reshape(shape) + if not isinstance(idx, pd.MultiIndex): + for name, values in arrays: + self[name] = (dims, values) + return + + shape = tuple(lev.size for lev in idx.levels) + indexer = tuple(idx.codes) + + # We already verified that the MultiIndex has all unique values, so + # there are missing values if and only if the size of output arrays is + # larger that the index. + missing_values = np.prod(shape) > idx.shape[0] + + for name, values in arrays: + # NumPy indexing is much faster than using DataFrame.reindex() to + # fill in missing values: + # https://stackoverflow.com/a/35049899/809705 + if missing_values: + dtype, fill_value = dtypes.maybe_promote(values.dtype) + data = np.full(shape, fill_value, dtype) + else: + # If there are no missing values, keep the existing dtype + # instead of promoting to support NA, e.g., keep integer + # columns as integers. + # TODO: consider removing this special case, which doesn't + # exist for sparse=True. + data = np.zeros(shape, values.dtype) + data[indexer] = values self[name] = (dims, data) @classmethod @@ -4631,7 +4644,19 @@ def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> "Datas if not dataframe.columns.is_unique: raise ValueError("cannot convert DataFrame with non-unique columns") - idx, dataframe = remove_unused_levels_categories(dataframe.index, dataframe) + idx = remove_unused_levels_categories(dataframe.index) + + if isinstance(idx, pd.MultiIndex) and not idx.is_unique: + raise ValueError( + "cannot convert a DataFrame with a non-unique MultiIndex into xarray" + ) + + # Cast to a NumPy array first, in case the Series is a pandas Extension + # array (which doesn't have a valid NumPy dtype) + # TODO: allow users to control how this casting happens, e.g., by + # forwarding arguments to pandas.Series.to_numpy? + arrays = [(k, np.asarray(v)) for k, v in dataframe.items()] + obj = cls() if isinstance(idx, pd.MultiIndex): @@ -4647,9 +4672,9 @@ def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> "Datas obj[index_name] = (dims, idx) if sparse: - obj._set_sparse_data_from_dataframe(dataframe, dims) + obj._set_sparse_data_from_dataframe(idx, arrays, dims) else: - obj._set_numpy_data_from_dataframe(dataframe, dims) + obj._set_numpy_data_from_dataframe(idx, arrays, dims) return obj def to_dask_dataframe(self, dim_order=None, set_index=False): diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index a4a5fa2c466..6b7220fdfd4 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -9,7 +9,7 @@ from .variable import Variable -def remove_unused_levels_categories(index, dataframe=None): +def remove_unused_levels_categories(index: pd.Index) -> pd.Index: """ Remove unused levels from MultiIndex and unused categories from CategoricalIndex """ @@ -25,14 +25,15 @@ def remove_unused_levels_categories(index, dataframe=None): else: level = level[index.codes[i]] levels.append(level) + # TODO: calling from_array() reorders MultiIndex levels. It would + # be best to avoid this, if possible, e.g., by using + # MultiIndex.remove_unused_levels() (which does not reorder) on the + # part of the MultiIndex that is not categorical, or by fixing this + # upstream in pandas. index = pd.MultiIndex.from_arrays(levels, names=index.names) elif isinstance(index, pd.CategoricalIndex): index = index.remove_unused_categories() - - if dataframe is None: - return index - dataframe = dataframe.set_index(index) - return dataframe.index, dataframe + return index class Indexes(collections.abc.Mapping): diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 0c4082a553e..62e2dd5c4f2 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -4013,6 +4013,49 @@ def test_to_and_from_empty_dataframe(self): assert len(actual) == 0 assert expected.equals(actual) + def test_from_dataframe_multiindex(self): + index = pd.MultiIndex.from_product([["a", "b"], [1, 2, 3]], names=["x", "y"]) + df = pd.DataFrame({"z": np.arange(6)}, index=index) + + expected = Dataset( + {"z": (("x", "y"), [[0, 1, 2], [3, 4, 5]])}, + coords={"x": ["a", "b"], "y": [1, 2, 3]}, + ) + actual = Dataset.from_dataframe(df) + assert_identical(actual, expected) + + df2 = df.iloc[[3, 2, 1, 0, 4, 5], :] + actual = Dataset.from_dataframe(df2) + assert_identical(actual, expected) + + df3 = df.iloc[:4, :] + expected3 = Dataset( + {"z": (("x", "y"), [[0, 1, 2], [3, np.nan, np.nan]])}, + coords={"x": ["a", "b"], "y": [1, 2, 3]}, + ) + actual = Dataset.from_dataframe(df3) + assert_identical(actual, expected3) + + df_nonunique = df.iloc[[0, 0], :] + with raises_regex(ValueError, "non-unique MultiIndex"): + Dataset.from_dataframe(df_nonunique) + + def test_from_dataframe_unsorted_levels(self): + # regression test for GH-4186 + index = pd.MultiIndex( + levels=[["b", "a"], ["foo"]], codes=[[0, 1], [0, 0]], names=["lev1", "lev2"] + ) + df = pd.DataFrame({"c1": [0, 2], "c2": [1, 3]}, index=index) + expected = Dataset( + { + "c1": (("lev1", "lev2"), [[0], [2]]), + "c2": (("lev1", "lev2"), [[1], [3]]), + }, + coords={"lev1": ["b", "a"], "lev2": ["foo"]}, + ) + actual = Dataset.from_dataframe(df) + assert_identical(actual, expected) + def test_from_dataframe_non_unique_columns(self): # regression test for GH449 df = pd.DataFrame(np.zeros((2, 2)))