pydata · dcherian · Apr 18, 2024 · Feb 2, 2024 · Feb 2, 2024 · Feb 5, 2024
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -39,6 +39,10 @@ New Features
   This is currently limited to the linear interpolation method (`method='linear'`).
   (:issue:`7377`, :pull:`8684`) By `Marco Wolsza <https://github.com/maawoo>`_.
 
+- Xarray now makes a best attempt not to coerce :py:class:`pandas.api.extensions.ExtensionArray` to a numpy array
+by supporting 1D `ExtensionArray` objects internally where possible.  Thus, `Dataset`s initialized with a `pd.Catgeorical`, for example,
+will retain the object.  However, one cannot do operations that are not possible on the `ExtensionArray` then, such as broadcasting.
+
 Breaking changes
 ~~~~~~~~~~~~~~~~
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -30,7 +30,7 @@ dependencies = [
 
 [project.optional-dependencies]
 accel = ["scipy", "bottleneck", "numbagg", "flox", "opt_einsum"]
-complete = ["xarray[accel,io,parallel,viz,dev]"]
+complete = ["xarray[accel,io,parallel,viz,dev,extension_arrays]"]
 dev = [
   "hypothesis",
   "pre-commit",
@@ -45,6 +45,7 @@ dev = [
 io = ["netCDF4", "h5netcdf", "scipy", 'pydap; python_version<"3.10"', "zarr", "fsspec", "cftime", "pooch"]
 parallel = ["dask[complete]"]
 viz = ["matplotlib", "seaborn", "nc-time-axis"]
+extension-arrays = ["plum-dispatch"]
 
 [project.urls]
 Documentation = "https://docs.xarray.dev"

diff --git a/xarray/coding/strings.py b/xarray/coding/strings.py
@@ -5,6 +5,7 @@
 from functools import partial
 
 import numpy as np
+from pandas.api.types import is_extension_array_dtype
 
 from xarray.coding.variables import (
     VariableCoder,
@@ -27,11 +28,10 @@ def create_vlen_dtype(element_type):
 
 
 def check_vlen_dtype(dtype):
-    if dtype.kind != "O" or dtype.metadata is None:
+    if is_extension_array_dtype(dtype) or dtype.kind != "O" or dtype.metadata is None:
         return None
-    else:
-        # check xarray (element_type) as well as h5py (vlen)
-        return dtype.metadata.get("element_type", dtype.metadata.get("vlen"))
+    # check xarray (element_type) as well as h5py (vlen)
+    return dtype.metadata.get("element_type", dtype.metadata.get("vlen"))
 
 
 def is_unicode_dtype(dtype):

diff --git a/xarray/coding/times.py b/xarray/coding/times.py
@@ -9,6 +9,7 @@
 
 import numpy as np
 import pandas as pd
+from pandas.api.types import is_extension_array_dtype
 from pandas.errors import OutOfBoundsDatetime, OutOfBoundsTimedelta
 
 from xarray.coding.variables import (
@@ -967,9 +968,10 @@ def __init__(self, use_cftime: bool | None = None) -> None:
         self.use_cftime = use_cftime
 
     def encode(self, variable: Variable, name: T_Name = None) -> Variable:
-        if np.issubdtype(
-            variable.data.dtype, np.datetime64
-        ) or contains_cftime_datetimes(variable):
+        if (not is_extension_array_dtype(variable.data)) and (
+            np.issubdtype(variable.data.dtype, np.datetime64)
+            or contains_cftime_datetimes(variable)
+        ):
             dims, data, attrs, encoding = unpack_for_encoding(variable)
 
             units = encoding.pop("units", None)
@@ -1007,7 +1009,9 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
 
 class CFTimedeltaCoder(VariableCoder):
     def encode(self, variable: Variable, name: T_Name = None) -> Variable:
-        if np.issubdtype(variable.data.dtype, np.timedelta64):
+        if (not is_extension_array_dtype(variable.data)) and np.issubdtype(
+            variable.data.dtype, np.timedelta64
+        ):
             dims, data, attrs, encoding = unpack_for_encoding(variable)
 
             data, units = encode_cf_timedelta(

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
@@ -9,6 +9,7 @@
 
 import numpy as np
 import pandas as pd
+from pandas.api.types import is_extension_array_dtype
 
 from xarray.core import dtypes, duck_array_ops, indexing
 from xarray.core.parallelcompat import get_chunked_array_type
@@ -250,7 +251,6 @@ class CFMaskCoder(VariableCoder):
     def encode(self, variable: Variable, name: T_Name = None):
         dims, data, attrs, encoding = unpack_for_encoding(variable)
 
-        dtype = np.dtype(encoding.get("dtype", data.dtype))
         fv = encoding.get("_FillValue")
         mv = encoding.get("missing_value")
 
@@ -268,6 +268,8 @@ def encode(self, variable: Variable, name: T_Name = None):
         # special case DateTime to properly handle NaT
         is_time_like = _is_time_like(attrs.get("units"))
 
+        dtype = np.dtype(encoding.get("dtype", data.dtype))
+
         if fv_exists:
             # Ensure _FillValue is cast to same dtype as data's
             encoding["_FillValue"] = dtype.type(fv)
@@ -472,16 +474,18 @@ class DefaultFillvalueCoder(VariableCoder):
 
     def encode(self, variable: Variable, name: T_Name = None) -> Variable:
         dims, data, attrs, encoding = unpack_for_encoding(variable)
+        has_no_fill = "_FillValue" not in attrs and "_FillValue" not in encoding
         # make NaN the fill value for float types
-        if (
-            "_FillValue" not in attrs
-            and "_FillValue" not in encoding
-            and np.issubdtype(variable.dtype, np.floating)
-        ):
+        if is_extension_array_dtype(data):
+            if not has_no_fill:
+                raise ValueError(
+                    "Found _FillValue encoding or attr on extension array."
+                )
+            return variable
+        if has_no_fill and np.issubdtype(variable.dtype, np.floating):
             attrs["_FillValue"] = variable.dtype.type(np.nan)
             return Variable(dims, data, attrs, encoding, fastpath=True)
-        else:
-            return variable
+        return variable
 
     def decode(self, variable: Variable, name: T_Name = None) -> Variable:
         raise NotImplementedError()

diff --git a/xarray/conventions.py b/xarray/conventions.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import pandas as pd
+from pandas.api.types import is_extension_array_dtype
 
 from xarray.coding import strings, times, variables
 from xarray.coding.variables import SerializationWarning, pop_to
@@ -114,7 +115,10 @@ def ensure_dtype_not_object(var: Variable, name: T_Name = None) -> Variable:
         dims, data, attrs, encoding = variables.unpack_for_encoding(var)
 
         # leave vlen dtypes unchanged
-        if strings.check_vlen_dtype(data.dtype) is not None:
+        if (
+            is_extension_array_dtype(data)
+            or strings.check_vlen_dtype(data.dtype) is not None
+        ):
             return var
 
         if is_duck_dask_array(data):
@@ -356,9 +360,9 @@ def _update_bounds_encoding(variables: T_Variables) -> None:
         attrs = v.attrs
         encoding = v.encoding
         has_date_units = "units" in encoding and "since" in encoding["units"]
-        is_datetime_type = np.issubdtype(
-            v.dtype, np.datetime64
-        ) or contains_cftime_datetimes(v)
+        is_datetime_type = (not is_extension_array_dtype(v)) and (
+            contains_cftime_datetimes(v) or np.issubdtype(v.dtype, np.datetime64)
+        )
 
         if (
             is_datetime_type

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -24,6 +24,7 @@
 from typing import IO, TYPE_CHECKING, Any, Callable, Generic, Literal, cast, overload
 
 import numpy as np
+from pandas.api.types import is_extension_array_dtype
 
 # remove once numpy 2.0 is the oldest supported version
 try:
@@ -6835,6 +6836,7 @@ def reduce(
                     # that don't have the reduce dims: PR5393
                     not reduce_dims
                     or not numeric_only
+                    or not is_extension_array_dtype(var.dtype)
                     or np.issubdtype(var.dtype, np.number)
                     or (var.dtype == np.bool_)
                 ):
@@ -7149,13 +7151,33 @@ def to_pandas(self) -> pd.Series | pd.DataFrame:
         )
 
     def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
-        columns = [k for k in self.variables if k not in self.dims]
+        columns = [
+            k
+            for k in self.variables
+            if k not in self.dims
+            and not is_extension_array_dtype(self.variables[k].data)
+        ]
+        extension_array_columns = [
+            k
+            for k in self.variables
+            if k not in self.dims and is_extension_array_dtype(self.variables[k].data)
+        ]
         data = [
             self._variables[k].set_dims(ordered_dims).values.reshape(-1)
             for k in columns
         ]
         index = self.coords.to_index([*ordered_dims])
-        return pd.DataFrame(dict(zip(columns, data)), index=index)
+        broadcasted_df = pd.DataFrame(dict(zip(columns, data)), index=index)
+        for extension_array_column in extension_array_columns:
+            extension_array = self.variables[extension_array_column].data.array
+            index = self[self.variables[extension_array_column].dims[0]].data
+            cat_df = pd.DataFrame(
+                {extension_array_column: extension_array},
+                index=self[self.variables[extension_array_column].dims[0]].data,
+            )
+            cat_df.index.name = self.variables[extension_array_column].dims[0]
+            broadcasted_df = broadcasted_df.join(cat_df)
+        return broadcasted_df
 
     def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFrame:
         """Convert this dataset into a pandas.DataFrame.
@@ -7301,11 +7323,14 @@ def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> Self:
                 "cannot convert a DataFrame with a non-unique MultiIndex into xarray"
             )
 
-        # Cast to a NumPy array first, in case the Series is a pandas Extension
-        # array (which doesn't have a valid NumPy dtype)
-        # TODO: allow users to control how this casting happens, e.g., by
-        # forwarding arguments to pandas.Series.to_numpy?
-        arrays = [(k, np.asarray(v)) for k, v in dataframe.items()]
+        arrays = [
+            (k, np.asarray(v))
+            for k, v in dataframe.items()
+            if not is_extension_array_dtype(v)
+        ]
+        extension_arrays = [
+            (k, v) for k, v in dataframe.items() if is_extension_array_dtype(v)
+        ]
 
         indexes: dict[Hashable, Index] = {}
         index_vars: dict[Hashable, Variable] = {}
@@ -7319,6 +7344,8 @@ def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> Self:
                 xr_idx = PandasIndex(lev, dim)
                 indexes[dim] = xr_idx
                 index_vars.update(xr_idx.create_variables())
+            arrays += [(k, np.asarray(v)) for k, v in extension_arrays]
+            extension_arrays = []
         else:
             index_name = idx.name if idx.name is not None else "index"
             dims = (index_name,)
@@ -7332,6 +7359,8 @@ def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> Self:
             obj._set_sparse_data_from_dataframe(idx, arrays, dims)
         else:
             obj._set_numpy_data_from_dataframe(idx, arrays, dims)
+        for name, extension_array in extension_arrays:
+            obj[name] = (dims, extension_array)
         return obj
 
     def to_dask_dataframe(