Skip to content

Error when fitting using nullable integer data with categorical transformer #1036

@rwedge

Description

@rwedge

Environment Details

Please indicate the following details about the environment in which you found the bug:

  • RDT version: 1.18.1
  • Python version: 3.13
  • Operating System: macOS

Error Description

When trying to fit nullable integer data with a categorical transformer, RDT throws an error.

Steps to reproduce

import pandas as pd

from rdt.transformers import UniformEncoder

df = pd.DataFrame({'example': [1, 2, 3, None]}, dtype='Int64')
encoder = UniformEncoder()
encoder.fit(data=df, column='example')
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File ~/miniconda3/envs/sdvclean/lib/python3.13/site-packages/pandas/core/internals/blocks.py:2401, in ExtensionBlock.fillna(self, value, limit, inplace, downcast, using_cow, already_warned)
   2400 try:
-> 2401     new_values = self.values.fillna(
   2402         value=value, method=None, limit=limit, copy=copy
   2403     )
   2404 except TypeError:
   2405     # 3rd party EA that has not implemented copy keyword yet

File ~/miniconda3/envs/sdvclean/lib/python3.13/site-packages/pandas/core/arrays/masked.py:267, in BaseMaskedArray.fillna(self, value, method, limit, copy)
    266             new_values = self[:]
--> 267         new_values[mask] = value
    268 else:

File ~/miniconda3/envs/sdvclean/lib/python3.13/site-packages/pandas/core/arrays/masked.py:320, in BaseMaskedArray.__setitem__(self, key, value)
    318     return
--> 320 value, mask = self._coerce_to_array(value, dtype=self.dtype)
    322 self._data[key] = value

File ~/miniconda3/envs/sdvclean/lib/python3.13/site-packages/pandas/core/arrays/numeric.py:272, in NumericArray._coerce_to_array(cls, value, dtype, copy)
    271 default_dtype = dtype_cls._default_np_dtype
--> 272 values, mask, _, _ = _coerce_to_data_and_mask(
    273     value, dtype, copy, dtype_cls, default_dtype
    274 )
    275 return values, mask

File ~/miniconda3/envs/sdvclean/lib/python3.13/site-packages/pandas/core/arrays/numeric.py:184, in _coerce_to_data_and_mask(values, dtype, copy, dtype_cls, default_dtype)
    183 if values.ndim != 1:
--> 184     raise TypeError("values must be a 1D list-like")
    186 if mask is None:

TypeError: values must be a 1D list-like

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
Cell In[18], line 7
      5 df = pd.DataFrame({'example': [1, 2, 3, None]}, dtype='Int64')
      6 encoder = UniformEncoder()
----> 7 encoder.fit(data=df, column='example')

File ~/miniconda3/envs/sdvclean/lib/python3.13/site-packages/rdt/transformers/base.py:57, in random_state.<locals>.wrapper(self, *args, **kwargs)
     55 method_name = function.__name__
     56 with set_random_states(self.random_states, method_name, self.set_random_state):
---> 57     return function(self, *args, **kwargs)

File ~/miniconda3/envs/sdvclean/lib/python3.13/site-packages/rdt/transformers/base.py:390, in BaseTransformer.fit(self, data, column)
    388 self._set_seed(data)
    389 columns_data = self._get_columns_data(data, self.columns)
--> 390 self._fit(columns_data)
    391 self._build_output_columns(data)

File ~/miniconda3/envs/sdvclean/lib/python3.13/site-packages/rdt/transformers/categorical.py:129, in UniformEncoder._fit(self, data)
    119 """Fit the transformer to the data.
    120 
    121 Compute the frequencies of each category and use them
   (...)    126         Data to fit the transformer to.
    127 """
    128 self.dtype = data.dtypes
--> 129 data = fill_nan_with_none(data)
    130 labels = pd.unique(data)
    131 labels = self._order_categories(labels)

File ~/miniconda3/envs/sdvclean/lib/python3.13/site-packages/rdt/transformers/utils.py:202, in fill_nan_with_none(data)
    199 if isinstance(data, pd.DataFrame):
    200     return data.apply(_fill_nan_with_none_series)
--> 202 return _fill_nan_with_none_series(data)

File ~/miniconda3/envs/sdvclean/lib/python3.13/site-packages/rdt/transformers/utils.py:186, in _fill_nan_with_none_series(data)
    183     data = data.fillna(sentinel).replace({sentinel: None})
    184     return pd.Series(pd.Categorical(data, categories=dtype.categories), index=data.index)
--> 186 return data.fillna(sentinel).replace({sentinel: None})

File ~/miniconda3/envs/sdvclean/lib/python3.13/site-packages/pandas/core/generic.py:7368, in NDFrame.fillna(self, value, method, axis, inplace, limit, downcast)
   7361     else:
   7362         raise TypeError(
   7363             '"value" parameter must be a scalar, dict '
   7364             "or Series, but you passed a "
   7365             f'"{type(value).__name__}"'
   7366         )
-> 7368     new_data = self._mgr.fillna(
   7369         value=value, limit=limit, inplace=inplace, downcast=downcast
   7370     )
   7372 elif isinstance(value, (dict, ABCSeries)):
   7373     if axis == 1:

File ~/miniconda3/envs/sdvclean/lib/python3.13/site-packages/pandas/core/internals/base.py:186, in DataManager.fillna(self, value, limit, inplace, downcast)
    182 if limit is not None:
    183     # Do this validation even if we go through one of the no-op paths
    184     limit = libalgos.validate_limit(None, limit=limit)
--> 186 return self.apply_with_block(
    187     "fillna",
    188     value=value,
    189     limit=limit,
    190     inplace=inplace,
    191     downcast=downcast,
    192     using_cow=using_copy_on_write(),
    193     already_warned=_AlreadyWarned(),
    194 )

File ~/miniconda3/envs/sdvclean/lib/python3.13/site-packages/pandas/core/internals/managers.py:363, in BaseBlockManager.apply(self, f, align_keys, **kwargs)
    361         applied = b.apply(f, **kwargs)
    362     else:
--> 363         applied = getattr(b, f)(**kwargs)
    364     result_blocks = extend_blocks(applied, result_blocks)
    366 out = type(self).from_blocks(result_blocks, self.axes)

File ~/miniconda3/envs/sdvclean/lib/python3.13/site-packages/pandas/core/internals/blocks.py:2407, in ExtensionBlock.fillna(self, value, limit, inplace, downcast, using_cow, already_warned)
   2404 except TypeError:
   2405     # 3rd party EA that has not implemented copy keyword yet
   2406     refs = None
-> 2407     new_values = self.values.fillna(value=value, method=None, limit=limit)
   2408     # issue the warning *after* retrying, in case the TypeError
   2409     #  was caused by an invalid fill_value
   2410     warnings.warn(
   2411         # GH#53278
   2412         "ExtensionArray.fillna added a 'copy' keyword in pandas "
   (...)   2418         stacklevel=find_stack_level(),
   2419     )

File ~/miniconda3/envs/sdvclean/lib/python3.13/site-packages/pandas/core/arrays/masked.py:267, in BaseMaskedArray.fillna(self, value, method, limit, copy)
    265         else:
    266             new_values = self[:]
--> 267         new_values[mask] = value
    268 else:
    269     if copy:

File ~/miniconda3/envs/sdvclean/lib/python3.13/site-packages/pandas/core/arrays/masked.py:320, in BaseMaskedArray.__setitem__(self, key, value)
    317         self._mask[key] = False
    318     return
--> 320 value, mask = self._coerce_to_array(value, dtype=self.dtype)
    322 self._data[key] = value
    323 self._mask[key] = mask

File ~/miniconda3/envs/sdvclean/lib/python3.13/site-packages/pandas/core/arrays/numeric.py:272, in NumericArray._coerce_to_array(cls, value, dtype, copy)
    270 dtype_cls = cls._dtype_cls
    271 default_dtype = dtype_cls._default_np_dtype
--> 272 values, mask, _, _ = _coerce_to_data_and_mask(
    273     value, dtype, copy, dtype_cls, default_dtype
    274 )
    275 return values, mask

File ~/miniconda3/envs/sdvclean/lib/python3.13/site-packages/pandas/core/arrays/numeric.py:184, in _coerce_to_data_and_mask(values, dtype, copy, dtype_cls, default_dtype)
    181     raise TypeError(f"{values.dtype} cannot be converted to {name}")
    183 if values.ndim != 1:
--> 184     raise TypeError("values must be a 1D list-like")
    186 if mask is None:
    187     if values.dtype.kind in "iu":
    188         # fastpath

TypeError: values must be a 1D list-like

Metadata

Metadata

Assignees

Labels

bugSomething isn't working

Type

No type

Projects

No projects

Relationships

None yet

Development

No branches or pull requests

Issue actions