Skip to content

Commit

Permalink
fix[python, rust]: address categorical-related edge cases with init f…
Browse files Browse the repository at this point in the history
…rom empty Arrow arrays (chunked or otherwise) (#4918)
  • Loading branch information
alexander-beedie committed Sep 21, 2022
1 parent cc688ea commit 9012df6
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 10 deletions.
6 changes: 3 additions & 3 deletions polars/polars-core/src/series/from.rs
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ impl Series {
#[cfg(feature = "dtype-categorical")]
ArrowDataType::Dictionary(key_type, value_type, _) => {
use arrow::datatypes::IntegerType;
// don't spuriously call this. This triggers a read on mmaped data
// don't spuriously call this; triggers a read on mmapped data
let arr = if chunks.len() > 1 {
let chunks = chunks.iter().map(|arr| &**arr).collect::<Vec<_>>();
arrow::compute::concatenate::concatenate(&chunks)?
Expand All @@ -204,10 +204,10 @@ impl Series {

if !matches!(
value_type.as_ref(),
ArrowDataType::Utf8 | ArrowDataType::LargeUtf8
ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 | ArrowDataType::Null
) {
return Err(PolarsError::ComputeError(
"polars only support dictionaries with string like values".into(),
"polars only supports dictionaries with string-like values".into(),
));
}

Expand Down
20 changes: 17 additions & 3 deletions py-polars/polars/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,22 @@ def series_to_pyseries(name: str, values: pli.Series) -> PySeries:
def arrow_to_pyseries(name: str, values: pa.Array, rechunk: bool = True) -> PySeries:
"""Construct a PySeries from an Arrow array."""
array = coerce_arrow(values)
if hasattr(array, "num_chunks"):

# special handling of empty categorical arrays
if (
len(array) == 0
and isinstance(array.type, pa.DictionaryType)
and array.type.value_type
in (
pa.utf8(),
pa.large_utf8(),
)
):
pys = pli.Series(name, [], dtype=Categorical)._s

elif not hasattr(array, "num_chunks"):
pys = PySeries.from_arrow(name, array)
else:
if array.num_chunks > 1:
it = array.iterchunks()
pys = PySeries.from_arrow(name, next(it))
Expand All @@ -98,8 +113,7 @@ def arrow_to_pyseries(name: str, values: pa.Array, rechunk: bool = True) -> PySe
if rechunk:
pys.rechunk(in_place=True)

return pys
return PySeries.from_arrow(name, array)
return pys


def numpy_to_pyseries(
Expand Down
24 changes: 20 additions & 4 deletions py-polars/tests/unit/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,16 +387,32 @@ def test_arrow() -> None:
out = a.to_arrow()
assert out == pa.array([1, 2, 3, None])

a = pa.array(["foo", "bar"], pa.dictionary(pa.int32(), pa.utf8()))
s = pl.Series("a", a)
assert s.dtype == pl.Categorical

s = cast(
pl.Series,
pl.from_arrow(pa.array([["foo"], ["foo", "bar"]], pa.list_(pa.utf8()))),
)
assert s.dtype == pl.List

# categorical dtype tests (including various forms of empty pyarrow array)
with pl.StringCache():
arr0 = pa.array(["foo", "bar"], pa.dictionary(pa.int32(), pa.utf8()))
assert_series_equal(
pl.Series("arr", ["foo", "bar"], pl.Categorical), pl.Series("arr", arr0)
)
arr1 = pa.array(["xxx", "xxx", None, "yyy"]).dictionary_encode()
arr2 = pa.array([]).dictionary_encode()
arr3 = pa.chunked_array([], arr1.type)
arr4 = pa.array([], arr1.type)

assert_series_equal(
pl.Series("arr", ["xxx", "xxx", None, "yyy"], dtype=pl.Categorical),
pl.Series("arr", arr1),
)
for arr in (arr2, arr3, arr4):
assert_series_equal(
pl.Series("arr", [], dtype=pl.Categorical), pl.Series("arr", arr)
)


def test_view() -> None:
a = pl.Series("a", [1.0, 2.0, 3.0])
Expand Down

0 comments on commit 9012df6

Please sign in to comment.