Skip to content

Commit

Permalink
python fix combine chunk of small integer arrow dictionaries
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Dec 3, 2021
1 parent e7ec5a1 commit b410546
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 0 deletions.
13 changes: 13 additions & 0 deletions py-polars/polars/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,5 +468,18 @@ def coerce_arrow(array: "pa.Array") -> "pa.Array":
chunks.append(pli.Series._from_arrow("", arr).to_arrow())
array = pa.chunked_array(chunks)

# small integer keys can often not be combined, so let's already cast
# to the uint32 used by polars
elif pa.types.is_dictionary(array.type) and (
pa.types.is_int8(array.type.index_type)
or pa.types.is_uint8(array.type.index_type)
or pa.types.is_int16(array.type.index_type)
or pa.types.is_uint16(array.type.index_type)
or pa.types.is_int32(array.type.index_type)
):
array = pa.compute.cast(
array, pa.dictionary(pa.uint32(), pa.large_string())
)

array = array.combine_chunks()
return array
19 changes: 19 additions & 0 deletions py-polars/tests/test_interop.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,22 @@ def test_from_pandas_nan_to_none() -> None:
assert [np.isnan(val) for val in out_false[1:]]
with pytest.raises(ArrowInvalid, match="Could not convert"):
pl.from_pandas(df, nan_to_none=False)


def test_upcast_pyarrow_dicts() -> None:
# 1752
tbls = []
for i in range(128):
tbls.append(
pa.table(
{
"col_name": pa.array(
["value_" + str(i)], pa.dictionary(pa.int8(), pa.string())
),
}
)
)

tbl = pa.concat_tables(tbls, promote=True)
out = pl.from_arrow(tbl)
assert out.shape == (128, 1)

0 comments on commit b410546

Please sign in to comment.