Skip to content

Commit

Permalink
fix bug in conversion of FFI of nested utf8 to large-utf8
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Oct 28, 2021
1 parent c6611bc commit ea2829e
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 4 deletions.
38 changes: 34 additions & 4 deletions polars/polars-core/src/series/from.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use crate::chunked_array::builder::get_list_builder;
use crate::chunked_array::cast::cast_chunks;
use crate::prelude::*;
use arrow::compute::cast;
use arrow::compute::cast::utf8_to_large_utf8;
use std::convert::TryFrom;

pub trait NamedFrom<T, Phantom: ?Sized> {
Expand Down Expand Up @@ -77,6 +78,29 @@ impl<T: AsRef<[Series]>> NamedFrom<T, ListType> for Series {
}
}

fn convert_list_inner(arr: &ArrayRef, fld: &ArrowField) -> ArrayRef {
// if inner type is Utf8, we need to convert that to large utf8
match fld.data_type() {
ArrowDataType::Utf8 => {
let arr = arr.as_any().downcast_ref::<ListArray<i64>>().unwrap();
let offsets = arr.offsets().iter().map(|x| *x as i64).collect();
let values = arr.values();
let values =
utf8_to_large_utf8(values.as_any().downcast_ref::<Utf8Array<i32>>().unwrap());

Arc::new(LargeListArray::from_data(
ArrowDataType::LargeList(
ArrowField::new(fld.name(), ArrowDataType::LargeUtf8, true).into(),
),
offsets,
Arc::new(values),
arr.validity().cloned(),
))
}
_ => arr.clone(),
}
}

// TODO: add types
impl std::convert::TryFrom<(&str, Vec<ArrayRef>)> for Series {
type Error = PolarsError;
Expand Down Expand Up @@ -110,9 +134,11 @@ impl std::convert::TryFrom<(&str, Vec<ArrayRef>)> for Series {
let chunks = chunks
.iter()
.map(|arr| {
cast::cast(arr.as_ref(), &ArrowDataType::LargeList(fld.clone()))
.unwrap()
.into()
let arr: ArrayRef =
cast::cast(arr.as_ref(), &ArrowDataType::LargeList(fld.clone()))
.unwrap()
.into();
convert_list_inner(&arr, fld)
})
.collect();
Ok(ListChunked::new_from_chunks(name, chunks).into_series())
Expand Down Expand Up @@ -184,7 +210,11 @@ impl std::convert::TryFrom<(&str, Vec<ArrayRef>)> for Series {
TimeUnit::Nanosecond => s,
})
}
ArrowDataType::LargeList(_) => {
ArrowDataType::LargeList(fld) => {
let chunks = chunks
.iter()
.map(|arr| convert_list_inner(arr, fld))
.collect();
Ok(ListChunked::new_from_chunks(name, chunks).into_series())
}
ArrowDataType::Null => {
Expand Down
10 changes: 10 additions & 0 deletions py-polars/tests/test_interop.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,13 @@ def test_from_pandas_null():
out = pl.DataFrame(test_df)
assert out.dtypes == [pl.Float64]
assert out["0"][0] is None


def test_from_pandas_nested_list():
# this panicked in https://github.com/pola-rs/polars/issues/1615
pddf = pd.DataFrame(
{"a": [1, 2, 3, 4], "b": [["x", "y"], ["x", "y", "z"], ["x"], ["x", "y"]]}
)
pldf = pl.from_pandas(pddf)
print(pldf)
assert pldf.shape == (4, 2)

0 comments on commit ea2829e

Please sign in to comment.