Skip to content

Commit

Permalink
fix[python]: improve from_dicts nested null inference (#4440)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Aug 16, 2022
1 parent 4686f8b commit fdfb001
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 6 deletions.
30 changes: 24 additions & 6 deletions polars/polars-core/src/frame/row.rs
Original file line number Diff line number Diff line change
Expand Up @@ -241,11 +241,22 @@ pub fn coerce_data_type<A: Borrow<DataType>>(datatypes: &[A]) -> DataType {
get_supertype(lhs, rhs).unwrap_or(Utf8)
}

fn is_nested_null(av: &AnyValue) -> bool {
match av {
AnyValue::Null => true,
AnyValue::List(s) => s.null_count() == s.len(),
#[cfg(feature = "dtype-struct")]
AnyValue::Struct(avs, _) => avs.iter().all(is_nested_null),
_ => false,
}
}

/// Infer schema from rows.
pub fn rows_to_schema(rows: &[Row], infer_schema_length: Option<usize>) -> Schema {
// no of rows to use to infer dtype
let max_infer = infer_schema_length.unwrap_or(rows.len());
let mut schema: Schema = (&rows[0]).into();

// the first row that has no nulls will be used to infer the schema.
// if there is a null, we check the next row and see if we can update the schema

Expand All @@ -255,19 +266,26 @@ pub fn rows_to_schema(rows: &[Row], infer_schema_length: Option<usize>) -> Schem
.iter_dtypes()
.enumerate()
.filter_map(|(i, dtype)| {
if matches!(dtype, DataType::Null) {
Some(i)
} else {
None
// double check struct and list types types
// nested null values can be wrongly inferred by front ends
match dtype {
DataType::Null | DataType::List(_) => Some(i),
#[cfg(feature = "dtype-struct")]
DataType::Struct(_) => Some(i),
_ => None,
}
})
.collect();
if nulls.is_empty() {
break;
} else {
for i in nulls {
let dtype = (&row.0[i]).into();
schema.coerce_by_index(i, dtype).unwrap();
let val = &row.0[i];

if !is_nested_null(val) {
let dtype = val.into();
schema.coerce_by_index(i, dtype).unwrap();
}
}
}
}
Expand Down
6 changes: 6 additions & 0 deletions py-polars/tests/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,9 @@ def test_with_context() -> None:

with pytest.raises(pl.ComputeError):
(df_a.with_context(df_b.lazy()).select(["a", "c"])).collect()


def test_from_dicst_nested_nulls() -> None:
assert pl.from_dicts([{"a": [None, None]}, {"a": [1, 2]}]).to_dict(False) == {
"a": [[None, None], [1, 2]]
}

0 comments on commit fdfb001

Please sign in to comment.