Skip to content

Commit

Permalink
fix(python): infer missing columns in from_dicts (#5183)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Oct 13, 2022
1 parent c30d0e4 commit aab7eb1
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 18 deletions.
42 changes: 25 additions & 17 deletions py-polars/src/conversion.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use std::collections::BTreeSet;
use std::fmt::{Display, Formatter};
use std::hash::{Hash, Hasher};

Expand Down Expand Up @@ -749,35 +750,42 @@ impl<'a, T: NativeType + FromPyObject<'a>> FromPyObject<'a> for Wrap<Vec<T>> {
}
}

pub(crate) fn dicts_to_rows(records: &PyAny) -> PyResult<(Vec<Row>, Vec<String>)> {
pub(crate) fn dicts_to_rows(
records: &PyAny,
infer_schema_len: usize,
) -> PyResult<(Vec<Row>, Vec<String>)> {
let (dicts, len) = get_pyseq(records)?;
let mut rows = Vec::with_capacity(len);

let mut iter = dicts.iter()?;
let d = iter.next().unwrap()?;
let d = d.downcast::<PyDict>()?;
let vals = d.values();
let keys_first = d.keys().extract::<Vec<String>>()?;
let row = vals.extract::<Wrap<Row>>()?.0;
rows.push(row);
let mut key_names = BTreeSet::new();
for d in dicts.iter()?.take(infer_schema_len) {
let d = d?;
let d = d.downcast::<PyDict>()?;
let keys = d.keys();

let keys = d.keys();
let width = keys.len();
for name in keys {
let name = name.extract::<String>()?;
key_names.insert(name);
}
}

let mut rows = Vec::with_capacity(len);

for d in iter {
for d in dicts.iter()? {
let d = d?;
let d = d.downcast::<PyDict>()?;

let mut row = Vec::with_capacity(width);
let mut row = Vec::with_capacity(key_names.len());

for k in keys {
let val = d.get_item(k).unwrap();
let val = val.extract::<Wrap<AnyValue>>()?.0;
for k in key_names.iter() {
let val = match d.get_item(k) {
None => AnyValue::Null,
Some(val) => val.extract::<Wrap<AnyValue>>()?.0,
};
row.push(val)
}
rows.push(Row(row))
}
Ok((rows, keys_first))
Ok((rows, key_names.into_iter().collect()))
}

#[cfg(feature = "asof_join")]
Expand Down
2 changes: 1 addition & 1 deletion py-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ impl PyDataFrame {

#[staticmethod]
pub fn read_dicts(dicts: &PyAny, infer_schema_length: Option<usize>) -> PyResult<Self> {
let (rows, names) = dicts_to_rows(dicts)?;
let (rows, names) = dicts_to_rows(dicts, infer_schema_length.unwrap_or(1))?;
let mut pydf = Self::finish_from_rows(rows, infer_schema_length)?;
pydf.df
.set_column_names(&names)
Expand Down
9 changes: 9 additions & 0 deletions py-polars/tests/unit/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,3 +446,12 @@ def test_u64_lit_5031() -> None:
df = pl.DataFrame({"foo": [1, 2, 3]}).with_column(pl.col("foo").cast(pl.UInt64))
assert df.filter(pl.col("foo") < (1 << 64) - 20).shape == (3, 1)
assert df["foo"].to_list() == [1, 2, 3]


def test_from_dicts_missing_columns() -> None:
data = [
{"a": 1},
{"b": 2},
]

assert pl.from_dicts(data).to_dict(False) == {"a": [1, None], "b": [None, 2]}

0 comments on commit aab7eb1

Please sign in to comment.