Skip to content

Commit

Permalink
python improve from_dict performance (#3315)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed May 6, 2022
1 parent 1f0ccec commit 078fec7
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 22 deletions.
10 changes: 1 addition & 9 deletions py-polars/polars/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,7 @@ def from_dict(
└─────┴─────┘
"""
# To deal with structs, we have to modify the data, but we dont want to modify
# `data` directly. Thus we create a separate dict, and only do so for the
# for the fields that need this, to save memory
data_struct = dict()
for col_name, value in data.items():
if isinstance(value, dict):
data_struct[col_name] = from_dict(value).to_struct(col_name)

return DataFrame._from_dict(data=dict(data, **data_struct), columns=columns) # type: ignore
return DataFrame._from_dict(data=data, columns=columns) # type: ignore


def from_records(
Expand Down
29 changes: 17 additions & 12 deletions py-polars/polars/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,18 +374,23 @@ def dict_to_pydf(
"""
Construct a PyDataFrame from a dictionary of sequences.
"""
columns, dtypes = _unpack_columns(columns, lookup_names=data.keys())
if not data and dtypes:
data_series = [
pli.Series(name, [], dtypes.get(name)).inner() for name in columns
]
else:
data_series = [
pli.Series(name, values, dtypes.get(name)).inner()
for name, values in data.items()
]
data_series = _handle_columns_arg(data_series, columns=columns)
return PyDataFrame(data_series)
if columns is not None:
# the columns arg may also set the dtype of the series
columns, dtypes = _unpack_columns(columns, lookup_names=data.keys())

if not data and dtypes:
data_series = [
pli.Series(name, [], dtypes.get(name)).inner() for name in columns
]
else:
data_series = [
pli.Series(name, values, dtypes.get(name)).inner()
for name, values in data.items()
]
data_series = _handle_columns_arg(data_series, columns=columns)
return PyDataFrame(data_series)
# fast path
return PyDataFrame.read_dict(data)


def numpy_to_pydf(
Expand Down
28 changes: 27 additions & 1 deletion py-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use numpy::IntoPyArray;
use pyo3::types::{PyList, PyTuple};
use pyo3::types::{PyDict, PyList, PyTuple};
use pyo3::{exceptions::PyRuntimeError, prelude::*};
use std::io::{BufReader, BufWriter, Cursor, Read};

Expand All @@ -18,6 +18,7 @@ use crate::{
arrow_interop,
error::PyPolarsErr,
file::{get_either_file, get_file_like, EitherRustPythonFile},
py_modules,
series::{to_pyseries_collection, to_series_collection, PySeries},
};
use polars::frame::row::{rows_to_schema, Row};
Expand Down Expand Up @@ -374,6 +375,31 @@ impl PyDataFrame {
Ok(pydf)
}

#[staticmethod]
pub fn read_dict(py: Python, dict: &PyDict) -> PyResult<Self> {
let cols = dict
.into_iter()
.map(|(key, val)| {
let name = key.extract::<&str>()?;

let s = if val.is_instance_of::<PyDict>()? {
let df = Self::read_dict(py, val.extract::<&PyDict>()?)?;
df.df.into_struct(name).into_series()
} else {
let obj = py_modules::SERIES.call1(py, (name, val))?;

let pyseries_obj = obj.getattr(py, "_s")?;
let pyseries = pyseries_obj.extract::<PySeries>(py)?;
pyseries.series
};
Ok(s)
})
.collect::<PyResult<Vec<_>>>()?;

let df = DataFrame::new(cols).map_err(PyPolarsErr::from)?;
Ok(df.into())
}

pub fn to_csv(
&mut self,
py: Python,
Expand Down
3 changes: 3 additions & 0 deletions py-polars/src/py_modules.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@ pub(crate) static POLARS: Lazy<PyObject> =

pub(crate) static UTILS: Lazy<PyObject> =
Lazy::new(|| Python::with_gil(|py| POLARS.getattr(py, "utils").unwrap()));

pub(crate) static SERIES: Lazy<PyObject> =
Lazy::new(|| Python::with_gil(|py| POLARS.getattr(py, "Series").unwrap()));

0 comments on commit 078fec7

Please sign in to comment.