python improve from_dict performance (#3315)

pola-rs · May 6, 2022 · 078fec7 · 078fec7
1 parent 1f0ccec
commit 078fec7
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 22 deletions.
diff --git a/py-polars/polars/convert.py b/py-polars/polars/convert.py
@@ -65,15 +65,7 @@ def from_dict(
     └─────┴─────┘
 
     """
-    # To deal with structs, we have to modify the data, but we dont want to modify
-    # `data` directly. Thus we create a separate dict, and only do so for the
-    # for the fields that need this, to save memory
-    data_struct = dict()
-    for col_name, value in data.items():
-        if isinstance(value, dict):
-            data_struct[col_name] = from_dict(value).to_struct(col_name)
-
-    return DataFrame._from_dict(data=dict(data, **data_struct), columns=columns)  # type: ignore
+    return DataFrame._from_dict(data=data, columns=columns)  # type: ignore
 
 
 def from_records(

diff --git a/py-polars/polars/internals/construction.py b/py-polars/polars/internals/construction.py
@@ -374,18 +374,23 @@ def dict_to_pydf(
     """
     Construct a PyDataFrame from a dictionary of sequences.
     """
-    columns, dtypes = _unpack_columns(columns, lookup_names=data.keys())
-    if not data and dtypes:
-        data_series = [
-            pli.Series(name, [], dtypes.get(name)).inner() for name in columns
-        ]
-    else:
-        data_series = [
-            pli.Series(name, values, dtypes.get(name)).inner()
-            for name, values in data.items()
-        ]
-    data_series = _handle_columns_arg(data_series, columns=columns)
-    return PyDataFrame(data_series)
+    if columns is not None:
+        # the columns arg may also set the dtype of the series
+        columns, dtypes = _unpack_columns(columns, lookup_names=data.keys())
+
+        if not data and dtypes:
+            data_series = [
+                pli.Series(name, [], dtypes.get(name)).inner() for name in columns
+            ]
+        else:
+            data_series = [
+                pli.Series(name, values, dtypes.get(name)).inner()
+                for name, values in data.items()
+            ]
+        data_series = _handle_columns_arg(data_series, columns=columns)
+        return PyDataFrame(data_series)
+    # fast path
+    return PyDataFrame.read_dict(data)
 
 
 def numpy_to_pydf(

diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs
@@ -1,5 +1,5 @@
 use numpy::IntoPyArray;
-use pyo3::types::{PyList, PyTuple};
+use pyo3::types::{PyDict, PyList, PyTuple};
 use pyo3::{exceptions::PyRuntimeError, prelude::*};
 use std::io::{BufReader, BufWriter, Cursor, Read};
 
@@ -18,6 +18,7 @@ use crate::{
     arrow_interop,
     error::PyPolarsErr,
     file::{get_either_file, get_file_like, EitherRustPythonFile},
+    py_modules,
     series::{to_pyseries_collection, to_series_collection, PySeries},
 };
 use polars::frame::row::{rows_to_schema, Row};
@@ -374,6 +375,31 @@ impl PyDataFrame {
         Ok(pydf)
     }
 
+    #[staticmethod]
+    pub fn read_dict(py: Python, dict: &PyDict) -> PyResult<Self> {
+        let cols = dict
+            .into_iter()
+            .map(|(key, val)| {
+                let name = key.extract::<&str>()?;
+
+                let s = if val.is_instance_of::<PyDict>()? {
+                    let df = Self::read_dict(py, val.extract::<&PyDict>()?)?;
+                    df.df.into_struct(name).into_series()
+                } else {
+                    let obj = py_modules::SERIES.call1(py, (name, val))?;
+
+                    let pyseries_obj = obj.getattr(py, "_s")?;
+                    let pyseries = pyseries_obj.extract::<PySeries>(py)?;
+                    pyseries.series
+                };
+                Ok(s)
+            })
+            .collect::<PyResult<Vec<_>>>()?;
+
+        let df = DataFrame::new(cols).map_err(PyPolarsErr::from)?;
+        Ok(df.into())
+    }
+
     pub fn to_csv(
         &mut self,
         py: Python,

diff --git a/py-polars/src/py_modules.rs b/py-polars/src/py_modules.rs
@@ -6,3 +6,6 @@ pub(crate) static POLARS: Lazy<PyObject> =
 
 pub(crate) static UTILS: Lazy<PyObject> =
     Lazy::new(|| Python::with_gil(|py| POLARS.getattr(py, "utils").unwrap()));
+
+pub(crate) static SERIES: Lazy<PyObject> =
+    Lazy::new(|| Python::with_gil(|py| POLARS.getattr(py, "Series").unwrap()));