python create df from_rows

pola-rs · Jun 7, 2021 · 3ce4097 · 3ce4097
1 parent 569035d
commit 3ce4097
Show file tree

Hide file tree

Showing 8 changed files with 160 additions and 6 deletions.
diff --git a/polars/polars-core/src/frame/row.rs b/polars/polars-core/src/frame/row.rs
@@ -1,6 +1,6 @@
 use crate::prelude::*;
 use itertools::Itertools;
-use std::fmt::Debug;
+use std::fmt::{Debug, Formatter};
 
 #[derive(Debug, Clone, PartialEq)]
 pub struct Row<'a>(pub Vec<AnyValue<'a>>);
@@ -59,7 +59,15 @@ impl DataFrame {
             }
             Ok(())
         })?;
-        let v = buffers.into_iter().map(|b| b.into_series()).collect();
+        let v = buffers
+            .into_iter()
+            .zip(schema.fields())
+            .map(|(b, fld)| {
+                let mut s = b.into_series();
+                s.rename(fld.name());
+                s
+            })
+            .collect();
         DataFrame::new(v)
     }
 
@@ -130,9 +138,10 @@ impl From<&Row<'_>> for Schema {
         let fields = row
             .0
             .iter()
-            .map(|av| {
+            .enumerate()
+            .map(|(i, av)| {
                 let field: Field = av.into();
-                field
+                Field::new(format!("column_{}", i).as_ref(), field.data_type().clone())
             })
             .collect();
 
@@ -156,6 +165,27 @@ pub(crate) enum Buffer {
     Utf8(Utf8ChunkedBuilder),
 }
 
+impl Debug for Buffer {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        use Buffer::*;
+        match self {
+            Boolean(_) => f.write_str("boolean"),
+            Int32(_) => f.write_str("i32"),
+            Int64(_) => f.write_str("i64"),
+            UInt32(_) => f.write_str("u32"),
+            #[cfg(feature = "dtype-u64")]
+            UInt64(_) => f.write_str("u64"),
+            #[cfg(feature = "dtype-date32")]
+            Date32(_) => f.write_str("date32"),
+            #[cfg(feature = "dtype-date64")]
+            Date64(_) => f.write_str("date64"),
+            Float32(_) => f.write_str("f32"),
+            Float64(_) => f.write_str("f64"),
+            Utf8(_) => f.write_str("utf8"),
+        }
+    }
+}
+
 impl Buffer {
     fn add(&mut self, val: AnyValue) -> Result<()> {
         use Buffer::*;

diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml
@@ -48,6 +48,7 @@ features = [
     "pretty_fmt",
     "performant",
     "dtype-full",
+    "rows",
     "private"
 ]
 

diff --git a/py-polars/polars/frame.py b/py-polars/polars/frame.py
@@ -117,6 +117,40 @@ def _from_pydf(df: "PyDataFrame") -> "DataFrame":
         self._df = df
         return self
 
+    @staticmethod
+    def from_rows(
+        rows: "Sequence[Sequence[Any]]",
+        column_names: "Optional[List[str]]" = None,
+        column_name_mapping: "Optional[Dict[int, str]]" = None,
+    ) -> "DataFrame":
+        """
+        Create a DataFrame from rows. This should only be used as a last resort, as this is more expensive than
+        creating from columnar data.
+
+        Parameters
+        ----------
+        rows
+            rows
+        column_names
+            column names to use for the DataFrame
+        column_name_mapping
+            map column index to a new name:
+            Example:
+            ```python
+                column_mapping: {0: "first_column, 3: "fourth column"}
+            ```
+        """
+        self = DataFrame.__new__(DataFrame)
+        self._df = PyDataFrame.read_rows(rows)
+        if column_names is not None:
+            self.columns = column_names
+        if column_name_mapping is not None:
+            for i, name in column_name_mapping.items():
+                s = self[:, i]
+                s.rename(name, in_place=True)
+                self.replace_at_idx(i, s)
+        return self
+
     @staticmethod
     def read_csv(
         file: Union[str, TextIO],

diff --git a/py-polars/polars/functions.py b/py-polars/polars/functions.py
@@ -1,4 +1,4 @@
-from typing import Union, TextIO, Optional, List, BinaryIO
+from typing import Union, TextIO, Optional, List, BinaryIO, Sequence, Any
 from io import StringIO, BytesIO
 import numpy as np
 from pathlib import Path
@@ -496,3 +496,28 @@ def read_json(
         Path to a file or a file like object.
     """
     return DataFrame.read_json(source)
+
+
+def from_rows(
+    rows: "Sequence[Sequence[Any]]",
+    column_names: "Optional[List[str]]" = None,
+    column_name_mapping: "Optional[Dict[int, str]]" = None,
+) -> "DataFrame":
+    """
+    Create a DataFrame from rows. This should only be used as a last resort, as this is more expensive than
+    creating from columnar data.
+
+    Parameters
+    ----------
+    rows
+        rows
+    column_names
+        column names to use for the DataFrame
+    column_name_mapping
+        map column index to a new name:
+        Example:
+        ```python
+            column_mapping: {0: "first_column, 3: "fourth column"}
+        ```
+    """
+    return DataFrame.from_rows(rows, column_names, column_name_mapping)
diff --git a/py-polars/src/arrow_interop/to_rust.rs b/py-polars/src/arrow_interop/to_rust.rs
@@ -7,6 +7,7 @@ use polars_core::utils::arrow::{
     record_batch::RecordBatch,
 };
 use pyo3::prelude::*;
+use crate::error::PyPolarsEr;
 
 pub fn array_to_rust(obj: &PyAny) -> PyResult<ArrayRef> {
     // prepare a pointer to receive the Array struct
@@ -23,7 +24,7 @@ pub fn array_to_rust(obj: &PyAny) -> PyResult<ArrayRef> {
 }
 
 pub fn to_rust_rb(rb: &[&PyAny]) -> PyResult<Vec<RecordBatch>> {
-    let schema = rb[0].getattr("schema")?;
+    let schema = rb.get(0).ok_or_else(|| PyPolarsEr::Other("empty table".into()))?.getattr("schema")?;
     let names = schema.getattr("names")?.extract::<Vec<String>>()?;
 
     let arrays = rb

diff --git a/py-polars/src/conversion.rs b/py-polars/src/conversion.rs
@@ -7,7 +7,10 @@ use pyo3::types::PySequence;
 use pyo3::{PyAny, PyResult};
 use std::any::Any;
 use std::fmt::{Display, Formatter};
+use crate::error::PyPolarsEr;
+use polars::frame::row::Row;
 
+#[repr(transparent)]
 pub struct Wrap<T>(pub T);
 
 impl<T> Clone for Wrap<T>
@@ -18,6 +21,11 @@ where
         Wrap(self.0.clone())
     }
 }
+impl<T> From<T> for Wrap<T> {
+    fn from(t: T) -> Self {
+        Wrap(t)
+    }
+}
 
 fn get_pyseq(obj: &PyAny) -> PyResult<(&PySequence, usize)> {
     let seq = <PySequence as PyTryFrom>::try_from(obj)?;
@@ -114,6 +122,35 @@ impl ToPyObject for Wrap<AnyValue<'_>> {
     }
 }
 
+impl<'s> FromPyObject<'s> for Wrap<AnyValue<'s>> {
+    fn extract(ob: &'s PyAny) -> PyResult<Self> {
+        if let Ok(v) = ob.extract::<i64>() {
+            Ok(AnyValue::Int64(v).into())
+        } else if let Ok(v) = ob.extract::<f64>() {
+            Ok(AnyValue::Float64(v).into())
+        } else if let Ok(v) = ob.extract::<&'s str>() {
+            Ok(AnyValue::Utf8(v).into())
+        } else if let Ok(v) = ob.extract::<bool>() {
+            Ok(AnyValue::Boolean(v).into())
+
+        } else if let Ok(res) = ob.call_method0("timestamp") {
+            // s to ms
+            let v = res.extract::<f64>()? as i64;
+            Ok(AnyValue::Date64(v * 1000).into())
+        } else {
+            Err(PyErr::from(PyPolarsEr::Other(format!("row type not supported {:?}", ob))))
+        }
+    }
+}
+
+impl<'s> FromPyObject<'s> for Wrap<Row<'s>> {
+    fn extract(ob: &'s PyAny) -> PyResult<Self> {
+        let vals = ob.extract::<Vec<Wrap<AnyValue<'s>>>>()?;
+        let vals: Vec<AnyValue> = unsafe { std::mem::transmute(vals) };
+        Ok(Wrap(Row(vals)))
+    }
+}
+
 #[derive(Clone, Debug)]
 pub struct ObjectValue {
     pub inner: PyObject,

diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs
@@ -23,6 +23,7 @@ use crate::{
     file::{get_either_file, get_file_like, EitherRustPythonFile},
     series::{to_pyseries_collection, to_series_collection, PySeries},
 };
+use polars::frame::row::Row;
 
 #[pyclass]
 #[repr(transparent)]
@@ -186,6 +187,17 @@ impl PyDataFrame {
         Ok(Self::from(df))
     }
 
+    // somehow from_rows did not work
+    #[staticmethod]
+    pub fn read_rows(rows: Vec<Wrap<Row>>) -> PyResult<Self> {
+        // safety:
+        // wrap is transparent
+        let rows: Vec<Row> = unsafe { std::mem::transmute(rows) };
+        let df = DataFrame::from_rows(&rows)
+            .map_err(PyPolarsEr::from)?;
+        Ok(df.into())
+    }
+
     pub fn to_csv(&mut self, py_f: PyObject, has_headers: bool, delimiter: u8) -> PyResult<()> {
         let mut buf = get_file_like(py_f, true)?;
         CsvWriter::new(&mut buf)

diff --git a/py-polars/tests/test_df.py b/py-polars/tests/test_df.py
@@ -8,6 +8,7 @@
 import pyarrow as pa
 import polars as pl
 import pandas as pd
+from datetime import datetime
 
 from utils import get_complete_df
 
@@ -674,3 +675,16 @@ def test_to_json():
     s = df.to_json(to_string=True)
     out = pl.read_json(s)
     assert df.frame_equal(out, null_equal=True)
+
+
+def test_from_rows():
+    df = pl.from_rows([[1, 2, "foo"], [2, 3, "bar"]], column_name_mapping={1: "foo"})
+    assert df.frame_equal(
+        pl.DataFrame({"column_0": [1, 2], "foo": [2, 3], "column_2": ["foo", "bar"]})
+    )
+
+    df = pl.from_rows(
+        [[1, datetime.fromtimestamp(100)], [2, datetime.fromtimestamp(2398754908)]],
+        column_name_mapping={1: "foo"},
+    )
+    assert df.dtypes == [pl.Int64, pl.Date64]