Skip to content

Commit

Permalink
python create df from_rows
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jun 7, 2021
1 parent 569035d commit 3ce4097
Show file tree
Hide file tree
Showing 8 changed files with 160 additions and 6 deletions.
38 changes: 34 additions & 4 deletions polars/polars-core/src/frame/row.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use crate::prelude::*;
use itertools::Itertools;
use std::fmt::Debug;
use std::fmt::{Debug, Formatter};

#[derive(Debug, Clone, PartialEq)]
pub struct Row<'a>(pub Vec<AnyValue<'a>>);
Expand Down Expand Up @@ -59,7 +59,15 @@ impl DataFrame {
}
Ok(())
})?;
let v = buffers.into_iter().map(|b| b.into_series()).collect();
let v = buffers
.into_iter()
.zip(schema.fields())
.map(|(b, fld)| {
let mut s = b.into_series();
s.rename(fld.name());
s
})
.collect();
DataFrame::new(v)
}

Expand Down Expand Up @@ -130,9 +138,10 @@ impl From<&Row<'_>> for Schema {
let fields = row
.0
.iter()
.map(|av| {
.enumerate()
.map(|(i, av)| {
let field: Field = av.into();
field
Field::new(format!("column_{}", i).as_ref(), field.data_type().clone())
})
.collect();

Expand All @@ -156,6 +165,27 @@ pub(crate) enum Buffer {
Utf8(Utf8ChunkedBuilder),
}

impl Debug for Buffer {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
use Buffer::*;
match self {
Boolean(_) => f.write_str("boolean"),
Int32(_) => f.write_str("i32"),
Int64(_) => f.write_str("i64"),
UInt32(_) => f.write_str("u32"),
#[cfg(feature = "dtype-u64")]
UInt64(_) => f.write_str("u64"),
#[cfg(feature = "dtype-date32")]
Date32(_) => f.write_str("date32"),
#[cfg(feature = "dtype-date64")]
Date64(_) => f.write_str("date64"),
Float32(_) => f.write_str("f32"),
Float64(_) => f.write_str("f64"),
Utf8(_) => f.write_str("utf8"),
}
}
}

impl Buffer {
fn add(&mut self, val: AnyValue) -> Result<()> {
use Buffer::*;
Expand Down
1 change: 1 addition & 0 deletions py-polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ features = [
"pretty_fmt",
"performant",
"dtype-full",
"rows",
"private"
]

Expand Down
34 changes: 34 additions & 0 deletions py-polars/polars/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,40 @@ def _from_pydf(df: "PyDataFrame") -> "DataFrame":
self._df = df
return self

@staticmethod
def from_rows(
rows: "Sequence[Sequence[Any]]",
column_names: "Optional[List[str]]" = None,
column_name_mapping: "Optional[Dict[int, str]]" = None,
) -> "DataFrame":
"""
Create a DataFrame from rows. This should only be used as a last resort, as this is more expensive than
creating from columnar data.
Parameters
----------
rows
rows
column_names
column names to use for the DataFrame
column_name_mapping
map column index to a new name:
Example:
```python
column_mapping: {0: "first_column, 3: "fourth column"}
```
"""
self = DataFrame.__new__(DataFrame)
self._df = PyDataFrame.read_rows(rows)
if column_names is not None:
self.columns = column_names
if column_name_mapping is not None:
for i, name in column_name_mapping.items():
s = self[:, i]
s.rename(name, in_place=True)
self.replace_at_idx(i, s)
return self

@staticmethod
def read_csv(
file: Union[str, TextIO],
Expand Down
27 changes: 26 additions & 1 deletion py-polars/polars/functions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Union, TextIO, Optional, List, BinaryIO
from typing import Union, TextIO, Optional, List, BinaryIO, Sequence, Any
from io import StringIO, BytesIO
import numpy as np
from pathlib import Path
Expand Down Expand Up @@ -496,3 +496,28 @@ def read_json(
Path to a file or a file like object.
"""
return DataFrame.read_json(source)


def from_rows(
rows: "Sequence[Sequence[Any]]",
column_names: "Optional[List[str]]" = None,
column_name_mapping: "Optional[Dict[int, str]]" = None,
) -> "DataFrame":
"""
Create a DataFrame from rows. This should only be used as a last resort, as this is more expensive than
creating from columnar data.
Parameters
----------
rows
rows
column_names
column names to use for the DataFrame
column_name_mapping
map column index to a new name:
Example:
```python
column_mapping: {0: "first_column, 3: "fourth column"}
```
"""
return DataFrame.from_rows(rows, column_names, column_name_mapping)
3 changes: 2 additions & 1 deletion py-polars/src/arrow_interop/to_rust.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use polars_core::utils::arrow::{
record_batch::RecordBatch,
};
use pyo3::prelude::*;
use crate::error::PyPolarsEr;

pub fn array_to_rust(obj: &PyAny) -> PyResult<ArrayRef> {
// prepare a pointer to receive the Array struct
Expand All @@ -23,7 +24,7 @@ pub fn array_to_rust(obj: &PyAny) -> PyResult<ArrayRef> {
}

pub fn to_rust_rb(rb: &[&PyAny]) -> PyResult<Vec<RecordBatch>> {
let schema = rb[0].getattr("schema")?;
let schema = rb.get(0).ok_or_else(|| PyPolarsEr::Other("empty table".into()))?.getattr("schema")?;
let names = schema.getattr("names")?.extract::<Vec<String>>()?;

let arrays = rb
Expand Down
37 changes: 37 additions & 0 deletions py-polars/src/conversion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@ use pyo3::types::PySequence;
use pyo3::{PyAny, PyResult};
use std::any::Any;
use std::fmt::{Display, Formatter};
use crate::error::PyPolarsEr;
use polars::frame::row::Row;

#[repr(transparent)]
pub struct Wrap<T>(pub T);

impl<T> Clone for Wrap<T>
Expand All @@ -18,6 +21,11 @@ where
Wrap(self.0.clone())
}
}
impl<T> From<T> for Wrap<T> {
fn from(t: T) -> Self {
Wrap(t)
}
}

fn get_pyseq(obj: &PyAny) -> PyResult<(&PySequence, usize)> {
let seq = <PySequence as PyTryFrom>::try_from(obj)?;
Expand Down Expand Up @@ -114,6 +122,35 @@ impl ToPyObject for Wrap<AnyValue<'_>> {
}
}

impl<'s> FromPyObject<'s> for Wrap<AnyValue<'s>> {
fn extract(ob: &'s PyAny) -> PyResult<Self> {
if let Ok(v) = ob.extract::<i64>() {
Ok(AnyValue::Int64(v).into())
} else if let Ok(v) = ob.extract::<f64>() {
Ok(AnyValue::Float64(v).into())
} else if let Ok(v) = ob.extract::<&'s str>() {
Ok(AnyValue::Utf8(v).into())
} else if let Ok(v) = ob.extract::<bool>() {
Ok(AnyValue::Boolean(v).into())

} else if let Ok(res) = ob.call_method0("timestamp") {
// s to ms
let v = res.extract::<f64>()? as i64;
Ok(AnyValue::Date64(v * 1000).into())
} else {
Err(PyErr::from(PyPolarsEr::Other(format!("row type not supported {:?}", ob))))
}
}
}

impl<'s> FromPyObject<'s> for Wrap<Row<'s>> {
fn extract(ob: &'s PyAny) -> PyResult<Self> {
let vals = ob.extract::<Vec<Wrap<AnyValue<'s>>>>()?;
let vals: Vec<AnyValue> = unsafe { std::mem::transmute(vals) };
Ok(Wrap(Row(vals)))
}
}

#[derive(Clone, Debug)]
pub struct ObjectValue {
pub inner: PyObject,
Expand Down
12 changes: 12 additions & 0 deletions py-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use crate::{
file::{get_either_file, get_file_like, EitherRustPythonFile},
series::{to_pyseries_collection, to_series_collection, PySeries},
};
use polars::frame::row::Row;

#[pyclass]
#[repr(transparent)]
Expand Down Expand Up @@ -186,6 +187,17 @@ impl PyDataFrame {
Ok(Self::from(df))
}

// somehow from_rows did not work
#[staticmethod]
pub fn read_rows(rows: Vec<Wrap<Row>>) -> PyResult<Self> {
// safety:
// wrap is transparent
let rows: Vec<Row> = unsafe { std::mem::transmute(rows) };
let df = DataFrame::from_rows(&rows)
.map_err(PyPolarsEr::from)?;
Ok(df.into())
}

pub fn to_csv(&mut self, py_f: PyObject, has_headers: bool, delimiter: u8) -> PyResult<()> {
let mut buf = get_file_like(py_f, true)?;
CsvWriter::new(&mut buf)
Expand Down
14 changes: 14 additions & 0 deletions py-polars/tests/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pyarrow as pa
import polars as pl
import pandas as pd
from datetime import datetime

from utils import get_complete_df

Expand Down Expand Up @@ -674,3 +675,16 @@ def test_to_json():
s = df.to_json(to_string=True)
out = pl.read_json(s)
assert df.frame_equal(out, null_equal=True)


def test_from_rows():
df = pl.from_rows([[1, 2, "foo"], [2, 3, "bar"]], column_name_mapping={1: "foo"})
assert df.frame_equal(
pl.DataFrame({"column_0": [1, 2], "foo": [2, 3], "column_2": ["foo", "bar"]})
)

df = pl.from_rows(
[[1, datetime.fromtimestamp(100)], [2, datetime.fromtimestamp(2398754908)]],
column_name_mapping={1: "foo"},
)
assert df.dtypes == [pl.Int64, pl.Date64]

0 comments on commit 3ce4097

Please sign in to comment.