Skip to content

Commit

Permalink
reduced pyarrow dependency: series dtype list from sequence
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Oct 9, 2021
1 parent 21e7155 commit 60c7a28
Show file tree
Hide file tree
Showing 7 changed files with 206 additions and 18 deletions.
40 changes: 40 additions & 0 deletions polars/polars-core/src/chunked_array/builder/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,20 @@ where
self.builder.try_push_valid().unwrap();
}

/// Appends from an iterator over values
#[inline]
pub fn append_iter<I: Iterator<Item = Option<T::Native>> + TrustedLen>(&mut self, iter: I) {
let values = self.builder.mut_values();

if iter.size_hint().0 == 0 {
self.fast_explode = false;
}
// Safety
// trusted len, trust the type system
unsafe { values.extend_trusted_len_unchecked(iter) };
self.builder.try_push_valid().unwrap();
}

pub fn append_null(&mut self) {
self.builder.push_null();
}
Expand Down Expand Up @@ -485,6 +499,19 @@ impl ListUtf8ChunkedBuilder {
fast_explode: true,
}
}

#[inline]
pub fn append_iter<'a, I: Iterator<Item = Option<&'a str>> + TrustedLen>(&mut self, iter: I) {
let values = self.builder.mut_values();

if iter.size_hint().0 == 0 {
self.fast_explode = false;
}
// Safety
// trusted len, trust the type system
unsafe { values.extend_trusted_len_unchecked(iter) };
self.builder.try_push_valid().unwrap();
}
}

impl ListBuilderTrait for ListUtf8ChunkedBuilder {
Expand Down Expand Up @@ -536,6 +563,19 @@ impl ListBooleanChunkedBuilder {
fast_explode: true,
}
}

#[inline]
pub fn append_iter<I: Iterator<Item = Option<bool>> + TrustedLen>(&mut self, iter: I) {
let values = self.builder.mut_values();

if iter.size_hint().0 == 0 {
self.fast_explode = false;
}
// Safety
// trusted len, trust the type system
unsafe { values.extend_trusted_len_unchecked(iter) };
self.builder.try_push_valid().unwrap();
}
}

impl ListBuilderTrait for ListBooleanChunkedBuilder {
Expand Down
17 changes: 17 additions & 0 deletions py-polars/polars/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,13 @@ def numpy_type_to_constructor(dtype: Type[np.dtype]) -> Callable[..., "PySeries"
bool: PySeries.new_opt_bool,
}

_PY_TYPE_TO_DTYPE = {
float: Float64,
int: Int64,
str: Utf8,
bool: Boolean,
}


def py_type_to_constructor(dtype: Type[Any]) -> Callable[..., "PySeries"]:
"""
Expand Down Expand Up @@ -308,3 +315,13 @@ def py_type_to_arrow_type(dtype: Type[Any]) -> "pa.lib.DataType":
return _PY_TYPE_TO_ARROW_TYPE[dtype]
except KeyError:
raise ValueError(f"Cannot parse dtype {dtype} into Arrow dtype.")


def py_type_to_polars_type(dtype: Type[Any]) -> "Type[DataType]":
"""
Convert a Python dtype to a Polars dtype.
"""
try:
return _PY_TYPE_TO_DTYPE[dtype]
except KeyError:
raise ValueError(f"Cannot parse dtype {dtype} into Polars dtype.")
9 changes: 6 additions & 3 deletions py-polars/polars/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
polars_type_to_constructor,
py_type_to_arrow_type,
py_type_to_constructor,
py_type_to_polars_type,
)

try:
Expand Down Expand Up @@ -128,9 +129,11 @@ def sequence_to_pyseries(
nested_dtype = type(nested_value) if value is not None else float

if not _PYARROW_AVAILABLE:
raise ImportError(
f"'pyarrow' is required for converting a Sequence of {nested_dtype} to a PySeries."
)
dtype = py_type_to_polars_type(nested_dtype)
return PySeries.new_list(name, values, dtype)
# raise ImportError(
# f"'pyarrow' is required for converting a Sequence of {nested_dtype} to a PySeries."
# )

try:
nested_arrow_dtype = py_type_to_arrow_type(nested_dtype)
Expand Down
12 changes: 8 additions & 4 deletions py-polars/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ pub mod datatypes;
pub mod error;
pub mod file;
pub mod lazy;
mod list_construction;
pub mod npy;
pub mod prelude;
pub mod series;
Expand Down Expand Up @@ -180,10 +181,13 @@ fn ipc_schema(py: Python, py_f: PyObject) -> PyResult<PyObject> {
fn collect_all(lfs: Vec<PyLazyFrame>) -> PyResult<Vec<PyDataFrame>> {
use polars_core::utils::rayon::prelude::*;
let out = polars_core::POOL.install(|| {
lfs.par_iter().map(|lf| {
let df = lf.ldf.clone().collect()?;
Ok(PyDataFrame::new(df))
}).collect::<polars_core::error::Result<Vec<_>>>().map_err(PyPolarsEr::from)
lfs.par_iter()
.map(|lf| {
let df = lf.ldf.clone().collect()?;
Ok(PyDataFrame::new(df))
})
.collect::<polars_core::error::Result<Vec<_>>>()
.map_err(PyPolarsEr::from)
});

Ok(out?)
Expand Down
99 changes: 99 additions & 0 deletions py-polars/src/list_construction.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
use crate::conversion::get_pyseq;
use crate::utils::str_to_polarstype;
use polars::prelude::*;
use polars_core::utils::CustomIterTools;
use pyo3::{PyAny, PyResult};

pub fn py_seq_to_list(name: &str, seq: &PyAny, dtype: &PyAny) -> PyResult<Series> {
let str_repr = dtype.str().unwrap().to_str().unwrap();
let dtype = str_to_polarstype(str_repr);

let (seq, len) = get_pyseq(seq)?;
let s = match dtype {
DataType::Int64 => {
let mut builder = ListPrimitiveChunkedBuilder::<Int64Type>::new(name, len, len * 5);
for sub_seq in seq.iter()? {
let sub_seq = sub_seq?;
let (sub_seq, len) = get_pyseq(sub_seq)?;
let iter = sub_seq
.iter()?
.map(|v| {
let v = v.unwrap();
if v.is_none() {
None
} else {
Some(v.extract::<i64>().unwrap())
}
})
.trust_my_length(len);
builder.append_iter(iter)
}
builder.finish().into_series()
}
DataType::Float64 => {
let mut builder = ListPrimitiveChunkedBuilder::<Float64Type>::new(name, len, len * 5);
for sub_seq in seq.iter()? {
let sub_seq = sub_seq?;
let (sub_seq, len) = get_pyseq(sub_seq)?;
let iter = sub_seq
.iter()?
.map(|v| {
let v = v.unwrap();
if v.is_none() {
None
} else {
Some(v.extract::<f64>().unwrap())
}
})
.trust_my_length(len);
builder.append_iter(iter)
}
builder.finish().into_series()
}
DataType::Boolean => {
let mut builder = ListBooleanChunkedBuilder::new(name, len, len * 5);
for sub_seq in seq.iter()? {
let sub_seq = sub_seq?;
let (sub_seq, len) = get_pyseq(sub_seq)?;
let iter = sub_seq
.iter()?
.map(|v| {
let v = v.unwrap();
if v.is_none() {
None
} else {
Some(v.extract::<bool>().unwrap())
}
})
.trust_my_length(len);
builder.append_iter(iter)
}
builder.finish().into_series()
}
DataType::Utf8 => {
let mut builder = ListUtf8ChunkedBuilder::new(name, len, len * 5);
for sub_seq in seq.iter()? {
let sub_seq = sub_seq?;
let (sub_seq, len) = get_pyseq(sub_seq)?;
let iter = sub_seq
.iter()?
.map(|v| {
let v = v.unwrap();
if v.is_none() {
None
} else {
Some(v.extract::<&str>().unwrap())
}
})
.trust_my_length(len);
builder.append_iter(iter)
}
builder.finish().into_series()
}
dt => {
panic!("cannot create list array from {:?}", dt);
}
};

Ok(s)
}
29 changes: 18 additions & 11 deletions py-polars/src/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@ use crate::arrow_interop::to_rust::array_to_rust;
use crate::dataframe::PyDataFrame;
use crate::datatypes::PyDataType;
use crate::error::PyPolarsEr;
use crate::list_construction::py_seq_to_list;
use crate::utils::{downsample_str_to_rule, reinterpret, str_to_polarstype};
use crate::{arrow_interop, npy::aligned_array, prelude::*};
use numpy::PyArray1;
use polars_core::utils::CustomIterTools;
use pyo3::types::{PyBytes, PyList, PyTuple};
use pyo3::{exceptions::PyRuntimeError, prelude::*, Python};
use polars_core::utils::CustomIterTools;

#[pyclass]
#[repr(transparent)]
Expand Down Expand Up @@ -215,6 +216,10 @@ impl PySeries {
Ok(series.into())
}

#[staticmethod]
pub fn new_list(name: &str, seq: &PyAny, dtype: &PyAny) -> PyResult<Self> {
py_seq_to_list(name, seq, dtype).map(|s| s.into())
}

/// Should only be called for Series with null types.
/// This will cast to floats so that `None = np.nan`
Expand All @@ -223,22 +228,24 @@ impl PySeries {
if s.bit_repr_is_large() {
let s = s.cast(&DataType::Float64).unwrap();
let ca = s.f64().unwrap();
let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|opt_v| {
match opt_v {
let np_arr = PyArray1::from_iter(
py,
ca.into_iter().map(|opt_v| match opt_v {
Some(v) => v,
None => f64::NAN
}
}));
None => f64::NAN,
}),
);
np_arr.into_py(py)
} else {
let s = s.cast(&DataType::Float32).unwrap();
let ca = s.f32().unwrap();
let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|opt_v| {
match opt_v {
let np_arr = PyArray1::from_iter(
py,
ca.into_iter().map(|opt_v| match opt_v {
Some(v) => v,
None => f32::NAN
}
}));
None => f32::NAN,
}),
);
np_arr.into_py(py)
}
}
Expand Down
18 changes: 18 additions & 0 deletions py-polars/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -672,3 +672,21 @@ def test_to_numpy():
a.to_numpy() == np.array([1, 2, 3])
a = pl.Series("a", [1, 2, None])
a.to_numpy() == np.array([1.0, 2.0, np.nan])


def test_from_sequences():
# test int, str, bool, flt
values = [
[[1], [None, 3]],
[["foo"], [None, "bar"]],
[[True], [None, False]],
[[1.0], [None, 3.0]],
]

for vals in values:
pl.internals.construction._PYARROW_AVAILABLE = False
a = pl.Series("a", vals)
pl.internals.construction._PYARROW_AVAILABLE = True
b = pl.Series("a", vals)
assert a.series_equal(b, null_equal=True)
assert a.to_list() == vals

0 comments on commit 60c7a28

Please sign in to comment.