Skip to content

Commit

Permalink
improve conversion to ndarray/numpy
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jan 19, 2022
1 parent f91f0a4 commit b02bb21
Show file tree
Hide file tree
Showing 7 changed files with 122 additions and 16 deletions.
3 changes: 1 addition & 2 deletions polars/polars-arrow/src/kernels/set.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use crate::array::default_arrays::FromData;
use crate::error::{PolarsError, Result};
use crate::kernels::BinaryMaskedSliceIterator;
use crate::prelude::PolarsArray;
use crate::trusted_len::PushUnchecked;
use arrow::array::*;
use arrow::{datatypes::DataType, types::NativeType};
Expand All @@ -14,7 +13,7 @@ where
T: NativeType,
{
let values = array.values();
if !array.has_validity() {
if array.null_count() == 0 {
return array.clone();
}

Expand Down
11 changes: 11 additions & 0 deletions polars/polars-core/src/chunked_array/float.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use crate::prelude::*;
use num::Float;
use polars_arrow::kernels::float::*;
use polars_arrow::kernels::set::set_at_nulls;

impl<T> ChunkedArray<T>
where
Expand All @@ -19,4 +20,14 @@ where
pub fn is_infinite(&self) -> BooleanChunked {
self.apply_kernel_cast(is_infinite)
}

#[must_use]
/// Convert missing values to `NaN` values.
pub fn none_to_nan(&self) -> Self {
let chunks = self
.downcast_iter()
.map(|arr| Arc::new(set_at_nulls(arr, T::Native::nan())) as ArrayRef)
.collect();
ChunkedArray::new_from_chunks(self.name(), chunks)
}
}
70 changes: 57 additions & 13 deletions polars/polars-core/src/chunked_array/ndarray.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use crate::prelude::*;
use ndarray::prelude::*;
use rayon::prelude::*;

impl<T> ChunkedArray<T>
where
Expand Down Expand Up @@ -77,6 +78,8 @@ impl DataFrame {
/// `DataFrame` to be non-null and numeric. They will be casted to the same data type
/// (if they aren't already).
///
/// For floating point data we implicitly convert `None` to `NaN` without failure.
///
/// ```rust
/// use polars_core::prelude::*;
/// let a = UInt32Chunked::new("a", &[1, 2, 3]).into_series();
Expand All @@ -97,24 +100,65 @@ impl DataFrame {
where
N: PolarsNumericType,
{
let mut ndarr = Array2::zeros(self.shape());
for (col_idx, series) in self.get_columns().iter().enumerate() {
if series.null_count() != 0 {
let columns = self
.get_columns()
.par_iter()
.map(|s| {
let s = s.cast(&N::get_dtype())?;
let s = match s.dtype() {
DataType::Float32 => {
let ca = s.f32().unwrap();
ca.none_to_nan().into_series()
}
DataType::Float64 => {
let ca = s.f64().unwrap();
ca.none_to_nan().into_series()
}
_ => s,
};
Ok(s.rechunk())
})
.collect::<Result<Vec<_>>>()?;

let shape = self.shape();
let height = self.height();
let mut membuf = Vec::with_capacity(shape.0 * shape.1);
let ptr = membuf.as_ptr() as usize;

columns.par_iter().enumerate().map(|(col_idx, s)| {
if s.null_count() != 0 {
return Err(PolarsError::HasNullValues(
"Creation of ndarray with null values is not supported.".into(),
"Creation of ndarray with null values is not supported. Consider using floats and NaNs".into(),
));
}

// this is an Arc clone if already of type N
let series = series.cast(&N::get_dtype())?;
let ca = series.unpack::<N>()?;

ca.into_no_null_iter()
.enumerate()
.for_each(|(row_idx, val)| {
ndarr[[row_idx, col_idx]] = val;
})
let s = s.cast(&N::get_dtype())?;
let ca = s.unpack::<N>()?;
let vals = ca.cont_slice().unwrap();

// Safety:
// we get parallel access to the vector
// but we make sure that we don't get aliased access by offsetting the column indices + length
unsafe {
let offset_ptr = (ptr as *mut N::Native).add(col_idx * height) ;
// Safety:
// this is uninitialized memory, so we must never read from this data
// copy_from_slice does not read
let buf = std::slice::from_raw_parts_mut(offset_ptr, height);
buf.copy_from_slice(vals)
}

Ok(())
}).collect::<Result<Vec<_>>>()?;

// Safety:
// we have written all data, so we can now safely set length
unsafe {
membuf.set_len(shape.0 * shape.1);
}
Ok(ndarr)
let ndarr = Array2::from_shape_vec((shape.1, shape.0), membuf).unwrap();
Ok(ndarr.reversed_axes())
}
}

Expand Down
1 change: 1 addition & 0 deletions py-polars/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions py-polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ features = [
"json",
"string_encoding",
"product",
"ndarray",
]

# [patch.crates-io]
Expand Down
8 changes: 7 additions & 1 deletion py-polars/polars/internals/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1152,7 +1152,13 @@ def to_numpy(self) -> np.ndarray:
<class 'numpy.ndarray'>
"""
return np.vstack([self.to_series(i).to_numpy() for i in range(self.width)]).T
out = self._df.to_numpy()
if out is None:
return np.vstack(
[self.to_series(i).to_numpy() for i in range(self.width)]
).T
else:
return out

def __getstate__(self): # type: ignore
return self.get_columns()
Expand Down
44 changes: 44 additions & 0 deletions py-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use numpy::IntoPyArray;
use pyo3::types::{PyList, PyTuple};
use pyo3::{exceptions::PyRuntimeError, prelude::*};

Expand All @@ -22,6 +23,7 @@ use crate::{
use polars::frame::row::{rows_to_schema, Row};
use polars_core::frame::groupby::PivotAgg;
use polars_core::prelude::QuantileInterpolOptions;
use polars_core::utils::get_supertype;

#[pyclass]
#[repr(transparent)]
Expand Down Expand Up @@ -352,6 +354,48 @@ impl PyDataFrame {
.into_py(py)
}

pub fn to_numpy(&self, py: Python) -> Option<PyObject> {
let mut st = DataType::Int8;
for s in self.df.iter() {
let dt_i = s.dtype();
st = get_supertype(&st, dt_i).ok()?;
}

match st {
DataType::UInt32 => self
.df
.to_ndarray::<UInt32Type>()
.ok()
.map(|arr| arr.into_pyarray(py).into_py(py)),
DataType::UInt64 => self
.df
.to_ndarray::<UInt64Type>()
.ok()
.map(|arr| arr.into_pyarray(py).into_py(py)),
DataType::Int32 => self
.df
.to_ndarray::<Int32Type>()
.ok()
.map(|arr| arr.into_pyarray(py).into_py(py)),
DataType::Int64 => self
.df
.to_ndarray::<Int64Type>()
.ok()
.map(|arr| arr.into_pyarray(py).into_py(py)),
DataType::Float32 => self
.df
.to_ndarray::<Float32Type>()
.ok()
.map(|arr| arr.into_pyarray(py).into_py(py)),
DataType::Float64 => self
.df
.to_ndarray::<Float64Type>()
.ok()
.map(|arr| arr.into_pyarray(py).into_py(py)),
_ => None,
}
}

#[cfg(feature = "parquet")]
pub fn to_parquet(&self, py_f: PyObject, compression: &str, statistics: bool) -> PyResult<()> {
let compression = match compression {
Expand Down

0 comments on commit b02bb21

Please sign in to comment.