Skip to content

Commit

Permalink
use chunkset in may_apply on DataFrame docs
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Aug 30, 2020
1 parent d0be04c commit da80534
Show file tree
Hide file tree
Showing 9 changed files with 488 additions and 343 deletions.
363 changes: 186 additions & 177 deletions examples/10_minutes_to_polars.ipynb

Large diffs are not rendered by default.

5 changes: 0 additions & 5 deletions polars/src/chunked_array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -413,11 +413,6 @@ where
}
}

/// Wrap as Series
pub fn into_series(self) -> Series {
Series::from_chunked_array(self)
}

/// Downcast generic `ChunkedArray<T>` to u8.
pub fn u8(self) -> Result<UInt8Chunked> {
match T::get_data_type() {
Expand Down
13 changes: 6 additions & 7 deletions polars/src/chunked_array/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use std::ops::{Add, Div};
/// Create a `ChunkedArray` with new values by index or by boolean mask.
/// Note that these operations clone data. This is however the only way we can modify at mask or
/// index level as the underlying Arrow arrays are immutable.
pub trait ChunkSet<'a, V> {
pub trait ChunkSet<'a, A, B> {
/// Set the values at indexes `idx` to some optional value `Option<T>`.
///
/// # Example
Expand All @@ -24,7 +24,7 @@ pub trait ChunkSet<'a, V> {
///
/// assert_eq!(Vec::from(&new), &[Some(10), Some(10), Some(3)]);
/// ```
fn set_at_idx<T: AsTakeIndex>(&'a self, idx: &T, opt_value: Option<V>) -> Result<Self>
fn set_at_idx<T: AsTakeIndex>(&'a self, idx: &T, opt_value: Option<A>) -> Result<Self>
where
Self: Sized;

Expand All @@ -42,7 +42,7 @@ pub trait ChunkSet<'a, V> {
fn set_at_idx_with<T: AsTakeIndex, F>(&'a self, idx: &T, f: F) -> Result<Self>
where
Self: Sized,
F: Fn(Option<V>) -> Option<V>;
F: Fn(Option<A>) -> Option<B>;
/// Set the values where the mask evaluates to `true` to some optional value `Option<T>`.
///
/// # Example
Expand All @@ -54,7 +54,7 @@ pub trait ChunkSet<'a, V> {
/// let new = ca.set(&mask, Some(5)).unwrap();
/// assert_eq!(Vec::from(&new), &[Some(1), Some(5), Some(3)]);
/// ```
fn set(&'a self, mask: &BooleanChunked, opt_value: Option<V>) -> Result<Self>
fn set(&'a self, mask: &BooleanChunked, opt_value: Option<A>) -> Result<Self>
where
Self: Sized;

Expand All @@ -74,7 +74,7 @@ pub trait ChunkSet<'a, V> {
fn set_with<F>(&'a self, mask: &BooleanChunked, f: F) -> Result<Self>
where
Self: Sized,
F: Fn(Option<V>) -> Option<V>;
F: Fn(Option<A>) -> Option<B>;
}

/// Cast `ChunkedArray<T>` to `ChunkedArray<N>`
Expand Down Expand Up @@ -121,8 +121,7 @@ pub trait ChunkAgg<T> {
/// use polars::prelude::*;
/// fn filter_all_ones(df: &DataFrame) -> Result<DataFrame> {
/// let mask = df
/// .column("column_a")
/// .ok_or(PolarsError::NotFound)?
/// .column("column_a")?
/// .eq(1);
///
/// df.filter(&mask)
Expand Down
241 changes: 147 additions & 94 deletions polars/src/chunked_array/set.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,56 @@
use crate::prelude::*;

impl<'a, T> ChunkSet<'a, T::Native> for ChunkedArray<T>
macro_rules! impl_set_at_idx_with {
($self:ident, $builder:ident, $idx:ident, $f:ident) => {{
let mut idx_iter = $idx.as_take_iter();
let mut ca_iter = $self.into_iter().enumerate();

while let Some(current_idx) = idx_iter.next() {
if current_idx > $self.len() {
return Err(PolarsError::OutOfBounds);
}
while let Some((cnt_idx, opt_val)) = ca_iter.next() {
if cnt_idx == current_idx {
$builder.append_option($f(opt_val));
break;
} else {
$builder.append_option(opt_val);
}
}
}
// the last idx is probably not the last value so we finish the iterator
while let Some((_, opt_val)) = ca_iter.next() {
$builder.append_option(opt_val);
}

let ca = $builder.finish();
Ok(ca)
}};
}

macro_rules! check_bounds {
($self:ident, $mask:ident) => {{
if $self.len() != $mask.len() {
return Err(PolarsError::ShapeMisMatch);
}
}};
}

macro_rules! impl_set_with {
($self:ident, $mask:ident, $f:ident) => {{
$self
.into_iter()
.zip($mask)
.map(|(opt_val, opt_mask)| match opt_mask {
None => opt_val,
Some(true) => $f(opt_val),
Some(false) => opt_val,
})
.collect()
}};
}

impl<'a, T> ChunkSet<'a, T::Native, T::Native> for ChunkedArray<T>
where
T: PolarsNumericType,
T::Native: Copy,
Expand All @@ -9,6 +59,15 @@ where
self.set_at_idx_with(idx, |_| value)
}

fn set_at_idx_with<I: AsTakeIndex, F>(&'a self, idx: &I, f: F) -> Result<Self>
where
F: Fn(Option<T::Native>) -> Option<T::Native>,
{
// TODO: implement fast path
let mut builder = PrimitiveChunkedBuilder::<T>::new(self.name(), self.len());
impl_set_at_idx_with!(self, builder, idx, f)
}

fn set(&'a self, mask: &BooleanChunked, value: Option<T::Native>) -> Result<Self> {
self.set_with(mask, |_| value)
}
Expand All @@ -17,10 +76,7 @@ where
where
F: Fn(Option<T::Native>) -> Option<T::Native>,
{
if self.len() != mask.len() {
return Err(PolarsError::ShapeMisMatch);
}

check_bounds!(self, mask);
// TODO: could make faster by also checking the mask for a fast path.
let mut ca: ChunkedArray<T> = match self.cont_slice() {
// fast path
Expand All @@ -34,127 +90,124 @@ where
})
.collect(),
// slower path
Err(_) => self
.into_iter()
.zip(mask)
.map(|(opt_val, opt_mask)| match opt_mask {
None => opt_val,
Some(true) => f(opt_val),
Some(false) => f(opt_val),
})
.collect(),
Err(_) => impl_set_with!(self, mask, f),
};

ca.rename(self.name());
Ok(ca)
}
}

fn set_at_idx_with<I: AsTakeIndex, F>(&'a self, idx: &I, f: F) -> Result<Self>
impl<'a> ChunkSet<'a, bool, bool> for BooleanChunked {
fn set_at_idx<T: AsTakeIndex>(&'a self, idx: &T, opt_value: Option<bool>) -> Result<Self>
where
F: Fn(Option<T::Native>) -> Option<T::Native>,
Self: Sized,
{
self.set_at_idx_with(idx, |_| opt_value)
}

fn set_at_idx_with<T: AsTakeIndex, F>(&'a self, idx: &T, f: F) -> Result<Self>
where
Self: Sized,
F: Fn(Option<bool>) -> Option<bool>,
{
let mut builder = BooleanChunkedBuilder::new(self.name(), self.len());
impl_set_at_idx_with!(self, builder, idx, f)
}

fn set(&'a self, mask: &BooleanChunked, opt_value: Option<bool>) -> Result<Self>
where
Self: Sized,
{
self.set_with(mask, |_| opt_value)
}

fn set_with<F>(&'a self, mask: &BooleanChunked, f: F) -> Result<Self>
where
Self: Sized,
F: Fn(Option<bool>) -> Option<bool>,
{
check_bounds!(self, mask);
let mut ca: BooleanChunked = impl_set_with!(self, mask, f);
ca.rename(self.name());
Ok(ca)
}
}

impl<'a> ChunkSet<'a, &'a str, String> for Utf8Chunked {
fn set_at_idx<T: AsTakeIndex>(&'a self, idx: &T, opt_value: Option<&'a str>) -> Result<Self>
where
Self: Sized,
{
// TODO: implement fast path
let mut idx_iter = idx.as_take_iter();
let mut ca_iter = self.into_iter().enumerate();

let mut builder = PrimitiveChunkedBuilder::<T>::new(self.name(), self.len());
let mut builder = Utf8ChunkedBuilder::new(self.name(), self.len());

while let Some(current_idx) = idx_iter.next() {
if current_idx > self.len() {
return Err(PolarsError::OutOfBounds);
}
while let Some((cnt_idx, opt_val)) = ca_iter.next() {
while let Some((cnt_idx, opt_val_self)) = ca_iter.next() {
if cnt_idx == current_idx {
builder.append_option(f(opt_val));
builder.append_option(opt_value);
break;
} else {
builder.append_option(opt_val);
builder.append_option(opt_val_self);
}
}
}
// the last idx is probably not the last value so we finish the iterator
while let Some((_, opt_val)) = ca_iter.next() {
builder.append_option(opt_val);
while let Some((_, opt_val_self)) = ca_iter.next() {
builder.append_option(opt_val_self);
}

let ca = builder.finish();
Ok(ca)
}
}

macro_rules! impl_chunkset {
($value_type:ty, $ca_type:ident, $builder:ident) => {
impl<'a> ChunkSet<'a, $value_type> for $ca_type {
fn set_with<F>(&'a self, mask: &BooleanChunked, f: F) -> Result<Self>
where
F: Fn(Option<$value_type>) -> Option<$value_type>,
{
if self.len() != mask.len() {
return Err(PolarsError::ShapeMisMatch);
}

let mut ca: $ca_type = self
.into_iter()
.zip(mask)
.map(|(opt_val, opt_mask)| match opt_mask {
None => opt_val,
Some(true) => f(opt_val),
Some(false) => opt_val,
})
.collect();

ca.rename(self.name());
Ok(ca)
}
fn set_at_idx_with<I: AsTakeIndex, F>(&'a self, idx: &I, f: F) -> Result<Self>
where
F: Fn(Option<$value_type>) -> Option<$value_type>,
{
let mut idx_iter = idx.as_take_iter();
let mut ca_iter = self.into_iter().enumerate();

let mut builder = $builder::new(self.name(), self.len());

while let Some(current_idx) = idx_iter.next() {
if current_idx > self.len() {
return Err(PolarsError::OutOfBounds);
}
while let Some((cnt_idx, opt_val)) = ca_iter.next() {
if cnt_idx == current_idx {
builder.append_option(f(opt_val));
break;
} else {
builder.append_option(opt_val);
}
}
}
// the last idx is probably not the last value so we finish the iterator
while let Some((_, opt_val)) = ca_iter.next() {
builder.append_option(opt_val);
}

let ca = builder.finish();
Ok(ca)
}
fn set_at_idx_with<T: AsTakeIndex, F>(&'a self, idx: &T, f: F) -> Result<Self>
where
Self: Sized,
F: Fn(Option<&'a str>) -> Option<String>,
{
let mut builder = Utf8ChunkedBuilder::new(self.name(), self.len());
impl_set_at_idx_with!(self, builder, idx, f)
}

fn set_at_idx<I: AsTakeIndex>(
&'a self,
idx: &I,
value: Option<$value_type>,
) -> Result<Self> {
self.set_at_idx_with(idx, |_| value)
}
fn set(&'a self, mask: &BooleanChunked, opt_value: Option<&'a str>) -> Result<Self>
where
Self: Sized,
{
check_bounds!(self, mask);
let mut builder = Utf8ChunkedBuilder::new(self.name(), self.len());
self.into_iter()
.zip(mask)
.for_each(|(opt_val_self, opt_mask)| match opt_mask {
None => builder.append_option(opt_val_self),
Some(true) => builder.append_option(opt_value),
Some(false) => builder.append_option(opt_val_self),
});
Ok(builder.finish())
}

fn set(&'a self, mask: &BooleanChunked, value: Option<$value_type>) -> Result<Self> {
self.set_with(mask, |_| value)
}
}
};
fn set_with<F>(&'a self, mask: &BooleanChunked, f: F) -> Result<Self>
where
Self: Sized,
F: Fn(Option<&'a str>) -> Option<String>,
{
check_bounds!(self, mask);
let mut builder = Utf8ChunkedBuilder::new(self.name(), self.len());
self.into_iter()
.zip(mask)
.for_each(|(opt_val, opt_mask)| match opt_mask {
None => builder.append_option(opt_val),
Some(true) => builder.append_option(f(opt_val)),
Some(false) => builder.append_option(opt_val),
});
Ok(builder.finish())
}
}

impl_chunkset!(&'a str, Utf8Chunked, Utf8ChunkedBuilder);
impl_chunkset!(bool, BooleanChunked, BooleanChunkedBuilder);

#[cfg(test)]
mod test {
use crate::prelude::*;
Expand Down
6 changes: 6 additions & 0 deletions polars/src/doc/changelog/v0_5.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
//! # Changelog v0.5
//!
//! * `DataFrame.column` returns `Result<_>` **breaking change**.
//! * Define idiomatic way to do inplace operations on a `DataFrame` with `apply`, `may_apply` and `ChunkSet`
//! * `ChunkSet` Trait.
//!
9 changes: 3 additions & 6 deletions polars/src/frame/group_by.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,8 @@ impl DataFrame {
/// }
/// ```
pub fn groupby(&self, by: &str) -> Result<GroupBy> {
let groups = if let Some(s) = self.column(by) {
s.group_tuples()
} else {
return Err(PolarsError::NotFound);
};
let s = self.column(by)?;
let groups = s.group_tuples();

Ok(GroupBy {
df: self,
Expand Down Expand Up @@ -309,7 +306,7 @@ impl<'a> GroupBy<'a> {
};

let keys = self.keys();
let agg_col = self.df.column(name).ok_or(PolarsError::NotFound)?;
let agg_col = self.df.column(name)?;
Ok((name, keys, agg_col))
}

Expand Down

0 comments on commit da80534

Please sign in to comment.