Skip to content

Commit

Permalink
groupby formatting function; closes #100
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Oct 3, 2020
1 parent fd3822a commit ccea7b4
Show file tree
Hide file tree
Showing 3 changed files with 148 additions and 24 deletions.
85 changes: 73 additions & 12 deletions polars/src/frame/group_by.rs
Original file line number Diff line number Diff line change
Expand Up @@ -664,7 +664,7 @@ impl<'df, 'selection_str> GroupBy<'df, 'selection_str> {
let (mut cols, agg_cols) = self.prepare_agg()?;

for agg_col in agg_cols {
let new_name = format!["{}_mean", agg_col.name()];
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Mean);
let opt_agg = apply_method_all_series!(agg_col, agg_mean, &self.groups);
if let Some(mut agg) = opt_agg {
agg.rename(&new_name);
Expand Down Expand Up @@ -703,7 +703,7 @@ impl<'df, 'selection_str> GroupBy<'df, 'selection_str> {
let (mut cols, agg_cols) = self.prepare_agg()?;

for agg_col in agg_cols {
let new_name = format!["{}_sum", agg_col.name()];
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Sum);
let opt_agg = apply_method_all_series!(agg_col, agg_sum, &self.groups);
if let Some(mut agg) = opt_agg {
agg.rename(&new_name);
Expand Down Expand Up @@ -741,7 +741,7 @@ impl<'df, 'selection_str> GroupBy<'df, 'selection_str> {
pub fn min(&self) -> Result<DataFrame> {
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = format!["{}_min", agg_col.name()];
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Min);
let opt_agg = apply_method_all_series!(agg_col, agg_min, &self.groups);
if let Some(mut agg) = opt_agg {
agg.rename(&new_name);
Expand Down Expand Up @@ -779,7 +779,7 @@ impl<'df, 'selection_str> GroupBy<'df, 'selection_str> {
pub fn max(&self) -> Result<DataFrame> {
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = format!["{}_max", agg_col.name()];
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Max);
let opt_agg = apply_method_all_series!(agg_col, agg_max, &self.groups);
if let Some(mut agg) = opt_agg {
agg.rename(&new_name);
Expand Down Expand Up @@ -817,7 +817,7 @@ impl<'df, 'selection_str> GroupBy<'df, 'selection_str> {
pub fn first(&self) -> Result<DataFrame> {
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = format!["{}_first", agg_col.name()];
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::First);
let mut agg = apply_method_all_series!(agg_col, agg_first, &self.groups);
agg.rename(&new_name);
cols.push(agg);
Expand Down Expand Up @@ -853,7 +853,7 @@ impl<'df, 'selection_str> GroupBy<'df, 'selection_str> {
pub fn last(&self) -> Result<DataFrame> {
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = format!["{}_last", agg_col.name()];
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Last);
let mut agg = apply_method_all_series!(agg_col, agg_last, &self.groups);
agg.rename(&new_name);
cols.push(agg);
Expand Down Expand Up @@ -889,7 +889,7 @@ impl<'df, 'selection_str> GroupBy<'df, 'selection_str> {
pub fn n_unique(&self) -> Result<DataFrame> {
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = format!["{}_n_unique", agg_col.name()];
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::NUnique);
let opt_agg = apply_method_all_series!(agg_col, agg_n_unique, &self.groups);
if let Some(mut agg) = opt_agg {
agg.rename(&new_name);
Expand Down Expand Up @@ -917,7 +917,7 @@ impl<'df, 'selection_str> GroupBy<'df, 'selection_str> {
}
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = format!["{}_quantile_{:.2}", agg_col.name(), quantile];
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Quantile(quantile));
let opt_agg = apply_method_all_series!(agg_col, agg_quantile, &self.groups, quantile);
if let Some(mut agg) = opt_agg {
agg.rename(&new_name);
Expand All @@ -940,7 +940,7 @@ impl<'df, 'selection_str> GroupBy<'df, 'selection_str> {
pub fn median(&self) -> Result<DataFrame> {
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = format!["{}_median", agg_col.name()];
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Median);
let opt_agg = apply_method_all_series!(agg_col, agg_median, &self.groups);
if let Some(mut agg) = opt_agg {
agg.rename(&new_name);
Expand Down Expand Up @@ -978,7 +978,7 @@ impl<'df, 'selection_str> GroupBy<'df, 'selection_str> {
pub fn count(&self) -> Result<DataFrame> {
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = format!["{}_count", agg_col.name()];
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Count);
let mut builder = PrimitiveChunkedBuilder::new(&new_name, self.groups.len());
for (_first, idx) in &self.groups {
builder.append_value(idx.len() as u32);
Expand All @@ -991,17 +991,43 @@ impl<'df, 'selection_str> GroupBy<'df, 'selection_str> {
}

/// Get the groupby group indexes.
///
/// # Example
///
/// ```rust
/// # use polars::prelude::*;
/// fn example(df: DataFrame) -> Result<DataFrame> {
/// df.groupby("date")?.groups()
/// }
/// ```
/// Returns:
///
/// ```text
/// +--------------+------------+
/// | date | groups |
/// | --- | --- |
/// | date32(days) | list [u32] |
/// +==============+============+
/// | 2020-08-23 | "[3]" |
/// +--------------+------------+
/// | 2020-08-22 | "[2, 4]" |
/// +--------------+------------+
/// | 2020-08-21 | "[0, 1]" |
/// +--------------+------------+
/// ```
pub fn groups(&self) -> Result<DataFrame> {
let mut cols = self.keys();

let column: LargeListChunked = self
let mut column: LargeListChunked = self
.groups
.iter()
.map(|(_first, idx)| {
let ca: Xob<UInt32Chunked> = idx.into_iter().map(|&v| v as u32).collect();
ca.into_inner().into_series()
})
.collect();
let new_name = fmt_groupby_column("", GroupByMethod::Groups);
column.rename(&new_name);
cols.push(column.into_series());
cols.shrink_to_fit();
DataFrame::new(cols)
Expand Down Expand Up @@ -1172,7 +1198,7 @@ impl<'df, 'selection_str> GroupBy<'df, 'selection_str> {

let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = format!["{}_agg_list", agg_col.name()];
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::List);
let mut agg =
match_arrow_data_type_apply_macro!(agg_col.dtype(), impl_gb, impl_gb_utf8, agg_col);
agg.rename(&new_name);
Expand Down Expand Up @@ -1263,6 +1289,40 @@ impl<'df, 'selection_str> GroupBy<'df, 'selection_str> {
}
}

pub(crate) enum GroupByMethod {
Min,
Max,
Median,
Mean,
First,
Last,
Sum,
Groups,
NUnique,
Quantile(f64),
Count,
List,
}

// Formatting functions used in eager and lazy code for renaming grouped columns
pub(crate) fn fmt_groupby_column(name: &str, method: GroupByMethod) -> String {
use GroupByMethod::*;
match method {
Min => format!["{}_min", name],
Max => format!["{}_max", name],
Median => format!["{}_median", name],
Mean => format!["{}_mean", name],
First => format!["{}_first", name],
Last => format!["{}_last", name],
Sum => format!["{}_sum", name],
Groups => "groups".to_string(),
NUnique => format!["{}_n_unique", name],
Count => format!["{}_count", name],
List => format!["{}_agg_list", name],
Quantile(quantile) => format!["{}_quantile_{:.2}", name, quantile],
}
}

/// Intermediate structure when a `pivot` operation is applied.
/// See [the pivot method for more information.](../group_by/struct.GroupBy.html#method.pivot)
pub struct Pivot<'df, 'selection_str> {
Expand Down Expand Up @@ -1660,6 +1720,7 @@ mod test {
.agg(&[("temp", &["n_unique", "sum", "min"])])
.unwrap()
);
println!("{:?}", df.groupby("date").unwrap().groups().unwrap());
}

#[test]
Expand Down
83 changes: 71 additions & 12 deletions polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use crate::lazy::utils::get_supertype;
use crate::frame::group_by::{fmt_groupby_column, GroupByMethod};
use crate::lazy::utils::{get_supertype, rename_field};
use crate::{lazy::prelude::*, prelude::*};
use arrow::datatypes::{Field, Schema};
use std::fmt;
Expand Down Expand Up @@ -26,12 +27,24 @@ pub enum Expr {
expr: Box<Expr>,
reverse: bool,
},
AggMin(Box<Expr>), // ScalarFunction {
// name: String,
// args: Vec<Expr>,
// return_type: ArrowDataType,
// },
// Wildcard
AggMin(Box<Expr>),
AggMax(Box<Expr>),
AggMedian(Box<Expr>),
AggNUnique(Box<Expr>),
AggFirst(Box<Expr>),
AggLast(Box<Expr>),
AggMean(Box<Expr>),
AggQuantile {
expr: Box<Expr>,
quantile: f64,
},
AggSum(Box<Expr>),
AggGroups(Box<Expr>), // ScalarFunction {
// name: String,
// args: Vec<Expr>,
// return_type: ArrowDataType,
// },
// Wildcard
}

impl Expr {
Expand Down Expand Up @@ -90,11 +103,53 @@ impl Expr {
Sort { expr, .. } => expr.to_field(schema),
AggMin(expr) => {
let field = expr.to_field(schema)?;
Ok(Field::new(
&format!("{}_min", field.name()),
field.data_type().clone(),
field.is_nullable(),
))
let new_name = fmt_groupby_column(field.name(), GroupByMethod::Min);
Ok(rename_field(&field, &new_name))
}
AggMax(expr) => {
let field = expr.to_field(schema)?;
let new_name = fmt_groupby_column(field.name(), GroupByMethod::Max);
Ok(rename_field(&field, &new_name))
}
AggMedian(expr) => {
let field = expr.to_field(schema)?;
let new_name = fmt_groupby_column(field.name(), GroupByMethod::Median);
Ok(rename_field(&field, &new_name))
}
AggMean(expr) => {
let field = expr.to_field(schema)?;
let new_name = fmt_groupby_column(field.name(), GroupByMethod::Mean);
Ok(rename_field(&field, &new_name))
}
AggFirst(expr) => {
let field = expr.to_field(schema)?;
let new_name = fmt_groupby_column(field.name(), GroupByMethod::First);
Ok(rename_field(&field, &new_name))
}
AggLast(expr) => {
let field = expr.to_field(schema)?;
let new_name = fmt_groupby_column(field.name(), GroupByMethod::Last);
Ok(rename_field(&field, &new_name))
}
AggNUnique(expr) => {
let field = expr.to_field(schema)?;
let new_name = fmt_groupby_column(field.name(), GroupByMethod::NUnique);
Ok(rename_field(&field, &new_name))
}
AggSum(expr) => {
let field = expr.to_field(schema)?;
let new_name = fmt_groupby_column(field.name(), GroupByMethod::Sum);
Ok(rename_field(&field, &new_name))
}
AggGroups(expr) => {
let field = expr.to_field(schema)?;
let new_name = fmt_groupby_column(field.name(), GroupByMethod::Groups);
Ok(rename_field(&field, &new_name))
}
AggQuantile { expr, quantile } => {
let field = expr.to_field(schema)?;
let new_name = fmt_groupby_column(field.name(), GroupByMethod::Quantile(*quantile));
Ok(rename_field(&field, &new_name))
}
}
}
Expand Down Expand Up @@ -191,6 +246,7 @@ pub fn col(name: &str) -> Expr {
}

pub trait Literal {
/// [Literal](Expr::Literal) expression.
fn lit(self) -> Expr;
}

Expand Down Expand Up @@ -233,14 +289,17 @@ pub fn lit<L: Literal>(t: L) -> Expr {
t.lit()
}

/// [Not](Expr::Not) expression
pub fn not(expr: Expr) -> Expr {
Expr::Not(Box::new(expr))
}

/// [IsNull](Expr::IsNotNull) expression
pub fn is_null(expr: Expr) -> Expr {
Expr::IsNull(Box::new(expr))
}

/// [IsNotNull](Expr::IsNotNull) expression
pub fn is_not_null(expr: Expr) -> Expr {
Expr::IsNotNull(Box::new(expr))
}
4 changes: 4 additions & 0 deletions polars/src/lazy/utils.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
use crate::{lazy::prelude::*, prelude::*};
use std::rc::Rc;

pub(crate) fn rename_field(field: &Field, name: &str) -> Field {
Field::new(name, field.data_type().clone(), field.is_nullable())
}

// unpack alias(col) to name of the root column
pub(crate) fn expr_to_root_column(expr: &Expr) -> Result<Rc<String>> {
match expr {
Expand Down

0 comments on commit ccea7b4

Please sign in to comment.