Skip to content

Commit

Permalink
add lower and higher interpolation to quantiles (#2001)
Browse files Browse the repository at this point in the history
  • Loading branch information
marcvanheerden committed Dec 13, 2021
1 parent ac0cefa commit fb0b9f6
Show file tree
Hide file tree
Showing 33 changed files with 869 additions and 119 deletions.
542 changes: 520 additions & 22 deletions polars/polars-core/src/chunked_array/ops/aggregate.rs

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion polars/polars-core/src/chunked_array/ops/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ pub trait ChunkAgg<T> {

/// Aggregate a given quantile of the ChunkedArray.
/// Returns `None` if the array is empty or only contains null values.
fn quantile(&self, _quantile: f64) -> Result<Option<T>> {
fn quantile(&self, _quantile: f64, _interpol: QuantileInterpolOptions) -> Result<Option<T>> {
Ok(None)
}
}
Expand Down
16 changes: 13 additions & 3 deletions polars/polars-core/src/frame/groupby/aggregations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -681,7 +681,12 @@ impl<T: PolarsObject> AggList for ObjectChunked<T> {
}

pub(crate) trait AggQuantile {
fn agg_quantile(&self, _groups: &[(u32, Vec<u32>)], _quantile: f64) -> Option<Series> {
fn agg_quantile(
&self,
_groups: &[(u32, Vec<u32>)],
_quantile: f64,
_interpol: QuantileInterpolOptions,
) -> Option<Series> {
None
}

Expand All @@ -699,14 +704,19 @@ where
+ arrow::compute::aggregate::SimdOrd<T::Native>,
ChunkedArray<T>: IntoSeries,
{
fn agg_quantile(&self, groups: &[(u32, Vec<u32>)], quantile: f64) -> Option<Series> {
fn agg_quantile(
&self,
groups: &[(u32, Vec<u32>)],
quantile: f64,
interpol: QuantileInterpolOptions,
) -> Option<Series> {
agg_helper::<T, _>(groups, |(_first, idx)| {
if idx.is_empty() {
return None;
}

let group_vals = unsafe { self.take_unchecked(idx.iter().map(|i| *i as usize).into()) };
group_vals.quantile(quantile).unwrap()
group_vals.quantile(quantile, interpol).unwrap()
})
}

Expand Down
17 changes: 9 additions & 8 deletions polars/polars-core/src/frame/groupby/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -848,19 +848,20 @@ impl<'df, 'selection_str> GroupBy<'df, 'selection_str> {
/// ```rust
/// # use polars_core::prelude::*;
/// fn example(df: DataFrame) -> Result<DataFrame> {
/// df.groupby("date")?.select("temp").quantile(0.2)
/// }
/// df.groupby("date")?.select("temp").quantile(0.2, QuantileInterpolOptions::default())
/// } //TODO: update this
/// ```
pub fn quantile(&self, quantile: f64) -> Result<DataFrame> {
pub fn quantile(&self, quantile: f64, interpol: QuantileInterpolOptions) -> Result<DataFrame> {
if !(0.0..=1.0).contains(&quantile) {
return Err(PolarsError::ComputeError(
"quantile should be within 0.0 and 1.0".into(),
));
}
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Quantile(quantile));
let opt_agg = agg_col.agg_quantile(&self.groups, quantile);
let new_name =
fmt_groupby_column(agg_col.name(), GroupByMethod::Quantile(quantile, interpol));
let opt_agg = agg_col.agg_quantile(&self.groups, quantile, interpol);
if let Some(mut agg) = opt_agg {
agg.rename(&new_name);
cols.push(agg.into_series());
Expand Down Expand Up @@ -1214,7 +1215,7 @@ pub enum GroupByMethod {
Sum,
Groups,
NUnique,
Quantile(f64),
Quantile(f64, QuantileInterpolOptions),
Count,
List,
Std,
Expand All @@ -1236,7 +1237,7 @@ pub fn fmt_groupby_column(name: &str, method: GroupByMethod) -> String {
NUnique => format!("{}_n_unique", name),
Count => format!("{}_count", name),
List => format!("{}_agg_list", name),
Quantile(quantile) => format!("{}_quantile_{:.2}", name, quantile),
Quantile(quantile, _interpol) => format!("{}_quantile_{:.2}", name, quantile),
Std => format!("{}_agg_std", name),
Var => format!("{}_agg_var", name),
}
Expand Down Expand Up @@ -1335,7 +1336,7 @@ mod test {
df.groupby("date")
.unwrap()
.select("temp")
.quantile(0.2)
.quantile(0.2, QuantileInterpolOptions::default())
.unwrap()
);
println!(
Expand Down
4 changes: 2 additions & 2 deletions polars/polars-core/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2257,11 +2257,11 @@ impl DataFrame {
}

/// Aggregate the columns to their quantile values.
pub fn quantile(&self, quantile: f64) -> Result<Self> {
pub fn quantile(&self, quantile: f64, interpol: QuantileInterpolOptions) -> Result<Self> {
let columns = POOL.install(|| {
self.columns
.par_iter()
.map(|s| s.quantile_as_series(quantile))
.map(|s| s.quantile_as_series(quantile, interpol))
.collect::<Result<Vec<_>>>()
})?;
Ok(DataFrame::new_no_checks(columns))
Expand Down
17 changes: 13 additions & 4 deletions polars/polars-core/src/series/implementations/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,13 @@ impl private::PrivateSeries for SeriesWrap<BooleanChunked> {
self.0.agg_list(groups)
}

fn agg_quantile(&self, groups: &[(u32, Vec<u32>)], quantile: f64) -> Option<Series> {
self.0.agg_quantile(groups, quantile)
fn agg_quantile(
&self,
groups: &[(u32, Vec<u32>)],
quantile: f64,
interpol: QuantileInterpolOptions,
) -> Option<Series> {
self.0.agg_quantile(groups, quantile, interpol)
}

fn agg_median(&self, groups: &[(u32, Vec<u32>)]) -> Option<Series> {
Expand Down Expand Up @@ -400,8 +405,12 @@ impl SeriesTrait for SeriesWrap<BooleanChunked> {
fn std_as_series(&self) -> Series {
VarAggSeries::std_as_series(&self.0)
}
fn quantile_as_series(&self, quantile: f64) -> Result<Series> {
ChunkAggSeries::quantile_as_series(&self.0, quantile)
fn quantile_as_series(
&self,
quantile: f64,
interpol: QuantileInterpolOptions,
) -> Result<Series> {
ChunkAggSeries::quantile_as_series(&self.0, quantile, interpol)
}

fn fmt_list(&self) -> String {
Expand Down
6 changes: 5 additions & 1 deletion polars/polars-core/src/series/implementations/categorical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,11 @@ impl SeriesTrait for SeriesWrap<CategoricalChunked> {
fn std_as_series(&self) -> Series {
CategoricalChunked::full_null(self.name(), 1).into_series()
}
fn quantile_as_series(&self, _quantile: f64) -> Result<Series> {
fn quantile_as_series(
&self,
_quantile: f64,
_interpol: QuantileInterpolOptions,
) -> Result<Series> {
Ok(CategoricalChunked::full_null(self.name(), 1).into_series())
}

Expand Down
15 changes: 12 additions & 3 deletions polars/polars-core/src/series/implementations/dates_time.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,14 @@ macro_rules! impl_dyn_series {
})
}

fn agg_quantile(&self, groups: &[(u32, Vec<u32>)], quantile: f64) -> Option<Series> {
fn agg_quantile(
&self,
groups: &[(u32, Vec<u32>)],
quantile: f64,
interpol: QuantileInterpolOptions,
) -> Option<Series> {
self.0
.agg_quantile(groups, quantile)
.agg_quantile(groups, quantile, interpol)
.map(|s| s.$into_logical().into_series())
}

Expand Down Expand Up @@ -573,7 +578,11 @@ macro_rules! impl_dyn_series {
.unwrap()
.into()
}
fn quantile_as_series(&self, _quantile: f64) -> Result<Series> {
fn quantile_as_series(
&self,
_quantile: f64,
_interpol: QuantileInterpolOptions,
) -> Result<Series> {
Ok(Int32Chunked::full_null(self.name(), 1)
.cast(self.dtype())
.unwrap()
Expand Down
17 changes: 13 additions & 4 deletions polars/polars-core/src/series/implementations/floats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,13 @@ macro_rules! impl_dyn_series {
self.0.agg_list(groups)
}

fn agg_quantile(&self, groups: &[(u32, Vec<u32>)], quantile: f64) -> Option<Series> {
self.0.agg_quantile(groups, quantile)
fn agg_quantile(
&self,
groups: &[(u32, Vec<u32>)],
quantile: f64,
interpol: QuantileInterpolOptions,
) -> Option<Series> {
self.0.agg_quantile(groups, quantile, interpol)
}

fn agg_median(&self, groups: &[(u32, Vec<u32>)]) -> Option<Series> {
Expand Down Expand Up @@ -540,8 +545,12 @@ macro_rules! impl_dyn_series {
fn std_as_series(&self) -> Series {
VarAggSeries::std_as_series(&self.0)
}
fn quantile_as_series(&self, quantile: f64) -> Result<Series> {
ChunkAggSeries::quantile_as_series(&self.0, quantile)
fn quantile_as_series(
&self,
quantile: f64,
interpol: QuantileInterpolOptions,
) -> Result<Series> {
ChunkAggSeries::quantile_as_series(&self.0, quantile, interpol)
}

fn fmt_list(&self) -> String {
Expand Down
8 changes: 6 additions & 2 deletions polars/polars-core/src/series/implementations/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -293,8 +293,12 @@ impl SeriesTrait for SeriesWrap<ListChunked> {
fn std_as_series(&self) -> Series {
VarAggSeries::std_as_series(&self.0)
}
fn quantile_as_series(&self, quantile: f64) -> Result<Series> {
ChunkAggSeries::quantile_as_series(&self.0, quantile)
fn quantile_as_series(
&self,
quantile: f64,
interpol: QuantileInterpolOptions,
) -> Result<Series> {
ChunkAggSeries::quantile_as_series(&self.0, quantile, interpol)
}

fn fmt_list(&self) -> String {
Expand Down
17 changes: 13 additions & 4 deletions polars/polars-core/src/series/implementations/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,13 @@ macro_rules! impl_dyn_series {
self.0.agg_list(groups)
}

fn agg_quantile(&self, groups: &[(u32, Vec<u32>)], quantile: f64) -> Option<Series> {
self.0.agg_quantile(groups, quantile)
fn agg_quantile(
&self,
groups: &[(u32, Vec<u32>)],
quantile: f64,
interpol: QuantileInterpolOptions,
) -> Option<Series> {
self.0.agg_quantile(groups, quantile, interpol)
}

fn agg_median(&self, groups: &[(u32, Vec<u32>)]) -> Option<Series> {
Expand Down Expand Up @@ -705,8 +710,12 @@ macro_rules! impl_dyn_series {
fn std_as_series(&self) -> Series {
VarAggSeries::std_as_series(&self.0)
}
fn quantile_as_series(&self, quantile: f64) -> Result<Series> {
ChunkAggSeries::quantile_as_series(&self.0, quantile)
fn quantile_as_series(
&self,
quantile: f64,
interpol: QuantileInterpolOptions,
) -> Result<Series> {
ChunkAggSeries::quantile_as_series(&self.0, quantile, interpol)
}

fn fmt_list(&self) -> String {
Expand Down
8 changes: 6 additions & 2 deletions polars/polars-core/src/series/implementations/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -387,8 +387,12 @@ impl SeriesTrait for SeriesWrap<Utf8Chunked> {
fn std_as_series(&self) -> Series {
VarAggSeries::std_as_series(&self.0)
}
fn quantile_as_series(&self, quantile: f64) -> Result<Series> {
ChunkAggSeries::quantile_as_series(&self.0, quantile)
fn quantile_as_series(
&self,
quantile: f64,
interpol: QuantileInterpolOptions,
) -> Result<Series> {
ChunkAggSeries::quantile_as_series(&self.0, quantile, interpol)
}

fn fmt_list(&self) -> String {
Expand Down
13 changes: 11 additions & 2 deletions polars/polars-core/src/series/series_trait.rs
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,12 @@ pub(crate) mod private {
fn agg_list(&self, _groups: &[(u32, Vec<u32>)]) -> Option<Series> {
None
}
fn agg_quantile(&self, _groups: &[(u32, Vec<u32>)], _quantile: f64) -> Option<Series> {
fn agg_quantile(
&self,
_groups: &[(u32, Vec<u32>)],
_quantile: f64,
_interpol: QuantileInterpolOptions,
) -> Option<Series> {
None
}
fn agg_median(&self, _groups: &[(u32, Vec<u32>)]) -> Option<Series> {
Expand Down Expand Up @@ -814,7 +819,11 @@ pub trait SeriesTrait:
invalid_operation_panic!(self)
}
/// Get the quantile of the ChunkedArray as a new Series of length 1.
fn quantile_as_series(&self, _quantile: f64) -> Result<Series> {
fn quantile_as_series(
&self,
_quantile: f64,
_interpol: QuantileInterpolOptions,
) -> Result<Series> {
invalid_operation_panic!(self)
}

Expand Down
13 changes: 9 additions & 4 deletions polars/polars-lazy/src/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,11 @@ pub enum AggExpr {
Mean(Box<Expr>),
List(Box<Expr>),
Count(Box<Expr>),
Quantile { expr: Box<Expr>, quantile: f64 },
Quantile {
expr: Box<Expr>,
quantile: f64,
interpol: QuantileInterpolOptions,
},
Sum(Box<Expr>),
AggGroups(Box<Expr>),
Std(Box<Expr>),
Expand Down Expand Up @@ -724,10 +728,11 @@ impl Expr {
}

/// Compute the quantile per group.
pub fn quantile(self, quantile: f64) -> Self {
pub fn quantile(self, quantile: f64, interpol: QuantileInterpolOptions) -> Self {
AggExpr::Quantile {
expr: Box::new(self),
quantile,
interpol,
}
.into()
}
Expand Down Expand Up @@ -1902,8 +1907,8 @@ pub fn median(name: &str) -> Expr {
}

/// Find a specific quantile of all the values in this Expression.
pub fn quantile(name: &str, quantile: f64) -> Expr {
col(name).quantile(quantile)
pub fn quantile(name: &str, quantile: f64, interpol: QuantileInterpolOptions) -> Expr {
col(name).quantile(quantile, interpol)
}

/// Apply a closure on the two columns that are evaluated from `Expr` a and `Expr` b.
Expand Down
8 changes: 4 additions & 4 deletions polars/polars-lazy/src/frame.rs
Original file line number Diff line number Diff line change
Expand Up @@ -805,7 +805,7 @@ impl LazyFrame {
/// .agg([
/// col("rain").min(),
/// col("rain").sum(),
/// col("rain").quantile(0.5).alias("median_rain"),
/// col("rain").quantile(0.5, QuantileInterpolOptions::Nearest).alias("median_rain"),
/// ])
/// .sort("date", false)
/// }
Expand Down Expand Up @@ -988,8 +988,8 @@ impl LazyFrame {
}

/// Aggregate all the columns as their quantile values.
pub fn quantile(self, quantile: f64) -> LazyFrame {
self.select_local(vec![col("*").quantile(quantile)])
pub fn quantile(self, quantile: f64, interpol: QuantileInterpolOptions) -> LazyFrame {
self.select_local(vec![col("*").quantile(quantile, interpol)])
}

/// Aggregate all the columns as their standard deviation values.
Expand Down Expand Up @@ -1144,7 +1144,7 @@ impl LazyGroupBy {
/// .agg([
/// col("rain").min(),
/// col("rain").sum(),
/// col("rain").quantile(0.5).alias("median_rain"),
/// col("rain").quantile(0.5, QuantileInterpolOptions::Nearest).alias("median_rain"),
/// ])
/// .sort("date", false)
/// }
Expand Down
2 changes: 1 addition & 1 deletion polars/polars-lazy/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@
//! .agg([
//! col("rain").min(),
//! col("rain").sum(),
//! col("rain").quantile(0.5).alias("median_rain"),
//! col("rain").quantile(0.5, QuantileInterpolOptions::Nearest).alias("median_rain"),
//! ])
//! .sort("date", false)
//! .collect()
Expand Down

0 comments on commit fb0b9f6

Please sign in to comment.