Skip to content

Commit

Permalink
Allowed non-vec in DF expressions. (#1477)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao committed Oct 2, 2021
1 parent 2a5d08f commit 898d99e
Show file tree
Hide file tree
Showing 21 changed files with 199 additions and 231 deletions.
53 changes: 21 additions & 32 deletions polars/benches/groupby.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ fn q1(c: &mut Criterion) {
b.iter(|| {
DATA.clone()
.lazy()
.groupby(vec![col("id1")])
.agg(vec![col("v1").sum()])
.groupby([col("id1")])
.agg([col("v1").sum()])
.collect()
.unwrap();
})
Expand All @@ -42,8 +42,8 @@ fn q2(c: &mut Criterion) {
b.iter(|| {
DATA.clone()
.lazy()
.groupby(vec![col("id1"), col("id2")])
.agg(vec![col("v1").sum()])
.groupby([col("id1"), col("id2")])
.agg([col("v1").sum()])
.collect()
.unwrap();
})
Expand All @@ -55,8 +55,8 @@ fn q3(c: &mut Criterion) {
b.iter(|| {
DATA.clone()
.lazy()
.groupby(vec![col("id3")])
.agg(vec![col("v1").sum(), col("v3").mean()])
.groupby([col("id3")])
.agg([col("v1").sum(), col("v3").mean()])
.collect()
.unwrap();
})
Expand All @@ -68,8 +68,8 @@ fn q4(c: &mut Criterion) {
b.iter(|| {
DATA.clone()
.lazy()
.groupby(vec![col("id4")])
.agg(vec![col("v1").mean(), col("v2").mean(), col("v3").mean()])
.groupby([col("id4")])
.agg([col("v1").mean(), col("v2").mean(), col("v3").mean()])
.collect()
.unwrap();
})
Expand All @@ -81,8 +81,8 @@ fn q5(c: &mut Criterion) {
b.iter(|| {
DATA.clone()
.lazy()
.groupby(vec![col("id6")])
.agg(vec![col("v1").sum(), col("v2").sum(), col("v3").sum()])
.groupby([col("id6")])
.agg([col("v1").sum(), col("v2").sum(), col("v3").sum()])
.collect()
.unwrap();
})
Expand All @@ -94,8 +94,8 @@ fn q6(c: &mut Criterion) {
b.iter(|| {
DATA.clone()
.lazy()
.groupby(vec![col("id4"), col("id5")])
.agg(vec![
.groupby([col("id4"), col("id5")])
.agg([
col("v3").median().alias("v3_median"),
col("v3").std().alias("v3_std"),
])
Expand All @@ -110,15 +110,9 @@ fn q7(c: &mut Criterion) {
b.iter(|| {
DATA.clone()
.lazy()
.groupby(vec![col("id3")])
.agg(vec![
col("v1").max().alias("v1"),
col("v2").min().alias("v2"),
])
.select(vec![
col("id3"),
(col("v1") - col("v2")).alias("range_v1_v2"),
])
.groupby([col("id3")])
.agg([col("v1").max().alias("v1"), col("v2").min().alias("v2")])
.select([col("id3"), (col("v1") - col("v2")).alias("range_v1_v2")])
.collect()
.unwrap();
})
Expand All @@ -133,8 +127,8 @@ fn q8(c: &mut Criterion) {
// todo! accept slice of str
.drop_nulls(Some(vec![col("v3")]))
.sort("v3", true)
.groupby(vec![col("id6")])
.agg(vec![col("v3").head(Some(2)).alias("v3_top_2")])
.groupby([col("id6")])
.agg([col("v3").head(Some(2)).alias("v3_top_2")])
.explode(&[col("v3_top_2")])
.collect()
.unwrap();
Expand All @@ -148,10 +142,8 @@ fn q9(c: &mut Criterion) {
DATA.clone()
.lazy()
.drop_nulls(Some(vec![col("v1"), col("v2")]))
.groupby(vec![col("id2"), col("id4")])
.agg(vec![pearson_corr(col("v1"), col("v2"))
.alias("r2")
.pow(2.0)])
.groupby([col("id2"), col("id4")])
.agg([pearson_corr(col("v1"), col("v2")).alias("r2").pow(2.0)])
.collect()
.unwrap();
})
Expand All @@ -163,18 +155,15 @@ fn q10(c: &mut Criterion) {
b.iter(|| {
DATA.clone()
.lazy()
.groupby(vec![
.groupby([
col("id1"),
col("id2"),
col("id3"),
col("id4"),
col("id5"),
col("id6"),
])
.agg(vec![
col("v3").sum().alias("v3"),
col("v1").count().alias("v1"),
])
.agg([col("v3").sum().alias("v3"), col("v1").count().alias("v1")])
.collect()
.unwrap();
})
Expand Down
2 changes: 1 addition & 1 deletion polars/polars-core/src/frame/groupby/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1488,7 +1488,7 @@ mod test {
vec![1, 2, 3, 4, 4, 4, 2, 1, 1],
vec![1, 2, 3, 4, 4, 4],
] {
let ca = UInt32Chunked::new_from_slice("", &slice);
let ca = UInt32Chunked::new_from_slice("", slice);
let split = split_ca(&ca, 4).unwrap();

let a = groupby(ca.into_iter()).into_iter().sorted().collect_vec();
Expand Down
4 changes: 1 addition & 3 deletions polars/polars-core/src/frame/hash_join/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1470,7 +1470,7 @@ mod test {
#[test]
#[cfg_attr(miri, ignore)]
fn test_join_multiple_columns() {
let (df_a, df_b) = get_dfs();
let (mut df_a, mut df_b) = get_dfs();

// First do a hack with concatenated string dummy column
let mut s = df_a
Expand All @@ -1483,7 +1483,6 @@ mod test {
+ df_a.column("b").unwrap().utf8().unwrap();
s.rename("dummy");

let mut df_a = df_a.clone();
df_a.with_column(s).unwrap();
let mut s = df_b
.column("foo")
Expand All @@ -1494,7 +1493,6 @@ mod test {
.unwrap()
+ df_b.column("bar").unwrap().utf8().unwrap();
s.rename("dummy");
let mut df_b = df_b.clone();
df_b.with_column(s).unwrap();

let joined = df_a.left_join(&df_b, "dummy", "dummy").unwrap();
Expand Down
2 changes: 1 addition & 1 deletion polars/polars-core/src/frame/select.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
///
/// &str => df.select("my-column"),
/// (&str)" => df.select(("col_1", "col_2")),
/// Vec<&str)" => df.select(vec!["col_a", "col_b"]),
/// Vec<&str)" => df.select(["col_a", "col_b"]),
pub trait Selection<'a, S> {
fn to_selection_vec(self) -> Vec<&'a str>;

Expand Down
4 changes: 2 additions & 2 deletions polars/polars-core/src/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ mod test {
fn test_pearson_corr() {
let a = Series::new("a", &[1.0f32, 2.0]);
let b = Series::new("b", &[1.0f32, 2.0]);
assert!((cov(&a.f32().unwrap(), &b.f32().unwrap()).unwrap() - 0.5).abs() < 0.001);
assert!((pearson_corr(&a.f32().unwrap(), &b.f32().unwrap()).unwrap() - 1.0).abs() < 0.001);
assert!((cov(a.f32().unwrap(), b.f32().unwrap()).unwrap() - 0.5).abs() < 0.001);
assert!((pearson_corr(a.f32().unwrap(), b.f32().unwrap()).unwrap() - 1.0).abs() < 0.001);
}

#[test]
Expand Down
2 changes: 1 addition & 1 deletion polars/polars-io/src/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -878,7 +878,7 @@ id090,id048,id0000067778,24,2,51862,4,9,
.with_columns(Some(
schema
.fields()
.into_iter()
.iter()
.map(|s| s.name().to_string())
.collect(),
))
Expand Down
6 changes: 3 additions & 3 deletions polars/polars-lazy/src/datafusion/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ mod test {

let out = df
.lazy()
.groupby(vec![col("a")])
.agg(vec![col("b").mean()])
.select(vec![col("a"), col("b_mean")])
.groupby([col("a")])
.agg([col("b").mean()])
.select([col("a"), col("b_mean")])
.sort("a", false)
.ooc()?;

Expand Down
8 changes: 4 additions & 4 deletions polars/polars-lazy/src/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1063,7 +1063,7 @@ impl Expr {
/// .lazy()
/// .select(&[
/// col("groups"),
/// sum("values").over(vec![col("groups")]),
/// sum("values").over([col("groups")]),
/// ])
/// .collect()?;
/// dbg!(&out);
Expand Down Expand Up @@ -1101,10 +1101,10 @@ impl Expr {
/// │ 1 ┆ 16 │
/// ╰────────┴────────╯
/// ```
pub fn over(self, partition_by: Vec<Expr>) -> Self {
pub fn over<E: AsRef<[Expr]>>(self, partition_by: E) -> Self {
Expr::Window {
function: Box::new(self),
partition_by,
partition_by: partition_by.as_ref().to_vec(),
order_by: None,
options: WindowOptions { explode: false },
}
Expand Down Expand Up @@ -1353,7 +1353,7 @@ impl Expr {
/// use polars_lazy::prelude::*;
///
/// fn example(df: LazyFrame) -> LazyFrame {
/// df.select(vec![
/// df.select([
/// // even thought the alias yields a different column name,
/// // `keep_name` will make sure that the original column name is used
/// col("*").alias("foo").keep_name()
Expand Down
31 changes: 15 additions & 16 deletions polars/polars-lazy/src/frame.rs
Original file line number Diff line number Diff line change
Expand Up @@ -631,11 +631,10 @@ impl LazyFrame {
/// use polars_lazy::prelude::*;
///
/// fn example(df: DataFrame) -> Result<DataFrame> {
/// df.lazy()
/// .groupby(vec![col("foo")])
/// .agg(vec!(col("bar").sum(),
/// col("ham").mean().alias("avg_ham")))
/// .collect()
/// df.lazy()
/// .groupby([col("foo")])
/// .agg([col("bar").sum(), col("ham").mean().alias("avg_ham")])
/// .collect()
/// }
/// ```
pub fn collect(self) -> Result<DataFrame> {
Expand Down Expand Up @@ -735,32 +734,32 @@ impl LazyFrame {
///
/// fn example(df: DataFrame) -> LazyFrame {
/// df.lazy()
/// .groupby(vec![col("date")])
/// .agg(vec![
/// .groupby([col("date")])
/// .agg([
/// col("rain").min(),
/// col("rain").sum(),
/// col("rain").quantile(0.5).alias("median_rain"),
/// ])
/// .sort("date", false)
/// }
/// ```
pub fn groupby(self, by: Vec<Expr>) -> LazyGroupBy {
pub fn groupby<E: AsRef<[Expr]>>(self, by: E) -> LazyGroupBy {
let opt_state = self.get_opt_state();
LazyGroupBy {
logical_plan: self.logical_plan,
opt_state,
keys: by,
keys: by.as_ref().to_vec(),
maintain_order: false,
}
}

/// Similar to groupby, but order of the DataFrame is maintained.
pub fn stable_groupby(self, by: Vec<Expr>) -> LazyGroupBy {
pub fn stable_groupby<E: AsRef<[Expr]>>(self, by: E) -> LazyGroupBy {
let opt_state = self.get_opt_state();
LazyGroupBy {
logical_plan: self.logical_plan,
opt_state,
keys: by,
keys: by.as_ref().to_vec(),
maintain_order: true,
}
}
Expand Down Expand Up @@ -1056,16 +1055,16 @@ impl LazyGroupBy {
///
/// fn example(df: DataFrame) -> LazyFrame {
/// df.lazy()
/// .groupby(vec![col("date")])
/// .agg(vec![
/// .groupby([col("date")])
/// .agg([
/// col("rain").min(),
/// col("rain").sum(),
/// col("rain").quantile(0.5).alias("median_rain"),
/// ])
/// .sort("date", false)
/// }
/// ```
pub fn agg(self, aggs: Vec<Expr>) -> LazyFrame {
pub fn agg<E: AsRef<[Expr]>>(self, aggs: E) -> LazyFrame {
let lp = LogicalPlanBuilder::from(self.logical_plan)
.groupby(Arc::new(self.keys), aggs, None, self.maintain_order)
.build();
Expand All @@ -1081,7 +1080,7 @@ impl LazyGroupBy {
.flatten()
.collect::<Vec<_>>();

self.agg(vec![col("*").exclude(&keys).head(n).list().keep_name()])
self.agg([col("*").exclude(&keys).head(n).list().keep_name()])
.explode(vec![col("*").exclude(&keys)])
}

Expand All @@ -1094,7 +1093,7 @@ impl LazyGroupBy {
.flatten()
.collect::<Vec<_>>();

self.agg(vec![col("*").exclude(&keys).tail(n).list().keep_name()])
self.agg([col("*").exclude(&keys).tail(n).list().keep_name()])
.explode(vec![col("*").exclude(&keys)])
}

Expand Down
4 changes: 2 additions & 2 deletions polars/polars-lazy/src/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,14 @@ pub fn pearson_corr(a: Expr, b: Expr) -> Expr {
/// That means that the first `Series` will be used to determine the ordering
/// until duplicates are found. Once duplicates are found, the next `Series` will
/// be used and so on.
pub fn argsort_by(by: Vec<Expr>, reverse: &[bool]) -> Expr {
pub fn argsort_by<E: AsRef<[Expr]>>(by: E, reverse: &[bool]) -> Expr {
let reverse = reverse.to_vec();
let function = NoEq::new(Arc::new(move |by: &mut [Series]| {
polars_core::functions::argsort_by(by, &reverse).map(|ca| ca.into_series())
}) as Arc<dyn SeriesUdf>);

Expr::Function {
input: by,
input: by.as_ref().to_vec(),
function,
output_type: GetOutput::from_type(DataType::UInt32),
options: FunctionOptions {
Expand Down
8 changes: 4 additions & 4 deletions polars/polars-lazy/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,8 @@
//! )?;
//!
//! df.lazy()
//! .groupby(vec![col("date")])
//! .agg(vec![
//! .groupby([col("date")])
//! .agg([
//! col("rain").min(),
//! col("rain").sum(),
//! col("rain").quantile(0.5).alias("median_rain"),
Expand Down Expand Up @@ -162,7 +162,7 @@
//! .filter(
//! col("a").lt(lit(2))
//! )
//! .groupby(vec![col("b")])
//! .groupby([col("b")])
//! .agg(
//! vec![col("b").first(), col("c").first()]
//! )
Expand All @@ -178,7 +178,7 @@
//!
//! fn aggregate_all_columns(df_a: DataFrame) -> LazyFrame {
//! df_a.lazy()
//! .groupby(vec![col("b")])
//! .groupby([col("b")])
//! .agg(
//! vec![col("*").first()]
//! )
Expand Down
4 changes: 2 additions & 2 deletions polars/polars-lazy/src/logical_plan/iterator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -320,8 +320,8 @@ mod test {
let (root, lp_arena, _expr_arena) = df
.lazy()
.sort("a", false)
.groupby(vec![col("a")])
.agg(vec![col("a").first()])
.groupby([col("a")])
.agg([col("a").first()])
.logical_plan
.into_alp();

Expand Down

0 comments on commit 898d99e

Please sign in to comment.