Skip to content

Commit

Permalink
fix bug in predicate pushdown
Browse files Browse the repository at this point in the history
Very simple predicates were not
pushed down to the scan level

Fixed and added proper tests.
Will add more.
  • Loading branch information
ritchie46 committed Nov 26, 2021
1 parent 81a0d21 commit 96ee42f
Show file tree
Hide file tree
Showing 7 changed files with 102 additions and 69 deletions.
37 changes: 2 additions & 35 deletions polars/polars-lazy/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -201,40 +201,7 @@ pub mod logical_plan;
pub mod physical_plan;
#[cfg(feature = "compile")]
pub mod prelude;
#[cfg(test)]
mod tests;
#[cfg(feature = "compile")]
pub(crate) mod utils;

#[cfg(test)]
mod test;

#[cfg(test)]
mod tests {
use polars_core::prelude::*;
use polars_io::prelude::*;
use std::io::Cursor;

// physical plan see: datafusion/physical_plan/planner.rs.html#61-63

pub(crate) fn get_df() -> DataFrame {
let s = r#"
"sepal.length","sepal.width","petal.length","petal.width","variety"
5.1,3.5,1.4,.2,"Setosa"
4.9,3,1.4,.2,"Setosa"
4.7,3.2,1.3,.2,"Setosa"
4.6,3.1,1.5,.2,"Setosa"
5,3.6,1.4,.2,"Setosa"
5.4,3.9,1.7,.4,"Setosa"
4.6,3.4,1.4,.3,"Setosa"
"#;

let file = Cursor::new(s);

let df = CsvReader::new(file)
// we also check if infer schema ignores errors
.infer_schema(Some(3))
.has_header(true)
.finish()
.unwrap();
df
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ impl OptimizationRule for ReplaceDropNulls {
mod test {
use super::*;
use crate::prelude::stack_opt::OptimizationRule;
use crate::test::fruits_cars;
use crate::tests::fruits_cars;
use crate::utils::test::optimize_lp;

#[test]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,7 @@ mod test {
let expr = node_to_exp(root, &expr_arena);
assert_eq!(
format!("{:?}", &expr),
format!("{:?}", &lit(true).and(predicate_expr))
format!("{:?}", predicate_expr.and(lit(true)))
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ pub(super) fn insert_and_combine_predicate(
.or_insert_with(|| arena.add(AExpr::Literal(LiteralValue::Boolean(true))));

let node = arena.add(AExpr::BinaryExpr {
left: *existing_predicate,
left: predicate,
op: Operator::And,
right: predicate,
right: *existing_predicate,
});

*existing_predicate = node;
Expand Down Expand Up @@ -192,9 +192,6 @@ where
} else {
expr_depth > 1
};
let output_field = e
.to_field(input_schema, Context::Default, expr_arena)
.unwrap();

// remove predicates that cannot be done on the input above
let to_local = acc_predicates
Expand All @@ -204,7 +201,7 @@ where
if check_input_node(*kv.1, input_schema, expr_arena)
// if this predicate not equals a column that is a computation
// it is ok
&& &**kv.0 != output_field.name() && !is_computation
&& !is_computation
{
None
} else {
Expand Down
56 changes: 56 additions & 0 deletions polars/polars-lazy/src/tests/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
mod predicate_pushdown;
mod queries;
use polars_core::prelude::*;
use polars_io::prelude::*;
use std::io::Cursor;

use crate::functions::{argsort_by, pearson_corr};
use crate::logical_plan::iterator::ArenaLpIter;
use crate::logical_plan::optimizer::simplify_expr::SimplifyExprRule;
use crate::logical_plan::optimizer::stack_opt::{OptimizationRule, StackOptimizer};
use crate::prelude::*;
use polars_core::chunked_array::builder::get_list_builder;
#[cfg(feature = "temporal")]
use polars_core::utils::chrono::{NaiveDate, NaiveDateTime, NaiveTime};
use polars_core::{df, prelude::*};
use std::iter::FromIterator;

fn scan_foods_csv() -> LazyFrame {
let path = "../../examples/aggregate_multiple_files_in_chunks/datasets/foods1.csv";
LazyCsvReader::new(path.to_string()).finish().unwrap()
}

pub(crate) fn fruits_cars() -> DataFrame {
df!(
"A"=> [1, 2, 3, 4, 5],
"fruits"=> ["banana", "banana", "apple", "apple", "banana"],
"B"=> [5, 4, 3, 2, 1],
"cars"=> ["beetle", "audi", "beetle", "beetle", "beetle"]
)
.unwrap()
}

// physical plan see: datafusion/physical_plan/planner.rs.html#61-63

pub(crate) fn get_df() -> DataFrame {
let s = r#"
"sepal.length","sepal.width","petal.length","petal.width","variety"
5.1,3.5,1.4,.2,"Setosa"
4.9,3,1.4,.2,"Setosa"
4.7,3.2,1.3,.2,"Setosa"
4.6,3.1,1.5,.2,"Setosa"
5,3.6,1.4,.2,"Setosa"
5.4,3.9,1.7,.4,"Setosa"
4.6,3.4,1.4,.3,"Setosa"
"#;

let file = Cursor::new(s);

let df = CsvReader::new(file)
// we also check if infer schema ignores errors
.infer_schema(Some(3))
.has_header(true)
.finish()
.unwrap();
df
}
38 changes: 38 additions & 0 deletions polars/polars-lazy/src/tests/predicate_pushdown.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
use super::*;

fn projection_at_scan(lp_arena: &Arena<ALogicalPlan>, lp: Node) -> bool {
(&lp_arena).iter(lp).all(|(_, lp)| {
if let ALogicalPlan::DataFrameScan { selection, .. } = lp {
selection.is_some()
} else {
true
}
})
}

#[test]
fn test_pred_pd_1() -> Result<()> {
let df = fruits_cars();

let mut expr_arena = Arena::with_capacity(16);
let mut lp_arena = Arena::with_capacity(8);
let lp = df
.clone()
.lazy()
.select([col("A"), col("B")])
.filter(col("A").gt(lit(1)))
.optimize(&mut lp_arena, &mut expr_arena)?;

assert!(projection_at_scan(&lp_arena, lp));

// check if we understand that we can unwrap the alias
let lp = df
.lazy()
.select([col("A").alias("C"), col("B")])
.filter(col("C").gt(lit(1)))
.optimize(&mut lp_arena, &mut expr_arena)?;

assert!(projection_at_scan(&lp_arena, lp));

Ok(())
}
Original file line number Diff line number Diff line change
@@ -1,29 +1,4 @@
use crate::functions::{argsort_by, pearson_corr};
use crate::logical_plan::optimizer::stack_opt::{OptimizationRule, StackOptimizer};
use crate::tests::get_df;
#[cfg(feature = "temporal")]
use polars_core::utils::chrono::{NaiveDate, NaiveDateTime, NaiveTime};
use polars_core::{df, prelude::*};

use crate::logical_plan::optimizer::simplify_expr::SimplifyExprRule;
use crate::prelude::*;
use polars_core::chunked_array::builder::get_list_builder;
use std::iter::FromIterator;

fn scan_foods_csv() -> LazyFrame {
let path = "../../examples/aggregate_multiple_files_in_chunks/datasets/foods1.csv";
LazyCsvReader::new(path.to_string()).finish().unwrap()
}

pub(crate) fn fruits_cars() -> DataFrame {
df!(
"A"=> [1, 2, 3, 4, 5],
"fruits"=> ["banana", "banana", "apple", "apple", "banana"],
"B"=> [5, 4, 3, 2, 1],
"cars"=> ["beetle", "audi", "beetle", "beetle", "beetle"]
)
.unwrap()
}
use super::*;

#[test]
fn test_lazy_ternary() {
Expand Down

0 comments on commit 96ee42f

Please sign in to comment.