Skip to content

Commit

Permalink
add regex to col('..').exclude('..')
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Oct 10, 2021
1 parent 0cef635 commit 0db4585
Show file tree
Hide file tree
Showing 5 changed files with 270 additions and 223 deletions.
4 changes: 3 additions & 1 deletion polars/polars-lazy/src/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1390,7 +1390,9 @@ impl Expr {
}
}

/// Exclude a column from a wildcard selection
/// Exclude a column from a wildcard/regex selection.
///
/// You may also use regexes in the exclude as long as they start with `^` and end with `$`/
///
/// # Example
///
Expand Down
223 changes: 2 additions & 221 deletions polars/polars-lazy/src/logical_plan/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ pub(crate) mod alp;
pub(crate) mod conversion;
pub(crate) mod iterator;
pub(crate) mod optimizer;
mod projection;
use projection::*;

// Will be set/ unset in the fetch operation to communicate overwriting the number of rows to scan.
thread_local! {pub(crate) static FETCH_ROWS: Cell<Option<usize>> = Cell::new(None)}
Expand Down Expand Up @@ -677,227 +679,6 @@ impl LogicalPlan {
}
}

/// This replace the wilcard Expr with a Column Expr. It also removes the Exclude Expr from the
/// expression chain.
fn replace_wildcard_with_column(mut expr: Expr, column_name: Arc<String>) -> Expr {
expr.mutate().apply(|e| {
match &e {
Expr::Wildcard => {
*e = Expr::Column(column_name.clone());
}
Expr::Exclude(input, _) => {
*e = replace_wildcard_with_column(*input.clone(), column_name.clone());
}
_ => {}
}
// always keep iterating all inputs
true
});
expr
}

fn rewrite_keep_name_and_sufprefix(expr: Expr) -> Expr {
// the blocks are added by cargo fmt
#[allow(clippy::blocks_in_if_conditions)]
if has_expr(&expr, |e| {
matches!(e, Expr::KeepName(_) | Expr::SufPreFix { .. })
}) {
match expr {
Expr::KeepName(expr) => {
let roots = expr_to_root_column_names(&expr);
let name = roots
.get(0)
.expect("expected root column to keep expression name");
Expr::Alias(expr, name.clone())
}
Expr::SufPreFix {
is_suffix,
value,
expr,
} => {
let name = get_single_root(&expr).unwrap();
let name = if is_suffix {
format!("{}{}", name, value)
} else {
format!("{}{}", value, name)
};

Expr::Alias(expr, Arc::new(name))
}
_ => panic!("`keep_name`, `suffix`, `prefix` should be last expression"),
}
} else {
expr
}
}

/// Take an expression with a root: col("*") and copies that expression for all columns in the schema,
/// with the exclusion of the `names` in the exclude expression.
/// The resulting expressions are written to result.
fn replace_wilcard(expr: &Expr, result: &mut Vec<Expr>, exclude: &[Arc<String>], schema: &Schema) {
for field in schema.fields() {
let name = field.name();
if !exclude.iter().any(|exluded| &**exluded == name) {
let new_expr = replace_wildcard_with_column(expr.clone(), Arc::new(name.clone()));
let new_expr = rewrite_keep_name_and_sufprefix(new_expr);
result.push(new_expr)
}
}
}

#[cfg(feature = "regex")]
fn replace_regex(expr: &Expr, result: &mut Vec<Expr>, schema: &Schema, pattern: Option<&str>) {
match pattern {
Some(pattern) => {
let re = regex::Regex::new(pattern)
.unwrap_or_else(|_| panic!("invalid regular expression in column: {}", pattern));
for field in schema.fields() {
let name = field.name();
if re.is_match(name) {
let mut new_expr = expr.clone();

new_expr.mutate().apply(|e| match &e {
Expr::Column(_) => {
*e = Expr::Column(Arc::new(name.clone()));
false
}
_ => true,
});

let new_expr = rewrite_keep_name_and_sufprefix(new_expr);
result.push(new_expr)
}
}
}
None => {
let roots = expr_to_root_column_names(expr);
// only in simple expression (no binary expression)
// we pattern match regex columns
if roots.len() == 1 {
let name = &**roots[0];
if name.starts_with('^') && name.ends_with('$') {
replace_regex(expr, result, schema, Some(name))
} else {
let expr = rewrite_keep_name_and_sufprefix(expr.clone());
result.push(expr)
}
} else {
let expr = rewrite_keep_name_and_sufprefix(expr.clone());
result.push(expr)
}
}
}
}

/// replace columns(["A", "B"]).. with col("A").., col("B")..
fn expand_columns(expr: &Expr, result: &mut Vec<Expr>, names: &[String]) {
for name in names {
let mut new_expr = expr.clone();
new_expr.mutate().apply(|e| {
if let Expr::Columns(_) = &e {
*e = Expr::Column(Arc::new(name.clone()));
}
// always keep iterating all inputs
true
});

let new_expr = rewrite_keep_name_and_sufprefix(new_expr);
result.push(new_expr)
}
}

/// In case of single col(*) -> do nothing, no selection is the same as select all
/// In other cases replace the wildcard with an expression with all columns
pub(crate) fn rewrite_projections(exprs: Vec<Expr>, schema: &Schema) -> Vec<Expr> {
let mut result = Vec::with_capacity(exprs.len() + schema.fields().len());

for mut expr in exprs {
// in case of multiple cols, we still want to check wildcard for function input,
// but in case of no wildcard, we don't want this expr pushed to results.
let mut push_current = true;
// has multiple column names
if let Some(e) = expr.into_iter().find(|e| matches!(e, Expr::Columns(_))) {
if let Expr::Columns(names) = e {
expand_columns(&expr, &mut result, names)
}
push_current = false;
}

if has_wildcard(&expr) {
// keep track of column excluded from the wildcard
let mut exclude = vec![];
(&expr).into_iter().for_each(|e| {
if let Expr::Exclude(_, names) = e {
exclude.extend_from_slice(names)
}
});

// if count wildcard. count one column
if has_expr(&expr, |e| matches!(e, Expr::Agg(AggExpr::Count(_)))) {
let new_name = Arc::new(schema.field(0).unwrap().name().clone());
let expr = rename_expr_root_name(&expr, new_name).unwrap();

let expr = if let Expr::Alias(_, _) = &expr {
expr
} else {
Expr::Alias(Box::new(expr), Arc::new("count".to_string()))
};
result.push(expr);

continue;
}
// this path prepares the wildcard as input for the Function Expr
if has_expr(
&expr,
|e| matches!(e, Expr::Function { options, .. } if options.input_wildcard_expansion),
) {
expr.mutate().apply(|e| {
if let Expr::Function { input, .. } = e {
let mut new_inputs = Vec::with_capacity(input.len());

input.iter_mut().for_each(|e| {
if has_wildcard(e) {
replace_wilcard(e, &mut new_inputs, &exclude, schema)
} else {
#[cfg(feature = "regex")]
{
replace_regex(e, &mut new_inputs, schema, None)
}
#[cfg(not(feature = "regex"))]
{
new_inputs.push(e.clone())
}
}
});

*input = new_inputs;
false
} else {
true
}
});
result.push(expr);
continue;
}
replace_wilcard(&expr, &mut result, &exclude, schema);
} else {
#[allow(clippy::collapsible_else_if)]
if push_current {
#[cfg(feature = "regex")]
{
replace_regex(&expr, &mut result, schema, None)
}
#[cfg(not(feature = "regex"))]
{
let expr = rewrite_keep_name_and_sufprefix(expr);
result.push(expr)
}
}
};
}
result
}

pub struct LogicalPlanBuilder(LogicalPlan);

impl LogicalPlan {
Expand Down

0 comments on commit 0db4585

Please sign in to comment.