Skip to content

Commit

Permalink
lazy select columns by regex
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Aug 5, 2021
1 parent ca4fae0 commit 3330fb0
Show file tree
Hide file tree
Showing 10 changed files with 173 additions and 3 deletions.
1 change: 1 addition & 0 deletions polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ mode = ["polars-core/mode", "polars-lazy/mode"]
take_opt_iter = ["polars-core/take_opt_iter"]
extract_jsonpath = ["polars-core/extract_jsonpath", "polars-core/strings"]
groupby_list = ["polars-core/groupby_list"]
lazy_regex = ["polars-lazy/regex"]

# don't use this
private = []
Expand Down
1 change: 1 addition & 0 deletions polars/polars-lazy/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ private = []
ahash = "0.7"
rayon = "1.5"
itertools = "0.10"
regex = {version = "1.4", optional = true}

polars-io = {version = "0.14.8", path = "../polars-io", features = ["lazy", "csv-file"], default-features=false}
polars-core = {version = "0.14.8", path = "../polars-core", features = ["lazy", "private", "zip_with"], default-features=false}
Expand Down
22 changes: 22 additions & 0 deletions polars/polars-lazy/src/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1180,6 +1180,28 @@ impl Expr {
}

/// Create a Column Expression based on a column name.
///
/// # Arguments
///
/// * `name` - A string slice that holds the name of the column
///
/// # Examples
///
/// ```ignore
/// // select a column name
/// col("foo")
/// ```
///
/// ```ignore
/// // select all columns by using a wildcard
/// col("*")
/// ```
///
/// ```ignore
/// // select specific column by writing a regular expression that starts with `^` and ends with `$`
/// // only if regex features is activated
/// col("^foo.*$")
/// ```
pub fn col(name: &str) -> Expr {
match name {
"*" => Expr::Wildcard,
Expand Down
49 changes: 47 additions & 2 deletions polars/polars-lazy/src/logical_plan/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -719,10 +719,34 @@ fn replace_wilcard(expr: &Expr, result: &mut Vec<Expr>, exclude: &[Arc<String>],
}
}

#[cfg(feature = "regex")]
fn replace_regex(expr: &Expr, result: &mut Vec<Expr>, schema: &Schema, pattern: &str) {
let re = regex::Regex::new(pattern)
.unwrap_or_else(|_| panic!("invalid regular expression in column: {}", pattern));
for field in schema.fields() {
let name = field.name();
if re.is_match(name) {
let mut new_expr = expr.clone();

new_expr.mutate().apply(|e| match &e {
Expr::Column(_) => {
*e = Expr::Column(Arc::new(name.clone()));
false
}
_ => true,
});

let new_expr = rewrite_keep_name(new_expr);
result.push(new_expr)
}
}
}

/// In case of single col(*) -> do nothing, no selection is the same as select all
/// In other cases replace the wildcard with an expression with all columns
fn rewrite_projections(exprs: Vec<Expr>, schema: &Schema) -> Vec<Expr> {
let mut result = Vec::with_capacity(exprs.len() + schema.fields().len());

for mut expr in exprs {
if has_wildcard(&expr) {
// keep track of column excluded from the wildcard
Expand Down Expand Up @@ -775,8 +799,29 @@ fn rewrite_projections(exprs: Vec<Expr>, schema: &Schema) -> Vec<Expr> {
}
replace_wilcard(&expr, &mut result, &exclude, schema);
} else {
let expr = rewrite_keep_name(expr);
result.push(expr)
#[cfg(feature = "regex")]
{
// only in simple expression (no binary expression)
// we patter match regex columns
let roots = expr_to_root_column_names(&expr);
if roots.len() == 1 {
let name = &**roots[0];
if name.starts_with('^') && name.ends_with('$') {
replace_regex(&expr, &mut result, schema, name)
} else {
let expr = rewrite_keep_name(expr);
result.push(expr)
}
} else {
let expr = rewrite_keep_name(expr);
result.push(expr)
}
}
#[cfg(not(feature = "regex"))]
{
let expr = rewrite_keep_name(expr);
result.push(expr)
}
};
}
result
Expand Down
15 changes: 15 additions & 0 deletions polars/polars-lazy/src/test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1380,3 +1380,18 @@ fn test_exclude() -> Result<()> {
assert_eq!(out.get_column_names(), &["a", "c"]);
Ok(())
}

#[test]
#[cfg(feature = "regex")]
fn test_regex_selection() -> Result<()> {
let df = df![
"anton" => [1, 2, 3],
"arnold schwars" => [1, 2, 3],
"annie" => [1, 2, 3]
]?;

let out = df.lazy().select(vec![col("^a.*o.*$")]).collect()?;

assert_eq!(out.get_column_names(), &["anton", "arnold schwars"]);
Ok(())
}
1 change: 1 addition & 0 deletions polars/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@
//! The opt-in features are (not including dtype features):
//!
//! * `lazy` - Lazy API
//! - `lazy_regex` - Use regexes in [column selection](crate::lazy::dsl::col)
//! * `random` - Generate arrays with randomly sampled values
//! * `ndarray`- Convert from `DataFrame` to `ndarray`
//! * `temporal` - Conversions between [Chrono](https://docs.rs/chrono/) and Polars for temporal data types
Expand Down
1 change: 1 addition & 0 deletions py-polars/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion py-polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ features = [
"reinterpret",
"decompress",
"mode",
"extract_jsonpath"
"extract_jsonpath",
"lazy_regex"
]

#[patch.crates-io]
Expand Down
70 changes: 70 additions & 0 deletions py-polars/polars/lazy/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,76 @@
def col(name: str) -> "pl.Expr":
"""
A column in a DataFrame.
Can be used to select:
* a single column by name
* all columns by using a wildcard `"*"`
* column by regular expression if the regex starts with `^` and ends with `$`
Parameters
col
A string that holds the name of the column
Examples
-------
>>> df = pl.DataFrame({
>>> "ham": [1, 2, 3],
>>> "hamburger": [11, 22, 33],
>>> "foo": [3, 2, 1]})
>>> df.select(col("foo"))
shape: (3, 1)
╭─────╮
│ foo │
│ --- │
│ i64 │
╞═════╡
│ 3 │
├╌╌╌╌╌┤
│ 2 │
├╌╌╌╌╌┤
│ 1 │
╰─────╯
>>> df.select(col("*"))
shape: (3, 3)
╭─────┬───────────┬─────╮
│ ham ┆ hamburger ┆ foo │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═══════════╪═════╡
│ 1 ┆ 11 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ 2 ┆ 22 ┆ 2 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ 3 ┆ 33 ┆ 1 │
╰─────┴───────────┴─────╯
>>> df.select(col("^ham.*$"))
shape: (3, 2)
╭─────┬───────────╮
│ ham ┆ hamburger │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═══════════╡
│ 1 ┆ 11 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 22 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 33 │
╰─────┴───────────╯
>>> df.select(col("*").exclude("ham"))
shape: (3, 2)
╭───────────┬─────╮
│ hamburger ┆ foo │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═══════════╪═════╡
│ 11 ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ 22 ┆ 2 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ 33 ┆ 1 │
╰───────────┴─────╯
"""
return pl.lazy.expr.wrap_expr(pycol(name))

Expand Down
13 changes: 13 additions & 0 deletions py-polars/tests/test_lazy.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,3 +338,16 @@ def test_lazy_columns():
).lazy()

assert df.select(["a", "c"]).columns == ["a", "c"]


def test_regex_selection():
df = pl.DataFrame(
{
"foo": [1],
"fooey": [1],
"foobar": [1],
"bar": [1],
}
).lazy()

assert df.select([col("^foo.*$")]).columns == ["foo", "fooey", "foobar"]

0 comments on commit 3330fb0

Please sign in to comment.