Skip to content

Commit

Permalink
exclude by dtype
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Dec 12, 2021
1 parent 66d3c8c commit ac0cefa
Show file tree
Hide file tree
Showing 7 changed files with 118 additions and 16 deletions.
19 changes: 17 additions & 2 deletions polars/polars-lazy/src/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ pub enum Expr {
output_field: NoEq<Arc<dyn BinaryUdfOutputField>>,
},
/// Can be used in a select statement to exclude a column from selection
Exclude(Box<Expr>, Vec<Arc<str>>),
Exclude(Box<Expr>, Vec<Excluded>),
/// Set root name as Alias
KeepName(Box<Expr>),
SufPreFix {
Expand All @@ -351,6 +351,12 @@ pub enum Expr {
},
}

#[derive(Debug, Clone, PartialEq)]
pub enum Excluded {
Name(Arc<str>),
Dtype(DataType),
}

impl Expr {
/// Get Field result of the expression. The schema is the input data.
pub(crate) fn to_field(&self, schema: &Schema, ctxt: Context) -> Result<Field> {
Expand Down Expand Up @@ -1476,7 +1482,16 @@ impl Expr {
let v = columns
.to_selection_vec()
.iter()
.map(|s| Arc::from(*s))
.map(|s| Excluded::Name(Arc::from(*s)))
.collect();
Expr::Exclude(Box::new(self), v)
}

pub fn exclude_dtype<D: AsRef<[DataType]>>(self, dtypes: D) -> Expr {
let v = dtypes
.as_ref()
.iter()
.map(|dt| Excluded::Dtype(dt.clone()))
.collect();
Expr::Exclude(Box::new(self), v)
}
Expand Down
39 changes: 31 additions & 8 deletions polars/polars-lazy/src/logical_plan/projection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,27 +158,50 @@ fn expand_dtypes(expr: &Expr, result: &mut Vec<Expr>, schema: &Schema, dtypes: &
fn prepare_excluded(expr: &Expr, schema: &Schema) -> Vec<Arc<str>> {
let mut exclude = vec![];
expr.into_iter().for_each(|e| {
if let Expr::Exclude(_, names) = e {
if let Expr::Exclude(_, to_exclude) = e {
#[cfg(feature = "regex")]
{
// instead of matching the names for regex patterns
// and expanding the matches in the schema we
// reuse the `replace_regex` function. This is a bit
// slower but DRY.
let mut buf = vec![];
for name in names {
let e = Expr::Column(name.clone());
replace_regex(&e, &mut buf, schema);
for col in buf.drain(..) {
if let Expr::Column(name) = col {
exclude.push(name)
for to_exclude_single in to_exclude {
match to_exclude_single {
Excluded::Name(name) => {
let e = Expr::Column(name.clone());
replace_regex(&e, &mut buf, schema);
for col in buf.drain(..) {
if let Expr::Column(name) = col {
exclude.push(name)
}
}
}
Excluded::Dtype(dt) => {
for fld in schema.fields() {
if fld.data_type() == dt {
exclude.push(Arc::from(fld.name().as_ref()))
}
}
}
}
}
}

#[cfg(not(feature = "regex"))]
{
exclude.extend_from_slice(names)
for to_exclude_single in to_exclude {
match to_exclude_single {
Excluded::Name(name) => exclude.push(name.clone()),
Excluded::Dtype(dt) => {
for fld in schema.fields() {
if matches!(fld.data_type(), dt) {
exclude.push(Arc::from(fld.name().as_ref()))
}
}
}
}
}
}
}
});
Expand Down
29 changes: 25 additions & 4 deletions py-polars/polars/internals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,10 @@ def alias(self, name: str) -> "Expr":
"""
return wrap_expr(self._pyexpr.alias(name))

def exclude(self, columns: Union[str, tp.List[str]]) -> "Expr":
def exclude(
self,
columns: Union[str, tp.List[str], Type[DataType], Sequence[Type[DataType]]],
) -> "Expr":
"""
Exclude certain columns from a wildcard/regex selection.
Expand All @@ -292,7 +295,11 @@ def exclude(self, columns: Union[str, tp.List[str]]) -> "Expr":
Parameters
----------
columns
Column(s) to exclude from selection
Column(s) to exclude from selection.
This can be:
- a column name, or multiple names
- a regular expression starting with `^` and ending with `$`
- a dtype or multiple dtypes
Examples
--------
Expand Down Expand Up @@ -335,8 +342,19 @@ def exclude(self, columns: Union[str, tp.List[str]]) -> "Expr":
"""
if isinstance(columns, str):
columns = [columns]
return wrap_expr(self._pyexpr.exclude(columns))
columns = [columns] # type: ignore
return wrap_expr(self._pyexpr.exclude(columns))
elif not isinstance(columns, list) and issubclass(columns, DataType): # type: ignore
columns = [columns] # type: ignore
return wrap_expr(self._pyexpr.exclude_dtype(columns))

if not all([isinstance(a, str) or issubclass(a, DataType) for a in columns]): # type: ignore
raise ValueError("input should be all string or all DataType")

if isinstance(columns[0], str): # type: ignore
return wrap_expr(self._pyexpr.exclude(columns))
else:
return wrap_expr(self._pyexpr.exclude_dtype(columns))

def keep_name(self) -> "Expr":
"""
Expand Down Expand Up @@ -2408,6 +2426,9 @@ def __init__(self, expr: Expr):

def buckets(self, interval: timedelta) -> Expr:
"""
.. warning::
This API is experimental and will likely change.
Divide the date/ datetime range into buckets.
Data will be sorted by this operation.
Expand Down
6 changes: 6 additions & 0 deletions py-polars/polars/internals/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2259,6 +2259,10 @@ def groupby(

def downsample(self, by: Union[str, tp.List[str]], rule: str, n: int) -> "GroupBy":
"""
.. deprecated:: 0.11.0
Use `buckets` expression instead
Start a downsampling groupby operation.
Parameters
Expand Down Expand Up @@ -3786,6 +3790,8 @@ def apply(self, f: Callable[[DataFrame], DataFrame]) -> DataFrame:
"""
Apply a function over the groups as a sub-DataFrame.
Beware, this is slow.
Parameters
----------
f
Expand Down
29 changes: 29 additions & 0 deletions py-polars/src/conversion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,35 @@ impl ToPyObject for Wrap<DataType> {
}
}

impl FromPyObject<'_> for Wrap<DataType> {
fn extract(ob: &PyAny) -> PyResult<Self> {
let dtype = match ob.repr().unwrap().to_str().unwrap() {
"<class 'polars.datatypes.UInt8'>" => DataType::UInt8,
"<class 'polars.datatypes.UInt16'>" => DataType::UInt16,
"<class 'polars.datatypes.UInt32'>" => DataType::UInt32,
"<class 'polars.datatypes.UInt64'>" => DataType::UInt64,
"<class 'polars.datatypes.Int8'>" => DataType::Int8,
"<class 'polars.datatypes.Int16'>" => DataType::Int16,
"<class 'polars.datatypes.Int32'>" => DataType::Int32,
"<class 'polars.datatypes.Int64'>" => DataType::Int64,
"<class 'polars.datatypes.Utf8'>" => DataType::Utf8,
"<class 'polars.datatypes.List'>" => DataType::List(Box::new(DataType::Boolean)),
"<class 'polars.datatypes.Boolean'>" => DataType::Boolean,
"<class 'polars.datatypes.Categorical'>" => DataType::Categorical,
"<class 'polars.datatypes.Date'>" => DataType::Date,
"<class 'polars.datatypes.Datetime'>" => DataType::Datetime,
"<class 'polars.datatypes.Float32'>" => DataType::Float32,
"<class 'polars.datatypes.Float64'>" => DataType::Float64,
"<class 'polars.datatypes.Object'>" => DataType::Object("unknown"),
dt => panic!(
"{} not expected in python dtype to rust dtype conversion",
dt
),
};
Ok(Wrap(dtype))
}
}

impl ToPyObject for Wrap<AnyValue<'_>> {
fn to_object(&self, py: Python) -> PyObject {
self.clone().into_py(py)
Expand Down
8 changes: 7 additions & 1 deletion py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use super::apply::*;
use crate::conversion::str_to_null_behavior;
use crate::conversion::{str_to_null_behavior, Wrap};
use crate::lazy::map_single;
use crate::lazy::utils::py_exprs_to_exprs;
use crate::prelude::{parse_strategy, str_to_rankmethod};
Expand Down Expand Up @@ -703,6 +703,12 @@ impl PyExpr {
pub fn exclude(&self, columns: Vec<String>) -> PyExpr {
self.inner.clone().exclude(&columns).into()
}
pub fn exclude_dtype(&self, dtypes: Vec<Wrap<DataType>>) -> PyExpr {
// Safety:
// Wrap is transparent.
let dtypes: Vec<DataType> = unsafe { std::mem::transmute(dtypes) };
self.inner.clone().exclude_dtype(&dtypes).into()
}
pub fn interpolate(&self) -> PyExpr {
self.inner.clone().interpolate().into()
}
Expand Down
4 changes: 3 additions & 1 deletion py-polars/tests/test_lazy.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,9 +528,11 @@ def test_regex_selection() -> None:


def test_exclude_selection() -> None:
df = pl.DataFrame({"a": [1], "b": [1], "c": [1]}).lazy()
df = pl.DataFrame({"a": [1], "b": [1], "c": [True]}).lazy()

assert df.select([pl.exclude("a")]).columns == ["b", "c"]
assert df.select(pl.all().exclude(pl.Boolean)).columns == ["a", "b"]
assert df.select(pl.all().exclude([pl.Boolean])).columns == ["a", "b"]


def test_col_series_selection() -> None:
Expand Down

0 comments on commit ac0cefa

Please sign in to comment.