exclude by dtype

pola-rs · Dec 12, 2021 · ac0cefa · ac0cefa
1 parent 66d3c8c
commit ac0cefa
Show file tree

Hide file tree

Showing 7 changed files with 118 additions and 16 deletions.
diff --git a/polars/polars-lazy/src/dsl.rs b/polars/polars-lazy/src/dsl.rs
@@ -341,7 +341,7 @@ pub enum Expr {
         output_field: NoEq<Arc<dyn BinaryUdfOutputField>>,
     },
     /// Can be used in a select statement to exclude a column from selection
-    Exclude(Box<Expr>, Vec<Arc<str>>),
+    Exclude(Box<Expr>, Vec<Excluded>),
     /// Set root name as Alias
     KeepName(Box<Expr>),
     SufPreFix {
@@ -351,6 +351,12 @@ pub enum Expr {
     },
 }
 
+#[derive(Debug, Clone, PartialEq)]
+pub enum Excluded {
+    Name(Arc<str>),
+    Dtype(DataType),
+}
+
 impl Expr {
     /// Get Field result of the expression. The schema is the input data.
     pub(crate) fn to_field(&self, schema: &Schema, ctxt: Context) -> Result<Field> {
@@ -1476,7 +1482,16 @@ impl Expr {
         let v = columns
             .to_selection_vec()
             .iter()
-            .map(|s| Arc::from(*s))
+            .map(|s| Excluded::Name(Arc::from(*s)))
+            .collect();
+        Expr::Exclude(Box::new(self), v)
+    }
+
+    pub fn exclude_dtype<D: AsRef<[DataType]>>(self, dtypes: D) -> Expr {
+        let v = dtypes
+            .as_ref()
+            .iter()
+            .map(|dt| Excluded::Dtype(dt.clone()))
             .collect();
         Expr::Exclude(Box::new(self), v)
     }

diff --git a/polars/polars-lazy/src/logical_plan/projection.rs b/polars/polars-lazy/src/logical_plan/projection.rs
@@ -158,27 +158,50 @@ fn expand_dtypes(expr: &Expr, result: &mut Vec<Expr>, schema: &Schema, dtypes: &
 fn prepare_excluded(expr: &Expr, schema: &Schema) -> Vec<Arc<str>> {
     let mut exclude = vec![];
     expr.into_iter().for_each(|e| {
-        if let Expr::Exclude(_, names) = e {
+        if let Expr::Exclude(_, to_exclude) = e {
             #[cfg(feature = "regex")]
             {
                 // instead of matching the names for regex patterns
                 // and expanding the matches in the schema we
                 // reuse the `replace_regex` function. This is a bit
                 // slower but DRY.
                 let mut buf = vec![];
-                for name in names {
-                    let e = Expr::Column(name.clone());
-                    replace_regex(&e, &mut buf, schema);
-                    for col in buf.drain(..) {
-                        if let Expr::Column(name) = col {
-                            exclude.push(name)
+                for to_exclude_single in to_exclude {
+                    match to_exclude_single {
+                        Excluded::Name(name) => {
+                            let e = Expr::Column(name.clone());
+                            replace_regex(&e, &mut buf, schema);
+                            for col in buf.drain(..) {
+                                if let Expr::Column(name) = col {
+                                    exclude.push(name)
+                                }
+                            }
+                        }
+                        Excluded::Dtype(dt) => {
+                            for fld in schema.fields() {
+                                if fld.data_type() == dt {
+                                    exclude.push(Arc::from(fld.name().as_ref()))
+                                }
+                            }
                         }
                     }
                 }
             }
+
             #[cfg(not(feature = "regex"))]
             {
-                exclude.extend_from_slice(names)
+                for to_exclude_single in to_exclude {
+                    match to_exclude_single {
+                        Excluded::Name(name) => exclude.push(name.clone()),
+                        Excluded::Dtype(dt) => {
+                            for fld in schema.fields() {
+                                if matches!(fld.data_type(), dt) {
+                                    exclude.push(Arc::from(fld.name().as_ref()))
+                                }
+                            }
+                        }
+                    }
+                }
             }
         }
     });

diff --git a/py-polars/polars/internals/expr.py b/py-polars/polars/internals/expr.py
@@ -283,7 +283,10 @@ def alias(self, name: str) -> "Expr":
         """
         return wrap_expr(self._pyexpr.alias(name))
 
-    def exclude(self, columns: Union[str, tp.List[str]]) -> "Expr":
+    def exclude(
+        self,
+        columns: Union[str, tp.List[str], Type[DataType], Sequence[Type[DataType]]],
+    ) -> "Expr":
         """
         Exclude certain columns from a wildcard/regex selection.
 
@@ -292,7 +295,11 @@ def exclude(self, columns: Union[str, tp.List[str]]) -> "Expr":
         Parameters
         ----------
         columns
-            Column(s) to exclude from selection
+            Column(s) to exclude from selection.
+            This can be:
+                - a column name, or multiple names
+                - a regular expression starting with `^` and ending with `$`
+                - a dtype or multiple dtypes
 
         Examples
         --------
@@ -335,8 +342,19 @@ def exclude(self, columns: Union[str, tp.List[str]]) -> "Expr":
 
         """
         if isinstance(columns, str):
-            columns = [columns]
-        return wrap_expr(self._pyexpr.exclude(columns))
+            columns = [columns]  # type: ignore
+            return wrap_expr(self._pyexpr.exclude(columns))
+        elif not isinstance(columns, list) and issubclass(columns, DataType):  # type: ignore
+            columns = [columns]  # type: ignore
+            return wrap_expr(self._pyexpr.exclude_dtype(columns))
+
+        if not all([isinstance(a, str) or issubclass(a, DataType) for a in columns]):  # type: ignore
+            raise ValueError("input should be all string or all DataType")
+
+        if isinstance(columns[0], str):  # type: ignore
+            return wrap_expr(self._pyexpr.exclude(columns))
+        else:
+            return wrap_expr(self._pyexpr.exclude_dtype(columns))
 
     def keep_name(self) -> "Expr":
         """
@@ -2408,6 +2426,9 @@ def __init__(self, expr: Expr):
 
     def buckets(self, interval: timedelta) -> Expr:
         """
+        .. warning::
+            This API is experimental and will likely change.
+
         Divide the date/ datetime range into buckets.
         Data will be sorted by this operation.
 

diff --git a/py-polars/polars/internals/frame.py b/py-polars/polars/internals/frame.py
@@ -2259,6 +2259,10 @@ def groupby(
 
     def downsample(self, by: Union[str, tp.List[str]], rule: str, n: int) -> "GroupBy":
         """
+
+        .. deprecated:: 0.11.0
+            Use `buckets` expression instead
+
         Start a downsampling groupby operation.
 
         Parameters
@@ -3786,6 +3790,8 @@ def apply(self, f: Callable[[DataFrame], DataFrame]) -> DataFrame:
         """
         Apply a function over the groups as a sub-DataFrame.
 
+        Beware, this is slow.
+
         Parameters
         ----------
         f

diff --git a/py-polars/src/conversion.rs b/py-polars/src/conversion.rs
@@ -204,6 +204,35 @@ impl ToPyObject for Wrap<DataType> {
     }
 }
 
+impl FromPyObject<'_> for Wrap<DataType> {
+    fn extract(ob: &PyAny) -> PyResult<Self> {
+        let dtype = match ob.repr().unwrap().to_str().unwrap() {
+            "<class 'polars.datatypes.UInt8'>" => DataType::UInt8,
+            "<class 'polars.datatypes.UInt16'>" => DataType::UInt16,
+            "<class 'polars.datatypes.UInt32'>" => DataType::UInt32,
+            "<class 'polars.datatypes.UInt64'>" => DataType::UInt64,
+            "<class 'polars.datatypes.Int8'>" => DataType::Int8,
+            "<class 'polars.datatypes.Int16'>" => DataType::Int16,
+            "<class 'polars.datatypes.Int32'>" => DataType::Int32,
+            "<class 'polars.datatypes.Int64'>" => DataType::Int64,
+            "<class 'polars.datatypes.Utf8'>" => DataType::Utf8,
+            "<class 'polars.datatypes.List'>" => DataType::List(Box::new(DataType::Boolean)),
+            "<class 'polars.datatypes.Boolean'>" => DataType::Boolean,
+            "<class 'polars.datatypes.Categorical'>" => DataType::Categorical,
+            "<class 'polars.datatypes.Date'>" => DataType::Date,
+            "<class 'polars.datatypes.Datetime'>" => DataType::Datetime,
+            "<class 'polars.datatypes.Float32'>" => DataType::Float32,
+            "<class 'polars.datatypes.Float64'>" => DataType::Float64,
+            "<class 'polars.datatypes.Object'>" => DataType::Object("unknown"),
+            dt => panic!(
+                "{} not expected in python dtype to rust dtype conversion",
+                dt
+            ),
+        };
+        Ok(Wrap(dtype))
+    }
+}
+
 impl ToPyObject for Wrap<AnyValue<'_>> {
     fn to_object(&self, py: Python) -> PyObject {
         self.clone().into_py(py)

diff --git a/py-polars/src/lazy/dsl.rs b/py-polars/src/lazy/dsl.rs
@@ -1,5 +1,5 @@
 use super::apply::*;
-use crate::conversion::str_to_null_behavior;
+use crate::conversion::{str_to_null_behavior, Wrap};
 use crate::lazy::map_single;
 use crate::lazy::utils::py_exprs_to_exprs;
 use crate::prelude::{parse_strategy, str_to_rankmethod};
@@ -703,6 +703,12 @@ impl PyExpr {
     pub fn exclude(&self, columns: Vec<String>) -> PyExpr {
         self.inner.clone().exclude(&columns).into()
     }
+    pub fn exclude_dtype(&self, dtypes: Vec<Wrap<DataType>>) -> PyExpr {
+        // Safety:
+        // Wrap is transparent.
+        let dtypes: Vec<DataType> = unsafe { std::mem::transmute(dtypes) };
+        self.inner.clone().exclude_dtype(&dtypes).into()
+    }
     pub fn interpolate(&self) -> PyExpr {
         self.inner.clone().interpolate().into()
     }

diff --git a/py-polars/tests/test_lazy.py b/py-polars/tests/test_lazy.py
@@ -528,9 +528,11 @@ def test_regex_selection() -> None:
 
 
 def test_exclude_selection() -> None:
-    df = pl.DataFrame({"a": [1], "b": [1], "c": [1]}).lazy()
+    df = pl.DataFrame({"a": [1], "b": [1], "c": [True]}).lazy()
 
     assert df.select([pl.exclude("a")]).columns == ["b", "c"]
+    assert df.select(pl.all().exclude(pl.Boolean)).columns == ["a", "b"]
+    assert df.select(pl.all().exclude([pl.Boolean])).columns == ["a", "b"]
 
 
 def test_col_series_selection() -> None: