fix sort by expressions

pola-rs · Jan 19, 2022 · a309e4b · a309e4b
1 parent d07fdc1
commit a309e4b
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 37 deletions.
diff --git a/polars/polars-core/src/frame/mod.rs b/polars/polars-core/src/frame/mod.rs
@@ -1236,15 +1236,8 @@ impl DataFrame {
     /// assert_eq!(df["Hydrogen"], sv[1]);
     /// # Ok::<(), PolarsError>(())
     /// ```
-    pub fn select_series<I, S>(&self, selection: I) -> Result<Vec<Series>>
-    where
-        I: IntoIterator<Item = S>,
-        S: AsRef<str>,
-    {
-        let cols = selection
-            .into_iter()
-            .map(|s| s.as_ref().to_string())
-            .collect::<Vec<_>>();
+    pub fn select_series(&self, selection: impl IntoVec<String>) -> Result<Vec<Series>> {
+        let cols = selection.into_vec();
         self.select_series_impl(&cols)
     }
 
@@ -1580,21 +1573,25 @@ impl DataFrame {
     }
 
     /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
-    fn sort_impl(&self, by_column: Vec<String>, reverse: Vec<bool>) -> Result<Self> {
+    #[cfg(feature = "private")]
+    pub fn sort_impl(&self, by_column: Vec<Series>, reverse: Vec<bool>) -> Result<Self> {
+        // note that the by_column argument also contains evaluated expression from polars-lazy
+        // that may not even be present in this dataframe.
+
+        // therefore when we try to set the first columns as sorted, we ignore the error
+        // as expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
         let first_reverse = reverse[0];
-        let first_by_column = &by_column[0];
+        let first_by_column = by_column[0].name().to_string();
         let take = match by_column.len() {
             1 => {
-                let s = self.column(&by_column[0])?;
+                let s = &by_column[0];
                 s.argsort(reverse[0])
             }
             _ => {
                 #[cfg(feature = "sort_multiple")]
                 {
-                    let columns = self.select_series(&by_column)?;
-
-                    let (first, columns, reverse) = prepare_argsort(columns, reverse)?;
-                    first.argsort_multiple(&columns, &reverse)?
+                    let (first, by_column, reverse) = prepare_argsort(by_column, reverse)?;
+                    first.argsort_multiple(&by_column, &reverse)?
                 }
                 #[cfg(not(feature = "sort_multiple"))]
                 {
@@ -1610,13 +1607,14 @@ impl DataFrame {
             unsafe { self.take_unchecked(&take) }
         };
         // Mark the first sort column as sorted
-        df.apply(first_by_column, |s| {
+        // if the column did not exists it is ok, because we sorted by an expression
+        // not present in the dataframe
+        let _ = df.apply(&first_by_column, |s| {
             let mut s = s.clone();
             let inner = s.get_inner_mut();
             inner.set_sorted(first_reverse);
             s
-        })
-        .expect("column is present");
+        });
         Ok(df)
     }
 
@@ -1640,7 +1638,7 @@ impl DataFrame {
         reverse: impl IntoVec<bool>,
     ) -> Result<Self> {
         // we do this heap allocation and dispatch to reduce monomorphization bloat
-        let by_column = by_column.into_vec();
+        let by_column = self.select_series(by_column)?;
         let reverse = reverse.into_vec();
         self.sort_impl(by_column, reverse)
     }

diff --git a/polars/polars-lazy/src/physical_plan/executors/sort.rs b/polars/polars-lazy/src/physical_plan/executors/sort.rs
@@ -10,27 +10,25 @@ pub(crate) struct SortExec {
 
 impl Executor for SortExec {
     fn execute(&mut self, state: &ExecutionState) -> Result<DataFrame> {
-        let mut df = self.input.execute(state)?;
+        let df = self.input.execute(state)?;
 
         let by_columns = self
             .by_column
             .iter()
-            .map(|e| e.evaluate(&df, state))
+            .enumerate()
+            .map(|(i, e)| {
+                let mut s = e.evaluate(&df, state)?;
+                // polars core will try to set the sorted columns as sorted
+                // this should only be done with simple col("foo") expressions
+                // therefore we rename more complex expressions so that
+                // polars core does not match these
+                if !matches!(e.as_expression(), Expr::Column(_)) {
+                    s.rename(&format!("_POLARS_SORT_BY_{}", i));
+                }
+                Ok(s)
+            })
             .collect::<Result<Vec<_>>>()?;
-        let mut column_names = Vec::with_capacity(by_columns.len());
-        // replace the columns in the DataFrame with the expressions
-        // for col("foo") this is redundant
-        // for col("foo").reverse() this is not
-        for column in by_columns {
-            let name = column.name();
-            column_names.push(name.to_string());
-            // if error, expression create a new named column and we must add it to the DataFrame
-            // if ok, we have replaced the column with the expression eval
-            if df.apply(name, |_| column.clone()).is_err() {
-                df.hstack(&[column])?;
-            }
-        }
-
-        df.sort(&column_names, std::mem::take(&mut self.reverse))
+        let reverse = std::mem::take(&mut self.reverse);
+        df.sort_impl(by_columns, reverse)
     }
 }
diff --git a/py-polars/tests/test_sort.py b/py-polars/tests/test_sort.py
@@ -62,3 +62,11 @@ def test_sort_in_place() -> None:
     expected = [1, 2, 3, 4, 5]
     assert result == expected
     assert ret is None
+
+
+def test_sort_by_exprs() -> None:
+    # make sure that the expression does not overwrite columns in the dataframe
+    df = pl.DataFrame({"a": [1, 2, -1, -2]})
+    out = df.sort(pl.col("a").abs()).to_series()
+
+    assert out.to_list() == [1, -1, 2, -2]