Skip to content

Commit

Permalink
fix sort by expressions
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jan 19, 2022
1 parent d07fdc1 commit a309e4b
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 37 deletions.
38 changes: 18 additions & 20 deletions polars/polars-core/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1236,15 +1236,8 @@ impl DataFrame {
/// assert_eq!(df["Hydrogen"], sv[1]);
/// # Ok::<(), PolarsError>(())
/// ```
pub fn select_series<I, S>(&self, selection: I) -> Result<Vec<Series>>
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
let cols = selection
.into_iter()
.map(|s| s.as_ref().to_string())
.collect::<Vec<_>>();
pub fn select_series(&self, selection: impl IntoVec<String>) -> Result<Vec<Series>> {
let cols = selection.into_vec();
self.select_series_impl(&cols)
}

Expand Down Expand Up @@ -1580,21 +1573,25 @@ impl DataFrame {
}

/// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
fn sort_impl(&self, by_column: Vec<String>, reverse: Vec<bool>) -> Result<Self> {
#[cfg(feature = "private")]
pub fn sort_impl(&self, by_column: Vec<Series>, reverse: Vec<bool>) -> Result<Self> {
// note that the by_column argument also contains evaluated expression from polars-lazy
// that may not even be present in this dataframe.

// therefore when we try to set the first columns as sorted, we ignore the error
// as expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
let first_reverse = reverse[0];
let first_by_column = &by_column[0];
let first_by_column = by_column[0].name().to_string();
let take = match by_column.len() {
1 => {
let s = self.column(&by_column[0])?;
let s = &by_column[0];
s.argsort(reverse[0])
}
_ => {
#[cfg(feature = "sort_multiple")]
{
let columns = self.select_series(&by_column)?;

let (first, columns, reverse) = prepare_argsort(columns, reverse)?;
first.argsort_multiple(&columns, &reverse)?
let (first, by_column, reverse) = prepare_argsort(by_column, reverse)?;
first.argsort_multiple(&by_column, &reverse)?
}
#[cfg(not(feature = "sort_multiple"))]
{
Expand All @@ -1610,13 +1607,14 @@ impl DataFrame {
unsafe { self.take_unchecked(&take) }
};
// Mark the first sort column as sorted
df.apply(first_by_column, |s| {
// if the column did not exists it is ok, because we sorted by an expression
// not present in the dataframe
let _ = df.apply(&first_by_column, |s| {
let mut s = s.clone();
let inner = s.get_inner_mut();
inner.set_sorted(first_reverse);
s
})
.expect("column is present");
});
Ok(df)
}

Expand All @@ -1640,7 +1638,7 @@ impl DataFrame {
reverse: impl IntoVec<bool>,
) -> Result<Self> {
// we do this heap allocation and dispatch to reduce monomorphization bloat
let by_column = by_column.into_vec();
let by_column = self.select_series(by_column)?;
let reverse = reverse.into_vec();
self.sort_impl(by_column, reverse)
}
Expand Down
32 changes: 15 additions & 17 deletions polars/polars-lazy/src/physical_plan/executors/sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,25 @@ pub(crate) struct SortExec {

impl Executor for SortExec {
fn execute(&mut self, state: &ExecutionState) -> Result<DataFrame> {
let mut df = self.input.execute(state)?;
let df = self.input.execute(state)?;

let by_columns = self
.by_column
.iter()
.map(|e| e.evaluate(&df, state))
.enumerate()
.map(|(i, e)| {
let mut s = e.evaluate(&df, state)?;
// polars core will try to set the sorted columns as sorted
// this should only be done with simple col("foo") expressions
// therefore we rename more complex expressions so that
// polars core does not match these
if !matches!(e.as_expression(), Expr::Column(_)) {
s.rename(&format!("_POLARS_SORT_BY_{}", i));
}
Ok(s)
})
.collect::<Result<Vec<_>>>()?;
let mut column_names = Vec::with_capacity(by_columns.len());
// replace the columns in the DataFrame with the expressions
// for col("foo") this is redundant
// for col("foo").reverse() this is not
for column in by_columns {
let name = column.name();
column_names.push(name.to_string());
// if error, expression create a new named column and we must add it to the DataFrame
// if ok, we have replaced the column with the expression eval
if df.apply(name, |_| column.clone()).is_err() {
df.hstack(&[column])?;
}
}

df.sort(&column_names, std::mem::take(&mut self.reverse))
let reverse = std::mem::take(&mut self.reverse);
df.sort_impl(by_columns, reverse)
}
}
8 changes: 8 additions & 0 deletions py-polars/tests/test_sort.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,11 @@ def test_sort_in_place() -> None:
expected = [1, 2, 3, 4, 5]
assert result == expected
assert ret is None


def test_sort_by_exprs() -> None:
# make sure that the expression does not overwrite columns in the dataframe
df = pl.DataFrame({"a": [1, 2, -1, -2]})
out = df.sort(pl.col("a").abs()).to_series()

assert out.to_list() == [1, -1, 2, -2]

0 comments on commit a309e4b

Please sign in to comment.