Skip to content

Commit

Permalink
fix accidentally slow cross join (#3980)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jul 11, 2022
1 parent 3c30eeb commit 5f5acb2
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 7 deletions.
23 changes: 18 additions & 5 deletions polars/polars-core/src/frame/cross_join.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use crate::prelude::*;
use crate::utils::{concat_df, CustomIterTools, NoNull};
use crate::series::IsSorted;
use crate::utils::{concat_df_unchecked, CustomIterTools, NoNull};
use crate::POOL;

impl DataFrame {
Expand All @@ -12,22 +13,34 @@ impl DataFrame {
// the left side has the Nth row combined with every row from right.
// So let's say we have the following no. of rows
// left: 3
// right: .as_slice()4
// right: 4
//
// left take idx: 000011112222
// right take idx: 012301230123

let create_left_df = || {
let take_left: NoNull<IdxCa> =
let mut take_left: NoNull<IdxCa> =
(0..total_rows).map(|i| i / n_rows_right).collect_trusted();
take_left.set_sorted2(IsSorted::Ascending);
// Safety:
// take left is in bounds
unsafe { self.take_unchecked(&take_left.into_inner()) }
};

let create_right_df = || {
let iter = (0..n_rows_left).map(|_| other);
concat_df(iter).unwrap()
// concatenation of dataframes is very expensive if we need to make the series mutable
// many times, these are atomic operations
// so we choose a different strategy at > 100 rows (arbitrarily small number)
if n_rows_left > 100 {
let take_right: NoNull<IdxCa> =
(0..total_rows).map(|i| i % n_rows_right).collect_trusted();
// Safety:
// take right is in bounds
unsafe { self.take_unchecked(&take_right.into_inner()) }
} else {
let iter = (0..n_rows_left).map(|_| other);
concat_df_unchecked(iter)
}
};

let (l_df, r_df) = POOL.install(|| rayon::join(create_left_df, create_right_df));
Expand Down
4 changes: 2 additions & 2 deletions polars/polars-core/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use crate::prelude::*;
use crate::utils::{get_supertype, split_ca, split_df, NoNull};

#[cfg(feature = "describe")]
use crate::utils::concat_df;
use crate::utils::concat_df_unchecked;

#[cfg(feature = "dataframe_arithmetic")]
mod arithmetic;
Expand Down Expand Up @@ -2395,7 +2395,7 @@ impl DataFrame {
tmp.push(describe_cast(&self.max()));
headers.push("max".to_string());

let mut summary = concat_df(&tmp).expect("unable to create dataframe");
let mut summary = concat_df_unchecked(&tmp);

summary
.insert_at_idx(0, Series::new("describe", headers))
Expand Down
13 changes: 13 additions & 0 deletions polars/polars-core/src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -823,6 +823,19 @@ where
Ok(acc_df)
}

/// Concat the DataFrames to a single DataFrame.
pub fn concat_df_unchecked<'a, I>(dfs: I) -> DataFrame
where
I: IntoIterator<Item = &'a DataFrame>,
{
let mut iter = dfs.into_iter();
let mut acc_df = iter.next().unwrap().clone();
for df in iter {
acc_df.vstack_mut_unchecked(df);
}
acc_df
}

pub fn accumulate_dataframes_horizontal(dfs: Vec<DataFrame>) -> Result<DataFrame> {
let mut iter = dfs.into_iter();
let mut acc_df = iter.next().unwrap();
Expand Down

0 comments on commit 5f5acb2

Please sign in to comment.