Skip to content

Commit

Permalink
make sure that multiple key join is done on physical typed keys
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Nov 7, 2021
1 parent 25d5f98 commit b888b20
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 38 deletions.
17 changes: 11 additions & 6 deletions polars/polars-core/src/frame/hash_join/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ use unsafe_unwrap::UnsafeUnwrap;
#[cfg(feature = "private")]
pub use self::multiple_keys::private_left_join_multiple_keys;
use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE;
use crate::utils::series::to_physical;

/// If Categorical types are created without a global string cache or under
/// a different global string cache the mapping will be incorrect.
Expand Down Expand Up @@ -1113,12 +1114,16 @@ impl DataFrame {
self.height()
}
}
// make sure that we don't have logical types.
// we don't overwrite the original selected as that might be used to create a column in the new df
let selected_left_physical = to_physical(&selected_left);
let selected_right_physical = to_physical(&selected_right);

// multiple keys
match how {
JoinType::Inner => {
let left = DataFrame::new_no_checks(selected_left);
let right = DataFrame::new_no_checks(selected_right.clone());
let left = DataFrame::new_no_checks(selected_left_physical);
let right = DataFrame::new_no_checks(selected_right_physical);
let (left, right, swap) = det_hash_prone_order!(left, right);
let join_tuples = inner_join_multiple_keys(&left, &right, swap);

Expand All @@ -1134,8 +1139,8 @@ impl DataFrame {
self.finish_join(df_left, df_right, suffix)
}
JoinType::Left => {
let left = DataFrame::new_no_checks(selected_left);
let right = DataFrame::new_no_checks(selected_right.clone());
let left = DataFrame::new_no_checks(selected_left_physical);
let right = DataFrame::new_no_checks(selected_right_physical);
let join_tuples = left_join_multiple_keys(&left, &right);

let (df_left, df_right) = POOL.join(
Expand All @@ -1152,8 +1157,8 @@ impl DataFrame {
self.finish_join(df_left, df_right, suffix)
}
JoinType::Outer => {
let left = DataFrame::new_no_checks(selected_left.clone());
let right = DataFrame::new_no_checks(selected_right.clone());
let left = DataFrame::new_no_checks(selected_left_physical);
let right = DataFrame::new_no_checks(selected_right_physical);

let (left, right, swap) = det_hash_prone_order!(left, right);
let opt_join_tuples = outer_join_multiple_keys(&left, &right, swap);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
pub(crate) mod series;

use crate::prelude::*;
use crate::POOL;
pub use arrow;
Expand Down
7 changes: 7 additions & 0 deletions polars/polars-core/src/utils/series.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
use crate::prelude::*;

pub(crate) fn to_physical(s: &[Series]) -> Vec<Series> {
s.iter()
.map(|s| s.to_physical_repr().into_owned())
.collect()
}
46 changes: 14 additions & 32 deletions py-polars/tests/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,45 +478,27 @@ def test_join():

def test_joins_dispatch():
# this just flexes the dispatch a bit
dfa = pl.DataFrame(
{
"a": ["a", "b", "c"],
"b": [1, 2, 3],
"date": ["2021-01-01", "2021-01-02", "2021-01-03"],
"datetime": [13241324, 12341256, 12341234],
}
).with_columns(
[pl.col("date").str.strptime(pl.Date), pl.col("datetime").cast(pl.Datetime)]
)
dfa = pl.concat([dfa, dfa], rechunk=False)

dfb = pl.DataFrame(
# don't change the data of this dataframe, this triggered:
# https://github.com/pola-rs/polars/issues/1688
dfa = pl.DataFrame(
{
"a": ["a", "b", "c"],
"b": [1, 2, 3],
"date": ["2021-01-01", "2021-01-02", "2021-01-03"],
"datetime": [13241324, 12341256, 12341234],
"a": ["a", "b", "c", "a"],
"b": [1, 2, 3, 1],
"date": ["2021-01-01", "2021-01-02", "2021-01-03", "2021-01-01"],
"datetime": [13241324, 12341256, 12341234, 13241324],
}
).with_columns(
[pl.col("date").str.strptime(pl.Date), pl.col("datetime").cast(pl.Datetime)]
)

dfa.join(dfb, on=["a", "b", "date", "datetime"], how="left")
dfa.join(dfb, on=["a", "b", "date", "datetime"], how="inner")
dfa.join(dfb, on=["a", "b", "date", "datetime"], how="outer")
dfa.join(dfb, on=["date", "datetime"], how="left")
dfa.join(dfb, on=["date", "datetime"], how="inner")
dfa.join(dfb, on=["date", "datetime"], how="outer")
dfa.join(dfb, on=["date", "datetime", "a"], how="left")
dfa.join(dfb, on=["date", "datetime", "a"], how="inner")
dfa.join(dfb, on=["date", "a"], how="outer")
dfa.join(dfb, on=["date", "a"], how="left")
dfa.join(dfb, on=["date", "a"], how="inner")
dfa.join(dfb, on=["date", "a"], how="outer")
dfa.join(dfb, on=["date"], how="outer")
dfa.join(dfb, on=["date"], how="left")
dfa.join(dfb, on=["date"], how="inner")
dfa.join(dfb, on=["date"], how="outer")
for how in ["left", "inner", "outer"]:
dfa.join(dfa, on=["a", "b", "date", "datetime"], how=how)
dfa.join(dfa, on=["date", "datetime"], how=how)
dfa.join(dfa, on=["date", "datetime", "a"], how=how)
dfa.join(dfa, on=["date", "a"], how=how)
dfa.join(dfa, on=["a", "datetime"], how=how)
dfa.join(dfa, on=["date"], how=how)


def test_hstack():
Expand Down

0 comments on commit b888b20

Please sign in to comment.