Skip to content

Commit

Permalink
fix NaN in joins and init integration tests (#2644)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Feb 14, 2022
1 parent 5de6940 commit b4a9ea2
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ jobs:
- name: Run tests
run: |
export RUSTFLAGS="-C debuginfo=0"
cd polars && make test
cd polars && make test && make integration-tests
- name: Run miri
run: |
cd polars
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test-windows.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ jobs:
override: true
- name: Run tests
run: |
cd polars && make test
cd polars && make test && make integration-tests
5 changes: 5 additions & 0 deletions polars/Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
BASE ?= master

.PHONY: fmt check check-features clippy clippy-default test test-doc integration-tests

fmt:
cargo fmt --all
$(MAKE) -C .. fmt_toml
Expand Down Expand Up @@ -36,6 +38,9 @@ test:
-- \
--test-threads=2

integration-tests:
cargo t --all-features --test it

miri:
# not tested on all features because miri does not support SIMD
# some tests are also filtered, because miri cannot deal with the rayon threadpool
Expand Down
6 changes: 3 additions & 3 deletions polars/polars-core/src/frame/hash_join/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use unsafe_unwrap::UnsafeUnwrap;
#[cfg(feature = "private")]
pub use self::multiple_keys::private_left_join_multiple_keys;
use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE;
use crate::utils::series::to_physical;
use crate::utils::series::to_physical_and_bit_repr;

/// If Categorical types are created without a global string cache or under
/// a different global string cache the mapping will be incorrect.
Expand Down Expand Up @@ -1117,8 +1117,8 @@ impl DataFrame {
}
// make sure that we don't have logical types.
// we don't overwrite the original selected as that might be used to create a column in the new df
let selected_left_physical = to_physical(&selected_left);
let selected_right_physical = to_physical(&selected_right);
let selected_left_physical = to_physical_and_bit_repr(&selected_left);
let selected_right_physical = to_physical_and_bit_repr(&selected_right);

// multiple keys
match how {
Expand Down
6 changes: 3 additions & 3 deletions polars/polars-core/src/frame/hash_join/multiple_keys.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use crate::frame::hash_join::{
get_hash_tbl_threaded_join_mut_partitioned, get_hash_tbl_threaded_join_partitioned,
};
use crate::prelude::*;
use crate::utils::series::to_physical;
use crate::utils::series::to_physical_and_bit_repr;
use crate::utils::{set_partition_size, split_df};
use crate::vector_hasher::{df_rows_to_hashes_threaded, this_partition, IdBuildHasher, IdxHash};
use crate::POOL;
Expand Down Expand Up @@ -239,8 +239,8 @@ pub(crate) fn inner_join_multiple_keys(

#[cfg(feature = "private")]
pub fn private_left_join_multiple_keys(a: &DataFrame, b: &DataFrame) -> Vec<(u32, Option<u32>)> {
let a = DataFrame::new_no_checks(to_physical(a.get_columns()));
let b = DataFrame::new_no_checks(to_physical(b.get_columns()));
let a = DataFrame::new_no_checks(to_physical_and_bit_repr(a.get_columns()));
let b = DataFrame::new_no_checks(to_physical_and_bit_repr(b.get_columns()));
left_join_multiple_keys(&a, &b)
}

Expand Down
15 changes: 13 additions & 2 deletions polars/polars-core/src/utils/series.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,18 @@
use crate::prelude::*;

pub(crate) fn to_physical(s: &[Series]) -> Vec<Series> {
/// Transform to physical type and coerce floating point and similar sized integer to a bit representation
/// to reduce compiler bloat
pub(crate) fn to_physical_and_bit_repr(s: &[Series]) -> Vec<Series> {
s.iter()
.map(|s| s.to_physical_repr().into_owned())
.map(|s| {
let physical = s.to_physical_repr();
match physical.dtype() {
DataType::Int64 => physical.bit_repr_large().into_series(),
DataType::Int32 => physical.bit_repr_small().into_series(),
DataType::Float32 => physical.bit_repr_small().into_series(),
DataType::Float64 => physical.bit_repr_large().into_series(),
_ => physical.into_owned(),
}
})
.collect()
}
30 changes: 30 additions & 0 deletions polars/tests/it/joins.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
use polars::prelude::*;

#[test]
fn join_nans_outer() -> Result<()> {
let df1 = df! {
"w" => [Some(2.5), None, Some(f64::NAN), None, Some(2.5), Some(f64::NAN), None, Some(3.0)],
"t" => [Some("xl"), Some("xl"), Some("xl"), Some("xl"), Some("xl"), Some("xl"), Some("xl"), Some("l")],
"c" => [Some(10), Some(5), Some(3), Some(2), Some(9), Some(4), Some(11), Some(3)],
}?
.lazy();
let a1 = df1
.clone()
.groupby(vec![col("w").alias("w"), col("t").alias("t")])
.agg(vec![col("c").sum().alias("c_sum")]);
let a2 = df1
.groupby(vec![col("w").alias("w"), col("t").alias("t")])
.agg(vec![col("c").max().alias("c_max")]);

let res = a1
.join_builder()
.with(a2)
.left_on(vec![col("w").alias("w"), col("t").alias("t")])
.right_on(vec![col("w").alias("w"), col("t").alias("t")])
.how(JoinType::Outer)
.finish()
.collect()?;

assert_eq!(res.shape(), (4, 4));
Ok(())
}
1 change: 1 addition & 0 deletions polars/tests/it/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
mod joins;

0 comments on commit b4a9ea2

Please sign in to comment.