Skip to content

Commit

Permalink
Improve Left join on chunked data (#3177)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Apr 21, 2022
1 parent 99817f3 commit 4e83b40
Show file tree
Hide file tree
Showing 50 changed files with 1,560 additions and 687 deletions.
7 changes: 3 additions & 4 deletions polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ default = [
"zip_with",
"csv-file",
"temporal",
"performant",
"fmt",
"dtype-slim",
]
Expand Down Expand Up @@ -52,9 +51,8 @@ avro = ["polars-io", "polars-io/avro"]
# support for arrows csv file parsing
csv-file = ["polars-io", "polars-io/csv-file", "polars-lazy/csv-file"]

# ~40% faster chunkedarray creation, but may lead to unexpected panic if iterator incorrectly sets a size_hint
# that fits a TrustedLen iterator.
performant = ["polars-core/performant"]
# slower builds
performant = ["polars-core/performant", "chunked_ids"]

# Dataframe formatting.
fmt = ["polars-core/fmt"]
Expand Down Expand Up @@ -107,6 +105,7 @@ log = ["polars-core/log", "polars-lazy/log"]
partition_by = ["polars-core/partition_by"]
semi_anti_join = ["polars-core/semi_anti_join"]
list_eval = ["polars-lazy/list_eval"]
chunked_ids = ["polars-core/chunked_ids", "polars-lazy/chunked_ids"]

test = [
"lazy",
Expand Down
103 changes: 103 additions & 0 deletions polars/polars-arrow/src/array/get.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
use crate::is_valid::IsValid;
use arrow::{
array::{Array, BooleanArray, ListArray, PrimitiveArray, Utf8Array},
types::NativeType,
};

pub trait ArrowGetItem {
type Item;

fn get(&self, item: usize) -> Option<Self::Item>;

/// # Safety
/// Get item. It is the callers resposibility that the `item < self.len()`
unsafe fn get_unchecked(&self, item: usize) -> Option<Self::Item>;
}

impl<T: NativeType> ArrowGetItem for PrimitiveArray<T> {
type Item = T;

#[inline]
fn get(&self, item: usize) -> Option<Self::Item> {
if item >= self.len() {
None
} else {
unsafe { self.get_unchecked(item) }
}
}

#[inline]
unsafe fn get_unchecked(&self, item: usize) -> Option<Self::Item> {
if self.is_null_unchecked(item) {
None
} else {
Some(self.value_unchecked(item))
}
}
}

impl ArrowGetItem for BooleanArray {
type Item = bool;

#[inline]
fn get(&self, item: usize) -> Option<Self::Item> {
if item >= self.len() {
None
} else {
unsafe { self.get_unchecked(item) }
}
}

#[inline]
unsafe fn get_unchecked(&self, item: usize) -> Option<Self::Item> {
if self.is_null_unchecked(item) {
None
} else {
Some(self.value_unchecked(item))
}
}
}

impl<'a> ArrowGetItem for &'a Utf8Array<i64> {
type Item = &'a str;

#[inline]
fn get(&self, item: usize) -> Option<Self::Item> {
if item >= self.len() {
None
} else {
unsafe { self.get_unchecked(item) }
}
}

#[inline]
unsafe fn get_unchecked(&self, item: usize) -> Option<Self::Item> {
if self.is_null_unchecked(item) {
None
} else {
Some(self.value_unchecked(item))
}
}
}

impl ArrowGetItem for ListArray<i64> {
type Item = Box<dyn Array>;

#[inline]
fn get(&self, item: usize) -> Option<Self::Item> {
if item >= self.len() {
None
} else {
unsafe { self.get_unchecked(item) }
}
}

#[inline]
unsafe fn get_unchecked(&self, item: usize) -> Option<Self::Item> {
if self.is_null_unchecked(item) {
None
} else {
Some(self.value_unchecked(item))
}
}
}
3 changes: 3 additions & 0 deletions polars/polars-arrow/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@ use std::sync::Arc;
use crate::utils::CustomIterTools;

pub mod default_arrays;
mod get;
pub mod list;

pub use get::ArrowGetItem;

pub trait ValueSize {
/// Useful for a Utf8 or a List to get underlying value size.
/// During a rechunk this is handy
Expand Down
2 changes: 2 additions & 0 deletions polars/polars-arrow/src/trusted_len/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ use std::slice::Iter;
/// length of the iterator must be correct
pub unsafe trait TrustedLen: Iterator {}

unsafe impl<T> TrustedLen for &mut dyn TrustedLen<Item = T> {}

unsafe impl<T> TrustedLen for Iter<'_, T> {}

unsafe impl<B, I: TrustedLen, T: FnMut(I::Item) -> B> TrustedLen for std::iter::Map<I, T> {}
Expand Down
7 changes: 4 additions & 3 deletions polars/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ avx512 = []
docs = []
temporal = ["regex", "chrono"]
random = ["rand", "rand_distr"]
default = ["docs", "temporal", "performant", "private"]
default = ["docs", "temporal", "private"]
lazy = ["sort_multiple"]

# ~40% faster collect, needed until trustedlength iter stabilizes
Expand Down Expand Up @@ -71,13 +71,14 @@ moment = []
diagonal_concat = []
horizontal_concat = []
abs = []
ewma = ["polars-utils"]
ewma = []
dataframe_arithmetic = []
product = []
unique_counts = []
log = []
partition_by = []
semi_anti_join = []
chunked_ids = []

dynamic_groupby = ["dtype-datetime", "dtype-date"]

Expand Down Expand Up @@ -156,7 +157,7 @@ lazy_static = "1.4"
ndarray = { version = "0.15", optional = true, default_features = false }
num = "^0.4"
polars-arrow = { version = "0.20.0", path = "../polars-arrow", features = ["compute"] }
polars-utils = { version = "0.20.0", path = "../polars-utils", optional = true }
polars-utils = { version = "0.20.0", path = "../polars-utils" }
rand = { version = "0.8", optional = true, features = ["small_rng", "std"] }
rand_distr = { version = "0.4", optional = true }
rayon = "1.5"
Expand Down
13 changes: 13 additions & 0 deletions polars/polars-core/src/chunked_array/builder/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,19 @@ impl<'a> AnonymousListBuilder<'a> {
}
}

pub fn append_opt_array(&mut self, opt_s: Option<&'a dyn Array>) {
match opt_s {
Some(s) => self.append_array(s),
None => {
self.append_null();
}
}
}

pub fn append_array(&mut self, arr: &'a dyn Array) {
self.builder.push(arr)
}

pub fn append_null(&mut self) {
self.builder.push_null();
}
Expand Down
9 changes: 9 additions & 0 deletions polars/polars-core/src/chunked_array/object/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,15 @@ where
pub unsafe fn is_null_unchecked(&self, i: usize) -> bool {
!self.is_valid_unchecked(i)
}

#[inline]
pub(crate) unsafe fn get_unchecked(&self, item: usize) -> Option<&T> {
if self.is_null_unchecked(item) {
None
} else {
Some(self.value_unchecked(item))
}
}
}

impl<T> Array for ObjectArray<T>
Expand Down
3 changes: 3 additions & 0 deletions polars/polars-core/src/chunked_array/ops/take/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,13 @@ use std::borrow::Cow;
pub use take_random::*;
pub use traits::*;

mod take_chunked;
mod take_every;
pub(crate) mod take_random;
pub(crate) mod take_single;
mod traits;
#[cfg(feature = "chunked_ids")]
pub(crate) use take_chunked::*;

macro_rules! take_iter_n_chunks {
($ca:expr, $indices:expr) => {{
Expand Down

0 comments on commit 4e83b40

Please sign in to comment.