Skip to content

Commit

Permalink
hoist swap check out of join tuples and redo performance benches
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Sep 2, 2020
1 parent 45ce8bd commit a88404f
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 36 deletions.
Binary file modified pandas_cmp/img/groupby10_.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified pandas_cmp/img/join_80_000.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion pandas_cmp/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ fn read_df(f: &File) -> DataFrame {
.expect("dataframe");

// for groupby we need to cast a column to a string
if let Some(s) = df.select_mut("str") {
if let Ok(s) = df.column("str") {
let s = s
.i64()
.expect("i64")
Expand Down
85 changes: 50 additions & 35 deletions polars/src/frame/hash_join.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,14 +113,22 @@ where
let hash_tbl = prepare_hashed_relation(b);

// Next we probe the other relation in the hash table
a.enumerate().for_each(|(idx_a, key)| {
if let Some(indexes_b) = hash_tbl.get(&key) {
let tuples = indexes_b
.iter()
.map(|&idx_b| if swap { (idx_b, idx_a) } else { (idx_a, idx_b) });
results.extend(tuples)
}
});
// code duplication is because we want to only do the swap check once
if swap {
a.enumerate().for_each(|(idx_a, key)| {
if let Some(indexes_b) = hash_tbl.get(&key) {
let tuples = indexes_b.iter().map(|&idx_b| (idx_b, idx_a));
results.extend(tuples)
}
});
} else {
a.enumerate().for_each(|(idx_a, key)| {
if let Some(indexes_b) = hash_tbl.get(&key) {
let tuples = indexes_b.iter().map(|&idx_b| (idx_a, idx_b));
results.extend(tuples)
}
});
}
results
}

Expand Down Expand Up @@ -172,36 +180,43 @@ where
// probe the hash table.
// Note: indexes from b that are not matched will be None, Some(idx_b)
// Therefore we remove the matches and the remaining will be joined from the right
a.enumerate().for_each(|(idx_a, key)| {
match hash_tbl.remove(&key) {
// left and right matches
Some(indexes_b) => results.extend(indexes_b.iter().map(|&idx_b| {
if swap {
(Some(idx_b), Some(idx_a))
} else {
(Some(idx_a), Some(idx_b))

// code duplication is because we want to only do the swap check once
if swap {
a.enumerate().for_each(|(idx_a, key)| {
match hash_tbl.remove(&key) {
// left and right matches
Some(indexes_b) => {
results.extend(indexes_b.iter().map(|&idx_b| (Some(idx_b), Some(idx_a))))
}
// only left values, right = null
None => {
results.insert((None, Some(idx_a)));
}
})),
// only left values, right = null
None => {
results.insert(if swap {
(None, Some(idx_a))
} else {
(Some(idx_a), None)
});
}
}
});
hash_tbl.iter().for_each(|(_k, indexes_b)| {
// remaining joined values from the right table
results.extend(indexes_b.iter().map(|&idx_b| {
if swap {
(Some(idx_b), None)
} else {
(None, Some(idx_b))
});
hash_tbl.iter().for_each(|(_k, indexes_b)| {
// remaining joined values from the right table
results.extend(indexes_b.iter().map(|&idx_b| (Some(idx_b), None)))
});
} else {
a.enumerate().for_each(|(idx_a, key)| {
match hash_tbl.remove(&key) {
// left and right matches
Some(indexes_b) => {
results.extend(indexes_b.iter().map(|&idx_b| (Some(idx_a), Some(idx_b))))
}
// only left values, right = null
None => {
results.insert((Some(idx_a), None));
}
}
}))
});
});
hash_tbl.iter().for_each(|(_k, indexes_b)| {
// remaining joined values from the right table
results.extend(indexes_b.iter().map(|&idx_b| (None, Some(idx_b))))
});
};

results
}
Expand Down

0 comments on commit a88404f

Please sign in to comment.