Skip to content

Commit

Permalink
add horizontal concat (#2173)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Dec 26, 2021
1 parent 601824d commit f7a9aa3
Show file tree
Hide file tree
Showing 10 changed files with 110 additions and 23 deletions.
3 changes: 3 additions & 0 deletions polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ moment = ["polars-core/moment", "polars-lazy/moment"]
arange = ["polars-lazy/arange"]
true_div = ["polars-lazy/true_div"]
diagonal_concat = ["polars-core/diagonal_concat"]
horizontal_concat = ["polars-core/horizontal_concat"]
abs = ["polars-core/abs", "polars-lazy/abs"]
dynamic_groupby = ["polars-core/dynamic_groupby", "polars-lazy/dynamic_groupby"]
ewma = ["polars-core/ewma", "polars-lazy/ewma"]
Expand Down Expand Up @@ -166,6 +167,8 @@ docs-selection = [
"rank",
"list",
"arange",
"diagonal_concat",
"horizontal_concat",
"abs",
]

Expand Down
2 changes: 2 additions & 0 deletions polars/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ rank = []
diff = []
moment = []
diagonal_concat = []
horizontal_concat = []
abs = []
ewma = ["polars-utils"]

Expand Down Expand Up @@ -122,6 +123,7 @@ docs-selection = [
"rank",
"list",
"diagonal_concat",
"horizontal_concat",
"abs",
]

Expand Down
42 changes: 42 additions & 0 deletions polars/polars-core/src/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,9 +160,51 @@ pub fn concat_str(s: &[Series], delimiter: &str) -> Result<Utf8Chunked> {
Ok(builder.finish())
}

/// Concat `[DataFrame]`s horizontally.
#[cfg(feature = "horizontal_concat")]
#[cfg_attr(docsrs, doc(cfg(feature = "horizontal_concat")))]
/// Concat horizontally and extend with null values if lengths don't match
pub fn hor_concat_df(dfs: &[DataFrame]) -> Result<DataFrame> {
let max_len = dfs
.iter()
.map(|df| df.height())
.max()
.ok_or_else(|| PolarsError::ComputeError("cannot concat empty dataframes".into()))?;

let owned_df;

// if not all equal length, extend the DataFrame with nulls
let dfs = if !dfs.iter().all(|df| df.height() == max_len) {
owned_df = dfs
.iter()
.cloned()
.map(|mut df| {
if df.height() != max_len {
let diff = max_len - df.height();
df.columns
.iter_mut()
.for_each(|s| *s = s.extend(AnyValue::Null, diff).unwrap());
}
df
})
.collect::<Vec<_>>();
owned_df.as_slice()
} else {
dfs
};

let mut first_df = dfs[0].clone();

for df in &dfs[1..] {
first_df.hstack_mut(df.get_columns())?;
}
Ok(first_df)
}

/// Concat `[DataFrame]`s diagonally.
#[cfg(feature = "diagonal_concat")]
#[cfg_attr(docsrs, doc(cfg(feature = "diagonal_concat")))]
/// Concat diagonally thereby combining different schemas.
pub fn diag_concat_df(dfs: &[DataFrame]) -> Result<DataFrame> {
let upper_bound_width = dfs.iter().map(|df| df.width()).sum();
let mut column_names = AHashSet::with_capacity(upper_bound_width);
Expand Down
1 change: 1 addition & 0 deletions polars/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@
//! - `groupby_list` - Allow groupby operation on keys of type List.
//! - `row_hash` - Utility to hash DataFrame rows to UInt64Chunked
//! - `diagonal_concat` - Concat diagonally thereby combining different schemas.
//! - `horizontal_concat` - Concat horizontally and extend with null values if lengths don't match
//! * `Series` operations:
//! - `is_in` - [Check for membership in `Series`](crate::chunked_array::ops::IsIn)
//! - `zip_with` - [Zip two Series/ ChunkedArrays](crate::chunked_array::ops::ChunkZip)
Expand Down
1 change: 1 addition & 0 deletions py-polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ features = [
"true_div",
"dtype-categorical",
"diagonal_concat",
"horizontal_concat",
"abs",
"ewma",
]
Expand Down
11 changes: 8 additions & 3 deletions py-polars/polars/internals/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from polars.polars import concat_series as _concat_series
from polars.polars import py_date_range as _py_date_range
from polars.polars import py_diag_concat_df as _diag_concat_df
from polars.polars import py_hor_concat_df as _hor_concat_df

_DOCUMENTING = False
except ImportError: # pragma: no cover
Expand Down Expand Up @@ -66,9 +67,11 @@ def concat(
how
Only used if the items are DataFrames.
On of {"vertical", "diagonal"}.
Vertical: Applies multiple `vstack` operations.
Diagonal: Finds a union between the column schemas and fills missing column values with null.
One of {"vertical", "diagonal", "horiztonal"}.
- Vertical: Applies multiple `vstack` operations.
- Diagonal: Finds a union between the column schemas and fills missing column values with null.
- Horizontal: Stacks Series horizontall and fills with nulls if the lengths don't match.
Examples
--------
Expand Down Expand Up @@ -96,6 +99,8 @@ def concat(
out = pli.wrap_df(_concat_df(items))
elif how == "diagonal":
out = pli.wrap_df(_diag_concat_df(items))
elif how == "horizontal":
out = pli.wrap_df(_hor_concat_df(items))
else:
raise ValueError(
f"how should be one of {'vertical', 'diagonal'}, got {how}"
Expand Down
1 change: 0 additions & 1 deletion py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ use crate::lazy::utils::py_exprs_to_exprs;
use crate::prelude::{parse_strategy, str_to_rankmethod};
use crate::series::PySeries;
use crate::utils::{reinterpret, str_to_polarstype};
use crate::PyPolarsEr::Any;
use polars::lazy::dsl;
use polars::lazy::dsl::Operator;
use polars::prelude::*;
Expand Down
19 changes: 18 additions & 1 deletion py-polars/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ use crate::error::PyPolarsEr;
use crate::file::get_either_file;
use crate::prelude::{ClosedWindow, DataType, Duration, PyDataType};
use mimalloc::MiMalloc;
use polars::functions::diag_concat_df;
use polars::functions::{diag_concat_df, hor_concat_df};
use polars_core::export::arrow::io::ipc::read::read_file_metadata;
use polars_core::prelude::IntoSeries;
use pyo3::types::PyDict;
Expand Down Expand Up @@ -224,6 +224,22 @@ fn py_diag_concat_df(dfs: &PyAny) -> PyResult<PyDataFrame> {
Ok(df.into())
}

#[pyfunction]
fn py_hor_concat_df(dfs: &PyAny) -> PyResult<PyDataFrame> {
let (seq, _len) = get_pyseq(dfs)?;
let iter = seq.iter()?;

let dfs = iter
.map(|item| {
let item = item?;
get_df(item)
})
.collect::<PyResult<Vec<_>>>()?;

let df = hor_concat_df(&dfs).map_err(PyPolarsEr::from)?;
Ok(df.into())
}

#[pyfunction]
fn concat_series(series: &PyAny) -> PyResult<PySeries> {
let (seq, _len) = get_pyseq(series)?;
Expand Down Expand Up @@ -332,6 +348,7 @@ fn polars(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_wrapped(wrap_pyfunction!(spearman_rank_corr)).unwrap();
m.add_wrapped(wrap_pyfunction!(map_mul)).unwrap();
m.add_wrapped(wrap_pyfunction!(py_diag_concat_df)).unwrap();
m.add_wrapped(wrap_pyfunction!(py_hor_concat_df)).unwrap();
m.add_wrapped(wrap_pyfunction!(py_datetime)).unwrap();
m.add_wrapped(wrap_pyfunction!(py_date_range)).unwrap();
Ok(())
Expand Down
18 changes: 0 additions & 18 deletions py-polars/tests/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -1401,24 +1401,6 @@ def test_filter_with_all_expansion() -> None:
assert out.shape == (2, 3)


def test_diag_concat() -> None:
a = pl.DataFrame({"a": [1, 2]})
b = pl.DataFrame({"b": ["a", "b"], "c": [1, 2]})
c = pl.DataFrame({"a": [5, 7], "c": [1, 2], "d": [1, 2]})

out = pl.concat([a, b, c], how="diagonal")
expected = pl.DataFrame(
{
"a": [1, 2, None, None, 5, 7],
"b": [None, None, "a", "b", None, None],
"c": [None, None, 1, 2, 1, 2],
"d": [None, None, None, None, 1, 2],
}
)

assert out.frame_equal(expected, null_equal=True)


def test_transpose() -> None:
df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
expected = pl.DataFrame(
Expand Down
35 changes: 35 additions & 0 deletions py-polars/tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,38 @@ def test_date_datetime() -> None:
print(out)
assert out["date"].series_equal(df["day"].rename("date"))
assert out["h2"].series_equal(df["hour"].rename("h2"))


def test_diag_concat() -> None:
a = pl.DataFrame({"a": [1, 2]})
b = pl.DataFrame({"b": ["a", "b"], "c": [1, 2]})
c = pl.DataFrame({"a": [5, 7], "c": [1, 2], "d": [1, 2]})

out = pl.concat([a, b, c], how="diagonal")
expected = pl.DataFrame(
{
"a": [1, 2, None, None, 5, 7],
"b": [None, None, "a", "b", None, None],
"c": [None, None, 1, 2, 1, 2],
"d": [None, None, None, None, 1, 2],
}
)

assert out.frame_equal(expected, null_equal=True)


def test_concat_horizontal() -> None:
a = pl.DataFrame({"a": ["a", "b"], "b": [1, 2]})
b = pl.DataFrame({"c": [5, 7, 8, 9], "d": [1, 2, 1, 2], "e": [1, 2, 1, 2]})

out = pl.concat([a, b], how="horizontal")
expected = pl.DataFrame(
{
"a": ["a", "b", None, None],
"b": [1, 2, None, None],
"c": [5, 7, 8, 9],
"d": [1, 2, 1, 2],
"e": [1, 2, 1, 2],
}
)
assert out.frame_equal(expected)

0 comments on commit f7a9aa3

Please sign in to comment.