Skip to content

Commit

Permalink
fix(rust, python): early error on duplicate names in streaming groupby (
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Nov 26, 2022
1 parent cc83dd2 commit 86a4a4c
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 6 deletions.
8 changes: 4 additions & 4 deletions polars/polars-core/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ pub struct DataFrame {
pub(crate) columns: Vec<Series>,
}

fn duplicate_err(name: &str) -> PolarsResult<()> {
pub fn _duplicate_err(name: &str) -> PolarsResult<()> {
Err(PolarsError::Duplicate(
format!("Column with name: '{}' has more than one occurrences", name).into(),
))
Expand Down Expand Up @@ -254,7 +254,7 @@ impl DataFrame {
let name = s.name();

if names.contains(name) {
duplicate_err(name)?
_duplicate_err(name)?
}

names.insert(name);
Expand Down Expand Up @@ -282,7 +282,7 @@ impl DataFrame {
let name = series.name().to_string();

if names.contains(&name) {
duplicate_err(&name)?
_duplicate_err(&name)?
}

series_cols.push(series);
Expand Down Expand Up @@ -1425,7 +1425,7 @@ impl DataFrame {
let mut names = PlHashSet::with_capacity(cols.len());
for name in cols {
if !names.insert(name.as_str()) {
duplicate_err(name)?
_duplicate_err(name)?
}
}
}
Expand Down
15 changes: 15 additions & 0 deletions polars/polars-lazy/polars-plan/src/logical_plan/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use std::ops::Deref;
use std::path::PathBuf;
use std::sync::Mutex;

use polars_core::frame::_duplicate_err;
use polars_core::frame::explode::MeltArgs;
use polars_core::prelude::*;
use polars_core::utils::try_get_supertype;
Expand Down Expand Up @@ -426,6 +427,20 @@ impl LogicalPlanBuilder {
);
schema.merge(other);

if schema.len() < keys.len() + aggs.len() {
let check_names = || {
let mut names = PlHashSet::with_capacity(schema.len());
for expr in aggs.iter().chain(keys.iter()) {
let name = expr_output_name(expr)?;
if !names.insert(name.clone()) {
return _duplicate_err(name.as_ref());
}
}
Ok(())
};
try_delayed!(check_names(), &self.0, into)
}

#[cfg(feature = "dynamic_groupby")]
{
let index_columns = &[
Expand Down
2 changes: 0 additions & 2 deletions polars/polars-lazy/polars-plan/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ use polars_core::prelude::*;

use crate::logical_plan::iterator::ArenaExprIter;
use crate::logical_plan::Context;
#[cfg(feature = "meta")]
use crate::prelude::names::COUNT;
use crate::prelude::*;

Expand Down Expand Up @@ -136,7 +135,6 @@ pub fn has_null(current_expr: &Expr) -> bool {
}

/// output name of expr
#[cfg(feature = "meta")]
pub(crate) fn expr_output_name(expr: &Expr) -> PolarsResult<Arc<str>> {
for e in expr {
match e {
Expand Down
19 changes: 19 additions & 0 deletions py-polars/tests/unit/test_streaming.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from datetime import date

import numpy as np
import pytest

import polars as pl

Expand Down Expand Up @@ -71,6 +72,24 @@ def test_streaming_groupby_types() -> None:
"date_last": [date(2022, 1, 1)],
}

with pytest.raises(pl.DuplicateError):
(
df.lazy()
.groupby("person_id")
.agg(
[
pl.col("person_name").first().alias("str_first"),
pl.col("person_name").last().alias("str_last"),
pl.col("person_name").mean().alias("str_mean"),
pl.col("person_name").sum().alias("str_sum"),
pl.col("bool").first().alias("bool_first"),
pl.col("bool").last().alias("bool_first"),
]
)
.select(pl.all().exclude("person_id"))
.collect(streaming=True)
)


def test_streaming_non_streaming_gb() -> None:
n = 100
Expand Down

0 comments on commit 86a4a4c

Please sign in to comment.