Skip to content

Commit

Permalink
Struct creations/append/extend stricter schema (#3454)
Browse files Browse the repository at this point in the history
* struct::new check duplicate name and add duplicate exception to python

* struct append/extend check field names
  • Loading branch information
ritchie46 committed May 21, 2022
1 parent 326a3cb commit e3c7973
Show file tree
Hide file tree
Showing 9 changed files with 105 additions and 3 deletions.
9 changes: 9 additions & 0 deletions polars/polars-core/src/chunked_array/logical/struct_/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,15 @@ fn fields_to_struct_array(fields: &[Series]) -> (ArrayRef, Vec<Series>) {

impl StructChunked {
pub fn new(name: &str, fields: &[Series]) -> Result<Self> {
let mut names = PlHashSet::with_capacity(fields.len());
for s in fields {
let name = s.name();
if !names.insert(name) {
return Err(PolarsError::Duplicate(
format!("multiple fields with name '{name}' found").into(),
));
}
}
if !fields.iter().map(|s| s.len()).all_equal() {
Err(PolarsError::ShapeMisMatch(
"expected all fields to have equal length".into(),
Expand Down
10 changes: 10 additions & 0 deletions polars/polars-core/src/series/implementations/struct_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,11 @@ impl SeriesTrait for SeriesWrap<StructChunked> {
let offset = self.chunks().len();

for (lhs, rhs) in self.0.fields_mut().iter_mut().zip(other.fields()) {
let lhs_name = lhs.name();
let rhs_name = rhs.name();
if lhs_name != rhs_name {
return Err(PolarsError::SchemaMisMatch(format!("cannot append field with name: {rhs_name} to struct with field name: {lhs_name}, please check your schema").into()));
}
lhs.append(rhs)?;
}
self.0.update_chunks(offset);
Expand All @@ -124,6 +129,11 @@ impl SeriesTrait for SeriesWrap<StructChunked> {
let other = other.struct_()?;

for (lhs, rhs) in self.0.fields_mut().iter_mut().zip(other.fields()) {
let lhs_name = lhs.name();
let rhs_name = rhs.name();
if lhs_name != rhs_name {
return Err(PolarsError::SchemaMisMatch(format!("cannot extend field with name: {rhs_name} to struct with field name: {lhs_name}, please check your schema").into()));
}
lhs.extend(rhs)?;
}
self.0.update_chunks(0);
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ These functions can be used as expression and sometimes also in eager contexts.

select
col
element
count
list
std
Expand Down
1 change: 1 addition & 0 deletions py-polars/polars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def version() -> str:
from polars.exceptions import (
ArrowError,
ComputeError,
DuplicateError,
NoDataError,
NotFoundError,
SchemaError,
Expand Down
5 changes: 5 additions & 0 deletions py-polars/polars/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from polars.polars import (
ArrowError,
ComputeError,
DuplicateError,
NoDataError,
NotFoundError,
SchemaError,
Expand Down Expand Up @@ -29,6 +30,9 @@ class SchemaError(Exception): # type: ignore
class ShapeError(Exception): # type: ignore
pass

class DuplicateError(Exception): # type: ignore
pass


__all__ = [
"ArrowError",
Expand All @@ -37,4 +41,5 @@ class ShapeError(Exception): # type: ignore
"NotFoundError",
"SchemaError",
"ShapeError",
"DuplicateError",
]
24 changes: 24 additions & 0 deletions py-polars/polars/internals/lazy_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,31 @@ def col(
def element() -> "pli.Expr":
"""
Alias for an element in evaluated in an `eval` expression
Examples
--------
A horizontal rank computation by taking the elements of a list
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2]})
>>> df.with_column(
... pl.concat_list(["a", "b"]).arr.eval(pl.element().rank()).alias("rank")
... )
shape: (3, 3)
┌─────┬─────┬────────────┐
│ a ┆ b ┆ rank │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ list [f32] │
╞═════╪═════╪════════════╡
│ 1 ┆ 4 ┆ [1.0, 2.0] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 8 ┆ 5 ┆ [2.0, 1.0] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 2 ┆ [2.0, 1.0] │
└─────┴─────┴────────────┘
"""

return col("")


Expand Down
3 changes: 2 additions & 1 deletion py-polars/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ impl std::convert::From<PyPolarsErr> for PyErr {
PolarsError::Io(err) => PyIOError::new_err(err.to_string()),
PolarsError::InvalidOperation(err) => PyValueError::new_err(err.to_string()),
PolarsError::ArrowError(err) => ArrowErrorException::new_err(format!("{:?}", err)),
_ => default(),
PolarsError::Duplicate(err) => DuplicateError::new_err(err.to_string()),
},
Arrow(err) => ArrowErrorException::new_err(format!("{:?}", err)),
_ => default(),
Expand All @@ -59,3 +59,4 @@ create_exception!(exceptions, NoDataError, PyException);
create_exception!(exceptions, ArrowErrorException, PyException);
create_exception!(exceptions, ShapeError, PyException);
create_exception!(exceptions, SchemaError, PyException);
create_exception!(exceptions, DuplicateError, PyException);
5 changes: 4 additions & 1 deletion py-polars/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ pub mod utils;

use crate::conversion::{get_df, get_lf, get_pyseq, get_series, Wrap};
use crate::error::{
ArrowErrorException, ComputeError, NoDataError, NotFoundError, PyPolarsErr, SchemaError,
ArrowErrorException, ComputeError, DuplicateError, NoDataError, NotFoundError, PyPolarsErr,
SchemaError,
};
use crate::file::get_either_file;
use crate::prelude::{ClosedWindow, DataType, DatetimeArgs, Duration, DurationArgs, PyDataType};
Expand Down Expand Up @@ -437,6 +438,8 @@ fn polars(py: Python, m: &PyModule) -> PyResult<()> {
m.add("SchemaError", py.get_type::<SchemaError>()).unwrap();
m.add("ArrowError", py.get_type::<ArrowErrorException>())
.unwrap();
m.add("DuplicateError", py.get_type::<DuplicateError>())
.unwrap();
m.add_class::<PySeries>().unwrap();
m.add_class::<PyDataFrame>().unwrap();
m.add_class::<PyLazyFrame>().unwrap();
Expand Down
50 changes: 49 additions & 1 deletion py-polars/tests/test_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,12 @@ def test_nested_struct() -> None:


def test_eager_struct() -> None:
s = pl.struct([pl.Series([1, 2, 3]), pl.Series(["a", "b", "c"])], eager=True)
with pytest.raises(pl.DuplicateError, match="multiple fields with name '' found"):
s = pl.struct([pl.Series([1, 2, 3]), pl.Series(["a", "b", "c"])], eager=True)

s = pl.struct(
[pl.Series("a", [1, 2, 3]), pl.Series("b", ["a", "b", "c"])], eager=True
)
assert s.dtype == pl.Struct


Expand Down Expand Up @@ -416,3 +421,46 @@ def test_struct_order() -> None:
"col1": [{"a": 1, "b": 2}, {"b": 4, "a": 3}],
}
)


def test_struct_schema_on_append_extend_3452() -> None:
housing1_data = [
{
"city": "Chicago",
"address": "100 Main St",
"price": 250000,
"nbr_bedrooms": 3,
},
{
"city": "New York",
"address": "100 First Ave",
"price": 450000,
"nbr_bedrooms": 2,
},
]

housing2_data = [
{
"address": "303 Mockingbird Lane",
"city": "Los Angeles",
"nbr_bedrooms": 2,
"price": 450000,
},
{
"address": "404 Moldave Dr",
"city": "Miami Beach",
"nbr_bedrooms": 1,
"price": 250000,
},
]
housing1, housing2 = pl.Series(housing1_data), pl.Series(housing2_data)
with pytest.raises(
pl.SchemaError,
match="cannot append field with name: address to struct with field name: city, please check your schema",
):
housing1.append(housing2, append_chunks=True)
with pytest.raises(
pl.SchemaError,
match="cannot extend field with name: address to struct with field name: city, please check your schema",
):
housing1.append(housing2, append_chunks=False)

0 comments on commit e3c7973

Please sign in to comment.