Skip to content

Commit

Permalink
improve equality checks & column generation of transpose
Browse files Browse the repository at this point in the history
Equality of Series and DataFrames now also checks
the column name. This also uncovered some inconsistencies
wich are now fixed.

For python transpose a generator option is
added so that a use can define column names
  • Loading branch information
ritchie46 committed Nov 23, 2021
1 parent f1be04f commit fb84ac8
Show file tree
Hide file tree
Showing 14 changed files with 243 additions and 116 deletions.
4 changes: 2 additions & 2 deletions polars/polars-core/src/chunked_array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ impl<T> ChunkedArray<T> {
Arc::new(BooleanArray::from_data_default(bitmap, None)) as ArrayRef
})
.collect_vec();
BooleanChunked::new_from_chunks("is_null", chunks)
BooleanChunked::new_from_chunks(self.name(), chunks)
}

/// Get a mask of the valid values.
Expand All @@ -408,7 +408,7 @@ impl<T> ChunkedArray<T> {
Arc::new(BooleanArray::from_data_default(bitmap, None)) as ArrayRef
})
.collect_vec();
BooleanChunked::new_from_chunks("is_not_null", chunks)
BooleanChunked::new_from_chunks(self.name(), chunks)
}

/// Get data type of ChunkedArray.
Expand Down
4 changes: 3 additions & 1 deletion polars/polars-core/src/chunked_array/ops/reverse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ macro_rules! impl_reverse {
($arrow_type:ident, $ca_type:ident) => {
impl ChunkReverse<$arrow_type> for $ca_type {
fn reverse(&self) -> Self {
self.into_iter().rev().collect_trusted()
let mut ca: Self = self.into_iter().rev().collect_trusted();
ca.rename(self.name());
ca
}
}
};
Expand Down
24 changes: 16 additions & 8 deletions polars/polars-core/src/chunked_array/ops/take/take_every.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,42 +8,50 @@ where
T: PolarsNumericType,
{
fn take_every(&self, n: usize) -> ChunkedArray<T> {
if !self.has_validity() {
let mut ca = if !self.has_validity() {
let a: NoNull<_> = self.into_no_null_iter().step_by(n).collect();
a.into_inner()
} else {
self.into_iter().step_by(n).collect()
}
};
ca.rename(self.name());
ca
}
}

impl ChunkTakeEvery<BooleanType> for BooleanChunked {
fn take_every(&self, n: usize) -> BooleanChunked {
if !self.has_validity() {
let mut ca: Self = if !self.has_validity() {
self.into_no_null_iter().step_by(n).collect()
} else {
self.into_iter().step_by(n).collect()
}
};
ca.rename(self.name());
ca
}
}

impl ChunkTakeEvery<Utf8Type> for Utf8Chunked {
fn take_every(&self, n: usize) -> Utf8Chunked {
if !self.has_validity() {
let mut ca: Self = if !self.has_validity() {
self.into_no_null_iter().step_by(n).collect()
} else {
self.into_iter().step_by(n).collect()
}
};
ca.rename(self.name());
ca
}
}

impl ChunkTakeEvery<ListType> for ListChunked {
fn take_every(&self, n: usize) -> ListChunked {
if !self.has_validity() {
let mut ca: Self = if !self.has_validity() {
self.into_no_null_iter().step_by(n).collect()
} else {
self.into_iter().step_by(n).collect()
}
};
ca.rename(self.name());
ca
}
}

Expand Down
61 changes: 21 additions & 40 deletions polars/polars-core/src/testing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,49 +5,25 @@ use std::ops::Deref;
impl Series {
/// Check if series are equal. Note that `None == None` evaluates to `false`
pub fn series_equal(&self, other: &Series) -> bool {
if self.get_data_ptr() == other.get_data_ptr() {
return true;
}
if self.len() != other.len() || self.null_count() != other.null_count() {
return false;
}
if self.dtype() != other.dtype()
&& !(matches!(self.dtype(), DataType::Utf8 | DataType::Categorical)
|| matches!(other.dtype(), DataType::Utf8 | DataType::Categorical))
&& !(self.is_numeric() && other.is_numeric())
{
return false;
}
match self.equal(other).sum() {
None => false,
Some(sum) => sum as usize == self.len(),
if self.null_count() > 0 || other.null_count() > 0 {
false
} else {
self.series_equal_missing(other)
}
}

/// Check if all values in series are equal where `None == None` evaluates to `true`.
pub fn series_equal_missing(&self, other: &Series) -> bool {
if self.get_data_ptr() == other.get_data_ptr() {
return true;
}
let null_count_left = self.null_count();
if self.len() != other.len() || null_count_left != other.null_count() {
return false;
}
if self.dtype() != other.dtype()
&& !(matches!(self.dtype(), DataType::Utf8 | DataType::Categorical)
|| matches!(other.dtype(), DataType::Utf8 | DataType::Categorical))
&& !(self.is_numeric() && other.is_numeric())
{
return false;
}
// if all null and previous check did not return (so other is also all null)
if null_count_left == self.len() {
return true;
}
match self.eq_missing(other).sum() {
None => false,
Some(sum) => sum as usize == self.len(),
}
// differences from Partial::eq in that numerical dtype may be different
self.len() == other.len()
&& self.name() == other.name()
&& self.null_count() == other.null_count()
&& self
.eq_missing(other)
.sum()
.map(|s| s as usize)
.unwrap_or(0)
== self.len()
}

/// Get a pointer to the underlying data of this Series.
Expand All @@ -70,7 +46,12 @@ impl PartialEq for Series {
self.len() == other.len()
&& self.field() == other.field()
&& self.null_count() == other.null_count()
&& self.eq_missing(other).sum().map(|s| s as usize) == Some(self.len())
&& self
.eq_missing(other)
.sum()
.map(|s| s as usize)
.unwrap_or(0)
== self.len()
}
}

Expand Down Expand Up @@ -128,7 +109,7 @@ mod test {
#[test]
fn test_series_equal() {
let a = Series::new("a", &[1, 2, 3]);
let b = Series::new("b", &[1, 2, 3]);
let b = Series::new("a", &[1, 2, 3]);
assert!(a.series_equal(&b));

let s = Series::new("foo", &[None, Some(1i64)]);
Expand Down
2 changes: 1 addition & 1 deletion polars/polars-io/src/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -859,7 +859,7 @@ hello,","," ",world,"!"
assert!(df
.column(col)
.unwrap()
.series_equal(&Series::new("", &[&**val; 4])));
.series_equal(&Series::new(col, &[&**val; 4])));
}
}

Expand Down
4 changes: 2 additions & 2 deletions polars/polars-lazy/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
//! assert!(new.column("new_column")
//! .unwrap()
//! .series_equal(
//! &Series::new("valid", &[50, 40, 30, 20, 10])
//! &Series::new("new_column", &[50, 40, 30, 20, 10])
//! )
//! );
//! ```
Expand Down Expand Up @@ -82,7 +82,7 @@
//! assert!(new.column("new_column")
//! .unwrap()
//! .series_equal(
//! &Series::new("valid", &[100, 100, 3, 4, 5])
//! &Series::new("new_column", &[100, 100, 3, 4, 5])
//! )
//! );
//! ```
Expand Down
86 changes: 82 additions & 4 deletions py-polars/polars/internals/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -777,7 +777,10 @@ def to_dicts(self) -> tp.List[Dict[str, Any]]:
]

def transpose(
self, include_header: bool = False, header_name: str = "column"
self,
include_header: bool = False,
header_name: str = "column",
column_names: Optional[Union[tp.Iterator[str], tp.Sequence[str]]] = None,
) -> "pli.DataFrame":
"""
Transpose a DataFrame over the diagonal.
Expand All @@ -788,6 +791,8 @@ def transpose(
If set, the column names will be added as first column.
header_name:
If `include_header` is set, this determines the name of the column that will be inserted
column_names:
Optional generator/iterator that yields column names. Will be used to replace the columns in the DataFrame.
Notes
-----
Expand All @@ -797,8 +802,81 @@ def transpose(
-------
DataFrame
"""
return wrap_df(self._df.transpose(include_header, header_name))
Examples
--------
>>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
>>> df.transpose(include_header=True)
shape: (2, 4)
┌────────┬──────────┬──────────┬──────────┐
│ column ┆ column_0 ┆ column_1 ┆ column_2 │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 │
╞════════╪══════════╪══════════╪══════════╡
│ a ┆ 1 ┆ 2 ┆ 3 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 1 ┆ 2 ┆ 3 │
└────────┴──────────┴──────────┴──────────┘
# replace the auto generated column names with a list
>>> df.transpose(include_header=False, column_names=["a", "b", "c"])
shape: (2, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 1 ┆ 2 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 1 ┆ 2 ┆ 3 │
└─────┴─────┴─────┘
>>> # include the header as a separate column
>>> df.transpose(include_header=True, header_name="foo", column_names=["a", "b", "c"])
shape: (2, 4)
┌─────┬─────┬─────┬─────┐
│ foo ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╪═════╡
│ a ┆ 1 ┆ 2 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ b ┆ 1 ┆ 2 ┆ 3 │
└─────┴─────┴─────┴─────┘
>>> import typing as tp
>>> # replace the auto generated column with column names from a generator function
>>> def name_generator() -> tp.Iterator[str]:
>>> base_name = "my_column_"
>>> count = 0
>>> while True:
>>> yield f"{base_name}{count}"
>>> count += 1
>>> df.transpose(include_header=False, column_names=name_generator())
shape: (2, 3)
┌─────────────┬─────────────┬─────────────┐
│ my_column_0 ┆ my_column_1 ┆ my_column_2 │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════════════╪═════════════╪═════════════╡
│ 1 ┆ 2 ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 2 ┆ 3 │
└─────────────┴─────────────┴─────────────┘
"""
df = wrap_df(self._df.transpose(include_header, header_name))
if column_names is not None:
names = []
n = df.width
if include_header:
names.append(header_name)
n -= 1

column_names = iter(column_names)
for _ in range(n):
names.append(next(column_names))
df.columns = names
return df

def to_parquet(
self,
Expand Down Expand Up @@ -1573,7 +1651,7 @@ def sort(
else:
return wrap_df(self._df.sort(by, reverse))

def frame_equal(self, other: "DataFrame", null_equal: bool = False) -> bool:
def frame_equal(self, other: "DataFrame", null_equal: bool = True) -> bool:
"""
Check if DataFrame is equal to other.
Expand Down
6 changes: 5 additions & 1 deletion py-polars/polars/internals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1674,7 +1674,11 @@ def shape(self) -> Tuple[int]:
def __len__(self) -> int:
return self.len()

def cast(self, dtype: Type[DataType], strict: bool = True) -> "Series":
def cast(
self,
dtype: Union[Type[DataType], Type[int], Type[float], Type[str], Type[bool]],
strict: bool = True,
) -> "Series":
"""
Cast between data types.
Expand Down

0 comments on commit fb84ac8

Please sign in to comment.