improve equality checks & column generation of transpose

Equality of Series and DataFrames now also checks the column name. This also uncovered some inconsistencies wich are now fixed. For python transpose a generator option is added so that a use can define column names
pola-rs · Nov 23, 2021 · fb84ac8 · fb84ac8
1 parent f1be04f
commit fb84ac8
Show file tree

Hide file tree

Showing 14 changed files with 243 additions and 116 deletions.
diff --git a/polars/polars-core/src/chunked_array/mod.rs b/polars/polars-core/src/chunked_array/mod.rs
@@ -389,7 +389,7 @@ impl<T> ChunkedArray<T> {
                 Arc::new(BooleanArray::from_data_default(bitmap, None)) as ArrayRef
             })
             .collect_vec();
-        BooleanChunked::new_from_chunks("is_null", chunks)
+        BooleanChunked::new_from_chunks(self.name(), chunks)
     }
 
     /// Get a mask of the valid values.
@@ -408,7 +408,7 @@ impl<T> ChunkedArray<T> {
                 Arc::new(BooleanArray::from_data_default(bitmap, None)) as ArrayRef
             })
             .collect_vec();
-        BooleanChunked::new_from_chunks("is_not_null", chunks)
+        BooleanChunked::new_from_chunks(self.name(), chunks)
     }
 
     /// Get data type of ChunkedArray.

diff --git a/polars/polars-core/src/chunked_array/ops/reverse.rs b/polars/polars-core/src/chunked_array/ops/reverse.rs
@@ -32,7 +32,9 @@ macro_rules! impl_reverse {
     ($arrow_type:ident, $ca_type:ident) => {
         impl ChunkReverse<$arrow_type> for $ca_type {
             fn reverse(&self) -> Self {
-                self.into_iter().rev().collect_trusted()
+                let mut ca: Self = self.into_iter().rev().collect_trusted();
+                ca.rename(self.name());
+                ca
             }
         }
     };

diff --git a/polars/polars-core/src/chunked_array/ops/take/take_every.rs b/polars/polars-core/src/chunked_array/ops/take/take_every.rs
@@ -8,42 +8,50 @@ where
     T: PolarsNumericType,
 {
     fn take_every(&self, n: usize) -> ChunkedArray<T> {
-        if !self.has_validity() {
+        let mut ca = if !self.has_validity() {
             let a: NoNull<_> = self.into_no_null_iter().step_by(n).collect();
             a.into_inner()
         } else {
             self.into_iter().step_by(n).collect()
-        }
+        };
+        ca.rename(self.name());
+        ca
     }
 }
 
 impl ChunkTakeEvery<BooleanType> for BooleanChunked {
     fn take_every(&self, n: usize) -> BooleanChunked {
-        if !self.has_validity() {
+        let mut ca: Self = if !self.has_validity() {
             self.into_no_null_iter().step_by(n).collect()
         } else {
             self.into_iter().step_by(n).collect()
-        }
+        };
+        ca.rename(self.name());
+        ca
     }
 }
 
 impl ChunkTakeEvery<Utf8Type> for Utf8Chunked {
     fn take_every(&self, n: usize) -> Utf8Chunked {
-        if !self.has_validity() {
+        let mut ca: Self = if !self.has_validity() {
             self.into_no_null_iter().step_by(n).collect()
         } else {
             self.into_iter().step_by(n).collect()
-        }
+        };
+        ca.rename(self.name());
+        ca
     }
 }
 
 impl ChunkTakeEvery<ListType> for ListChunked {
     fn take_every(&self, n: usize) -> ListChunked {
-        if !self.has_validity() {
+        let mut ca: Self = if !self.has_validity() {
             self.into_no_null_iter().step_by(n).collect()
         } else {
             self.into_iter().step_by(n).collect()
-        }
+        };
+        ca.rename(self.name());
+        ca
     }
 }
 

diff --git a/polars/polars-core/src/testing.rs b/polars/polars-core/src/testing.rs
@@ -5,49 +5,25 @@ use std::ops::Deref;
 impl Series {
     /// Check if series are equal. Note that `None == None` evaluates to `false`
     pub fn series_equal(&self, other: &Series) -> bool {
-        if self.get_data_ptr() == other.get_data_ptr() {
-            return true;
-        }
-        if self.len() != other.len() || self.null_count() != other.null_count() {
-            return false;
-        }
-        if self.dtype() != other.dtype()
-            && !(matches!(self.dtype(), DataType::Utf8 | DataType::Categorical)
-                || matches!(other.dtype(), DataType::Utf8 | DataType::Categorical))
-            && !(self.is_numeric() && other.is_numeric())
-        {
-            return false;
-        }
-        match self.equal(other).sum() {
-            None => false,
-            Some(sum) => sum as usize == self.len(),
+        if self.null_count() > 0 || other.null_count() > 0 {
+            false
+        } else {
+            self.series_equal_missing(other)
         }
     }
 
     /// Check if all values in series are equal where `None == None` evaluates to `true`.
     pub fn series_equal_missing(&self, other: &Series) -> bool {
-        if self.get_data_ptr() == other.get_data_ptr() {
-            return true;
-        }
-        let null_count_left = self.null_count();
-        if self.len() != other.len() || null_count_left != other.null_count() {
-            return false;
-        }
-        if self.dtype() != other.dtype()
-            && !(matches!(self.dtype(), DataType::Utf8 | DataType::Categorical)
-                || matches!(other.dtype(), DataType::Utf8 | DataType::Categorical))
-            && !(self.is_numeric() && other.is_numeric())
-        {
-            return false;
-        }
-        // if all null and previous check did not return (so other is also all null)
-        if null_count_left == self.len() {
-            return true;
-        }
-        match self.eq_missing(other).sum() {
-            None => false,
-            Some(sum) => sum as usize == self.len(),
-        }
+        // differences from Partial::eq in that numerical dtype may be different
+        self.len() == other.len()
+            && self.name() == other.name()
+            && self.null_count() == other.null_count()
+            && self
+                .eq_missing(other)
+                .sum()
+                .map(|s| s as usize)
+                .unwrap_or(0)
+                == self.len()
     }
 
     /// Get a pointer to the underlying data of this Series.
@@ -70,7 +46,12 @@ impl PartialEq for Series {
         self.len() == other.len()
             && self.field() == other.field()
             && self.null_count() == other.null_count()
-            && self.eq_missing(other).sum().map(|s| s as usize) == Some(self.len())
+            && self
+                .eq_missing(other)
+                .sum()
+                .map(|s| s as usize)
+                .unwrap_or(0)
+                == self.len()
     }
 }
 
@@ -128,7 +109,7 @@ mod test {
     #[test]
     fn test_series_equal() {
         let a = Series::new("a", &[1, 2, 3]);
-        let b = Series::new("b", &[1, 2, 3]);
+        let b = Series::new("a", &[1, 2, 3]);
         assert!(a.series_equal(&b));
 
         let s = Series::new("foo", &[None, Some(1i64)]);

diff --git a/polars/polars-io/src/csv.rs b/polars/polars-io/src/csv.rs
@@ -859,7 +859,7 @@ hello,","," ",world,"!"
             assert!(df
                 .column(col)
                 .unwrap()
-                .series_equal(&Series::new("", &[&**val; 4])));
+                .series_equal(&Series::new(col, &[&**val; 4])));
         }
     }
 

diff --git a/polars/polars-lazy/src/lib.rs b/polars/polars-lazy/src/lib.rs
@@ -49,7 +49,7 @@
 //! assert!(new.column("new_column")
 //!     .unwrap()
 //!     .series_equal(
-//!         &Series::new("valid", &[50, 40, 30, 20, 10])
+//!         &Series::new("new_column", &[50, 40, 30, 20, 10])
 //!     )
 //! );
 //! ```
@@ -82,7 +82,7 @@
 //! assert!(new.column("new_column")
 //!     .unwrap()
 //!     .series_equal(
-//!         &Series::new("valid", &[100, 100, 3, 4, 5])
+//!         &Series::new("new_column", &[100, 100, 3, 4, 5])
 //!     )
 //! );
 //! ```

diff --git a/py-polars/polars/internals/frame.py b/py-polars/polars/internals/frame.py
@@ -777,7 +777,10 @@ def to_dicts(self) -> tp.List[Dict[str, Any]]:
         ]
 
     def transpose(
-        self, include_header: bool = False, header_name: str = "column"
+        self,
+        include_header: bool = False,
+        header_name: str = "column",
+        column_names: Optional[Union[tp.Iterator[str], tp.Sequence[str]]] = None,
     ) -> "pli.DataFrame":
         """
         Transpose a DataFrame over the diagonal.
@@ -788,6 +791,8 @@ def transpose(
             If set, the column names will be added as first column.
         header_name:
             If `include_header` is set, this determines the name of the column that will be inserted
+        column_names:
+            Optional generator/iterator that yields column names. Will be used to replace the columns in the DataFrame.
 
         Notes
         -----
@@ -797,8 +802,81 @@ def transpose(
         -------
         DataFrame
 
-        """
-        return wrap_df(self._df.transpose(include_header, header_name))
+        Examples
+        --------
+        >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
+        >>> df.transpose(include_header=True)
+        shape: (2, 4)
+        ┌────────┬──────────┬──────────┬──────────┐
+        │ column ┆ column_0 ┆ column_1 ┆ column_2 │
+        │ ---    ┆ ---      ┆ ---      ┆ ---      │
+        │ str    ┆ i64      ┆ i64      ┆ i64      │
+        ╞════════╪══════════╪══════════╪══════════╡
+        │ a      ┆ 1        ┆ 2        ┆ 3        │
+        ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
+        │ b      ┆ 1        ┆ 2        ┆ 3        │
+        └────────┴──────────┴──────────┴──────────┘
+
+        # replace the auto generated column names with a list
+        >>> df.transpose(include_header=False, column_names=["a", "b", "c"])
+        shape: (2, 3)
+        ┌─────┬─────┬─────┐
+        │ a   ┆ b   ┆ c   │
+        │ --- ┆ --- ┆ --- │
+        │ i64 ┆ i64 ┆ i64 │
+        ╞═════╪═════╪═════╡
+        │ 1   ┆ 2   ┆ 3   │
+        ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+        │ 1   ┆ 2   ┆ 3   │
+        └─────┴─────┴─────┘
+
+        >>> # include the header as a separate column
+        >>> df.transpose(include_header=True, header_name="foo", column_names=["a", "b", "c"])
+        shape: (2, 4)
+        ┌─────┬─────┬─────┬─────┐
+        │ foo ┆ a   ┆ b   ┆ c   │
+        │ --- ┆ --- ┆ --- ┆ --- │
+        │ str ┆ i64 ┆ i64 ┆ i64 │
+        ╞═════╪═════╪═════╪═════╡
+        │ a   ┆ 1   ┆ 2   ┆ 3   │
+        ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+        │ b   ┆ 1   ┆ 2   ┆ 3   │
+        └─────┴─────┴─────┴─────┘
+
+        >>> import typing as tp
+        >>> # replace the auto generated column with column names from a generator function
+        >>> def name_generator() -> tp.Iterator[str]:
+        >>>     base_name = "my_column_"
+        >>>     count = 0
+        >>>     while True:
+        >>>         yield f"{base_name}{count}"
+        >>>         count += 1
+        >>> df.transpose(include_header=False, column_names=name_generator())
+        shape: (2, 3)
+        ┌─────────────┬─────────────┬─────────────┐
+        │ my_column_0 ┆ my_column_1 ┆ my_column_2 │
+        │ ---         ┆ ---         ┆ ---         │
+        │ i64         ┆ i64         ┆ i64         │
+        ╞═════════════╪═════════════╪═════════════╡
+        │ 1           ┆ 2           ┆ 3           │
+        ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+        │ 1           ┆ 2           ┆ 3           │
+        └─────────────┴─────────────┴─────────────┘
+
+        """
+        df = wrap_df(self._df.transpose(include_header, header_name))
+        if column_names is not None:
+            names = []
+            n = df.width
+            if include_header:
+                names.append(header_name)
+                n -= 1
+
+            column_names = iter(column_names)
+            for _ in range(n):
+                names.append(next(column_names))
+            df.columns = names
+        return df
 
     def to_parquet(
         self,
@@ -1573,7 +1651,7 @@ def sort(
         else:
             return wrap_df(self._df.sort(by, reverse))
 
-    def frame_equal(self, other: "DataFrame", null_equal: bool = False) -> bool:
+    def frame_equal(self, other: "DataFrame", null_equal: bool = True) -> bool:
         """
         Check if DataFrame is equal to other.
 

diff --git a/py-polars/polars/internals/series.py b/py-polars/polars/internals/series.py
@@ -1674,7 +1674,11 @@ def shape(self) -> Tuple[int]:
     def __len__(self) -> int:
         return self.len()
 
-    def cast(self, dtype: Type[DataType], strict: bool = True) -> "Series":
+    def cast(
+        self,
+        dtype: Union[Type[DataType], Type[int], Type[float], Type[str], Type[bool]],
+        strict: bool = True,
+    ) -> "Series":
         """
         Cast between data types.