feat(python): Update describe to use new count implementation (#1…

…2990)
pola-rs · Dec 12, 2023 · ac7ffa6 · ac7ffa6
1 parent a6483c6
commit ac7ffa6
Show file tree

Hide file tree

Showing 7 changed files with 320 additions and 213 deletions.
diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
@@ -36,7 +36,6 @@
 from polars.datatypes import (
     INTEGER_DTYPES,
     N_INFER_DEFAULT,
-    NUMERIC_DTYPES,
     Boolean,
     Float64,
     Object,
@@ -4340,7 +4339,7 @@ def describe(
         │ ---        ┆ ---      ┆ ---      ┆ ---      ┆ ---  ┆ ---  ┆ ---        │
         │ str        ┆ f64      ┆ f64      ┆ f64      ┆ str  ┆ str  ┆ str        │
         ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡
-        │ count      ┆ 3.0      ┆ 3.0      ┆ 3.0      ┆ 3    ┆ 3    ┆ 3          │
+        │ count      ┆ 3.0      ┆ 2.0      ┆ 3.0      ┆ 2    ┆ 2    ┆ 3          │
         │ null_count ┆ 0.0      ┆ 1.0      ┆ 0.0      ┆ 1    ┆ 1    ┆ 0          │
         │ mean       ┆ 2.266667 ┆ 4.5      ┆ 0.666667 ┆ null ┆ null ┆ null       │
         │ std        ┆ 1.101514 ┆ 0.707107 ┆ 0.57735  ┆ null ┆ null ┆ null       │
@@ -4352,44 +4351,63 @@ def describe(
         └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘
 
         """
-        # determine metrics and optional/additional percentiles
+        if not self.columns:
+            raise TypeError("cannot describe a DataFrame without any columns")
+
+        # Determine which columns should get std/mean/percentile statistics
+        stat_cols = {
+            c for c, dt in self.schema.items() if dt.is_numeric() or dt == Boolean
+        }
+
+        # Determine metrics and optional/additional percentiles
         metrics = ["count", "null_count", "mean", "std", "min"]
         percentile_exprs = []
         for p in parse_percentiles(percentiles):
-            percentile_exprs.append(F.all().quantile(p).name.prefix(f"{p}:"))
+            for c in self.columns:
+                expr = F.col(c).quantile(p) if c in stat_cols else F.lit(None)
+                expr = expr.alias(f"{p}:{c}")
+                percentile_exprs.append(expr)
             metrics.append(f"{p:.0%}")
         metrics.append("max")
 
-        # execute metrics in parallel
+        mean_exprs = [
+            (F.col(c).mean() if c in stat_cols else F.lit(None)).alias(f"mean:{c}")
+            for c in self.columns
+        ]
+        std_exprs = [
+            (F.col(c).std() if c in stat_cols else F.lit(None)).alias(f"std:{c}")
+            for c in self.columns
+        ]
+
+        # Calculate metrics in parallel
         df_metrics = self.select(
-            F.all().len().name.prefix("count:"),
+            F.all().count().name.prefix("count:"),
             F.all().null_count().name.prefix("null_count:"),
-            F.all().mean().name.prefix("mean:"),
-            F.all().std().name.prefix("std:"),
+            *mean_exprs,
+            *std_exprs,
             F.all().min().name.prefix("min:"),
             *percentile_exprs,
             F.all().max().name.prefix("max:"),
-        ).row(0)
+        )
 
-        # reshape wide result
-        n_cols = len(self.columns)
+        # Reshape wide result
         described = [
-            df_metrics[(n * n_cols) : (n + 1) * n_cols] for n in range(len(metrics))
+            df_metrics.row(0)[(n * self.width) : (n + 1) * self.width]
+            for n in range(len(metrics))
         ]
 
-        # cast by column type (numeric/bool -> float), (other -> string)
+        # Cast by column type (numeric/bool -> float), (other -> string)
         summary = dict(zip(self.columns, list(zip(*described))))
-        num_or_bool = NUMERIC_DTYPES | {Boolean}
-        for c, tp in self.schema.items():
+        for c in self.columns:
             summary[c] = [  # type: ignore[assignment]
                 None
                 if (v is None or isinstance(v, dict))
-                else (float(v) if tp in num_or_bool else str(v))
+                else (float(v) if c in stat_cols else str(v))
                 for v in summary[c]
             ]
 
-        # return results as a frame
-        df_summary = self.__class__(summary)
+        # Return results as a DataFrame
+        df_summary = self._from_dict(summary)
         df_summary.insert_column(0, pl.Series("describe", metrics))
         return df_summary
 

diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py
@@ -1576,8 +1576,8 @@ def describe(
 
         Examples
         --------
-        >>> series_num = pl.Series([1, 2, 3, 4, 5])
-        >>> series_num.describe()
+        >>> s = pl.Series([1, 2, 3, 4, 5])
+        >>> s.describe()
         shape: (9, 2)
         ┌────────────┬──────────┐
         │ statistic  ┆ value    │
@@ -1595,64 +1595,70 @@ def describe(
         │ max        ┆ 5.0      │
         └────────────┴──────────┘
 
-        >>> series_str = pl.Series(["a", "a", None, "b", "c"])
-        >>> series_str.describe()
+        Non-numeric data types may not have all statistics available.
+
+        >>> s = pl.Series(["a", "a", None, "b", "c"])
+        >>> s.describe()
         shape: (3, 2)
         ┌────────────┬───────┐
         │ statistic  ┆ value │
         │ ---        ┆ ---   │
         │ str        ┆ i64   │
         ╞════════════╪═══════╡
-        │ count      ┆ 5     │
+        │ count      ┆ 4     │
         │ null_count ┆ 1     │
         │ unique     ┆ 4     │
         └────────────┴───────┘
 
         """
         stats: dict[str, PythonLiteral | None]
+        stats_dtype: PolarsDataType
 
-        if self.len() == 0:
-            raise ValueError("Series must contain at least one value")
-
-        elif self.dtype.is_numeric():
-            s = self.cast(Float64)
+        if self.dtype.is_numeric():
+            stats_dtype = Float64
             stats = {
-                "count": s.len(),
-                "null_count": s.null_count(),
-                "mean": s.mean(),
-                "std": s.std(),
-                "min": s.min(),
+                "count": self.count(),
+                "null_count": self.null_count(),
+                "mean": self.mean(),
+                "std": self.std(),
+                "min": self.min(),
             }
             for p in parse_percentiles(percentiles):
-                stats[f"{p:.0%}"] = s.quantile(p)
-            stats["max"] = s.max()
+                stats[f"{p:.0%}"] = self.quantile(p)
+            stats["max"] = self.max()
 
         elif self.dtype == Boolean:
+            stats_dtype = Int64
             stats = {
-                "count": self.len(),
+                "count": self.count(),
                 "null_count": self.null_count(),
                 "sum": self.sum(),
             }
         elif self.dtype == Utf8:
+            stats_dtype = Int64
             stats = {
-                "count": self.len(),
+                "count": self.count(),
                 "null_count": self.null_count(),
-                "unique": len(self.unique()),
+                "unique": self.n_unique(),
             }
         elif self.dtype.is_temporal():
             # we coerce all to string, because a polars column
             # only has a single dtype and dates: datetime and count: int don't match
+            stats_dtype = Utf8
             stats = {
-                "count": str(self.len()),
+                "count": str(self.count()),
                 "null_count": str(self.null_count()),
                 "min": str(self.dt.min()),
                 "50%": str(self.dt.median()),
                 "max": str(self.dt.max()),
             }
         else:
-            raise TypeError("this type is not supported")
+            raise TypeError(f"cannot describe Series of data type {self.dtype}")
 
-        return pl.DataFrame({"statistic": stats.keys(), "value": stats.values()})
+        return pl.DataFrame(
+            {"statistic": stats.keys(), "value": stats.values()},
+            schema={"statistic": Utf8, "value": stats_dtype},
+        )
 
     def sum(self) -> int | float:
         """

diff --git a/py-polars/tests/unit/dataframe/test_describe.py b/py-polars/tests/unit/dataframe/test_describe.py
@@ -0,0 +1,143 @@
+from __future__ import annotations
+
+from datetime import date
+
+import pytest
+
+import polars as pl
+from polars.testing import assert_frame_equal
+
+
+def test_df_describe() -> None:
+    df = pl.DataFrame(
+        {
+            "a": [1.0, 2.8, 3.0],
+            "b": [4, 5, None],
+            "c": [True, False, True],
+            "d": [None, "b", "c"],
+            "e": ["usd", "eur", None],
+            "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)],
+        },
+        schema_overrides={"e": pl.Categorical},
+    )
+
+    result = df.describe()
+
+    expected = pl.DataFrame(
+        [
+            ("count", 3.0, 2.0, 3.0, 2, 2, 3),
+            ("null_count", 0.0, 1.0, 0.0, 1, 1, 0),
+            ("mean", 2.266667, 4.5, 0.666667, None, None, None),
+            ("std", 1.101514, 0.707107, 0.57735, None, None, None),
+            ("min", 1.0, 4.0, 0.0, "b", None, "2020-01-01"),
+            ("25%", 1.0, 4.0, None, None, None, None),
+            ("50%", 2.8, 5.0, None, None, None, None),
+            ("75%", 3.0, 5.0, None, None, None, None),
+            ("max", 3.0, 5.0, 1.0, "c", None, "2022-01-01"),
+        ],
+        schema=["describe"] + df.columns,
+        schema_overrides={"e": pl.Utf8},
+    )
+    assert_frame_equal(result, expected)
+
+
+def test_df_describe_nested() -> None:
+    df = pl.DataFrame(
+        {
+            "struct": [{"x": 1, "y": 2}, {"x": 3, "y": 4}, {"x": 1, "y": 2}, None],
+            "list": [[1, 2], [3, 4], [1, 2], None],
+        }
+    )
+
+    result = df.describe()
+
+    expected = pl.DataFrame(
+        [
+            ("count", 3, 3),
+            ("null_count", 1, 1),
+            ("mean", None, None),
+            ("std", None, None),
+            ("min", None, None),
+            ("25%", None, None),
+            ("50%", None, None),
+            ("75%", None, None),
+            ("max", None, None),
+        ],
+        schema=["describe"] + df.columns,
+        schema_overrides={"struct": pl.Utf8, "list": pl.Utf8},
+    )
+    assert_frame_equal(result, expected)
+
+
+def test_df_describe_custom_percentiles() -> None:
+    df = pl.DataFrame({"numeric": [1, 2, 1, None]})
+
+    result = df.describe(percentiles=(0.2, 0.4, 0.5, 0.6, 0.8))
+
+    expected = pl.DataFrame(
+        [
+            ("count", 3.0),
+            ("null_count", 1.0),
+            ("mean", 1.3333333333333333),
+            ("std", 0.5773502691896257),
+            ("min", 1.0),
+            ("20%", 1.0),
+            ("40%", 1.0),
+            ("50%", 1.0),
+            ("60%", 1.0),
+            ("80%", 2.0),
+            ("max", 2.0),
+        ],
+        schema=["describe"] + df.columns,
+    )
+    assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("pcts", [None, []])
+def test_df_describe_no_percentiles(pcts: list[float] | None) -> None:
+    df = pl.DataFrame({"numeric": [1, 2, 1, None]})
+
+    result = df.describe(percentiles=pcts)
+
+    expected = pl.DataFrame(
+        [
+            ("count", 3.0),
+            ("null_count", 1.0),
+            ("mean", 1.3333333333333333),
+            ("std", 0.5773502691896257),
+            ("min", 1.0),
+            ("max", 2.0),
+        ],
+        schema=["describe"] + df.columns,
+    )
+    assert_frame_equal(result, expected)
+
+
+def test_df_describe_empty_column() -> None:
+    df = pl.DataFrame(schema={"a": pl.Int64})
+
+    result = df.describe()
+
+    expected = pl.DataFrame(
+        [
+            ("count", 0.0),
+            ("null_count", 0.0),
+            ("mean", None),
+            ("std", None),
+            ("min", None),
+            ("25%", None),
+            ("50%", None),
+            ("75%", None),
+            ("max", None),
+        ],
+        schema=["describe"] + df.columns,
+    )
+    assert_frame_equal(result, expected)
+
+
+def test_df_describe_empty() -> None:
+    df = pl.DataFrame()
+    with pytest.raises(
+        TypeError, match="cannot describe a DataFrame without any columns"
+    ):
+        df.describe()