Skip to content

Commit

Permalink
feat(python): Update describe to use new count implementation (#1…
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego committed Dec 12, 2023
1 parent a6483c6 commit ac7ffa6
Show file tree
Hide file tree
Showing 7 changed files with 320 additions and 213 deletions.
54 changes: 36 additions & 18 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
from polars.datatypes import (
INTEGER_DTYPES,
N_INFER_DEFAULT,
NUMERIC_DTYPES,
Boolean,
Float64,
Object,
Expand Down Expand Up @@ -4340,7 +4339,7 @@ def describe(
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │
╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡
│ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 33 ┆ 3 │
│ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 22 ┆ 3 │
│ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │
│ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │
│ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │
Expand All @@ -4352,44 +4351,63 @@ def describe(
└────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘
"""
# determine metrics and optional/additional percentiles
if not self.columns:
raise TypeError("cannot describe a DataFrame without any columns")

# Determine which columns should get std/mean/percentile statistics
stat_cols = {
c for c, dt in self.schema.items() if dt.is_numeric() or dt == Boolean
}

# Determine metrics and optional/additional percentiles
metrics = ["count", "null_count", "mean", "std", "min"]
percentile_exprs = []
for p in parse_percentiles(percentiles):
percentile_exprs.append(F.all().quantile(p).name.prefix(f"{p}:"))
for c in self.columns:
expr = F.col(c).quantile(p) if c in stat_cols else F.lit(None)
expr = expr.alias(f"{p}:{c}")
percentile_exprs.append(expr)
metrics.append(f"{p:.0%}")
metrics.append("max")

# execute metrics in parallel
mean_exprs = [
(F.col(c).mean() if c in stat_cols else F.lit(None)).alias(f"mean:{c}")
for c in self.columns
]
std_exprs = [
(F.col(c).std() if c in stat_cols else F.lit(None)).alias(f"std:{c}")
for c in self.columns
]

# Calculate metrics in parallel
df_metrics = self.select(
F.all().len().name.prefix("count:"),
F.all().count().name.prefix("count:"),
F.all().null_count().name.prefix("null_count:"),
F.all().mean().name.prefix("mean:"),
F.all().std().name.prefix("std:"),
*mean_exprs,
*std_exprs,
F.all().min().name.prefix("min:"),
*percentile_exprs,
F.all().max().name.prefix("max:"),
).row(0)
)

# reshape wide result
n_cols = len(self.columns)
# Reshape wide result
described = [
df_metrics[(n * n_cols) : (n + 1) * n_cols] for n in range(len(metrics))
df_metrics.row(0)[(n * self.width) : (n + 1) * self.width]
for n in range(len(metrics))
]

# cast by column type (numeric/bool -> float), (other -> string)
# Cast by column type (numeric/bool -> float), (other -> string)
summary = dict(zip(self.columns, list(zip(*described))))
num_or_bool = NUMERIC_DTYPES | {Boolean}
for c, tp in self.schema.items():
for c in self.columns:
summary[c] = [ # type: ignore[assignment]
None
if (v is None or isinstance(v, dict))
else (float(v) if tp in num_or_bool else str(v))
else (float(v) if c in stat_cols else str(v))
for v in summary[c]
]

# return results as a frame
df_summary = self.__class__(summary)
# Return results as a DataFrame
df_summary = self._from_dict(summary)
df_summary.insert_column(0, pl.Series("describe", metrics))
return df_summary

Expand Down
52 changes: 29 additions & 23 deletions py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1576,8 +1576,8 @@ def describe(
Examples
--------
>>> series_num = pl.Series([1, 2, 3, 4, 5])
>>> series_num.describe()
>>> s = pl.Series([1, 2, 3, 4, 5])
>>> s.describe()
shape: (9, 2)
┌────────────┬──────────┐
│ statistic ┆ value │
Expand All @@ -1595,64 +1595,70 @@ def describe(
│ max ┆ 5.0 │
└────────────┴──────────┘
>>> series_str = pl.Series(["a", "a", None, "b", "c"])
>>> series_str.describe()
Non-numeric data types may not have all statistics available.
>>> s = pl.Series(["a", "a", None, "b", "c"])
>>> s.describe()
shape: (3, 2)
┌────────────┬───────┐
│ statistic ┆ value │
│ --- ┆ --- │
│ str ┆ i64 │
╞════════════╪═══════╡
│ count ┆ 5
│ count ┆ 4
│ null_count ┆ 1 │
│ unique ┆ 4 │
└────────────┴───────┘
"""
stats: dict[str, PythonLiteral | None]
stats_dtype: PolarsDataType

if self.len() == 0:
raise ValueError("Series must contain at least one value")

elif self.dtype.is_numeric():
s = self.cast(Float64)
if self.dtype.is_numeric():
stats_dtype = Float64
stats = {
"count": s.len(),
"null_count": s.null_count(),
"mean": s.mean(),
"std": s.std(),
"min": s.min(),
"count": self.count(),
"null_count": self.null_count(),
"mean": self.mean(),
"std": self.std(),
"min": self.min(),
}
for p in parse_percentiles(percentiles):
stats[f"{p:.0%}"] = s.quantile(p)
stats["max"] = s.max()
stats[f"{p:.0%}"] = self.quantile(p)
stats["max"] = self.max()

elif self.dtype == Boolean:
stats_dtype = Int64
stats = {
"count": self.len(),
"count": self.count(),
"null_count": self.null_count(),
"sum": self.sum(),
}
elif self.dtype == Utf8:
stats_dtype = Int64
stats = {
"count": self.len(),
"count": self.count(),
"null_count": self.null_count(),
"unique": len(self.unique()),
"unique": self.n_unique(),
}
elif self.dtype.is_temporal():
# we coerce all to string, because a polars column
# only has a single dtype and dates: datetime and count: int don't match
stats_dtype = Utf8
stats = {
"count": str(self.len()),
"count": str(self.count()),
"null_count": str(self.null_count()),
"min": str(self.dt.min()),
"50%": str(self.dt.median()),
"max": str(self.dt.max()),
}
else:
raise TypeError("this type is not supported")
raise TypeError(f"cannot describe Series of data type {self.dtype}")

return pl.DataFrame({"statistic": stats.keys(), "value": stats.values()})
return pl.DataFrame(
{"statistic": stats.keys(), "value": stats.values()},
schema={"statistic": Utf8, "value": stats_dtype},
)

def sum(self) -> int | float:
"""
Expand Down
143 changes: 143 additions & 0 deletions py-polars/tests/unit/dataframe/test_describe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
from __future__ import annotations

from datetime import date

import pytest

import polars as pl
from polars.testing import assert_frame_equal


def test_df_describe() -> None:
df = pl.DataFrame(
{
"a": [1.0, 2.8, 3.0],
"b": [4, 5, None],
"c": [True, False, True],
"d": [None, "b", "c"],
"e": ["usd", "eur", None],
"f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)],
},
schema_overrides={"e": pl.Categorical},
)

result = df.describe()

expected = pl.DataFrame(
[
("count", 3.0, 2.0, 3.0, 2, 2, 3),
("null_count", 0.0, 1.0, 0.0, 1, 1, 0),
("mean", 2.266667, 4.5, 0.666667, None, None, None),
("std", 1.101514, 0.707107, 0.57735, None, None, None),
("min", 1.0, 4.0, 0.0, "b", None, "2020-01-01"),
("25%", 1.0, 4.0, None, None, None, None),
("50%", 2.8, 5.0, None, None, None, None),
("75%", 3.0, 5.0, None, None, None, None),
("max", 3.0, 5.0, 1.0, "c", None, "2022-01-01"),
],
schema=["describe"] + df.columns,
schema_overrides={"e": pl.Utf8},
)
assert_frame_equal(result, expected)


def test_df_describe_nested() -> None:
df = pl.DataFrame(
{
"struct": [{"x": 1, "y": 2}, {"x": 3, "y": 4}, {"x": 1, "y": 2}, None],
"list": [[1, 2], [3, 4], [1, 2], None],
}
)

result = df.describe()

expected = pl.DataFrame(
[
("count", 3, 3),
("null_count", 1, 1),
("mean", None, None),
("std", None, None),
("min", None, None),
("25%", None, None),
("50%", None, None),
("75%", None, None),
("max", None, None),
],
schema=["describe"] + df.columns,
schema_overrides={"struct": pl.Utf8, "list": pl.Utf8},
)
assert_frame_equal(result, expected)


def test_df_describe_custom_percentiles() -> None:
df = pl.DataFrame({"numeric": [1, 2, 1, None]})

result = df.describe(percentiles=(0.2, 0.4, 0.5, 0.6, 0.8))

expected = pl.DataFrame(
[
("count", 3.0),
("null_count", 1.0),
("mean", 1.3333333333333333),
("std", 0.5773502691896257),
("min", 1.0),
("20%", 1.0),
("40%", 1.0),
("50%", 1.0),
("60%", 1.0),
("80%", 2.0),
("max", 2.0),
],
schema=["describe"] + df.columns,
)
assert_frame_equal(result, expected)


@pytest.mark.parametrize("pcts", [None, []])
def test_df_describe_no_percentiles(pcts: list[float] | None) -> None:
df = pl.DataFrame({"numeric": [1, 2, 1, None]})

result = df.describe(percentiles=pcts)

expected = pl.DataFrame(
[
("count", 3.0),
("null_count", 1.0),
("mean", 1.3333333333333333),
("std", 0.5773502691896257),
("min", 1.0),
("max", 2.0),
],
schema=["describe"] + df.columns,
)
assert_frame_equal(result, expected)


def test_df_describe_empty_column() -> None:
df = pl.DataFrame(schema={"a": pl.Int64})

result = df.describe()

expected = pl.DataFrame(
[
("count", 0.0),
("null_count", 0.0),
("mean", None),
("std", None),
("min", None),
("25%", None),
("50%", None),
("75%", None),
("max", None),
],
schema=["describe"] + df.columns,
)
assert_frame_equal(result, expected)


def test_df_describe_empty() -> None:
df = pl.DataFrame()
with pytest.raises(
TypeError, match="cannot describe a DataFrame without any columns"
):
df.describe()
Loading

0 comments on commit ac7ffa6

Please sign in to comment.