Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): Update describe to use new count implementation #12990

Merged
merged 5 commits into from
Dec 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 36 additions & 18 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
from polars.datatypes import (
INTEGER_DTYPES,
N_INFER_DEFAULT,
NUMERIC_DTYPES,
Boolean,
Float64,
Object,
Expand Down Expand Up @@ -4340,7 +4339,7 @@ def describe(
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │
╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡
│ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 33 ┆ 3 │
│ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 22 ┆ 3 │
│ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │
│ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │
│ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │
Expand All @@ -4352,44 +4351,63 @@ def describe(
└────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘

"""
# determine metrics and optional/additional percentiles
if not self.columns:
raise TypeError("cannot describe a DataFrame without any columns")

# Determine which columns should get std/mean/percentile statistics
stat_cols = {
c for c, dt in self.schema.items() if dt.is_numeric() or dt == Boolean
}

# Determine metrics and optional/additional percentiles
metrics = ["count", "null_count", "mean", "std", "min"]
percentile_exprs = []
for p in parse_percentiles(percentiles):
percentile_exprs.append(F.all().quantile(p).name.prefix(f"{p}:"))
for c in self.columns:
expr = F.col(c).quantile(p) if c in stat_cols else F.lit(None)
expr = expr.alias(f"{p}:{c}")
percentile_exprs.append(expr)
metrics.append(f"{p:.0%}")
metrics.append("max")

# execute metrics in parallel
mean_exprs = [
(F.col(c).mean() if c in stat_cols else F.lit(None)).alias(f"mean:{c}")
for c in self.columns
]
std_exprs = [
(F.col(c).std() if c in stat_cols else F.lit(None)).alias(f"std:{c}")
for c in self.columns
]

# Calculate metrics in parallel
df_metrics = self.select(
F.all().len().name.prefix("count:"),
F.all().count().name.prefix("count:"),
F.all().null_count().name.prefix("null_count:"),
F.all().mean().name.prefix("mean:"),
F.all().std().name.prefix("std:"),
*mean_exprs,
*std_exprs,
F.all().min().name.prefix("min:"),
*percentile_exprs,
F.all().max().name.prefix("max:"),
).row(0)
)

# reshape wide result
n_cols = len(self.columns)
# Reshape wide result
described = [
df_metrics[(n * n_cols) : (n + 1) * n_cols] for n in range(len(metrics))
df_metrics.row(0)[(n * self.width) : (n + 1) * self.width]
for n in range(len(metrics))
]

# cast by column type (numeric/bool -> float), (other -> string)
# Cast by column type (numeric/bool -> float), (other -> string)
summary = dict(zip(self.columns, list(zip(*described))))
num_or_bool = NUMERIC_DTYPES | {Boolean}
for c, tp in self.schema.items():
for c in self.columns:
summary[c] = [ # type: ignore[assignment]
None
if (v is None or isinstance(v, dict))
else (float(v) if tp in num_or_bool else str(v))
else (float(v) if c in stat_cols else str(v))
for v in summary[c]
]

# return results as a frame
df_summary = self.__class__(summary)
# Return results as a DataFrame
df_summary = self._from_dict(summary)
df_summary.insert_column(0, pl.Series("describe", metrics))
return df_summary

Expand Down
52 changes: 29 additions & 23 deletions py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1576,8 +1576,8 @@ def describe(

Examples
--------
>>> series_num = pl.Series([1, 2, 3, 4, 5])
>>> series_num.describe()
>>> s = pl.Series([1, 2, 3, 4, 5])
>>> s.describe()
shape: (9, 2)
┌────────────┬──────────┐
│ statistic ┆ value │
Expand All @@ -1595,64 +1595,70 @@ def describe(
│ max ┆ 5.0 │
└────────────┴──────────┘

>>> series_str = pl.Series(["a", "a", None, "b", "c"])
>>> series_str.describe()
Non-numeric data types may not have all statistics available.

>>> s = pl.Series(["a", "a", None, "b", "c"])
>>> s.describe()
shape: (3, 2)
┌────────────┬───────┐
│ statistic ┆ value │
│ --- ┆ --- │
│ str ┆ i64 │
╞════════════╪═══════╡
│ count ┆ 5
│ count ┆ 4
│ null_count ┆ 1 │
│ unique ┆ 4 │
└────────────┴───────┘

"""
stats: dict[str, PythonLiteral | None]
stats_dtype: PolarsDataType

if self.len() == 0:
raise ValueError("Series must contain at least one value")

elif self.dtype.is_numeric():
s = self.cast(Float64)
if self.dtype.is_numeric():
stats_dtype = Float64
stats = {
"count": s.len(),
"null_count": s.null_count(),
"mean": s.mean(),
"std": s.std(),
"min": s.min(),
"count": self.count(),
"null_count": self.null_count(),
"mean": self.mean(),
"std": self.std(),
"min": self.min(),
}
for p in parse_percentiles(percentiles):
stats[f"{p:.0%}"] = s.quantile(p)
stats["max"] = s.max()
stats[f"{p:.0%}"] = self.quantile(p)
stats["max"] = self.max()

elif self.dtype == Boolean:
stats_dtype = Int64
stats = {
"count": self.len(),
"count": self.count(),
"null_count": self.null_count(),
"sum": self.sum(),
}
elif self.dtype == Utf8:
stats_dtype = Int64
stats = {
"count": self.len(),
"count": self.count(),
"null_count": self.null_count(),
"unique": len(self.unique()),
"unique": self.n_unique(),
}
elif self.dtype.is_temporal():
# we coerce all to string, because a polars column
# only has a single dtype and dates: datetime and count: int don't match
stats_dtype = Utf8
stats = {
"count": str(self.len()),
"count": str(self.count()),
"null_count": str(self.null_count()),
"min": str(self.dt.min()),
"50%": str(self.dt.median()),
"max": str(self.dt.max()),
}
else:
raise TypeError("this type is not supported")
raise TypeError(f"cannot describe Series of data type {self.dtype}")

return pl.DataFrame({"statistic": stats.keys(), "value": stats.values()})
return pl.DataFrame(
{"statistic": stats.keys(), "value": stats.values()},
schema={"statistic": Utf8, "value": stats_dtype},
)

def sum(self) -> int | float:
"""
Expand Down
143 changes: 143 additions & 0 deletions py-polars/tests/unit/dataframe/test_describe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
from __future__ import annotations

from datetime import date

import pytest

import polars as pl
from polars.testing import assert_frame_equal


def test_df_describe() -> None:
df = pl.DataFrame(
{
"a": [1.0, 2.8, 3.0],
"b": [4, 5, None],
"c": [True, False, True],
"d": [None, "b", "c"],
"e": ["usd", "eur", None],
"f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)],
},
schema_overrides={"e": pl.Categorical},
)

result = df.describe()

expected = pl.DataFrame(
[
("count", 3.0, 2.0, 3.0, 2, 2, 3),
("null_count", 0.0, 1.0, 0.0, 1, 1, 0),
("mean", 2.266667, 4.5, 0.666667, None, None, None),
("std", 1.101514, 0.707107, 0.57735, None, None, None),
("min", 1.0, 4.0, 0.0, "b", None, "2020-01-01"),
("25%", 1.0, 4.0, None, None, None, None),
("50%", 2.8, 5.0, None, None, None, None),
("75%", 3.0, 5.0, None, None, None, None),
("max", 3.0, 5.0, 1.0, "c", None, "2022-01-01"),
],
schema=["describe"] + df.columns,
schema_overrides={"e": pl.Utf8},
)
assert_frame_equal(result, expected)


def test_df_describe_nested() -> None:
df = pl.DataFrame(
{
"struct": [{"x": 1, "y": 2}, {"x": 3, "y": 4}, {"x": 1, "y": 2}, None],
"list": [[1, 2], [3, 4], [1, 2], None],
}
)

result = df.describe()

expected = pl.DataFrame(
[
("count", 3, 3),
("null_count", 1, 1),
("mean", None, None),
("std", None, None),
("min", None, None),
("25%", None, None),
("50%", None, None),
("75%", None, None),
("max", None, None),
],
schema=["describe"] + df.columns,
schema_overrides={"struct": pl.Utf8, "list": pl.Utf8},
)
assert_frame_equal(result, expected)


def test_df_describe_custom_percentiles() -> None:
df = pl.DataFrame({"numeric": [1, 2, 1, None]})

result = df.describe(percentiles=(0.2, 0.4, 0.5, 0.6, 0.8))

expected = pl.DataFrame(
[
("count", 3.0),
("null_count", 1.0),
("mean", 1.3333333333333333),
("std", 0.5773502691896257),
("min", 1.0),
("20%", 1.0),
("40%", 1.0),
("50%", 1.0),
("60%", 1.0),
("80%", 2.0),
("max", 2.0),
],
schema=["describe"] + df.columns,
)
assert_frame_equal(result, expected)


@pytest.mark.parametrize("pcts", [None, []])
def test_df_describe_no_percentiles(pcts: list[float] | None) -> None:
df = pl.DataFrame({"numeric": [1, 2, 1, None]})

result = df.describe(percentiles=pcts)

expected = pl.DataFrame(
[
("count", 3.0),
("null_count", 1.0),
("mean", 1.3333333333333333),
("std", 0.5773502691896257),
("min", 1.0),
("max", 2.0),
],
schema=["describe"] + df.columns,
)
assert_frame_equal(result, expected)


def test_df_describe_empty_column() -> None:
df = pl.DataFrame(schema={"a": pl.Int64})

result = df.describe()

expected = pl.DataFrame(
[
("count", 0.0),
("null_count", 0.0),
("mean", None),
("std", None),
("min", None),
("25%", None),
("50%", None),
("75%", None),
("max", None),
],
schema=["describe"] + df.columns,
)
assert_frame_equal(result, expected)


def test_df_describe_empty() -> None:
df = pl.DataFrame()
with pytest.raises(
TypeError, match="cannot describe a DataFrame without any columns"
):
df.describe()
Loading