Skip to content

Commit

Permalink
feat(python): Add DataFrame.glimpse() (#5622)
Browse files Browse the repository at this point in the history
  • Loading branch information
zundertj committed Dec 10, 2022
1 parent 76bade2 commit 60edad1
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 0 deletions.
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/dataframe/descriptive.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Descriptive
:toctree: api/

DataFrame.describe
DataFrame.glimpse
DataFrame.estimated_size
DataFrame.is_duplicated
DataFrame.is_empty
Expand Down
83 changes: 83 additions & 0 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2557,10 +2557,85 @@ def filter(
.collect(no_optimization=True)
)

def glimpse(self: DF) -> str:
"""
Print a dense preview of the dataframe.
Printing is done one line per column, so wide dataframes show nicely. Each
line will show the column name, the data type and the first few values.
See Also
--------
describe, head, tail
Examples
--------
>>> from datetime import date
>>> df = pl.DataFrame(
... {
... "a": [1.0, 2.8, 3.0],
... "b": [4, 5, None],
... "c": [True, False, True],
... "d": [None, "b", "c"],
... "e": ["usd", "eur", None],
... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)],
... }
... )
>>> df.glimpse() # doctest: +IGNORE_RESULT
Rows: 3
Columns: 6
$ a <Float64> 1.0, 2.8, 3.0
$ b <Int64> 4, 5, None
$ c <Boolean> True, False, True
$ d <Utf8> None, b, c
$ e <Utf8> usd, eur, None
$ f <Date> 2020-01-01, 2021-01-02, 2022-01-01
"""
# always print at most this number of values, mainly used to ensure
# we do not cast long arrays to strings which would be very slow
max_num_values = min(10, self.height)

def _parse_column(col_name: str) -> tuple[str, str, str]:
s = self[col_name]
dtype_str = "<" + s.dtype.__name__ + ">"
val = s[:max_num_values].to_list()
val_str = ", ".join(map(str, val))
return col_name, dtype_str, val_str

data = [_parse_column(s) for s in self.columns]

# we make the first column as small as possible by taking the longest
# column name
max_col_name = max((len(col_name) for col_name, _, _ in data))

# dtype string
max_col_dtype = max((len(dtype_str) for _, dtype_str, _ in data))

# limit the amount of data printed such that total width is fixed
max_col_values = 100 - max_col_name - max_col_dtype

# print header
output = f"Rows: {self.height}\nColumns: {self.width}\n"

# print individual columns: one row per column
for col_name, dtype_str, val_str in data:
output += (
f"$ {col_name:<{max_col_name}}"
f" {dtype_str:>{max_col_dtype}}"
f" {val_str:<{max_col_values}}\n"
)

return output

def describe(self: DF) -> DF:
"""
Summary statistics for a DataFrame.
See Also
--------
glimpse
Examples
--------
>>> from datetime import date
Expand Down Expand Up @@ -2911,6 +2986,10 @@ def head(self: DF, n: int = 5) -> DF:
n
Number of rows to return.
See Also
--------
tail, glimpse
Examples
--------
>>> df = pl.DataFrame(
Expand Down Expand Up @@ -2946,6 +3025,10 @@ def tail(self: DF, n: int = 5) -> DF:
n
Number of rows to return.
See Also
--------
head
Examples
--------
>>> df = pl.DataFrame(
Expand Down
25 changes: 25 additions & 0 deletions py-polars/tests/unit/test_df.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa: W291
from __future__ import annotations

import sys
Expand Down Expand Up @@ -2520,3 +2521,27 @@ def test_init_physical_with_timezone() -> None:
datetime(2022, 10, 12, 21, 30, tzinfo=zoneinfo.ZoneInfo(tz_asia)),
)
]


def test_glimpse() -> None:
df = pl.DataFrame(
{
"a": [1.0, 2.8, 3.0],
"b": [4, 5, None],
"c": [True, False, True],
"d": [None, "b", "c"],
"e": ["usd", "eur", None],
"f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)],
}
)
result = df.glimpse()

expected = """Rows: 3
Columns: 6
$ a <Float64> 1.0, 2.8, 3.0
$ b <Int64> 4, 5, None
$ c <Boolean> True, False, True
$ d <Utf8> None, b, c
$ e <Utf8> usd, eur, None
$ f <Date> 2020-01-01, 2021-01-02, 2022-01-01"""
assert result.strip() == expected

0 comments on commit 60edad1

Please sign in to comment.