feat[python] support units for estimated_size ("mb","gb", etc) (#4499)

pola-rs · Aug 19, 2022 · de93b31 · de93b31
1 parent 3c7302f
commit de93b31
Show file tree

Hide file tree

Showing 5 changed files with 83 additions and 9 deletions.
diff --git a/py-polars/polars/internals/dataframe/frame.py b/py-polars/polars/internals/dataframe/frame.py
@@ -53,6 +53,7 @@
     is_int_sequence,
     is_str_sequence,
     range_to_slice,
+    scale_bytes,
 )
 
 try:
@@ -113,6 +114,7 @@
         ParallelStrategy,
         ParquetCompression,
         PivotAgg,
+        SizeUnit,
         UniqueKeepStrategy,
     )
 
@@ -300,10 +302,10 @@ def __init__(
         else:
             raise ValueError("DataFrame constructor not called properly.")
 
-    def estimated_size(self) -> int:
+    def estimated_size(self, unit: SizeUnit = "b") -> int | float:
         """
         Return an estimation of the total (heap) allocated size of the `DataFrame` in
-        bytes.
+        bytes (pass `unit` to return estimated size in kilobytes, megabytes, etc)..
 
         This estimation is the sum of the size of its buffers, validity, including
         nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
@@ -315,8 +317,30 @@ def estimated_size(self) -> int:
         this function returns the visible size of the buffer, not its total capacity.
 
         FFI buffers are included in this estimation.
+
+        Parameters
+        ----------
+        unit : {'b', 'kb', 'mb', 'gb', 'tb'}
+            Scale the returned size to the given unit.
+
+        Examples
+        --------
+        >>> df = pl.DataFrame(
+        ...     {
+        ...         "x": list(reversed(range(1_000_000))),
+        ...         "y": [v / 1000 for v in range(1_000_000)],
+        ...         "z": [str(v) for v in range(1_000_000)],
+        ...     },
+        ...     columns=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)],
+        ... )
+        >>> df.estimated_size()
+        25888898
+        >>> df.estimated_size("mb")
+        24.689577102661133
+
         """
-        return self._df.estimated_size()
+        sz = self._df.estimated_size()
+        return scale_bytes(sz, to=unit)
 
     @classmethod
     def _from_pydf(cls: type[DF], py_df: PyDataFrame) -> DF:

diff --git a/py-polars/polars/internals/series/series.py b/py-polars/polars/internals/series/series.py
@@ -54,6 +54,7 @@
     is_bool_sequence,
     is_int_sequence,
     range_to_slice,
+    scale_bytes,
 )
 
 try:
@@ -96,6 +97,7 @@
         InterpolationMethod,
         NullBehavior,
         RankMethod,
+        SizeUnit,
         TimeUnit,
     )
 
@@ -651,10 +653,10 @@ def flags(self) -> dict[str, bool]:
             "SORTED_DESC": self._s.is_sorted_reverse_flag(),
         }
 
-    def estimated_size(self) -> int:
+    def estimated_size(self, unit: SizeUnit = "b") -> int | float:
         """
         Return an estimation of the total (heap) allocated size of the `Series` in
-        bytes.
+        bytes (pass `unit` to return estimated size in kilobytes, megabytes, etc).
 
         This estimation is the sum of the size of its buffers, validity, including
         nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
@@ -667,8 +669,22 @@ def estimated_size(self) -> int:
 
         FFI buffers are included in this estimation.
 
+        Parameters
+        ----------
+        unit : {'b', 'kb', 'mb', 'gb', 'tb'}
+            Scale the returned size to the given unit.
+
+        Examples
+        --------
+        >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32)
+        >>> s.estimated_size()
+        4000000
+        >>> s.estimated_size("mb")
+        3.814697265625
+
         """
-        return self._s.estimated_size()
+        sz = self._s.estimated_size()
+        return scale_bytes(sz, to=unit)
 
     def sqrt(self) -> Series:
         """

diff --git a/py-polars/polars/internals/type_aliases.py b/py-polars/polars/internals/type_aliases.py
@@ -39,6 +39,18 @@
 RankMethod: TypeAlias = Literal["average", "min", "max", "dense", "ordinal", "random"]
 TimeUnit: TypeAlias = Literal["ns", "us", "ms"]
 UniqueKeepStrategy: TypeAlias = Literal["first", "last"]
+SizeUnit: TypeAlias = Literal[
+    "b",
+    "kb",
+    "mb",
+    "gb",
+    "tb",
+    "bytes",
+    "kilobytes",
+    "megabytes",
+    "gigabytes",
+    "terabytes",
+]
 
 # The following have a Rust enum equivalent with a different name
 AsofJoinStrategy: TypeAlias = Literal["backward", "forward"]  # AsofStrategy

diff --git a/py-polars/polars/utils.py b/py-polars/polars/utils.py
@@ -34,7 +34,7 @@
     from typing_extensions import ParamSpec, TypeGuard
 
 if TYPE_CHECKING:
-    from polars.internals.type_aliases import TimeUnit
+    from polars.internals.type_aliases import SizeUnit, TimeUnit
 
 
 def _process_null_values(
@@ -344,3 +344,17 @@ def _rename_kwargs(
                 stacklevel=3,
             )
             kwargs[new] = kwargs.pop(alias)
+
+
+def scale_bytes(sz: int, to: SizeUnit) -> int | float:
+    """Scale size in bytes to other size units (eg: "kb", "mb", "gb", "tb")."""
+    scaling_factor = {
+        "b": 1,
+        "k": 1024,
+        "m": 1024**2,
+        "g": 1024**3,
+        "t": 1024**4,
+    }[to[0]]
+    if scaling_factor > 1:
+        return sz / scaling_factor
+    return sz
diff --git a/py-polars/tests/test_utils.py b/py-polars/tests/test_utils.py
@@ -53,5 +53,13 @@ def test_timedelta_to_pl_timedelta() -> None:
 
 
 def test_estimated_size() -> None:
-    a = pl.Series([1, 2, 3])
-    assert a.estimated_size() == a.to_frame().estimated_size()
+    s = pl.Series("n", list(range(100)))
+    df = s.to_frame()
+
+    for sz in (s.estimated_size(), s.estimated_size("b"), s.estimated_size("bytes")):
+        assert sz == df.estimated_size()
+
+    assert s.estimated_size("kb") == (df.estimated_size("b") / 1024)
+    assert s.estimated_size("mb") == (df.estimated_size("kb") / 1024)
+    assert s.estimated_size("gb") == (df.estimated_size("mb") / 1024)
+    assert s.estimated_size("tb") == (df.estimated_size("gb") / 1024)