Skip to content

Commit

Permalink
add estimate_size methods (#3110)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Apr 11, 2022
1 parent 82ece67 commit 9c948b7
Show file tree
Hide file tree
Showing 8 changed files with 93 additions and 0 deletions.
16 changes: 16 additions & 0 deletions polars/polars-core/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,22 @@ fn duplicate_err(name: &str) -> Result<()> {
}

impl DataFrame {
/// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
///
/// # Implementation
/// This estimation is the sum of the size of its buffers, validity, including nested arrays.
/// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
/// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
///
/// When an array is sliced, its allocated size remains constant because the buffer unchanged.
/// However, this function will yield a smaller number. This is because this function returns
/// the visible size of the buffer, not its total capacity.
///
/// FFI buffers are included in this estimation.
pub fn estimated_size(&self) -> usize {
self.columns.iter().map(|s| s.estimated_size()).sum()
}

/// Get the index of the column.
fn check_name_to_idx(&self, name: &str) -> Result<usize> {
self.find_idx_by_name(name)
Expand Down
35 changes: 35 additions & 0 deletions polars/polars-core/src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ use crate::utils::Wrap;
use crate::utils::{split_ca, split_series};
use crate::{series::arithmetic::coerce_lhs_rhs, POOL};
use ahash::RandomState;
use arrow::compute::aggregate::estimated_bytes_size;
pub use from::*;
use num::NumCast;
use rayon::prelude::*;
Expand Down Expand Up @@ -974,6 +975,40 @@ impl Series {
)),
}
}

/// Returns an estimation of the total (heap) allocated size of the `Series` in bytes.
///
/// # Implementation
/// This estimation is the sum of the size of its buffers, validity, including nested arrays.
/// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
/// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
///
/// When an array is sliced, its allocated size remains constant because the buffer unchanged.
/// However, this function will yield a smaller number. This is because this function returns
/// the visible size of the buffer, not its total capacity.
///
/// FFI buffers are included in this estimation.
pub fn estimated_size(&self) -> usize {
#[allow(unused_mut)]
let mut size = self
.chunks()
.iter()
.map(|arr| estimated_bytes_size(&**arr))
.sum();
match self.dtype() {
#[cfg(feature = "dtype-categorical")]
DataType::Categorical(Some(rv)) => match &**rv {
RevMapping::Local(arr) => size += estimated_bytes_size(arr),
RevMapping::Global(map, arr, _) => {
size +=
map.capacity() * std::mem::size_of::<u32>() * 2 + estimated_bytes_size(arr);
}
},
_ => {}
}

size
}
}

impl Deref for Series {
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/dataframe.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ Descriptive stats
:toctree: api/

DataFrame.describe
DataFrame.estimated_size
DataFrame.is_duplicated
DataFrame.is_unique
DataFrame.n_chunks
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ Descriptive stats
:toctree: api/

Series.describe
Series.estimated_size
Series.unique_counts
Series.value_counts
Series.chunk_lengths
Expand Down
16 changes: 16 additions & 0 deletions py-polars/polars/internals/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,22 @@ def __init__(
else:
raise ValueError("DataFrame constructor not called properly.")

def estimated_size(self) -> int:
"""
Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
This estimation is the sum of the size of its buffers, validity, including nested arrays.
Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
When an array is sliced, its allocated size remains constant because the buffer unchanged.
However, this function will yield a smaller number. This is because this function returns
the visible size of the buffer, not its total capacity.
FFI buffers are included in this estimation.
"""
return self._s.estimated_size()

@classmethod
def _from_pydf(cls: Type[DF], py_df: "PyDataFrame") -> DF:
"""
Expand Down
16 changes: 16 additions & 0 deletions py-polars/polars/internals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,22 @@ def __setitem__(
else:
raise ValueError(f'cannot use "{key}" for indexing')

def estimated_size(self) -> int:
"""
Returns an estimation of the total (heap) allocated size of the `Series` in bytes.
This estimation is the sum of the size of its buffers, validity, including nested arrays.
Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
When an array is sliced, its allocated size remains constant because the buffer unchanged.
However, this function will yield a smaller number. This is because this function returns
the visible size of the buffer, not its total capacity.
FFI buffers are included in this estimation.
"""
return self._s.estimated_size()

def sqrt(self) -> "Series":
"""
Compute the square root of the elements
Expand Down
4 changes: 4 additions & 0 deletions py-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ impl PyDataFrame {
PyDataFrame { df }
}

pub fn estimated_size(&self) -> usize {
self.df.estimated_size()
}

fn finish_from_rows(rows: Vec<Row>) -> PyResult<Self> {
// replace inferred nulls with boolean
let schema = rows_to_schema(&rows);
Expand Down
4 changes: 4 additions & 0 deletions py-polars/src/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,10 @@ impl PySeries {
}
}

pub fn estimated_size(&self) -> usize {
self.series.estimated_size()
}

pub fn get_object(&self, index: usize) -> PyObject {
let gil = Python::acquire_gil();
let python = gil.python();
Expand Down

0 comments on commit 9c948b7

Please sign in to comment.