Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf(python): Optimize DataFrame.iter_rows for smaller buffer sizes #12804

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 7 additions & 8 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9553,7 +9553,7 @@ def iter_rows(
...

def iter_rows(
self, *, named: bool = False, buffer_size: int = 500
self, *, named: bool = False, buffer_size: int = 512
) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]:
"""
Returns an iterator over the DataFrame of rows of python-native values.
Expand Down Expand Up @@ -9614,17 +9614,16 @@ def iter_rows(
# note: buffering rows results in a 2-4x speedup over individual calls
# to ".row(i)", so it should only be disabled in extremely specific cases.
if buffer_size and not has_object:
create_with_pyarrow = named and can_create_dicts_with_pyarrow(self.dtypes)
for offset in range(0, self.height, buffer_size):
zerocopy_slice = self.slice(offset, buffer_size)
if named and can_create_dicts_with_pyarrow(self.dtypes):
if create_with_pyarrow:
yield from zerocopy_slice.to_arrow().to_pylist()
elif named:
for row in zerocopy_slice.rows(named=False):
yield dict_(zip_(columns, row))
else:
rows_chunk = zerocopy_slice.rows(named=False)
if named:
for row in rows_chunk:
yield dict_(zip_(columns, row))
else:
yield from rows_chunk
yield from zerocopy_slice.rows(named=False)
elif named:
for i in range(self.height):
yield dict_(zip_(columns, get_row(i)))
Expand Down