Skip to content

Commit

Permalink
feat(python): large speedup for df.iterrows (~200-400%) (#5979)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-beedie committed Jan 1, 2023
1 parent 9c3a659 commit d5793fd
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 20 deletions.
50 changes: 46 additions & 4 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6447,6 +6447,11 @@ def rows(self, named: bool = False) -> list[tuple[Any, ...]] | list[Any]:
Return named tuples instead of regular tuples. This is more expensive than
returning regular tuples, but allows for accessing values by column name.
Warnings
--------
Row-iteration is not optimal as the underlying data is stored in columnar form;
where possible, prefer export via one of the dedicated export/output methods.
Examples
--------
>>> df = pl.DataFrame(
Expand All @@ -6460,6 +6465,10 @@ def rows(self, named: bool = False) -> list[tuple[Any, ...]] | list[Any]:
>>> df.rows(named=True)
[Row(a=1, b=2), Row(a=3, b=4), Row(a=5, b=6)]
See Also
--------
iterrows : row iterator over frame data (does not materialise all rows).
"""
if named:
Row = namedtuple("Row", self.columns) # type: ignore[misc]
Expand All @@ -6468,15 +6477,19 @@ def rows(self, named: bool = False) -> list[tuple[Any, ...]] | list[Any]:
return self._df.row_tuples()

@overload
def iterrows(self, named: Literal[False] = ...) -> Iterator[tuple[Any, ...]]:
def iterrows(
self, named: Literal[False] = ..., buffer_size: int = ...
) -> Iterator[tuple[Any, ...]]:
...

@overload
def iterrows(self, named: Literal[True] = ...) -> Iterator[Any]:
def iterrows(
self, named: Literal[True] = ..., buffer_size: int = ...
) -> Iterator[Any]:
...

def iterrows(
self, named: bool = False
self, named: bool = False, buffer_size: int = 500
) -> Iterator[tuple[Any, ...]] | Iterator[Any]:
"""
Returns an iterator over the rows in the DataFrame.
Expand All @@ -6487,9 +6500,22 @@ def iterrows(
Return named tuples instead of regular tuples. This is more expensive than
returning regular tuples, but allows for accessing values by column name.
buffer_size
Determines the number of rows that are buffered internally while iterating
over the data; you should only modify this in very specific cases where the
default value is determined not to be a good fit to your access pattern, as
the speedup from using the buffer is significant (~2-4x). Setting this
value to zero disables row buffering.
Warnings
--------
This is very expensive and should not be used in any performance critical code!
Row-iteration is not optimal as the underlying data is stored in columnar form;
where possible, prefer export via one of the dedicated export/output methods.
Notes
-----
If you are planning to materialise all frame data at once you should prefer
calling ``rows()``, which will be faster.
Examples
--------
Expand All @@ -6504,9 +6530,25 @@ def iterrows(
>>> [row.b for row in df.iterrows(named=True)]
[2, 4, 6]
See Also
--------
rows : materialises all frame data as a list of rows.
"""
# note: buffering rows results in a 2-4x speedup over individual calls
# to ".row(i)", so it should only be disabled in extremely specific cases.
if named:
Row = namedtuple("Row", self.columns) # type: ignore[misc]
if buffer_size:
for offset in range(0, self.height, buffer_size):
rows_chunk = self.slice(offset, buffer_size).rows(named=False)
if named:
for row in rows_chunk:
yield Row(*row)
else:
yield from rows_chunk

elif named:
for i in range(self.height):
yield Row(*self.row(i))
else:
Expand Down
41 changes: 25 additions & 16 deletions py-polars/tests/unit/test_rows.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,26 +68,35 @@ def test_rows() -> None:
def test_iterrows() -> None:
df = pl.DataFrame({"a": [1, 2, 3], "b": [None, False, None]})

# Regular iterrows
# Default iterrows behaviour
it = df.iterrows()
assert next(it) == (1, None)
assert next(it) == (2, False)
assert next(it) == (3, None)
with pytest.raises(StopIteration):
next(it)

# Named iterrows
it_named = df.iterrows(named=True)

row = next(it_named)
assert row.a == 1
assert row.b is None
row = next(it_named)
assert row.a == 2
assert row.b is False
row = next(it_named)
assert row.a == 3
assert row.b is None

with pytest.raises(StopIteration):
next(it_named)
# Apply explicit row-buffer size
for sz in (0, 1, 2, 3, 4):
it = df.iterrows(buffer_size=sz)
assert next(it) == (1, None)
assert next(it) == (2, False)
assert next(it) == (3, None)
with pytest.raises(StopIteration):
next(it)

# Return rows as namedtuples
it_named = df.iterrows(named=True, buffer_size=sz)

row = next(it_named)
assert row.a == 1
assert row.b is None
row = next(it_named)
assert row.a == 2
assert row.b is False
row = next(it_named)
assert row.a == 3
assert row.b is None

with pytest.raises(StopIteration):
next(it_named)

0 comments on commit d5793fd

Please sign in to comment.