Skip to content

Commit

Permalink
performance: use faster method of item insert
Browse files Browse the repository at this point in the history
Adding items onto publishes seemed a bit slower than expected.
After some profiling and experimenting I found that changing the method
of bulk insert here significantly improves performance.

The sqlalchemy docs hint at this. In the docs for Insert.values[1]
it's mentioned that:

> To emit an INSERT statement against multiple rows, the normal method
> is to pass a multiple values list to the Connection.execute() method,
> which is supported by all database backends and is generally more
> efficient for a very large number of parameters.

The included test takes 15 seconds on my system to run prior to this
change and 6 seconds afterward.

[1] https://docs.sqlalchemy.org/en/20/core/dml.html#sqlalchemy.sql.expression.Insert.values
  • Loading branch information
rohanpm committed Dec 13, 2023
1 parent 803d98e commit 3ae38d3
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 2 deletions.
4 changes: 2 additions & 2 deletions exodus_gw/routers/publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ def update_publish_items(
extra={"event": "publish"},
)

statement = insert(models.Item).values(items_data)
statement = insert(models.Item)

# Update all target table columns, except for the primary_key column.
update_dict = {c.name: c for c in statement.excluded if not c.primary_key}
Expand All @@ -308,7 +308,7 @@ def update_publish_items(
set_=update_dict,
)

db.execute(update_statement)
db.execute(update_statement, items_data)

# If any of the items we just updated are an entry point, we also trigger
# autoindex in the background.
Expand Down
98 changes: 98 additions & 0 deletions tests/routers/test_publish_perf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from collections.abc import Iterable
from hashlib import sha256
from itertools import islice
from typing import TypeVar

from fastapi.testclient import TestClient

from exodus_gw.main import app
from exodus_gw.models import Publish

T = TypeVar("T")


def object_key(name: str):
return sha256(name.encode()).hexdigest().lower()


def origin_items(count: int):
for i in range(1, count + 1):
filename = f"test-package-{i}.noarch.rpm"
yield {
"web_uri": f"/origin/rpms/{filename}",
"object_key": object_key(filename),
"content_type": "application/x-rpm",
}


def package_items(count: int):
for i in range(1, count + 1):
filename = f"test-package-{i}.noarch.rpm"
yield {
"web_uri": f"/content/some-repo/Packages/{filename}",
"link_to": f"/origin/rpms/{filename}",
}


# TODO: in python 3.12 use itertools.batched
def batched(iterable: Iterable[T], n: int):
it = iter(iterable)
while batch := tuple(islice(it, n)):
yield batch


def test_update_publish_items_large(db, auth_header):
"""Performance test putting a large number of items onto a publish."""

publish_id = "11224567-e89b-12d3-a456-426614174000"

publish = Publish(id=publish_id, env="test", state="PENDING")
db.add(publish)
db.commit()

# This test is trying to simulate performance of a force publish of
# a large rhsm-pulp repo. 35000 is a realistic count of RPMs for
# some repos.
#
# 10000 is the default batch size used by exodus-rsync.
package_count = 35000
batch_size = 10000

# Produce two lists of items to add to the publish.
#
# The 'origin' list represents pulp's rsync under /origin (cdn_path) and
# uses non-link items.
#
# The 'package' list represents Pulp's rsync of Packages directory in a yum
# repo, which uses link items to /origin.
#
# In both cases we force eager creation of the lists now so it doesn't count
# against later performance measurements.
all_origin_items = list(origin_items(package_count))
all_package_items = list(package_items(package_count))

# Now arrange them in the actual batches which will be used during PUT.
# This should be similar to the way exodus-rsync would batch them in real usage.
batched_origin_items = batched(all_origin_items, batch_size)
batched_package_items = batched(all_package_items, batch_size)

with TestClient(app) as client:
for batch in batched_origin_items:
r = client.put(
"/test/publish/%s" % publish_id,
json=batch,
headers=auth_header(roles=["test-publisher"]),
)
assert r.status_code == 200

for batch in batched_package_items:
r = client.put(
"/test/publish/%s" % publish_id,
json=batch,
headers=auth_header(roles=["test-publisher"]),
)
assert r.status_code == 200

# Verify expected number of items were added
db.refresh(publish)
assert len(publish.items) == package_count * 2

0 comments on commit 3ae38d3

Please sign in to comment.