performance: use faster method of item insert

Adding items onto publishes seemed a bit slower than expected. After some profiling and experimenting I found that changing the method of bulk insert here significantly improves performance. The sqlalchemy docs hint at this. In the docs for Insert.values[1] it's mentioned that: > To emit an INSERT statement against multiple rows, the normal method > is to pass a multiple values list to the Connection.execute() method, > which is supported by all database backends and is generally more > efficient for a very large number of parameters. The included test takes 15 seconds on my system to run prior to this change and 6 seconds afterward. [1] https://docs.sqlalchemy.org/en/20/core/dml.html#sqlalchemy.sql.expression.Insert.values
release-engineering · Dec 13, 2023 · 3ae38d3 · 3ae38d3
1 parent 803d98e
commit 3ae38d3
Show file tree

Hide file tree

Showing 2 changed files with 100 additions and 2 deletions.
diff --git a/exodus_gw/routers/publish.py b/exodus_gw/routers/publish.py
@@ -298,7 +298,7 @@ def update_publish_items(
         extra={"event": "publish"},
     )
 
-    statement = insert(models.Item).values(items_data)
+    statement = insert(models.Item)
 
     # Update all target table columns, except for the primary_key column.
     update_dict = {c.name: c for c in statement.excluded if not c.primary_key}
@@ -308,7 +308,7 @@ def update_publish_items(
         set_=update_dict,
     )
 
-    db.execute(update_statement)
+    db.execute(update_statement, items_data)
 
     # If any of the items we just updated are an entry point, we also trigger
     # autoindex in the background.

diff --git a/tests/routers/test_publish_perf.py b/tests/routers/test_publish_perf.py
@@ -0,0 +1,98 @@
+from collections.abc import Iterable
+from hashlib import sha256
+from itertools import islice
+from typing import TypeVar
+
+from fastapi.testclient import TestClient
+
+from exodus_gw.main import app
+from exodus_gw.models import Publish
+
+T = TypeVar("T")
+
+
+def object_key(name: str):
+    return sha256(name.encode()).hexdigest().lower()
+
+
+def origin_items(count: int):
+    for i in range(1, count + 1):
+        filename = f"test-package-{i}.noarch.rpm"
+        yield {
+            "web_uri": f"/origin/rpms/{filename}",
+            "object_key": object_key(filename),
+            "content_type": "application/x-rpm",
+        }
+
+
+def package_items(count: int):
+    for i in range(1, count + 1):
+        filename = f"test-package-{i}.noarch.rpm"
+        yield {
+            "web_uri": f"/content/some-repo/Packages/{filename}",
+            "link_to": f"/origin/rpms/{filename}",
+        }
+
+
+# TODO: in python 3.12 use itertools.batched
+def batched(iterable: Iterable[T], n: int):
+    it = iter(iterable)
+    while batch := tuple(islice(it, n)):
+        yield batch
+
+
+def test_update_publish_items_large(db, auth_header):
+    """Performance test putting a large number of items onto a publish."""
+
+    publish_id = "11224567-e89b-12d3-a456-426614174000"
+
+    publish = Publish(id=publish_id, env="test", state="PENDING")
+    db.add(publish)
+    db.commit()
+
+    # This test is trying to simulate performance of a force publish of
+    # a large rhsm-pulp repo. 35000 is a realistic count of RPMs for
+    # some repos.
+    #
+    # 10000 is the default batch size used by exodus-rsync.
+    package_count = 35000
+    batch_size = 10000
+
+    # Produce two lists of items to add to the publish.
+    #
+    # The 'origin' list represents pulp's rsync under /origin (cdn_path) and
+    # uses non-link items.
+    #
+    # The 'package' list represents Pulp's rsync of Packages directory in a yum
+    # repo, which uses link items to /origin.
+    #
+    # In both cases we force eager creation of the lists now so it doesn't count
+    # against later performance measurements.
+    all_origin_items = list(origin_items(package_count))
+    all_package_items = list(package_items(package_count))
+
+    # Now arrange them in the actual batches which will be used during PUT.
+    # This should be similar to the way exodus-rsync would batch them in real usage.
+    batched_origin_items = batched(all_origin_items, batch_size)
+    batched_package_items = batched(all_package_items, batch_size)
+
+    with TestClient(app) as client:
+        for batch in batched_origin_items:
+            r = client.put(
+                "/test/publish/%s" % publish_id,
+                json=batch,
+                headers=auth_header(roles=["test-publisher"]),
+            )
+            assert r.status_code == 200
+
+        for batch in batched_package_items:
+            r = client.put(
+                "/test/publish/%s" % publish_id,
+                json=batch,
+                headers=auth_header(roles=["test-publisher"]),
+            )
+            assert r.status_code == 200
+
+    # Verify expected number of items were added
+    db.refresh(publish)
+    assert len(publish.items) == package_count * 2