From 69182ee1ca9bc0476dc073e82c0a44962dc20f54 Mon Sep 17 00:00:00 2001 From: Daniel Alley Date: Mon, 5 Apr 2021 11:57:58 -0400 Subject: [PATCH] Improve publish speed by 2x to 20x Use a more complex single query rather than N small queries to iterate content artifacts. Using a repositoriy containing 20,000 content units, the publish time is improved from ~23 seconds to 1 second if the content is immediate synced, and 14 seconds (vs. the same) if content is on_demand. closes: #8508 https://pulp.plan.io/issues/8508 --- CHANGES/8508.misc | 1 + pulp_file/app/tasks/publishing.py | 44 +++++++++++++++++-------------- 2 files changed, 25 insertions(+), 20 deletions(-) create mode 100644 CHANGES/8508.misc diff --git a/CHANGES/8508.misc b/CHANGES/8508.misc new file mode 100644 index 00000000..c92d79f0 --- /dev/null +++ b/CHANGES/8508.misc @@ -0,0 +1 @@ +Substantially improved speed of publishing repositories, especially large ones. diff --git a/pulp_file/app/tasks/publishing.py b/pulp_file/app/tasks/publishing.py index 9fbeb554..c396f26a 100644 --- a/pulp_file/app/tasks/publishing.py +++ b/pulp_file/app/tasks/publishing.py @@ -5,9 +5,14 @@ from django.core.files import File -from pulpcore.plugin.models import RepositoryVersion, PublishedMetadata, RemoteArtifact - -from pulp_file.app.models import FileContent, FilePublication +from pulpcore.plugin.models import ( + ContentArtifact, + RepositoryVersion, + PublishedMetadata, + RemoteArtifact, +) + +from pulp_file.app.models import FilePublication from pulp_file.manifest import Entry, Manifest @@ -57,20 +62,19 @@ def populate(publication): """ - def find_artifact(): - _artifact = content_artifact.artifact - if not _artifact: - _artifact = RemoteArtifact.objects.filter(content_artifact=content_artifact).first() - return _artifact - - for content in FileContent.objects.filter( - pk__in=publication.repository_version.content - ).order_by("-pulp_created"): - for content_artifact in content.contentartifact_set.all(): - artifact = find_artifact() - entry = Entry( - relative_path=content_artifact.relative_path, - digest=artifact.sha256, - size=artifact.size, - ) - yield entry + content_artifacts = ContentArtifact.objects.filter( + content__in=publication.repository_version.content + ).order_by("-content__pulp_created") + + for content_artifact in content_artifacts.select_related("artifact").iterator(): + if content_artifact.artifact: + artifact = content_artifact.artifact + else: + # TODO: this scales poorly, one query per on_demand content being published. + artifact = RemoteArtifact.objects.filter(content_artifact=content_artifact).first() + entry = Entry( + relative_path=content_artifact.relative_path, + digest=artifact.sha256, + size=artifact.size, + ) + yield entry