From 70c11e0dceb82aca1d02556261fb7ae59e17aa66 Mon Sep 17 00:00:00 2001 From: Rohan McGovern Date: Mon, 23 Aug 2021 09:48:49 +1000 Subject: [PATCH] Map RpmUnit.sha256sum to Pulp 'checksum', not 'checksums.sha256' For RPM units in Pulp, the SHA256 checksum is stored in two places: once in a 'checksums' dict and once again in a top-level 'checksum' field. We previously used the former, but only the latter is indexed (being a part of the unit key). Switching to 'checksum' can make searches much faster (and can make the difference between a search being feasible at all or being unusably slow), so let's do that. This should be safe, as I have confirmed in the Pulp code (linked within) that the checksum field is always forced to sha256, and also verified on our live systems that there are no units with a checksumtype!='sha256'. Motivation: noticed some queries were slower than expected while working on RHELDST-5435. --- CHANGELOG.md | 4 ++- pubtools/pulplib/_impl/model/unit/rpm.py | 6 +++- pubtools/pulplib/_impl/schema/unit.yaml | 10 +++++++ tests/unit/test_rpm_sums.py | 37 ++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 tests/unit/test_rpm_sums.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 2bbd9088..4e45b15d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -- n/a +- Searching for RPMs by sha256sum will now use the indexed `checksum` field on Pulp, + rather than the non-indexed `checksums.sha256` field. This can significantly improve + the performance of these searches on large systems. ## [2.12.1] - 2021-08-11 diff --git a/pubtools/pulplib/_impl/model/unit/rpm.py b/pubtools/pulplib/_impl/model/unit/rpm.py index f12da3d9..4065ea4b 100644 --- a/pubtools/pulplib/_impl/model/unit/rpm.py +++ b/pubtools/pulplib/_impl/model/unit/rpm.py @@ -88,7 +88,11 @@ class RpmUnit(Unit): sha256sum = pulp_attrib( default=None, type=str, - pulp_field="checksums.sha256", + # Use 'checksum' field because it's indexed and therefore much faster than + # searching for checksums.sha256. + # It's safe since this is always stored as a copy of the sha256 checksum, see: + # https://github.com/pulp/pulp_rpm/blob/69759d0fb9a16c0a47b1f49c78f6712e650912e1/plugins/pulp_rpm/plugins/importers/yum/upload.py#L436 + pulp_field="checksum", converter=lambda s: s.lower() if s else s, ) """SHA256 checksum of this RPM, as a hex string.""" diff --git a/pubtools/pulplib/_impl/schema/unit.yaml b/pubtools/pulplib/_impl/schema/unit.yaml index aa11b612..ba1129d7 100644 --- a/pubtools/pulplib/_impl/schema/unit.yaml +++ b/pubtools/pulplib/_impl/schema/unit.yaml @@ -97,6 +97,16 @@ definitions: type: string pattern: "^[a-f0-9]{64}$" + # SHA256 checksum. + # This duplicates checksums.sha256 above; the difference is that this field + # is a part of the unit key, so it's both mandatory & indexed. + # Also, though the original intent was probably to support multiple checksum + # types in this field, it is nowadays forced to sha256, see: + # https://github.com/pulp/pulp_rpm/blob/69759d0fb9a16c0a47b1f49c78f6712e650912e1/plugins/pulp_rpm/plugins/importers/yum/upload.py#L436 + checksum: + type: string + pattern: "^[a-f0-9]{64}$" + repository_memberships: type: array items: diff --git a/tests/unit/test_rpm_sums.py b/tests/unit/test_rpm_sums.py new file mode 100644 index 00000000..0ed34f96 --- /dev/null +++ b/tests/unit/test_rpm_sums.py @@ -0,0 +1,37 @@ +from pubtools.pulplib import Unit + + +def test_rpm_sums(): + """Checksum values come from expected fields on pulp unit.""" + + unit = Unit.from_data( + { + "_content_type_id": "rpm", + "name": "bash", + "epoch": "0", + "filename": "bash-x86_64.rpm", + "version": "4.0", + "release": "1", + "arch": "x86_64", + # Sums are stored in a dict per algorithm... + "checksums": { + "md5": "aaa07a382ec010c01889250fce66fb13", + "sha1": "bbb9ae4aeea6946a8668445395ba10b7399523a0", + "sha256": "ccce93732fcf8d63fe1cce759664982dbd5b23161f007dba8561862adc96d063", + }, + # But there is also a top-level "checksum" which is always sha256. + # Normally this should be exactly equal to checksums.sha256 of course; + # in this test we force a difference so we can tell which value was used. + "checksum": "ddde93732fcf8d63fe1cce759664982dbd5b23161f007dba8561862adc96d063", + } + ) + + # It should get these two from the checksums dict. + assert unit.md5sum == "aaa07a382ec010c01889250fce66fb13" + assert unit.sha1sum == "bbb9ae4aeea6946a8668445395ba10b7399523a0" + + # This one should instead come from "checksum". + assert ( + unit.sha256sum + == "ddde93732fcf8d63fe1cce759664982dbd5b23161f007dba8561862adc96d063" + )