Skip to content

Commit

Permalink
Fix unreliable filelist parsing
Browse files Browse the repository at this point in the history
.findall() is not reliable in the face of namespaces. Don't use it
anymore.

closes: #9107
https://pulp.plan.io/issues/9107
closes: #8972
https://pulp.plan.io/issues/8972
  • Loading branch information
dralley committed Jul 23, 2021
1 parent 99ad991 commit 552b0b2
Show file tree
Hide file tree
Showing 11 changed files with 398 additions and 67 deletions.
1 change: 1 addition & 0 deletions CHANGES/8972.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added tests that metadata is written and read correctly.
1 change: 1 addition & 0 deletions CHANGES/9107.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The fix for a previous issue resulting in incorrect metadata (#8995) was still regressing in some circumstances. Implemented a complete fix and added tests to ensure it never recurs.
1 change: 1 addition & 0 deletions coverage.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ This file contains list of features and their test coverage.
| As a user, I can re-sync custom repository metadata when it was the only change in a repository | YES | |
| As a user, I can sync an existing advisory whose dates are timestamps (as opposed to datetimes) | NO | Example: https://updates.suse.com/SUSE/Updates/SLE-SERVER/12-SP5/x86_64/update/ |
| As a user, I can sync repos with repomd.xml files whose filenames contain dashes (e.g., app-icons.tar.gz) | NO | Example: https://updates.suse.com/SUSE/Updates/SLE-SERVER/12-SP5/x86_64/update/ |
| As a user, the metadata being produced by Pulp (and being saved to Pulp) is correct | PART | Primary, Filelists and Other metadata is mostly covered. Updateinfo, Groups, and Distribution Tree metadata is not covered. "Weird" cases not covered. |
| **Duplicates** | | |
| As a user, I have only one advisory with the same id in a repo version | YES | |
| As a user, I have only one module with the same NSVCA in a repo version | NO | |
Expand Down
2 changes: 2 additions & 0 deletions functest_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@ git+https://github.com/pulp/pulp-smash.git#egg=pulp-smash
productmd>=1.25
pytest
git+https://github.com/pulp/pulpcore.git#egg=pulpcore # re: https://pulp.plan.io/issues/7883
dictdiffer
xmltodict
23 changes: 12 additions & 11 deletions pulp_rpm/app/metadata_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,12 @@ def process_filelists_package_element(element):
pkgid = element.attrib["pkgid"]

files = []
for element in element.findall("{*}file"):
basename, filename = os.path.split(element.text)
ftype = element.attrib.get("type")

files.append((ftype, basename, filename))
for subelement in element:
if subelement.tag == "file" or re.sub(NS_STRIP_RE, "", subelement.tag) == "file":
basename, filename = os.path.split(subelement.text)
basename = f"{basename}/"
ftype = subelement.attrib.get("type")
files.append((ftype, basename, filename))

return pkgid, files

Expand All @@ -114,11 +115,11 @@ def process_other_package_element(element):
pkgid = element.attrib["pkgid"]

changelogs = []
for element in element.findall("{*}changelog"):
author = element.attrib["author"]
date = int(element.attrib["date"])
text = element.text

changelogs.append((author, date, text))
for subelement in element:
if subelement.tag == "changelog" or re.sub(NS_STRIP_RE, "", subelement.tag) == "changelog":
author = subelement.attrib["author"]
date = int(subelement.attrib["date"])
text = subelement.text
changelogs.append((author, date, text))

return pkgid, changelogs
1 change: 1 addition & 0 deletions pulp_rpm/app/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@
INSTALLED_APPS = ["django_readonly_field", "dynaconf_merge"]
ALLOW_AUTOMATIC_UNSAFE_ADVISORY_CONFLICT_RESOLUTION = False
DEFAULT_ULN_SERVER_BASE_URL = "https://linux-update.oracle.com/"
RPM_ITERATIVE_PARSING = True
49 changes: 25 additions & 24 deletions pulp_rpm/app/tasks/synchronizing.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,10 +590,9 @@ def newpkgcb(pkgId, name, arch):
# TODO: handle parsing errors/warnings, warningcb callback can be used below
cr.xml_parse_primary(primary_xml_path, pkgcb=pkgcb, do_files=False)

# only left behind to make swapping back for testing easier - we are doing our own separate
# parsing of other.xml and filelists.xml
# cr.xml_parse_filelists(filelists_xml_path, newpkgcb=newpkgcb)
# cr.xml_parse_other(other_xml_path, newpkgcb=newpkgcb)
if not settings.RPM_ITERATIVE_PARSING:
cr.xml_parse_filelists(filelists_xml_path, newpkgcb=newpkgcb)
cr.xml_parse_other(other_xml_path, newpkgcb=newpkgcb)
return packages

async def run(self):
Expand Down Expand Up @@ -1029,27 +1028,29 @@ async def parse_packages(self, primary_xml, filelists_xml, other_xml, file_exten
except KeyError:
break

while True:
pkgid_extra, files, changelogs = next(extra_repodata_parser)
if pkgid_extra in seen_pkgids:
# This is a dirty hack to handle cases that "shouldn't" happen.
# Sometimes repositories have packages listed twice under the same pkgid.
# This is a problem because the primary.xml parsing deduplicates the
# entries by placing them into a dict keyed by pkgid. So if the iterative
# parser(s) run into a package we've seen before, we should skip it and
# move on.
continue
else:
seen_pkgids.add(pkgid)
break

assert pkgid == pkgid_extra, (
"Package id from primary metadata ({}), does not match package id "
"from filelists, other metadata ({})"
).format(pkgid, pkgid_extra)
if settings.RPM_ITERATIVE_PARSING:
while True:
pkgid_extra, files, changelogs = next(extra_repodata_parser)
if pkgid_extra in seen_pkgids:
# This is a dirty hack to handle cases that "shouldn't" happen.
# Sometimes repositories have packages listed twice under the same
# pkgid. This is a problem because the primary.xml parsing
# deduplicates the entries by placing them into a dict keyed by pkgid.
# So if the iterative parser(s) run into a package we've seen before,
# we should skip it and move on.
continue
else:
seen_pkgids.add(pkgid)
break

assert pkgid == pkgid_extra, (
"Package id from primary metadata ({}), does not match package id "
"from filelists, other metadata ({})"
).format(pkgid, pkgid_extra)

pkg.files = files
pkg.changelogs = changelogs

pkg.files = files
pkg.changelogs = changelogs
package = Package(**Package.createrepo_to_dict(pkg))
del pkg

Expand Down
208 changes: 181 additions & 27 deletions pulp_rpm/tests/functional/api/test_publish.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""Tests that publish rpm plugin repositories."""
import gzip
import collections
import os
from tempfile import NamedTemporaryFile
from random import choice
from xml.etree import ElementTree

import xmltodict
import dictdiffer

from pulp_smash import cli, config
from pulp_smash.utils import get_pulp_setting, http_get
from pulp_smash.pulp3.bindings import PulpTestCase, monitor_task
Expand All @@ -20,6 +22,7 @@

from pulp_rpm.tests.functional.constants import (
RPM_ALT_LAYOUT_FIXTURE_URL,
RPM_COMPLEX_FIXTURE_URL,
RPM_FIXTURE_SUMMARY,
RPM_KICKSTART_FIXTURE_URL,
RPM_KICKSTART_REPOSITORY_ROOT_CONTENT,
Expand All @@ -33,7 +36,7 @@
RPM_UNSIGNED_FIXTURE_URL,
SRPM_UNSIGNED_FIXTURE_URL,
)
from pulp_rpm.tests.functional.utils import gen_rpm_client, gen_rpm_remote, skip_if
from pulp_rpm.tests.functional.utils import gen_rpm_client, gen_rpm_remote, skip_if, read_xml_gz
from pulp_rpm.tests.functional.utils import set_up_module as setUpModule # noqa:F401

from pulpcore.client.pulp_rpm import (
Expand All @@ -47,25 +50,6 @@
from pulpcore.client.pulp_rpm.exceptions import ApiException


def read_xml_gz(content):
"""
Read xml and xml.gz.
Tests work normally but fails for S3 due '.gz'
Why is it only compressed for S3?
"""
with NamedTemporaryFile() as temp_file:
temp_file.write(content)
temp_file.seek(0)

try:
content_xml = gzip.open(temp_file.name).read()
except OSError:
# FIXME: fix this as in CI primary/update_info.xml has '.gz' but it is not gzipped
content_xml = temp_file.read()
return content_xml


class PublishAnyRepoVersionTestCase(PulpTestCase):
"""Test whether a particular repository version can be published.
Expand Down Expand Up @@ -248,14 +232,12 @@ class ValidateNoChecksumTagTestCase(PulpTestCase):
"""Publish repository and validate the updateinfo.
This Test does the following:
1. Create a rpm repo and a remote.
2. Sync the repo with the remote.
3. Publish and distribute the repo.
4. Check whether CheckSum tag ``sum`` not present in ``updateinfo.xml``.
This test targets the following issue:
This test targets the following issues:
* `Pulp #4109 <https://pulp.plan.io/issues/4109>`_
* `Pulp #4033 <https://pulp.plan.io/issues/4033>`_
"""
Expand Down Expand Up @@ -332,6 +314,178 @@ def _get_updateinfo_xml_path(root_elem):
return data_elems[0].find(xpath).get("href")


class MetadataTestCase(PulpTestCase):
"""Publish repository and validate the metadata is the same.
This Test does the following:
1. Create a rpm repo and a remote.
2. Sync the repo with the remote.
3. Publish and distribute the repo.
4. Download the metadata of both the original and Pulp-created repos.
5. Convert it into a canonical form (XML -> JSON -> Dict -> apply a sorting order).
6. Compare the metadata.
"""

@classmethod
def setUpClass(cls):
"""Create class-wide variables."""
cls.cfg = config.get_config()
cls.client = gen_rpm_client()
cls.repo_api = RepositoriesRpmApi(cls.client)
cls.remote_api = RemotesRpmApi(cls.client)
cls.publications = PublicationsRpmApi(cls.client)
cls.distributions = DistributionsRpmApi(cls.client)

def test_complex_repo(self):
"""Test the "complex" fixture that covers more of the metadata cases.
The standard fixtures have no changelogs and don't cover "ghost" files. The repo
with the "complex-package" does, and also does a better job of covering rich deps
and other atypical metadata.
"""
delete_orphans()
self.do_test(RPM_COMPLEX_FIXTURE_URL)

def do_test(self, repo_url):
"""Sync and publish an RPM repository and verify the metadata is what was expected."""
# 1. create repo and remote
repo = self.repo_api.create(gen_repo())
self.addCleanup(self.repo_api.delete, repo.pulp_href)

body = gen_rpm_remote(repo_url, policy="on_demand")
remote = self.remote_api.create(body)
self.addCleanup(self.remote_api.delete, remote.pulp_href)

# 2. Sync it
repository_sync_data = RpmRepositorySyncURL(remote=remote.pulp_href)
sync_response = self.repo_api.sync(repo.pulp_href, repository_sync_data)
monitor_task(sync_response.task)

# 3. Publish and distribute
publish_data = RpmRpmPublication(repository=repo.pulp_href)
publish_response = self.publications.create(publish_data)
created_resources = monitor_task(publish_response.task).created_resources
publication_href = created_resources[0]
self.addCleanup(self.publications.delete, publication_href)

body = gen_distribution()
body["publication"] = publication_href
distribution_response = self.distributions.create(body)
created_resources = monitor_task(distribution_response.task).created_resources
distribution = self.distributions.read(created_resources[0])
self.addCleanup(self.distributions.delete, distribution.pulp_href)

# 4. Download and parse the metadata.
original_repomd = ElementTree.fromstring(
http_get(os.path.join(repo_url, "repodata/repomd.xml"))
)

reproduced_repomd = ElementTree.fromstring(
http_get(os.path.join(distribution.base_url, "repodata/repomd.xml"))
)

def get_metadata_content(base_url, repomd_elem, meta_type):
"""Return the text contents of metadata file.
Provided a url, a repomd root element, and a metadata type, locate the metadata
file's location href, download it from the provided url, un-gzip it, parse it, and
return the root element node.
Don't use this with large repos because it will blow up.
"""
# <ns0:repomd xmlns:ns0="http://linux.duke.edu/metadata/repo">
# <ns0:data type="primary">
# <ns0:checksum type="sha256">[…]</ns0:checksum>
# <ns0:location href="repodata/[…]-primary.xml.gz" />
# …
# </ns0:data>
# …
xpath = "{{{}}}data".format(RPM_NAMESPACES["metadata/repo"])
data_elems = [
elem for elem in repomd_elem.findall(xpath) if elem.get("type") == meta_type
]
xpath = "{{{}}}location".format(RPM_NAMESPACES["metadata/repo"])
location_href = data_elems[0].find(xpath).get("href")

return read_xml_gz(http_get(os.path.join(base_url, location_href)))

# 5, 6. Convert the metadata into a more workable form and then compare.
for metadata_file in ["primary", "filelists", "other"]:
with self.subTest(metadata_file):
original_metadata = get_metadata_content(repo_url, original_repomd, metadata_file)
generated_metadata = get_metadata_content(
distribution.base_url, reproduced_repomd, metadata_file
)

self.compare_metadata_file(original_metadata, generated_metadata, metadata_file)

def compare_metadata_file(self, original_metadata_text, generated_metadata_text, meta_type):
"""Compare two metadata files.
First convert the metadata into a canonical form. We convert from XML to JSON, and then to
a dict, and then apply transformations to the dict to standardize the ordering. Then, we
perform comparisons and observe any differences.
"""
metadata_block_names = {
"primary": "metadata",
"filelists": "filelists",
"other": "otherdata",
}
subsection = metadata_block_names[meta_type]

# First extract the package entries
original_metadata = xmltodict.parse(original_metadata_text)[subsection]["package"]
generated_metadata = xmltodict.parse(generated_metadata_text)[subsection]["package"]

# The other transformations are inside the package nodes - they differ by type of metadata
if meta_type == "primary":
# location_href gets rewritten by Pulp so we should ignore these differences
ignore = {"location.@href", "format.file"}
# we need to make sure all of the requirements are in the same order
requirement_types = [
"rpm:suggests",
"rpm:recommends",
"rpm:enhances",
"rpm:provides",
"rpm:requires",
"rpm:obsoletes",
"rpm:conflicts",
"rpm:supplements",
]
for md_file in [original_metadata, generated_metadata]:
for req in requirement_types:
if md_file["format"].get(req):
md_file["format"][req] = sorted(md_file["format"][req])
elif meta_type == "filelists":
ignore = {}
# make sure the files are all in the same order and type and sort them
# nodes with a "type" attribute in the XML become OrderedDicts, so we have to convert
# them to a string representation.
for md_file in [original_metadata, generated_metadata]:
if md_file.get("file"):
files = []
for f in md_file["file"]:
if isinstance(f, collections.OrderedDict):
files.append(
"{path} type={type}".format(path=f["@type"], type=f["#text"])
)
else:
files.append(f)

md_file["file"] = sorted(files)
elif meta_type == "other":
ignore = {}
# make sure the changelogs are in the same order
for md_file in [original_metadata, generated_metadata]:
if md_file.get("changelog"):
md_file["changelog"] = sorted(md_file["changelog"], key=lambda x: x["@author"])

# The metadata dicts should now be consistently ordered. Check for differences.
diff = dictdiffer.diff(original_metadata, generated_metadata, ignore=ignore)
self.assertListEqual(list(diff), [], list(diff))


class ChecksumTypeTestCase(PulpTestCase):
"""Publish repository and validate the updateinfo.
Expand Down Expand Up @@ -595,14 +749,14 @@ def do_test(self, with_sqlite):
sqlite_files = [elem for elem in data_elems if elem.get("type").endswith("_db")]

if with_sqlite:
self.assertEquals(3, len(sqlite_files))
self.assertEqual(3, len(sqlite_files))

for db_elem in sqlite_files:
location_xpath = "{{{}}}location".format(RPM_NAMESPACES["metadata/repo"])
db_href = db_elem.find(location_xpath).get("href")
http_get(os.path.join(distribution.base_url, db_href))
else:
self.assertEquals(0, len(sqlite_files))
self.assertEqual(0, len(sqlite_files))

def test_sqlite_metadata(self):
"""Sync and publish an RPM repository and verify that the sqlite metadata exists."""
Expand Down
Loading

0 comments on commit 552b0b2

Please sign in to comment.