Fix unreliable filelist parsing

.findall() is not reliable in the face of namespaces. Don't use it anymore. closes: #9107 https://pulp.plan.io/issues/9107 closes: #8972 https://pulp.plan.io/issues/8972
pulp · Jul 23, 2021 · 552b0b2 · 552b0b2
1 parent 99ad991
commit 552b0b2
Show file tree

Hide file tree

Showing 11 changed files with 398 additions and 67 deletions.
diff --git a/CHANGES/8972.misc b/CHANGES/8972.misc
@@ -0,0 +1 @@
+Added tests that metadata is written and read correctly.
diff --git a/CHANGES/9107.bugfix b/CHANGES/9107.bugfix
@@ -0,0 +1 @@
+The fix for a previous issue resulting in incorrect metadata (#8995) was still regressing in some circumstances. Implemented a complete fix and added tests to ensure it never recurs.
diff --git a/coverage.md b/coverage.md
@@ -26,6 +26,7 @@ This file contains list of features and their test coverage.
 | As a user, I can re-sync custom repository metadata when it was the only change in a repository | YES |  |
 | As a user, I can sync an existing advisory whose dates are timestamps (as opposed to datetimes) | NO  | Example: https://updates.suse.com/SUSE/Updates/SLE-SERVER/12-SP5/x86_64/update/ |
 | As a user, I can sync repos with repomd.xml files whose filenames contain dashes (e.g., app-icons.tar.gz) | NO  | Example: https://updates.suse.com/SUSE/Updates/SLE-SERVER/12-SP5/x86_64/update/ |
+| As a user, the metadata being produced by Pulp (and being saved to Pulp) is correct | PART | Primary, Filelists and Other metadata is mostly covered. Updateinfo, Groups, and Distribution Tree metadata is not covered. "Weird" cases not covered. |
 | **Duplicates** |  |  |
 | As a user, I have only one advisory with the same id in a repo version | YES |  |
 | As a user, I have only one module with the same NSVCA in a repo version | NO |  |

diff --git a/functest_requirements.txt b/functest_requirements.txt
@@ -2,3 +2,5 @@ git+https://github.com/pulp/pulp-smash.git#egg=pulp-smash
 productmd>=1.25
 pytest
 git+https://github.com/pulp/pulpcore.git#egg=pulpcore  # re: https://pulp.plan.io/issues/7883
+dictdiffer
+xmltodict
diff --git a/pulp_rpm/app/metadata_parsing.py b/pulp_rpm/app/metadata_parsing.py
@@ -100,11 +100,12 @@ def process_filelists_package_element(element):
     pkgid = element.attrib["pkgid"]
 
     files = []
-    for element in element.findall("{*}file"):
-        basename, filename = os.path.split(element.text)
-        ftype = element.attrib.get("type")
-
-        files.append((ftype, basename, filename))
+    for subelement in element:
+        if subelement.tag == "file" or re.sub(NS_STRIP_RE, "", subelement.tag) == "file":
+            basename, filename = os.path.split(subelement.text)
+            basename = f"{basename}/"
+            ftype = subelement.attrib.get("type")
+            files.append((ftype, basename, filename))
 
     return pkgid, files
 
@@ -114,11 +115,11 @@ def process_other_package_element(element):
     pkgid = element.attrib["pkgid"]
 
     changelogs = []
-    for element in element.findall("{*}changelog"):
-        author = element.attrib["author"]
-        date = int(element.attrib["date"])
-        text = element.text
-
-        changelogs.append((author, date, text))
+    for subelement in element:
+        if subelement.tag == "changelog" or re.sub(NS_STRIP_RE, "", subelement.tag) == "changelog":
+            author = subelement.attrib["author"]
+            date = int(subelement.attrib["date"])
+            text = subelement.text
+            changelogs.append((author, date, text))
 
     return pkgid, changelogs
diff --git a/pulp_rpm/app/settings.py b/pulp_rpm/app/settings.py
@@ -8,3 +8,4 @@
 INSTALLED_APPS = ["django_readonly_field", "dynaconf_merge"]
 ALLOW_AUTOMATIC_UNSAFE_ADVISORY_CONFLICT_RESOLUTION = False
 DEFAULT_ULN_SERVER_BASE_URL = "https://linux-update.oracle.com/"
+RPM_ITERATIVE_PARSING = True
diff --git a/pulp_rpm/app/tasks/synchronizing.py b/pulp_rpm/app/tasks/synchronizing.py
@@ -590,10 +590,9 @@ def newpkgcb(pkgId, name, arch):
         # TODO: handle parsing errors/warnings, warningcb callback can be used below
         cr.xml_parse_primary(primary_xml_path, pkgcb=pkgcb, do_files=False)
 
-        # only left behind to make swapping back for testing easier - we are doing our own separate
-        # parsing of other.xml and filelists.xml
-        # cr.xml_parse_filelists(filelists_xml_path, newpkgcb=newpkgcb)
-        # cr.xml_parse_other(other_xml_path, newpkgcb=newpkgcb)
+        if not settings.RPM_ITERATIVE_PARSING:
+            cr.xml_parse_filelists(filelists_xml_path, newpkgcb=newpkgcb)
+            cr.xml_parse_other(other_xml_path, newpkgcb=newpkgcb)
         return packages
 
     async def run(self):
@@ -1029,27 +1028,29 @@ async def parse_packages(self, primary_xml, filelists_xml, other_xml, file_exten
                 except KeyError:
                     break
 
-                while True:
-                    pkgid_extra, files, changelogs = next(extra_repodata_parser)
-                    if pkgid_extra in seen_pkgids:
-                        # This is a dirty hack to handle cases that "shouldn't" happen.
-                        # Sometimes repositories have packages listed twice under the same pkgid.
-                        # This is a problem because the primary.xml parsing deduplicates the
-                        # entries by placing them into a dict keyed by pkgid. So if the iterative
-                        # parser(s) run into a package we've seen before, we should skip it and
-                        # move on.
-                        continue
-                    else:
-                        seen_pkgids.add(pkgid)
-                        break
-
-                assert pkgid == pkgid_extra, (
-                    "Package id from primary metadata ({}), does not match package id "
-                    "from filelists, other metadata ({})"
-                ).format(pkgid, pkgid_extra)
+                if settings.RPM_ITERATIVE_PARSING:
+                    while True:
+                        pkgid_extra, files, changelogs = next(extra_repodata_parser)
+                        if pkgid_extra in seen_pkgids:
+                            # This is a dirty hack to handle cases that "shouldn't" happen.
+                            # Sometimes repositories have packages listed twice under the same
+                            # pkgid. This is a problem because the primary.xml parsing
+                            # deduplicates the entries by placing them into a dict keyed by pkgid.
+                            # So if the iterative parser(s) run into a package we've seen before,
+                            # we should skip it and move on.
+                            continue
+                        else:
+                            seen_pkgids.add(pkgid)
+                            break
+
+                    assert pkgid == pkgid_extra, (
+                        "Package id from primary metadata ({}), does not match package id "
+                        "from filelists, other metadata ({})"
+                    ).format(pkgid, pkgid_extra)
+
+                    pkg.files = files
+                    pkg.changelogs = changelogs
 
-                pkg.files = files
-                pkg.changelogs = changelogs
                 package = Package(**Package.createrepo_to_dict(pkg))
                 del pkg
 

diff --git a/pulp_rpm/tests/functional/api/test_publish.py b/pulp_rpm/tests/functional/api/test_publish.py
@@ -1,10 +1,12 @@
 """Tests that publish rpm plugin repositories."""
-import gzip
+import collections
 import os
-from tempfile import NamedTemporaryFile
 from random import choice
 from xml.etree import ElementTree
 
+import xmltodict
+import dictdiffer
+
 from pulp_smash import cli, config
 from pulp_smash.utils import get_pulp_setting, http_get
 from pulp_smash.pulp3.bindings import PulpTestCase, monitor_task
@@ -20,6 +22,7 @@
 
 from pulp_rpm.tests.functional.constants import (
     RPM_ALT_LAYOUT_FIXTURE_URL,
+    RPM_COMPLEX_FIXTURE_URL,
     RPM_FIXTURE_SUMMARY,
     RPM_KICKSTART_FIXTURE_URL,
     RPM_KICKSTART_REPOSITORY_ROOT_CONTENT,
@@ -33,7 +36,7 @@
     RPM_UNSIGNED_FIXTURE_URL,
     SRPM_UNSIGNED_FIXTURE_URL,
 )
-from pulp_rpm.tests.functional.utils import gen_rpm_client, gen_rpm_remote, skip_if
+from pulp_rpm.tests.functional.utils import gen_rpm_client, gen_rpm_remote, skip_if, read_xml_gz
 from pulp_rpm.tests.functional.utils import set_up_module as setUpModule  # noqa:F401
 
 from pulpcore.client.pulp_rpm import (
@@ -47,25 +50,6 @@
 from pulpcore.client.pulp_rpm.exceptions import ApiException
 
 
-def read_xml_gz(content):
-    """
-    Read xml and xml.gz.
-
-    Tests work normally but fails for S3 due '.gz'
-    Why is it only compressed for S3?
-    """
-    with NamedTemporaryFile() as temp_file:
-        temp_file.write(content)
-        temp_file.seek(0)
-
-        try:
-            content_xml = gzip.open(temp_file.name).read()
-        except OSError:
-            # FIXME: fix this as in CI primary/update_info.xml has '.gz' but it is not gzipped
-            content_xml = temp_file.read()
-        return content_xml
-
-
 class PublishAnyRepoVersionTestCase(PulpTestCase):
     """Test whether a particular repository version can be published.
 
@@ -248,14 +232,12 @@ class ValidateNoChecksumTagTestCase(PulpTestCase):
     """Publish repository and validate the updateinfo.
 
     This Test does the following:
-
     1. Create a rpm repo and a remote.
     2. Sync the repo with the remote.
     3. Publish and distribute the repo.
     4. Check whether CheckSum tag ``sum`` not present in ``updateinfo.xml``.
 
-    This test targets the following issue:
-
+    This test targets the following issues:
     * `Pulp #4109 <https://pulp.plan.io/issues/4109>`_
     * `Pulp #4033 <https://pulp.plan.io/issues/4033>`_
     """
@@ -332,6 +314,178 @@ def _get_updateinfo_xml_path(root_elem):
         return data_elems[0].find(xpath).get("href")
 
 
+class MetadataTestCase(PulpTestCase):
+    """Publish repository and validate the metadata is the same.
+
+    This Test does the following:
+
+    1. Create a rpm repo and a remote.
+    2. Sync the repo with the remote.
+    3. Publish and distribute the repo.
+    4. Download the metadata of both the original and Pulp-created repos.
+    5. Convert it into a canonical form (XML -> JSON -> Dict -> apply a sorting order).
+    6. Compare the metadata.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        """Create class-wide variables."""
+        cls.cfg = config.get_config()
+        cls.client = gen_rpm_client()
+        cls.repo_api = RepositoriesRpmApi(cls.client)
+        cls.remote_api = RemotesRpmApi(cls.client)
+        cls.publications = PublicationsRpmApi(cls.client)
+        cls.distributions = DistributionsRpmApi(cls.client)
+
+    def test_complex_repo(self):
+        """Test the "complex" fixture that covers more of the metadata cases.
+
+        The standard fixtures have no changelogs and don't cover "ghost" files. The repo
+        with the "complex-package" does, and also does a better job of covering rich deps
+        and other atypical metadata.
+        """
+        delete_orphans()
+        self.do_test(RPM_COMPLEX_FIXTURE_URL)
+
+    def do_test(self, repo_url):
+        """Sync and publish an RPM repository and verify the metadata is what was expected."""
+        # 1. create repo and remote
+        repo = self.repo_api.create(gen_repo())
+        self.addCleanup(self.repo_api.delete, repo.pulp_href)
+
+        body = gen_rpm_remote(repo_url, policy="on_demand")
+        remote = self.remote_api.create(body)
+        self.addCleanup(self.remote_api.delete, remote.pulp_href)
+
+        # 2. Sync it
+        repository_sync_data = RpmRepositorySyncURL(remote=remote.pulp_href)
+        sync_response = self.repo_api.sync(repo.pulp_href, repository_sync_data)
+        monitor_task(sync_response.task)
+
+        # 3. Publish and distribute
+        publish_data = RpmRpmPublication(repository=repo.pulp_href)
+        publish_response = self.publications.create(publish_data)
+        created_resources = monitor_task(publish_response.task).created_resources
+        publication_href = created_resources[0]
+        self.addCleanup(self.publications.delete, publication_href)
+
+        body = gen_distribution()
+        body["publication"] = publication_href
+        distribution_response = self.distributions.create(body)
+        created_resources = monitor_task(distribution_response.task).created_resources
+        distribution = self.distributions.read(created_resources[0])
+        self.addCleanup(self.distributions.delete, distribution.pulp_href)
+
+        # 4. Download and parse the metadata.
+        original_repomd = ElementTree.fromstring(
+            http_get(os.path.join(repo_url, "repodata/repomd.xml"))
+        )
+
+        reproduced_repomd = ElementTree.fromstring(
+            http_get(os.path.join(distribution.base_url, "repodata/repomd.xml"))
+        )
+
+        def get_metadata_content(base_url, repomd_elem, meta_type):
+            """Return the text contents of metadata file.
+
+            Provided a url, a repomd root element, and a metadata type, locate the metadata
+            file's location href, download it from the provided url, un-gzip it, parse it, and
+            return the root element node.
+
+            Don't use this with large repos because it will blow up.
+            """
+            # <ns0:repomd xmlns:ns0="http://linux.duke.edu/metadata/repo">
+            #     <ns0:data type="primary">
+            #         <ns0:checksum type="sha256">[…]</ns0:checksum>
+            #         <ns0:location href="repodata/[…]-primary.xml.gz" />
+            #         …
+            #     </ns0:data>
+            #     …
+            xpath = "{{{}}}data".format(RPM_NAMESPACES["metadata/repo"])
+            data_elems = [
+                elem for elem in repomd_elem.findall(xpath) if elem.get("type") == meta_type
+            ]
+            xpath = "{{{}}}location".format(RPM_NAMESPACES["metadata/repo"])
+            location_href = data_elems[0].find(xpath).get("href")
+
+            return read_xml_gz(http_get(os.path.join(base_url, location_href)))
+
+        # 5, 6. Convert the metadata into a more workable form and then compare.
+        for metadata_file in ["primary", "filelists", "other"]:
+            with self.subTest(metadata_file):
+                original_metadata = get_metadata_content(repo_url, original_repomd, metadata_file)
+                generated_metadata = get_metadata_content(
+                    distribution.base_url, reproduced_repomd, metadata_file
+                )
+
+                self.compare_metadata_file(original_metadata, generated_metadata, metadata_file)
+
+    def compare_metadata_file(self, original_metadata_text, generated_metadata_text, meta_type):
+        """Compare two metadata files.
+
+        First convert the metadata into a canonical form. We convert from XML to JSON, and then to
+        a dict, and then apply transformations to the dict to standardize the ordering. Then, we
+        perform comparisons and observe any differences.
+        """
+        metadata_block_names = {
+            "primary": "metadata",
+            "filelists": "filelists",
+            "other": "otherdata",
+        }
+        subsection = metadata_block_names[meta_type]
+
+        # First extract the package entries
+        original_metadata = xmltodict.parse(original_metadata_text)[subsection]["package"]
+        generated_metadata = xmltodict.parse(generated_metadata_text)[subsection]["package"]
+
+        # The other transformations are inside the package nodes - they differ by type of metadata
+        if meta_type == "primary":
+            # location_href gets rewritten by Pulp so we should ignore these differences
+            ignore = {"location.@href", "format.file"}
+            # we need to make sure all of the requirements are in the same order
+            requirement_types = [
+                "rpm:suggests",
+                "rpm:recommends",
+                "rpm:enhances",
+                "rpm:provides",
+                "rpm:requires",
+                "rpm:obsoletes",
+                "rpm:conflicts",
+                "rpm:supplements",
+            ]
+            for md_file in [original_metadata, generated_metadata]:
+                for req in requirement_types:
+                    if md_file["format"].get(req):
+                        md_file["format"][req] = sorted(md_file["format"][req])
+        elif meta_type == "filelists":
+            ignore = {}
+            # make sure the files are all in the same order and type and sort them
+            # nodes with a "type" attribute in the XML become OrderedDicts, so we have to convert
+            # them to a string representation.
+            for md_file in [original_metadata, generated_metadata]:
+                if md_file.get("file"):
+                    files = []
+                    for f in md_file["file"]:
+                        if isinstance(f, collections.OrderedDict):
+                            files.append(
+                                "{path} type={type}".format(path=f["@type"], type=f["#text"])
+                            )
+                        else:
+                            files.append(f)
+
+                    md_file["file"] = sorted(files)
+        elif meta_type == "other":
+            ignore = {}
+            # make sure the changelogs are in the same order
+            for md_file in [original_metadata, generated_metadata]:
+                if md_file.get("changelog"):
+                    md_file["changelog"] = sorted(md_file["changelog"], key=lambda x: x["@author"])
+
+        # The metadata dicts should now be consistently ordered. Check for differences.
+        diff = dictdiffer.diff(original_metadata, generated_metadata, ignore=ignore)
+        self.assertListEqual(list(diff), [], list(diff))
+
+
 class ChecksumTypeTestCase(PulpTestCase):
     """Publish repository and validate the updateinfo.
 
@@ -595,14 +749,14 @@ def do_test(self, with_sqlite):
         sqlite_files = [elem for elem in data_elems if elem.get("type").endswith("_db")]
 
         if with_sqlite:
-            self.assertEquals(3, len(sqlite_files))
+            self.assertEqual(3, len(sqlite_files))
 
             for db_elem in sqlite_files:
                 location_xpath = "{{{}}}location".format(RPM_NAMESPACES["metadata/repo"])
                 db_href = db_elem.find(location_xpath).get("href")
                 http_get(os.path.join(distribution.base_url, db_href))
         else:
-            self.assertEquals(0, len(sqlite_files))
+            self.assertEqual(0, len(sqlite_files))
 
     def test_sqlite_metadata(self):
         """Sync and publish an RPM repository and verify that the sqlite metadata exists."""