From 73d67dd4ac855b6311a5622f7eb211f2e65d0f35 Mon Sep 17 00:00:00 2001
From: Sean Myers <sean.myers@redhat.com>
Date: Mon, 14 Mar 2016 23:08:58 +0000
Subject: [PATCH] published updateinfo only contains units in repo

Errata units in Pulp contain all units in all repos that are linked to
errata with the same id, which was resulting in published errata
referencing packages that weren't actually available in the published
repo. This limits packages in published errata updateinfo XML to only
the packages that are contained in the published repo.

fixes #1366
https://pulp.plan.io/issues/1366

fixes #1548
https://pulp.plan.io/issues/1548
---
 .../distributors/yum/metadata/updateinfo.py   |  74 +++++++++++-
 .../plugins/distributors/yum/publish.py       |   3 +-
 .../yum/metadata/test_metadata.py             | 111 +++++++++++++++++-
 3 files changed, 185 insertions(+), 3 deletions(-)

diff --git a/plugins/pulp_rpm/plugins/distributors/yum/metadata/updateinfo.py b/plugins/pulp_rpm/plugins/distributors/yum/metadata/updateinfo.py
index 532c9facc..ba97ecb5e 100644
--- a/plugins/pulp_rpm/plugins/distributors/yum/metadata/updateinfo.py
+++ b/plugins/pulp_rpm/plugins/distributors/yum/metadata/updateinfo.py
@@ -1,9 +1,12 @@
 import os
 from xml.etree import ElementTree
 
+import mongoengine
 from pulp.plugins.util.metadata_writer import XmlFileContext
+from pulp.server.db.model import RepositoryContentUnit
 
 from pulp_rpm.plugins.distributors.yum.metadata.metadata import REPO_DATA_DIR_NAME
+from pulp_rpm.plugins.db import models
 from pulp_rpm.yum_plugin import util
 
 
@@ -13,12 +16,65 @@
 
 
 class UpdateinfoXMLFileContext(XmlFileContext):
-    def __init__(self, working_dir, checksum_type=None):
+    def __init__(self, working_dir, checksum_type=None, conduit=None):
         metadata_file_path = os.path.join(working_dir, REPO_DATA_DIR_NAME,
                                           UPDATE_INFO_XML_FILE_NAME)
+        self.conduit = conduit
         super(UpdateinfoXMLFileContext, self).__init__(
             metadata_file_path, 'updates', checksum_type=checksum_type)
 
+    def _repo_unit_nevra(self, erratum_unit, repo_id):
+        """
+        Return a list of NEVRA dicts for units in a single repo referenced by the given errata.
+
+        Pulp errata units combine the known packages from all synced repos. Given an errata unit
+        and a repo, return a list of NEVRA dicts that can be used to filter out packages not
+        linked to that repo when generating a repo's updateinfo XML file. While returning that
+        list of NEVRA dicts is the main goal, doing so quickly and without running out of memory
+        is what makes this a little bit tricky.
+
+        Build up a super-fancy query to get the unit ids for all NEVRA seen in these errata
+        check repo/unit associations for this errata to limit the packages in the published
+        updateinfo to the units in the repo being currently published.
+
+        :param erratum_unit: The erratum unit that should be written to updateinfo.xml.
+        :type erratum_unit: pulp_rpm.plugins.db.models.Errata
+        :param repo_id: The repo_id of a pulp repository in which to find units
+        :type repo_id: str
+        :return: a list of NEVRA dicts for units in a single repo referenced by the given errata
+        :rtype: list
+        """
+        nevra_fields = ('name', 'epoch', 'version', 'release', 'arch')
+        nevra_q = mongoengine.Q()
+        for pkglist in erratum_unit.pkglist:
+            for pkg in pkglist['packages']:
+                pkg_nevra = dict((field, pkg[field]) for field in nevra_fields)
+                nevra_q |= mongoengine.Q(**pkg_nevra)
+
+        # Aim the super-fancy query at mongo to get the units that this errata refers to
+        # The scaler method on the end returns a list of tuples to try to save some memory
+        # and also cut down on mongoengine model instance hydration costs.
+        nevra_units = models.RPM.objects.filter(nevra_q).scalar('id', *nevra_fields)
+
+        # Split up the nevra unit entries into a mapping of the unit id to its nevra fields
+        nevra_unit_map = dict((nevra_unit[0], nevra_unit[1:]) for nevra_unit in nevra_units)
+
+        # Get all of the unit ids from this errata that are associated with the current repo.
+        # Cast this as a set for speedier lookups when iterating of the nevra unit map.
+        repo_unit_ids = set(RepositoryContentUnit.objects.filter(
+            unit_id__in=nevra_unit_map.keys(), repo_id=repo_id).scalar('unit_id'))
+
+        # Finally(!), intersect the repo unit ids with the unit nevra ids to
+        # create a list of nevra dicts that can be easily compared to the
+        # errata package nevra and exclude unrelated packages
+        repo_unit_nevra = []
+        for nevra_unit_id, nevra_field_values in nevra_unit_map.items():
+            # based on the args to scalar when nevra_units was created:
+            if nevra_unit_id in repo_unit_ids:
+                repo_unit_nevra.append(dict(zip(nevra_fields, nevra_field_values)))
+
+        return repo_unit_nevra
+
     def add_unit_metadata(self, item):
         """
         Write the XML representation of erratum_unit to self.metadata_file_handle
@@ -80,6 +136,12 @@ def add_unit_metadata(self, item):
                                     'href': reference['href']}
             ElementTree.SubElement(references_element, 'reference', reference_attributes)
 
+        # If we can pull a repo_id off the conduit, use that to generate repo-specific nevra
+        if self.conduit and hasattr(self.conduit, 'repo_id'):
+            repo_unit_nevra = self._repo_unit_nevra(erratum_unit, self.conduit.repo_id)
+        else:
+            repo_unit_nevra = None
+
         for pkglist in erratum_unit.pkglist:
 
             pkglist_element = ElementTree.SubElement(update_element, 'pkglist')
@@ -102,6 +164,16 @@ def add_unit_metadata(self, item):
                                       'epoch': package['epoch'] or '0',
                                       'arch': package['arch'],
                                       'src': package.get('src', '') or ''}
+
+                if repo_unit_nevra is not None:
+                    # If repo_unit_nevra can be used for comparison, take the src attr out of a
+                    # copy of this package's attrs to get a nevra dict for comparison
+                    package_nevra = package_attributes.copy()
+                    del(package_nevra['src'])
+                    if package_nevra not in repo_unit_nevra:
+                        # current package not in the specified repo, don't add it to the output
+                        continue
+
                 package_element = ElementTree.SubElement(collection_element, 'package',
                                                          package_attributes)
 
diff --git a/plugins/pulp_rpm/plugins/distributors/yum/publish.py b/plugins/pulp_rpm/plugins/distributors/yum/publish.py
index 2df7cbba0..a9e7adae6 100644
--- a/plugins/pulp_rpm/plugins/distributors/yum/publish.py
+++ b/plugins/pulp_rpm/plugins/distributors/yum/publish.py
@@ -562,7 +562,8 @@ def initialize(self):
         one that is built into the UpdateinfoXMLFileContext
         """
         checksum_type = self.parent.get_checksum_type()
-        self.context = UpdateinfoXMLFileContext(self.get_working_dir(), checksum_type)
+        self.context = UpdateinfoXMLFileContext(self.get_working_dir(), checksum_type,
+                                                self.get_conduit())
         self.context.initialize()
         # set the self.process_unit method to the corresponding method on the
         # UpdateInfoXMLFileContext as there is no other processing to be done for each unit.
diff --git a/plugins/test/unit/plugins/distributors/yum/metadata/test_metadata.py b/plugins/test/unit/plugins/distributors/yum/metadata/test_metadata.py
index 8ebd75814..f7bee97bd 100644
--- a/plugins/test/unit/plugins/distributors/yum/metadata/test_metadata.py
+++ b/plugins/test/unit/plugins/distributors/yum/metadata/test_metadata.py
@@ -6,7 +6,8 @@
 import unittest
 from xml.etree import cElementTree as et
 
-from mock import patch
+from mock import Mock, patch
+from mongoengine.queryset.visitor import QCombination
 from pulp.plugins.model import Unit
 
 from pulp_rpm.common.ids import TYPE_ID_RPM
@@ -396,6 +397,114 @@ def test_updateinfo_unit_metadata(self):
         self.assertEqual(content.count('<package'), 2)
         self.assertEqual(content.count('<sum type="md5">f3c197a29d9b66c5b65c5d62b25db5b4</sum>'), 1)
 
+    @patch.object(UpdateinfoXMLFileContext, '_repo_unit_nevra')
+    def test_updateinfo_unit_metadata_with_repo(self, repo_unit_nevra):
+
+        path = os.path.join(self.metadata_file_dir,
+                            REPO_DATA_DIR_NAME,
+                            UPDATE_INFO_XML_FILE_NAME)
+
+        handle = open(os.path.join(DATA_DIR, 'updateinfo.xml'), 'r')
+        generator = packages.package_list_generator(handle, 'update',
+                                                    updateinfo.process_package_element)
+
+        # mock out the repo/unit nevra matcher so that only one unit in the referenced errata
+        # is included in the output updateinfo XML
+        repo_unit_nevra.return_value = [
+            {'name': 'patb', 'epoch': '0', 'version': '0.1',
+             'release': '2', 'arch': 'x86_64'},
+        ]
+
+        erratum_unit = next(generator)
+
+        # just checking
+        self.assertEqual(erratum_unit.unit_key['errata_id'], 'RHEA-2010:9999')
+
+        mock_conduit = Mock()
+        mock_conduit.repo_id = 'mock_conduit_repo'
+        context = UpdateinfoXMLFileContext(self.metadata_file_dir, conduit=mock_conduit)
+        context._open_metadata_file_handle()
+        context.add_unit_metadata(erratum_unit)
+        context._close_metadata_file_handle()
+
+        self.assertNotEqual(os.path.getsize(path), 0)
+
+        updateinfo_handle = gzip.open(path, 'r')
+        content = updateinfo_handle.read()
+        updateinfo_handle.close()
+
+        self.assertEqual(content.count('from="enhancements@redhat.com"'), 1)
+        self.assertEqual(content.count('status="final"'), 1)
+        self.assertEqual(content.count('type="enhancements"'), 1)
+        self.assertEqual(content.count('version="1"'), 1)
+        self.assertEqual(content.count('<id>RHEA-2010:9999</id>'), 1)
+        self.assertEqual(content.count('<collection short="F13PTP">'), 1)
+        self.assertEqual(content.count('<package'), 1)
+        self.assertEqual(content.count('<sum type="md5">f3c197a29d9b66c5b65c5d62b25db5b4</sum>'), 1)
+
+    @patch('pulp_rpm.plugins.db.models.RPM')
+    def test_updateinfo_repo_unit_nevra_q_filter(self, mock_rpm):
+        # A mongoengine "QCombination" object is used to efficiently search for units
+        # by nevra. This checks that the QCombination object is properly created based
+        # on the errata unit parsed from the test updateinfo XML.
+        with open(os.path.join(DATA_DIR, 'updateinfo.xml'), 'r') as handle:
+            generator = packages.package_list_generator(
+                handle, 'update', updateinfo.process_package_element)
+            erratum_unit = next(generator)
+
+        context = UpdateinfoXMLFileContext(self.metadata_file_dir)
+        context._repo_unit_nevra(erratum_unit, 'mock_repo')
+
+        # Call 0 to mock_rpm's filter should have one arg, which should be the QCombination
+        # object that is built with an OR operator, with two children (one for each package
+        # in the errata unit that was passed to the method under test.
+        qcombination = mock_rpm.objects.filter.call_args_list[0][0][0]
+        self.assertTrue(isinstance(qcombination, QCombination))
+        self.assertEqual(qcombination.operation, qcombination.OR)
+        self.assertEqual(len(qcombination.children), 2)
+
+    @patch('pulp_rpm.plugins.db.models.RPM')
+    @patch('pulp_rpm.plugins.distributors.yum.metadata.updateinfo.RepositoryContentUnit')
+    def test_updateinfo_repo_unit_nevra_return(self, mock_rcu, mock_rpm):
+        # Build up the mock data as well as the expected returns
+        nevra_fields = ('name', 'epoch', 'version', 'release', 'arch')
+        unit1_nevra = ('n1', 'e1', 'v1', 'r1', 'a1')
+        unit1_nevra_dict = dict(zip(nevra_fields, unit1_nevra))
+        unit2_nevra = ('n2', 'e2', 'v2', 'r2', 'a2')
+        unit2_nevra_dict = dict(zip(nevra_fields, unit2_nevra))
+
+        # This is the result to the query for all units with a given nevra
+        # The expected value is a list of tuples containing unit ids and nevra fields;
+        mock_rpm.objects.filter().scalar.return_value = [
+            ('id1',) + unit1_nevra,
+            ('id2',) + unit2_nevra,
+        ]
+        # The expected value here is a list of unit IDs from the previous query that are
+        # associated with our mock repo.
+        mock_rcu.objects.filter().scalar.return_value = ['id1']
+
+        # Load the updateinfo XML to get an erratum unit to process
+        with open(os.path.join(DATA_DIR, 'updateinfo.xml'), 'r') as handle:
+            generator = packages.package_list_generator(
+                handle, 'update', updateinfo.process_package_element)
+            erratum_unit = next(generator)
+
+        context = UpdateinfoXMLFileContext(self.metadata_file_dir)
+        repo_unit_nevra = context._repo_unit_nevra(erratum_unit, 'mock_repo')
+
+        # Call 0 created the scalar mock, so we're interested in call 1. In this case, check
+        # that filter was called at least once with the expected filter kwargs and values.
+        mock_rcu.objects.filter.assert_any_call(unit_id__in=['id2', 'id1'], repo_id='mock_repo')
+
+        # And finally, make sure the return value is actually good!
+        # We made the RPM mock simulate two units known to pulp with the nevra seen in our errata.
+        # Then, we made the RepositoryContentUnit mock simulate that only one of those units is
+        # associated with the passed-in repo. The return value should be a list with only the
+        # single matching unit's nevra dict in it.
+        self.assertEqual(len(repo_unit_nevra), 1)
+        self.assertTrue(unit1_nevra_dict in repo_unit_nevra)
+        self.assertTrue(unit2_nevra_dict not in repo_unit_nevra)
+
     # -- prestodelta.xml testing -----------------------------------------------
 
     @skip_broken