Skip to content

Commit

Permalink
Merge colliding structure content
Browse files Browse the repository at this point in the history
closes #599

Includes a DB migration that re-writes any colliding structure content,
thus altering existing repository versions. So re-written repo versions
will result in structurally identical publications, as compared to
before the change. The only difference is that post-migration
publications will lose any duplicate package paragraphs.

This change will be released as part of pulp_deb version 3.0.0!
  • Loading branch information
quba42 committed Jun 5, 2023
1 parent 8a59af6 commit f5c6018
Show file tree
Hide file tree
Showing 8 changed files with 173 additions and 45 deletions.
2 changes: 2 additions & 0 deletions CHANGES/599.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
This change includes a large DB migration to drop 'codename' and 'suite' from the uniqueness constraints of all structure content.
The migration will merge any resulting collisions and alter all records with a foreign key relation to the so eliminated content to point at the merge result instead.
1 change: 1 addition & 0 deletions CHANGES/599.removal
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The codename and suite fields are removed from the ReleaseComponent and ReleaseArchitecture models and all associated filters and viewsets.
6 changes: 3 additions & 3 deletions docs/_scripts/structured_repo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ TASK_HREF=$(http ${BASE_ADDR}/pulp/api/v3/distributions/deb/apt/ name=myrepo bas
wait_until_task_finished $BASE_ADDR$TASK_HREF

# create the necessary content (release, comp, architecture)
RELEASE_HREF=$(http ${BASE_ADDR}/pulp/api/v3/content/deb/releases/ codename=mycodename suite=mysuite distribution=mydist | jq -r .pulp_href)
RELEASE_HREF=$(http ${BASE_ADDR}/pulp/api/v3/content/deb/releases/ distribution=mydist codename=mycodename suite=mysuite | jq -r .pulp_href)
# Note that creating the release is optional, but without it your published repo will use default values for the suite and the codename in the published Release file.
ARCH_HREF=$(http ${BASE_ADDR}/pulp/api/v3/content/deb/release_architectures/ architecture=ppc64 codename=mycodename suite=mysuite distribution=mydist | jq -r .pulp_href)
COMP_HREF=$(http ${BASE_ADDR}/pulp/api/v3/content/deb/release_components/ component=mycomp codename=mycodename suite=mysuite distribution=mydist | jq -r .pulp_href)
ARCH_HREF=$(http ${BASE_ADDR}/pulp/api/v3/content/deb/release_architectures/ distribution=mydist architecture=ppc64 | jq -r .pulp_href)
COMP_HREF=$(http ${BASE_ADDR}/pulp/api/v3/content/deb/release_components/ distribution=mydist component=mycomp | jq -r .pulp_href)
PKG_COMP_HREF=$(http ${BASE_ADDR}/pulp/api/v3/content/deb/package_release_components/ package=$PACKAGE_HREF release_component=$COMP_HREF | jq -r .pulp_href)

# add our content to the repository
Expand Down
142 changes: 142 additions & 0 deletions pulp_deb/app/migrations/0023_merge_colliding_structure_content.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# Generated by Django 3.2.19 on 2023-05-09 12:35, extended manually;

from django.db import migrations
from django.core.exceptions import ObjectDoesNotExist

BATCH_SIZE = 1000


# TODO: Batch everything!




def merge_colliding_structure_content(apps, schema_editor):
ReleaseArchitecture = apps.get_model('deb', 'ReleaseArchitecture')
ReleaseComponent = apps.get_model('deb', 'ReleaseComponent')
PackageReleaseComponent = apps.get_model('deb', 'PackageReleaseComponent')
RepositoryContent = apps.get_model('core', 'RepositoryContent')
Content = apps.get_model('core', 'Content')

def _get_or_create_prc_to_keep(duplicate_prc_package, component_to_keep):
try:
prc_to_keep = PackageReleaseComponent.objects.get(
release_component=component_to_keep, package=duplicate_prc_package
)
except ObjectDoesNotExist:
component = ReleaseComponent.objects.get(pk=component_to_keep)
prc_to_keep = PackageReleaseComponent.objects.create(
pulp_type='deb.package_release_component', release_component=component, package=duplicate_prc_package
)
prc_to_keep.save()
return prc_to_keep

def _update_repo_content(duplicate_content_ids, content_to_keep):
for repo_content in RepositoryContent.objects.filter(content_id__in=duplicate_content_ids):
repo_content.content_id = content_to_keep
repo_content.save()

# Deduplicate ReleaseArchitecture:
distributions = (
ReleaseArchitecture.objects.all()
.distinct('distribution')
.values_list('distribution', flat=True)
)

for distribution in distributions:
architectures = (
ReleaseArchitecture.objects.filter(distribution=distribution)
.distinct('architecture')
.values_list('architecture', flat=True)
)
for architecture in architectures:
duplicate_architecture_ids = list(
ReleaseArchitecture.objects.filter(
distribution=distribution, architecture=architecture
).values_list('pk', flat=True)
)
if len(duplicate_architecture_ids) > 1:
architecture_to_keep = duplicate_architecture_ids.pop()
_update_repo_content(duplicate_architecture_ids, architecture_to_keep)
ReleaseArchitecture.objects.filter(pk__in=duplicate_architecture_ids).delete()
Content.objects.filter(pk__in=duplicate_architecture_ids).delete()

# Deduplicate ReleaseComponent:
distributions = (
ReleaseComponent.objects.all()
.distinct('distribution')
.values_list('distribution', flat=True)
)
for distribution in distributions:
components = (
ReleaseComponent.objects.filter(distribution=distribution)
.distinct('component')
.values_list('component', flat=True)
)
for component in components:
duplicate_component_ids = list(
ReleaseComponent.objects.filter(
distribution=distribution, component=component
).values_list('pk', flat=True)
)
if len(duplicate_component_ids) > 1:
component_to_keep = duplicate_component_ids.pop()
_update_repo_content(duplicate_component_ids, component_to_keep)

# Deduplicate PackageReleaseComponents
global_duplicate_prcs = set()
for duplicate_component in duplicate_component_ids:
duplicate_prcs = PackageReleaseComponent.objects.filter(
release_component=duplicate_component
)
for duplicate_prc in duplicate_prcs.iterator():
prc_to_keep = _get_or_create_prc_to_keep(
duplicate_prc.package, component_to_keep
)
_update_repo_content([duplicate_prc.pk], prc_to_keep)
global_duplicate_prcs.add(duplicate_prc.pk)

ReleaseComponent.objects.filter(pk__in=duplicate_component_ids).delete()
Content.objects.filter(pk__in=duplicate_component_ids).delete()
PackageReleaseComponent.objects.filter(pk__in=global_duplicate_prcs).delete()
Content.objects.filter(pk__in=global_duplicate_prcs).delete()


class Migration(migrations.Migration):
dependencies = [
('deb', '0022_alter_aptdistribution_distribution_ptr_and_more'),
]

operations = [
migrations.RunPython(
merge_colliding_structure_content, reverse_code=migrations.RunPython.noop, elidable=True
),
migrations.RunSQL(
sql="SET CONSTRAINTS ALL IMMEDIATE;",
reverse_sql="",
),
migrations.AlterUniqueTogether(
name='releasearchitecture',
unique_together={('distribution', 'architecture')},
),
migrations.AlterUniqueTogether(
name='releasecomponent',
unique_together={('distribution', 'component')},
),
migrations.RemoveField(
model_name='releasearchitecture',
name='codename',
),
migrations.RemoveField(
model_name='releasearchitecture',
name='suite',
),
migrations.RemoveField(
model_name='releasecomponent',
name='codename',
),
migrations.RemoveField(
model_name='releasecomponent',
name='suite',
),
]
18 changes: 3 additions & 15 deletions pulp_deb/app/models/content/structure_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,12 @@ class ReleaseArchitecture(Content):

TYPE = "release_architecture"

architecture = models.TextField()
distribution = models.TextField()

# IMPORTANT: The following fields are only part of this model in order to avoid historical
# uniqueness constraint collisions. The plan is to drop these fields from this model ASAP! This
# will require a complex DB migration to sort out any collisions.
codename = models.TextField()
suite = models.TextField()
architecture = models.TextField()

class Meta:
default_related_name = "%(app_label)s_%(model_name)s"
unique_together = (("architecture", "distribution", "codename", "suite"),)
unique_together = (("distribution", "architecture"),)


class ReleaseComponent(Content):
Expand All @@ -63,12 +57,6 @@ class ReleaseComponent(Content):
distribution = models.TextField()
component = models.TextField()

# IMPORTANT: The following fields are only part of this model in order to avoid historical
# uniqueness constraint collisions. The plan is to drop these fields from this model ASAP! This
# will require a complex DB migration to sort out any collisions.
codename = models.TextField()
suite = models.TextField()

@property
def plain_component(self):
"""
Expand All @@ -87,7 +75,7 @@ def plain_component(self):

class Meta:
default_related_name = "%(app_label)s_%(model_name)s"
unique_together = (("distribution", "component", "codename", "suite"),)
unique_together = (("distribution", "component"),)


class PackageReleaseComponent(Content):
Expand Down
6 changes: 1 addition & 5 deletions pulp_deb/app/serializers/content_serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def create(self, validated_data):
release_component = ReleaseComponent(distribution=distribution, component=component)
release_component.save()
release_component_to_add = ReleaseComponent.objects.filter(
distribution=distribution, component=component, codename="", suite=""
distribution=distribution, component=component
)
package = content_to_add[0]
release_arch = ReleaseArchitecture(
Expand Down Expand Up @@ -679,8 +679,6 @@ class Meta(NoArtifactContentSerializer.Meta):
fields = NoArtifactContentSerializer.Meta.fields + (
"architecture",
"distribution",
"codename",
"suite",
)


Expand All @@ -697,8 +695,6 @@ class Meta(NoArtifactContentSerializer.Meta):
fields = NoArtifactContentSerializer.Meta.fields + (
"component",
"distribution",
"codename",
"suite",
)


Expand Down
39 changes: 19 additions & 20 deletions pulp_deb/app/tasks/synchronizing.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,15 +635,12 @@ async def _handle_distribution(self, distribution):
await pb.aincrement()
return

# For historic reasons, we have tied up the distribution with the codename and suite.
# Untangling this will require careful planning due to changing uniqueness constraints.
# See: https://github.com/pulp/pulp_deb/issues/599
distribution_dict = {
"codename": release_file.codename,
"suite": release_file.suite,
"distribution": distribution,
}
await self.put(DeclarativeContent(content=Release(**distribution_dict)))
release = Release(
codename=release_file.codename,
suite=release_file.suite,
distribution=distribution,
)
await self.put(DeclarativeContent(content=release))
# Create release architectures
if release_file.architectures:
architectures = _filter_split_architectures(
Expand All @@ -659,7 +656,7 @@ async def _handle_distribution(self, distribution):

for architecture in architectures:
release_architecture_dc = DeclarativeContent(
content=ReleaseArchitecture(architecture=architecture, **distribution_dict)
content=ReleaseArchitecture(architecture=architecture, distribution=distribution)
)
await self.put(release_architecture_dc)
# Parse release file
Expand Down Expand Up @@ -690,13 +687,13 @@ async def _handle_distribution(self, distribution):

if distribution[-1] == "/":
# Handle flat repo
sub_tasks = [self._handle_flat_repo(file_references, release_file, distribution_dict)]
sub_tasks = [self._handle_flat_repo(file_references, release_file, distribution)]
else:
# Handle components
sub_tasks = [
self._handle_component(
component,
distribution_dict,
distribution,
release_file,
file_references,
architectures,
Expand All @@ -711,15 +708,15 @@ async def _handle_distribution(self, distribution):
async def _handle_component(
self,
component,
distribution_dict,
distribution,
release_file,
file_references,
architectures,
hybrid_format,
):
# Create release_component
release_component_dc = DeclarativeContent(
content=ReleaseComponent(component=component, **distribution_dict)
content=ReleaseComponent(component=component, distribution=distribution)
)
release_component = await self._create_unit(release_component_dc)

Expand Down Expand Up @@ -799,10 +796,10 @@ async def _handle_component(
raise NotImplementedError("Syncing source repositories is not yet implemented.")
await asyncio.gather(*pending_tasks)

async def _handle_flat_repo(self, file_references, release_file, distribution_dict):
async def _handle_flat_repo(self, file_references, release_file, distribution):
# We are creating a component so the flat repo can be published as a structured repo!
release_component_dc = DeclarativeContent(
content=ReleaseComponent(component="flat-repo-component", **distribution_dict)
content=ReleaseComponent(component="flat-repo-component", distribution=distribution)
)
release_component = await self._create_unit(release_component_dc)
pending_tasks = []
Expand All @@ -814,7 +811,7 @@ async def _handle_flat_repo(self, file_references, release_file, distribution_di
release_component=release_component,
architecture="",
file_references=file_references,
distribution_dict=distribution_dict,
distribution=distribution,
)
)

Expand All @@ -832,7 +829,7 @@ async def _handle_package_index(
architecture,
file_references,
infix="",
distribution_dict=None,
distribution=None,
hybrid_format=False,
):
# Create package_index
Expand Down Expand Up @@ -1021,7 +1018,7 @@ async def _handle_package_index(
log.warning(_(message).format(architecture))
release_architecture_dc = DeclarativeContent(
content=ReleaseArchitecture(
architecture=architecture, **distribution_dict
architecture=architecture, distribution=distribution
)
)
await self.put(release_architecture_dc)
Expand All @@ -1039,7 +1036,9 @@ async def _handle_package_index(
log.warning(_(message).format(package_architectures_string))
for architecture in package_architectures:
release_architecture_dc = DeclarativeContent(
content=ReleaseArchitecture(architecture=architecture, **distribution_dict)
content=ReleaseArchitecture(
architecture=architecture, distribution=distribution
)
)
await self.put(release_architecture_dc)

Expand Down
4 changes: 2 additions & 2 deletions pulp_deb/app/viewsets/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ class ReleaseArchitectureFilter(ContentFilter):

class Meta:
model = models.ReleaseArchitecture
fields = ["architecture", "distribution", "codename", "suite"]
fields = ["architecture", "distribution"]


class ReleaseArchitectureViewSet(ContentViewSet):
Expand Down Expand Up @@ -446,7 +446,7 @@ class ReleaseComponentFilter(ContentFilter):

class Meta:
model = models.ReleaseComponent
fields = ["component", "distribution", "codename", "suite"]
fields = ["component", "distribution"]


class ReleaseComponentViewSet(ContentViewSet):
Expand Down

0 comments on commit f5c6018

Please sign in to comment.