From 0efe6b18c751c3f01f4e3a743c939ac23b244b5f Mon Sep 17 00:00:00 2001 From: Tatiana Tereshchenko Date: Mon, 8 Feb 2021 21:35:16 +0100 Subject: [PATCH] Only go through the changed repositories on the re-run. re #7779 https://pulp.plan.io/issues/7779 --- CHANGES/7779.bugfix | 1 + pulp_2to3_migration/app/pre_migration.py | 66 +++++++++++++++++-- .../functional/test_migration_behaviour.py | 8 ++- 3 files changed, 65 insertions(+), 10 deletions(-) create mode 100644 CHANGES/7779.bugfix diff --git a/CHANGES/7779.bugfix b/CHANGES/7779.bugfix new file mode 100644 index 00000000..ca76e3bc --- /dev/null +++ b/CHANGES/7779.bugfix @@ -0,0 +1 @@ +Fixed the re-run times when repositories/importers/distributors hasn't changed much since the last run. diff --git a/pulp_2to3_migration/app/pre_migration.py b/pulp_2to3_migration/app/pre_migration.py index 79ecfd27..7d76b782 100644 --- a/pulp_2to3_migration/app/pre_migration.py +++ b/pulp_2to3_migration/app/pre_migration.py @@ -335,11 +335,18 @@ def pre_migrate_all_without_content(plan): """ Pre-migrate repositories, relations to their contents, importers and distributors. - NOTE: MongoDB and Django handle datetime fields differently. MongoDB doesn't care about - timezones and provides "naive" time, while Django is complaining about time without a timezone. - The problem is that naive time != time with specified timezone, that's why all the time for - MongoDB comparisons should be naive and all the time for Django/PostgreSQL should be timezone - aware. + Look at the last updated times in the pulp2to3 tables for repositories/importers/distributors: + * pulp2_last_unit_added or pulp2_last_unit_removed for repositories + * pulp2_last_updated for importers and distributors + + Query empty-never-had-content repos (can't filter them out in any way) and repos for which + there were: + * content changes since the last run + * importer changes since the last run + * distributor changes since the last run + + Query in order of last_unit_added for the case when pre-migration is interrupted before we are + done with repositories. Args: plan(MigrationPlan): A Migration Plan @@ -355,8 +362,47 @@ def pre_migrate_all_without_content(plan): # filter by repo type repos_to_check = plan.type_to_repo_ids[plugin_plan.type] - mongo_repo_q = mongo_Q(repo_id__in=repos_to_check) - mongo_repo_qs = Repository.objects(mongo_repo_q) + epoch = datetime.utcfromtimestamp(0) + # figure out which repositories/importers/distributors were updated since the last run + repo_premigrated_last_by_added = Pulp2Repository.objects.aggregate( + Max('pulp2_last_unit_added') + )['pulp2_last_unit_added__max'] or epoch + repo_premigrated_last_by_removed = Pulp2Repository.objects.aggregate( + Max('pulp2_last_unit_removed') + )['pulp2_last_unit_removed__max'] or epoch + imp_premigrated_last = Pulp2Importer.objects.aggregate( + Max('pulp2_last_updated') + )['pulp2_last_updated__max'] or epoch + dist_premigrated_last = Pulp2Distributor.objects.aggregate( + Max('pulp2_last_updated') + )['pulp2_last_updated__max'] or epoch + + is_content_added_q = mongo_Q(last_unit_added__gte=repo_premigrated_last_by_added) + is_content_removed_q = mongo_Q(last_unit_removed__gte=repo_premigrated_last_by_removed) + is_new_enough_repo_q = is_content_added_q | is_content_removed_q + is_empty_repo_q = ( + mongo_Q(last_unit_added__exists=False) & mongo_Q(last_unit_removed__exists=False) + ) + is_new_enough_imp_q = mongo_Q(last_updated__gte=imp_premigrated_last) + is_new_enough_dist_q = mongo_Q(last_updated__gte=dist_premigrated_last) + repo_id_q = mongo_Q(repo_id__in=repos_to_check) + + updated_importers = Importer.objects( + repo_id_q & is_new_enough_imp_q + ).only('repo_id') + updated_imp_repos = set(imp.repo_id for imp in updated_importers) + updated_distributors = Distributor.objects( + repo_id_q & is_new_enough_dist_q + ).only('repo_id') + updated_dist_repos = set(dist.repo_id for dist in updated_distributors) + updated_impdist_repos = updated_imp_repos | updated_dist_repos + + mongo_updated_repo_q = repo_id_q & (is_new_enough_repo_q | is_empty_repo_q) + mongo_updated_imp_dist_repo_q = mongo_Q(repo_id__in=updated_impdist_repos) + + mongo_repo_qs = Repository.objects( + mongo_updated_repo_q | mongo_updated_imp_dist_repo_q + ).order_by('last_unit_added') pb.total += mongo_repo_qs.count() pb.save() @@ -392,6 +438,12 @@ def pre_migrate_repo(record, repo_id_to_type): """ Pre-migrate a pulp 2 repo. + NOTE: MongoDB and Django handle datetime fields differently. MongoDB doesn't care about + timezones and provides "naive" time, while Django is complaining about time without a timezone. + The problem is that naive time != time with specified timezone, that's why all the time for + MongoDB comparisons should be naive and all the time for Django/PostgreSQL should be timezone + aware. + Args: record(Repository): Pulp 2 repository data repo_id_to_type(dict): A mapping from a pulp 2 repo_id to pulp 2 repo types diff --git a/pulp_2to3_migration/tests/functional/test_migration_behaviour.py b/pulp_2to3_migration/tests/functional/test_migration_behaviour.py index 74c52401..fb999f97 100644 --- a/pulp_2to3_migration/tests/functional/test_migration_behaviour.py +++ b/pulp_2to3_migration/tests/functional/test_migration_behaviour.py @@ -251,7 +251,9 @@ def _test_pulp2repositories(self, plan): Check correctness of the dara for the first repo in the list. """ self._load_and_run(plan) - pulp2repository = self.pulp2repositories_api.list(ordering='pulp2_id', limit=1).results[0] + pulp2repository = self.pulp2repositories_api.list( + ordering='pulp2_repo_id', limit=1 + ).results[0] pulp3_repo = self.file_repo_api.read(pulp2repository.pulp3_repository_href) pulp3_remote = self.file_remotes_api.read(pulp2repository.pulp3_remote_href) pulp3_pub = self.file_publications_api.read(pulp2repository.pulp3_publication_href) @@ -315,7 +317,7 @@ def test_importer_different_repo(self): Importers are swapped in the plan. """ self._load_and_run(IMPORTER_DIFF_PLAN) - pulp2repositories = self.pulp2repositories_api.list(ordering='pulp2_id').results + pulp2repositories = self.pulp2repositories_api.list(ordering='pulp2_repo_id').results pulp2repo1, pulp2repo2 = pulp2repositories pulp3_remote1 = self.file_remotes_api.read(pulp2repo1.pulp3_remote_href) pulp3_remote2 = self.file_remotes_api.read(pulp2repo2.pulp3_remote_href) @@ -336,7 +338,7 @@ def test_distributor_different_repo(self): Distributors are swapped in the plan. """ self._load_and_run(DISTRIBUTOR_DIFF_PLAN) - pulp2repositories = self.pulp2repositories_api.list(ordering='pulp2_id').results + pulp2repositories = self.pulp2repositories_api.list(ordering='pulp2_repo_id').results pulp2repo1, pulp2repo2 = pulp2repositories pulp3_pub1 = self.file_publications_api.read(pulp2repo1.pulp3_publication_href) pulp3_pub2 = self.file_publications_api.read(pulp2repo2.pulp3_publication_href)