Skip to content
This repository has been archived by the owner on Jan 9, 2023. It is now read-only.

Commit

Permalink
Only go through the changed repositories on the re-run.
Browse files Browse the repository at this point in the history
  • Loading branch information
goosemania committed Feb 12, 2021
1 parent 86af5f4 commit 0efe6b1
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 10 deletions.
1 change: 1 addition & 0 deletions CHANGES/7779.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fixed the re-run times when repositories/importers/distributors hasn't changed much since the last run.
66 changes: 59 additions & 7 deletions pulp_2to3_migration/app/pre_migration.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,11 +335,18 @@ def pre_migrate_all_without_content(plan):
"""
Pre-migrate repositories, relations to their contents, importers and distributors.
NOTE: MongoDB and Django handle datetime fields differently. MongoDB doesn't care about
timezones and provides "naive" time, while Django is complaining about time without a timezone.
The problem is that naive time != time with specified timezone, that's why all the time for
MongoDB comparisons should be naive and all the time for Django/PostgreSQL should be timezone
aware.
Look at the last updated times in the pulp2to3 tables for repositories/importers/distributors:
* pulp2_last_unit_added or pulp2_last_unit_removed for repositories
* pulp2_last_updated for importers and distributors
Query empty-never-had-content repos (can't filter them out in any way) and repos for which
there were:
* content changes since the last run
* importer changes since the last run
* distributor changes since the last run
Query in order of last_unit_added for the case when pre-migration is interrupted before we are
done with repositories.
Args:
plan(MigrationPlan): A Migration Plan
Expand All @@ -355,8 +362,47 @@ def pre_migrate_all_without_content(plan):
# filter by repo type
repos_to_check = plan.type_to_repo_ids[plugin_plan.type]

mongo_repo_q = mongo_Q(repo_id__in=repos_to_check)
mongo_repo_qs = Repository.objects(mongo_repo_q)
epoch = datetime.utcfromtimestamp(0)
# figure out which repositories/importers/distributors were updated since the last run
repo_premigrated_last_by_added = Pulp2Repository.objects.aggregate(
Max('pulp2_last_unit_added')
)['pulp2_last_unit_added__max'] or epoch
repo_premigrated_last_by_removed = Pulp2Repository.objects.aggregate(
Max('pulp2_last_unit_removed')
)['pulp2_last_unit_removed__max'] or epoch
imp_premigrated_last = Pulp2Importer.objects.aggregate(
Max('pulp2_last_updated')
)['pulp2_last_updated__max'] or epoch
dist_premigrated_last = Pulp2Distributor.objects.aggregate(
Max('pulp2_last_updated')
)['pulp2_last_updated__max'] or epoch

is_content_added_q = mongo_Q(last_unit_added__gte=repo_premigrated_last_by_added)
is_content_removed_q = mongo_Q(last_unit_removed__gte=repo_premigrated_last_by_removed)
is_new_enough_repo_q = is_content_added_q | is_content_removed_q
is_empty_repo_q = (
mongo_Q(last_unit_added__exists=False) & mongo_Q(last_unit_removed__exists=False)
)
is_new_enough_imp_q = mongo_Q(last_updated__gte=imp_premigrated_last)
is_new_enough_dist_q = mongo_Q(last_updated__gte=dist_premigrated_last)
repo_id_q = mongo_Q(repo_id__in=repos_to_check)

updated_importers = Importer.objects(
repo_id_q & is_new_enough_imp_q
).only('repo_id')
updated_imp_repos = set(imp.repo_id for imp in updated_importers)
updated_distributors = Distributor.objects(
repo_id_q & is_new_enough_dist_q
).only('repo_id')
updated_dist_repos = set(dist.repo_id for dist in updated_distributors)
updated_impdist_repos = updated_imp_repos | updated_dist_repos

mongo_updated_repo_q = repo_id_q & (is_new_enough_repo_q | is_empty_repo_q)
mongo_updated_imp_dist_repo_q = mongo_Q(repo_id__in=updated_impdist_repos)

mongo_repo_qs = Repository.objects(
mongo_updated_repo_q | mongo_updated_imp_dist_repo_q
).order_by('last_unit_added')

pb.total += mongo_repo_qs.count()
pb.save()
Expand Down Expand Up @@ -392,6 +438,12 @@ def pre_migrate_repo(record, repo_id_to_type):
"""
Pre-migrate a pulp 2 repo.
NOTE: MongoDB and Django handle datetime fields differently. MongoDB doesn't care about
timezones and provides "naive" time, while Django is complaining about time without a timezone.
The problem is that naive time != time with specified timezone, that's why all the time for
MongoDB comparisons should be naive and all the time for Django/PostgreSQL should be timezone
aware.
Args:
record(Repository): Pulp 2 repository data
repo_id_to_type(dict): A mapping from a pulp 2 repo_id to pulp 2 repo types
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,9 @@ def _test_pulp2repositories(self, plan):
Check correctness of the dara for the first repo in the list.
"""
self._load_and_run(plan)
pulp2repository = self.pulp2repositories_api.list(ordering='pulp2_id', limit=1).results[0]
pulp2repository = self.pulp2repositories_api.list(
ordering='pulp2_repo_id', limit=1
).results[0]
pulp3_repo = self.file_repo_api.read(pulp2repository.pulp3_repository_href)
pulp3_remote = self.file_remotes_api.read(pulp2repository.pulp3_remote_href)
pulp3_pub = self.file_publications_api.read(pulp2repository.pulp3_publication_href)
Expand Down Expand Up @@ -315,7 +317,7 @@ def test_importer_different_repo(self):
Importers are swapped in the plan.
"""
self._load_and_run(IMPORTER_DIFF_PLAN)
pulp2repositories = self.pulp2repositories_api.list(ordering='pulp2_id').results
pulp2repositories = self.pulp2repositories_api.list(ordering='pulp2_repo_id').results
pulp2repo1, pulp2repo2 = pulp2repositories
pulp3_remote1 = self.file_remotes_api.read(pulp2repo1.pulp3_remote_href)
pulp3_remote2 = self.file_remotes_api.read(pulp2repo2.pulp3_remote_href)
Expand All @@ -336,7 +338,7 @@ def test_distributor_different_repo(self):
Distributors are swapped in the plan.
"""
self._load_and_run(DISTRIBUTOR_DIFF_PLAN)
pulp2repositories = self.pulp2repositories_api.list(ordering='pulp2_id').results
pulp2repositories = self.pulp2repositories_api.list(ordering='pulp2_repo_id').results
pulp2repo1, pulp2repo2 = pulp2repositories
pulp3_pub1 = self.file_publications_api.read(pulp2repo1.pulp3_publication_href)
pulp3_pub2 = self.file_publications_api.read(pulp2repo2.pulp3_publication_href)
Expand Down

0 comments on commit 0efe6b1

Please sign in to comment.