From 44bb0623a01b7578e2ec442844c7f5849754b237 Mon Sep 17 00:00:00 2001 From: Grant Gainey Date: Fri, 22 May 2020 15:47:41 -0400 Subject: [PATCH] Teach exporter to understand, validate, and respect chunk_size= parameter. Since we can now have multiple-output-files, replaced filename/sha256 columns with output_file_info, a JSONField which is a dictionary of filename: hash pairs. closes #6736 --- CHANGES/6736.feature | 1 + CHANGES/6736.removal | 1 + docs/workflows/import-export.rst | 26 ++++ .../app/migrations/0032_export_to_chunks.py | 27 ++++ pulpcore/app/models/exporter.py | 4 +- pulpcore/app/serializers/exporter.py | 52 +++++-- pulpcore/app/tasks/export.py | 138 +++++++++++------- pulpcore/app/viewsets/exporter.py | 1 + .../api/using_plugin/test_pulpexport.py | 20 ++- .../api/using_plugin/test_pulpimport.py | 5 +- .../tests/unit/serializers/test_pulpexport.py | 67 ++++++++- 11 files changed, 267 insertions(+), 75 deletions(-) create mode 100644 CHANGES/6736.feature create mode 100644 CHANGES/6736.removal create mode 100644 pulpcore/app/migrations/0032_export_to_chunks.py diff --git a/CHANGES/6736.feature b/CHANGES/6736.feature new file mode 100644 index 0000000000..c9b0908af2 --- /dev/null +++ b/CHANGES/6736.feature @@ -0,0 +1 @@ +Taught export how to split export-file into chunk_size bytes. diff --git a/CHANGES/6736.removal b/CHANGES/6736.removal new file mode 100644 index 0000000000..0032480375 --- /dev/null +++ b/CHANGES/6736.removal @@ -0,0 +1 @@ +Replaced PulpExport filename/sha256 fields, with output_info_file, a '': '' dictionary. diff --git a/docs/workflows/import-export.rst b/docs/workflows/import-export.rst index 6b0dbfd611..b84b9855db 100644 --- a/docs/workflows/import-export.rst +++ b/docs/workflows/import-export.rst @@ -210,6 +210,32 @@ accomplish this by setting the ``full`` parameter on the ``/exports/`` invocatio This results in an export of all content-entities, but only ::term::`Artifacts` that have been **added** since the `last_export` of the same Exporter. +Exporting Chunked Files +----------------------- + +By default, PulpExport streams data into a single ``.tar.gz`` file. Since ::term:`Respoitories` +can contain a lot of artifacts and content, that can result in a file too large to be +copied to transport media. In this case, you can specify a maximum-file-size, and the +export process will chunk the tar.gz into a series of files no larger than this. + +You accomplish this by setting the ``chunk_size`` parameter to the desired maximum number of bytes. This +parameter takes an integer, or size-units of KB, MB, or GB. Files appear in the Exporter.path +directory, with a four-digit sequence number suffix:: + + http POST :/pulp/api/v3/exporters/core/pulp/1ddbe6bf-a6c3-4a88-8614-ad9511d21b94/exports/ chunk_size="10KB" + { + "task": "/pulp/api/v3/tasks/da3350f7-0102-4dd5-81e0-81becf3ffdc7/" + } + ls -l /tmp/exports/ + total 76 + 10K export-780822a4-d280-4ed0-a53c-382a887576a6-20200522_2325.tar.gz.0000 + 10K export-780822a4-d280-4ed0-a53c-382a887576a6-20200522_2325.tar.gz.0001 + 10K export-780822a4-d280-4ed0-a53c-382a887576a6-20200522_2325.tar.gz.0002 + 10K export-780822a4-d280-4ed0-a53c-382a887576a6-20200522_2325.tar.gz.0003 + 10K export-780822a4-d280-4ed0-a53c-382a887576a6-20200522_2325.tar.gz.0004 + 10K export-780822a4-d280-4ed0-a53c-382a887576a6-20200522_2325.tar.gz.0005 + 2.3K export-780822a4-d280-4ed0-a53c-382a887576a6-20200522_2325.tar.gz.0006 + Updating an Exporter -------------------- diff --git a/pulpcore/app/migrations/0032_export_to_chunks.py b/pulpcore/app/migrations/0032_export_to_chunks.py new file mode 100644 index 0000000000..3f32a50663 --- /dev/null +++ b/pulpcore/app/migrations/0032_export_to_chunks.py @@ -0,0 +1,27 @@ +# Generated by Django 2.2.11 on 2020-05-22 18:31 + +import django.contrib.postgres.fields.jsonb +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0031_import_export_validate_params'), + ] + + operations = [ + migrations.RemoveField( + model_name='pulpexport', + name='filename', + ), + migrations.RemoveField( + model_name='pulpexport', + name='sha256', + ), + migrations.AddField( + model_name='pulpexport', + name='output_file_info', + field=django.contrib.postgres.fields.jsonb.JSONField(null=True), + ), + ] diff --git a/pulpcore/app/models/exporter.py b/pulpcore/app/models/exporter.py index 9bfef0e040..abf87a3a26 100644 --- a/pulpcore/app/models/exporter.py +++ b/pulpcore/app/models/exporter.py @@ -163,8 +163,8 @@ class PulpExport(Export): tarfile = None validated_versions = None - sha256 = models.CharField(max_length=64, null=True) - filename = models.CharField(max_length=4096, null=True) + validated_chunk_size = None + output_file_info = JSONField(null=True) def export_tarfile_path(self): """ diff --git a/pulpcore/app/serializers/exporter.py b/pulpcore/app/serializers/exporter.py index baa96f1693..3401249750 100644 --- a/pulpcore/app/serializers/exporter.py +++ b/pulpcore/app/serializers/exporter.py @@ -1,4 +1,5 @@ import os +import re from gettext import gettext as _ from rest_framework import serializers @@ -109,12 +110,9 @@ class PulpExportSerializer(ExportSerializer): Serializer for PulpExports. """ - sha256 = serializers.CharField( - help_text=_("The SHA-256 checksum of the exported .tar.gz."), read_only=True, - ) - - filename = serializers.CharField( - help_text=_("The full-path filename of the exported .tar.gz."), read_only=True, + output_file_info = serializers.JSONField( + help_text=_("Dictionary of filename: sha256hash entries for export-output-file(s)"), + read_only=True, ) dry_run = serializers.BooleanField( @@ -136,6 +134,15 @@ class PulpExportSerializer(ExportSerializer): write_only=True, ) + chunk_size = serializers.CharField( + help_text=_( + "Chunk export-tarfile into pieces of chunk_size bytes." + + "Recognizes units of B/KB/MB/GB/TB." + ), + required=False, + write_only=True, + ) + def validate_versions(self, versions): """ If specifying repo-versions explicitly, must provide a version for each exporter-repository @@ -146,8 +153,8 @@ def validate_versions(self, versions): if num_repos != len(versions): raise serializers.ValidationError( _( - "Number of versions does not match the number of Repositories for the owning " - + "Exporter!" + "Number of versions ({}) does not match the number of Repositories ({}) for " + + "the owning Exporter!" ).format(num_repos, len(versions)) ) @@ -159,18 +166,41 @@ def validate_versions(self, versions): _( "Requested RepositoryVersions must belong to the Repositories named by the " + "Exporter!" - ).format(exporter_repos, version_repos) + ) ) return versions + @staticmethod + def _parse_size(size): + try: + # based on https://stackoverflow.com/a/42865957/2002471 + units = {"B": 1, "KB": 2 ** 10, "MB": 2 ** 20, "GB": 2 ** 30, "TB": 2 ** 40} + size = size.upper() + if not re.match(r" ", size): + size = re.sub(r"([KMGT]?B)", r" \1", size) + number, unit = [string.strip() for string in size.split()] + return int(float(number) * units[unit]) + except ValueError: + raise serializers.ValidationError( + _("chunk_size '{}' is not valid (valid units are B/KB/MB/GB/TB)").format(size) + ) + + def validate_chunk_size(self, chunk_size): + the_size = self._parse_size(chunk_size) + if the_size <= 0: + raise serializers.ValidationError( + _("Chunk size {} is not greater than zero!").format(the_size) + ) + return the_size + class Meta: model = models.PulpExport fields = ExportSerializer.Meta.fields + ( - "sha256", - "filename", "full", "dry_run", "versions", + "chunk_size", + "output_file_info", ) diff --git a/pulpcore/app/tasks/export.py b/pulpcore/app/tasks/export.py index efc3c399b7..b900683c00 100644 --- a/pulpcore/app/tasks/export.py +++ b/pulpcore/app/tasks/export.py @@ -1,10 +1,13 @@ import hashlib import logging import os +import subprocess import tarfile from distutils.util import strtobool from gettext import gettext as _ +from glob import glob +from pathlib import Path from pkg_resources import get_distribution from pulpcore.app.models import ( @@ -132,7 +135,7 @@ def pulp_export(the_export): 1) Spit out all Artifacts, ArtifactResource.json, and RepositoryResource.json 2) Spit out all *resource JSONs in per-repo-version directories - 3) Compute and store the sha256 and filename of the resulting tar.gz + 3) Compute and store the sha256 and filename of the resulting tar.gz/chunks Args: the_export (models.PulpExport): PulpExport instance @@ -141,62 +144,99 @@ def pulp_export(the_export): ValidationError: When path is not in the ALLOWED_EXPORT_PATHS setting, OR path exists and is not a directory """ + pulp_exporter = the_export.exporter the_export.task = Task.current() tarfile_fp = the_export.export_tarfile_path() os.makedirs(pulp_exporter.path, exist_ok=True) + rslts = {} + + if the_export.validated_chunk_size: + # write it into chunks + with subprocess.Popen( + [ + "split", + "-a", + "4", + "-b", + str(the_export.validated_chunk_size), + "-d", + "-", + tarfile_fp + ".", + ], + stdin=subprocess.PIPE, + ) as split_process: + with tarfile.open(tarfile_fp, "w|gz", fileobj=split_process.stdin) as tar: + _do_export(pulp_exporter, tar, the_export) + + # compute the hashes + paths = [str(Path(p)) for p in glob(tarfile_fp + ".*")] + for a_file in paths: + a_hash = _compute_hash(a_file) + rslts[a_file] = a_hash + else: + # write into the file + with tarfile.open(tarfile_fp, "w:gz") as tar: + _do_export(pulp_exporter, tar, the_export) + # compute the hash + tarfile_hash = _compute_hash(tarfile_fp) + rslts[tarfile_fp] = tarfile_hash + + # store the outputfile/hash info + the_export.output_file_info = rslts + # save the export + the_export.save() + # mark it as 'last' + pulp_exporter.last_export = the_export + # save the exporter + pulp_exporter.save() - with tarfile.open(tarfile_fp, "w:gz") as tar: - the_export.tarfile = tar - CreatedResource.objects.create(content_object=the_export) - versions_to_export = _get_versions_to_export(pulp_exporter, the_export) - plugin_version_info = _get_versions_info(pulp_exporter) - - do_incremental = _incremental_requested(the_export) - - # list-of-previous-versions, or None - if do_incremental: - prev_versions = [ - er.content_object - for er in ExportedResource.objects.filter(export=pulp_exporter.last_export).all() - ] - else: - prev_versions = None - - vers_match = _version_match(versions_to_export, prev_versions) - - # Gather up versions and artifacts - artifacts = [] - for version in versions_to_export: - # Check version-content to make sure we're not being asked to export an on_demand repo - content_artifacts = ContentArtifact.objects.filter(content__in=version.content) - if content_artifacts.filter(artifact=None).exists(): - RuntimeError(_("Remote artifacts cannot be exported.")) - - if do_incremental: - vers_artifacts = version.artifacts.difference(vers_match[version].artifacts).all() - else: - vers_artifacts = version.artifacts.all() - artifacts.extend(vers_artifacts) - - # export plugin-version-info - export_versions(the_export, plugin_version_info) - # Export the top-level entities (artifacts and repositories) - # Note: we've already handled "what about incrementals" when building the 'artifacts' list - export_artifacts(the_export, artifacts) - # Export the repository-version data, per-version - for version in versions_to_export: - export_content(the_export, version) - ExportedResource.objects.create(export=the_export, content_object=version) +def _compute_hash(filename): sha256_hash = hashlib.sha256() - with open(tarfile_fp, "rb") as f: + with open(filename, "rb") as f: # Read and update hash string value in blocks of 4K for byte_block in iter(lambda: f.read(4096), b""): sha256_hash.update(byte_block) - the_export.sha256 = sha256_hash.hexdigest() - the_export.filename = tarfile_fp - the_export.save() - pulp_exporter.last_export = the_export - pulp_exporter.save() + return sha256_hash.hexdigest() + + +def _do_export(pulp_exporter, tar, the_export): + the_export.tarfile = tar + CreatedResource.objects.create(content_object=the_export) + versions_to_export = _get_versions_to_export(pulp_exporter, the_export) + plugin_version_info = _get_versions_info(pulp_exporter) + do_incremental = _incremental_requested(the_export) + # list-of-previous-versions, or None + if do_incremental: + prev_versions = [ + er.content_object + for er in ExportedResource.objects.filter(export=pulp_exporter.last_export).all() + ] + else: + prev_versions = None + vers_match = _version_match(versions_to_export, prev_versions) + # Gather up versions and artifacts + artifacts = [] + for version in versions_to_export: + # Check version-content to make sure we're not being asked to export + # an on_demand repo + content_artifacts = ContentArtifact.objects.filter(content__in=version.content) + if content_artifacts.filter(artifact=None).exists(): + RuntimeError(_("Remote artifacts cannot be exported.")) + + if do_incremental: + vers_artifacts = version.artifacts.difference(vers_match[version].artifacts).all() + else: + vers_artifacts = version.artifacts.all() + artifacts.extend(vers_artifacts) + # export plugin-version-info + export_versions(the_export, plugin_version_info) + # Export the top-level entities (artifacts and repositories) + # Note: we've already handled "what about incrementals" when building the 'artifacts' list + export_artifacts(the_export, artifacts) + # Export the repository-version data, per-version + for version in versions_to_export: + export_content(the_export, version) + ExportedResource.objects.create(export=the_export, content_object=version) diff --git a/pulpcore/app/viewsets/exporter.py b/pulpcore/app/viewsets/exporter.py index 174bd406ce..db6e383271 100644 --- a/pulpcore/app/viewsets/exporter.py +++ b/pulpcore/app/viewsets/exporter.py @@ -127,6 +127,7 @@ def create(self, request, exporter_pk): # Invoke the export export = PulpExport.objects.create(exporter=exporter, params=request.data) export.validated_versions = serializer.validated_data.get("versions", None) + export.validated_chunk_size = serializer.validated_data.get("chunk_size", None) result = enqueue_with_reservation(pulp_export, [exporter], kwargs={"the_export": export}) diff --git a/pulpcore/tests/functional/api/using_plugin/test_pulpexport.py b/pulpcore/tests/functional/api/using_plugin/test_pulpexport.py index 8b08b17c35..af01a5de1f 100644 --- a/pulpcore/tests/functional/api/using_plugin/test_pulpexport.py +++ b/pulpcore/tests/functional/api/using_plugin/test_pulpexport.py @@ -202,9 +202,10 @@ def test_export(self): export = self._gen_export(exporter) self.assertIsNotNone(export) self.assertEqual(len(exporter.repositories), len(export.exported_resources)) - self.assertIsNotNone(export.filename) - self.assertIsNotNone(export.sha256) - self.assertFalse("//" in export.filename) + self.assertIsNotNone(export.output_file_info) + for an_export_filename in export.output_file_info.keys(): + self.assertFalse("//" in an_export_filename) + finally: self._delete_exporter(exporter) @@ -352,3 +353,16 @@ def test_incremental(self): self._gen_export(exporter, body) finally: self._delete_exporter(exporter) + + def test_chunking(self): + a_repo = self.repo_api.create(gen_repo()) + self.addCleanup(self.client.delete, a_repo.pulp_href) + (exporter, body) = self._create_exporter(use_repos=[a_repo], cleanup=False) + try: + body = {"chunk_size": "250B"} + export = self._gen_export(exporter, body) + info = export.output_file_info + self.assertIsNotNone(info) + self.assertTrue(len(info) > 1) + finally: + self._delete_exporter(exporter) diff --git a/pulpcore/tests/functional/api/using_plugin/test_pulpimport.py b/pulpcore/tests/functional/api/using_plugin/test_pulpimport.py index ca92bee3be..93ef81457d 100644 --- a/pulpcore/tests/functional/api/using_plugin/test_pulpimport.py +++ b/pulpcore/tests/functional/api/using_plugin/test_pulpimport.py @@ -191,9 +191,8 @@ def test_import(self): importer = self.importer_api.create(body) self.addCleanup(self.importer_api.delete, importer.pulp_href) - import_response = self.imports_api.create( - importer.pulp_href, {"path": self.export.filename} - ) + filenames = list(self.export.output_file_info.keys()) + import_response = self.imports_api.create(importer.pulp_href, {"path": filenames[0]}) monitor_task(import_response.task) task = self.client.get(import_response.task) resources = task["created_resources"] diff --git a/pulpcore/tests/unit/serializers/test_pulpexport.py b/pulpcore/tests/unit/serializers/test_pulpexport.py index 1eed973966..1ce4a79802 100644 --- a/pulpcore/tests/unit/serializers/test_pulpexport.py +++ b/pulpcore/tests/unit/serializers/test_pulpexport.py @@ -1,5 +1,4 @@ from unittest import TestCase - from pulpcore.app.serializers import PulpExportSerializer @@ -20,14 +19,68 @@ def test_bad_params(self): self.assertFalse(serializer.is_valid()) def test_read_only_params(self): - data = {"full": True, "dry_run": False, "sha256": "bar", "filename": "blech"} + data = {"full": True, "dry_run": False, "output_file_info": {"bar": "blech"}} serializer = PulpExportSerializer(data=data) + self.assertTrue(serializer.is_valid()) with self.assertRaises(AttributeError): - serializer.sha256 + serializer.output_file_info["bar"] - with self.assertRaises(AttributeError): - serializer.filename + def test_chunk_size(self): + # positive tests + # bytes + data = {"chunk_size": "100B"} + serializer = PulpExportSerializer(data=data) + self.assertTrue(serializer.is_valid()) + self.assertEqual(100, serializer.validated_data["chunk_size"]) - with self.assertRaises(AttributeError): - serializer.sha256 + # kilobytes + data = {"chunk_size": "100KB"} + serializer = PulpExportSerializer(data=data) + self.assertTrue(serializer.is_valid()) + self.assertEqual(100 * 1024, serializer.validated_data["chunk_size"]) + + # megabytes + data = {"chunk_size": "100MB"} + serializer = PulpExportSerializer(data=data) + self.assertTrue(serializer.is_valid()) + self.assertEqual(100 * 1024 * 1024, serializer.validated_data["chunk_size"]) + + # gigabytes + data = {"chunk_size": "100GB"} + serializer = PulpExportSerializer(data=data) + self.assertTrue(serializer.is_valid()) + self.assertEqual(100 * 1024 * 1024 * 1024, serializer.validated_data["chunk_size"]) + + # terabytes + data = {"chunk_size": "100TB"} + serializer = PulpExportSerializer(data=data) + self.assertTrue(serializer.is_valid()) + self.assertEqual(100 * 1024 * 1024 * 1024 * 1024, serializer.validated_data["chunk_size"]) + + # float-units + data = {"chunk_size": "2.4GB"} + serializer = PulpExportSerializer(data=data) + self.assertTrue(serializer.is_valid()) + self.assertEqual(int(2.4 * 1024 * 1024 * 1024), serializer.validated_data["chunk_size"]) + + # negative tests + # no units + data = {"chunk_size": "100"} + serializer = PulpExportSerializer(data=data) + self.assertFalse(serializer.is_valid()) + + # not-a-number + data = {"chunk_size": "bazMB"} + serializer = PulpExportSerializer(data=data) + serializer.is_valid() + + # non-positive + data = {"chunk_size": "0GB"} + serializer = PulpExportSerializer(data=data) + self.assertFalse(serializer.is_valid()) + + # non-positive + data = {"chunk_size": "-10KB"} + serializer = PulpExportSerializer(data=data) + self.assertFalse(serializer.is_valid())