From f01a001527f16c6e0551cd615b17ea258b9ccc05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lubom=C3=ADr=20Sedl=C3=A1=C5=99?= Date: Fri, 16 Aug 2019 12:00:03 +0200 Subject: [PATCH 1/2] Refactor splitting zip files When directory is given as input, simply pack it all and publish. When zip file is specified, it is repacked to create smaller archives. The tool tries to find the actual maven-repository files. If that succeeds, only those files are pushed (meaning any ancillary files are removed, e.g. licences, examples). Fixes: NEXUS-237 --- rcm_nexus/archive.py | 117 ++++++++++++++++++++---------- tests/test_archive_dir.py | 146 ++++++++++++++++---------------------- tests/test_archive_zip.py | 126 ++++++++++++++++---------------- 3 files changed, 204 insertions(+), 185 deletions(-) diff --git a/rcm_nexus/archive.py b/rcm_nexus/archive.py index 56e06fc..43d6b7c 100644 --- a/rcm_nexus/archive.py +++ b/rcm_nexus/archive.py @@ -5,51 +5,93 @@ MAX_COUNT = 1000 -MAX_SIZE = 1000000000 #1GB +MAX_SIZE = 10 ** 9 # 1GB OUT_ZIP_FORMAT = "part-%03d.zip" -def create_partitioned_zips(src, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE): - if os.path.isdir(src) is True: - return create_partitioned_zips_from_dir(src, out_dir, max_count, max_size) - elif src.endswith('.zip') and os.path.exists(src): - return create_partitioned_zips_from_zip(src, out_dir, max_count, max_size) - else: - raise Exception("Invalid input: %s" % src) -def create_partitioned_zips_from_dir(src, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE): +def create_partitioned_zips_from_dir( + src, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE +): + """ + Given directory, create a set of zips that contain all files in there. No + filtering is done here. + """ zips = Zipper(out_dir, max_count, max_size) for (dirpath, dirnames, filenames) in os.walk(src): dir_skip = len(src) dirname = dirpath[dir_skip:] - if dirname.startswith('/'): + if dirname.startswith("/"): dirname = dirname[1:] for filename in filenames: path = os.path.join(dirpath, filename) entry_name = os.path.join(dirname, filename) - # print("Path: %s (uncompressed size: %s)" % (os.path.join(dirname, filename), os.path.getsize(path))) - with open(path, 'rb') as f: + with open(path, "rb") as f: zips.append(entry_name, os.path.getsize(path), lambda: f.read()) return zips.list() -def create_partitioned_zips_from_zip(src, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE): + +def create_partitioned_zips_from_zip( + src, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE +): + """ + Given a zip archive, split it into smaller chunks and possibly filter out + only some parts. + + The general structure is like this (given foo-1.0.0-maven-repository.zip): + + foo-1.0-maven-repository/ + foo-1.0-maven-repository/examples/ + foo-1.0-maven-repository/maven-repository/... + foo-1.0-maven-repository/licenses/ + foo-1.0-maven-repository/example-config.xml + + This function will look for a maven-subdirectory inside the top-level + directory and only repackage its contents. If there is no such + subdirectory, all content will be taken without any changes. + The top-level directory name does not matter at all. + """ zips = Zipper(out_dir, max_count, max_size) zf = zipfile.ZipFile(src) - for info in zf.infolist(): + repodir = None + + zip_objects = zf.infolist() + + # Find if there is a maven-repository subdir under top-level directory. + for info in zip_objects: + parts = info.filename.split("/") + if len(parts) < 3: + # Not a subdirectory of top-level dir or a file in there. + continue + if parts[1] == "maven-repository": + repodir = os.path.join(*parts[:2]) + "/" + break + + # Iterate over all objects in the directory. + for info in zip_objects: if info.filename.endswith("/") and info.file_size == 0: - # Skip directories + # Skip directories for this iteration. continue - # print("Path: %s (uncompressed size: %s)" % (info.filename, info.file_size)) - zips.append(info.filename, info.file_size, lambda: zf.read(info.filename) ) + filename = info.filename + # We found maven-repository subdirectory previously, only content from + # there should be taken. + if repodir: + if filename.startswith(repodir): + # It's in correct location, move to top-level. + filename = filename[len(repodir):] + else: + # Not correct location, ignore it. + continue + + zips.append(filename, info.file_size, lambda: zf.read(info.filename)) return zips.list() class Zipper(object): - def __init__(self, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE): self.out_dir = out_dir self.max_count = max_count @@ -59,23 +101,27 @@ def __init__(self, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE): self.counter = 0 self.zip = None - def append(self, filename, size, stream_func): - if self.zip is None or self.file_count >= self.max_count or self.file_size + size >= self.max_size: - if self.zip is not None: - self.zip.close() - self.counter += 1 - self.file_count = 0 - self.file_size = 0 - - self.zip = zipfile.ZipFile(os.path.join(self.out_dir, OUT_ZIP_FORMAT % self.counter), mode='w') + def should_rollover(self, size): + return ( + self.zip is None # No zip created yet + or self.file_count >= self.max_count # Maximum file count reached + or self.file_size + size >= self.max_size # Maximum size reached + ) - if '/' in filename: - filename_parts = filename.split('/') + def rollover(self): + if self.zip is not None: + self.zip.close() + self.counter += 1 + self.file_count = 0 + self.file_size = 0 - while filename_parts and "maven-repository" in filename_parts[0] and len(filename_parts) > 1: - del filename_parts[0] + self.zip = zipfile.ZipFile( + os.path.join(self.out_dir, OUT_ZIP_FORMAT % self.counter), mode="w" + ) - filename = "/".join(filename_parts) + def append(self, filename, size, stream_func): + if self.should_rollover(size): + self.rollover() self.zip.writestr(filename, stream_func()) self.file_count += 1 @@ -86,7 +132,6 @@ def close(self): self.zip.close() def list(self): - return sorted([os.path.join(self.out_dir, fname) for fname in os.listdir(self.out_dir)]) - - - + return sorted( + os.path.join(self.out_dir, fname) for fname in os.listdir(self.out_dir) + ) diff --git a/tests/test_archive_dir.py b/tests/test_archive_dir.py index cd4e3a1..3d7d8b2 100644 --- a/tests/test_archive_dir.py +++ b/tests/test_archive_dir.py @@ -3,90 +3,68 @@ from rcm_nexus import archive from .base import NexupBaseTest import tempfile -import os -from random import randint import zipfile -class ArchiveZipest(NexupBaseTest): - - def test_small(self): - self.load_words() - - paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt'] - - srcdir = tempfile.mkdtemp() - self.write_dir(srcdir, paths) - - outdir = tempfile.mkdtemp() - zips = archive.create_partitioned_zips_from_dir(srcdir, outdir) - self.assertEqual(len(zips), 1) - - print(zips) - - z = zips[0] - zf = zipfile.ZipFile(z) - for info in zf.infolist(): - print("%s contains: %s" % (z, info.filename)) - self.assertEqual(info.filename in paths, True) - - - def test_trim_maven_dir(self): - self.load_words() - - paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt'] - maven_paths = ["maven-repository/%s" % path for path in paths] - - srcdir = tempfile.mkdtemp() - self.write_dir(srcdir, maven_paths) - - outdir = tempfile.mkdtemp() - zips = archive.create_partitioned_zips_from_dir(srcdir, outdir) - self.assertEqual(len(zips), 1) - - print(zips) - z = zips[0] - zf = zipfile.ZipFile(z) - for info in zf.infolist(): - print("%s contains: %s" % (z, info.filename)) - self.assertEqual(info.filename in paths, True) - - - def test_count_rollover(self): - self.load_words() - - paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt'] - - srcdir = tempfile.mkdtemp() - self.write_dir(srcdir, paths) - - outdir = tempfile.mkdtemp() - zips = archive.create_partitioned_zips_from_dir(srcdir, outdir, max_count=2) - self.assertEqual(len(zips), 2) - print(zips) - - for z in zips: - zf = zipfile.ZipFile(z) - for info in zf.infolist(): - print("%s contains: %s" % (z, info.filename)) - self.assertEqual(info.filename in paths, True) - - def test_size_rollover(self): - self.load_words() - - paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt'] - src = "This is a test of the system" - - srcdir = tempfile.mkdtemp() - self.write_dir(srcdir, paths, content=src) - - outdir = tempfile.mkdtemp() - zips = archive.create_partitioned_zips_from_dir(srcdir, outdir, max_size=2*len(src) + 1) - self.assertEqual(len(zips), 2) - print(zips) - - for z in zips: - zf = zipfile.ZipFile(z) - for info in zf.infolist(): - print("%s contains: %s" % (z, info.filename)) - self.assertEqual(info.filename in paths, True) +class ArchiveZipest(NexupBaseTest): + def test_small(self): + self.load_words() + + paths = ["path/one.txt", "path/to/two.txt", "path/to/stuff/three.txt"] + + srcdir = tempfile.mkdtemp() + self.write_dir(srcdir, paths) + + outdir = tempfile.mkdtemp() + zips = archive.create_partitioned_zips_from_dir(srcdir, outdir) + self.assertEqual(len(zips), 1) + + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[0]).infolist()), + sorted(paths), + ) + + def test_count_rollover(self): + self.load_words() + + paths = ["path/one.txt", "path/to/two.txt", "path/to/stuff/three.txt"] + + srcdir = tempfile.mkdtemp() + self.write_dir(srcdir, paths) + + outdir = tempfile.mkdtemp() + zips = archive.create_partitioned_zips_from_dir(srcdir, outdir, max_count=2) + self.assertEqual(len(zips), 2) + + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[0]).infolist()), + paths[:2], + ) + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[1]).infolist()), + paths[2:], + ) + + def test_size_rollover(self): + self.load_words() + + paths = ["path/one.txt", "path/to/two.txt", "path/to/stuff/three.txt"] + src = "This is a test of the system" + + srcdir = tempfile.mkdtemp() + self.write_dir(srcdir, paths, content=src) + + outdir = tempfile.mkdtemp() + zips = archive.create_partitioned_zips_from_dir( + srcdir, outdir, max_size=2 * len(src) + 1 + ) + self.assertEqual(len(zips), 2) + + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[0]).infolist()), + paths[:2], + ) + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[1]).infolist()), + paths[2:], + ) diff --git a/tests/test_archive_zip.py b/tests/test_archive_zip.py index 8f25ade..db845b2 100644 --- a/tests/test_archive_zip.py +++ b/tests/test_archive_zip.py @@ -3,94 +3,90 @@ from rcm_nexus import archive from .base import NexupBaseTest import tempfile -import os -from random import randint import zipfile -class ArchiveZipest(NexupBaseTest): - - def test_small(self): - self.load_words() - paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt'] +class ArchiveZipest(NexupBaseTest): + def test_small(self): + self.load_words() - (_f,src_zip) = tempfile.mkstemp(suffix='.zip') + paths = ["path/one.txt", "path/to/two.txt", "path/to/stuff/three.txt"] - self.write_zip(src_zip, paths) + (_f, src_zip) = tempfile.mkstemp(suffix=".zip") - outdir = tempfile.mkdtemp() - zips = archive.create_partitioned_zips_from_zip(src_zip, outdir) - self.assertEqual(len(zips), 1) + self.write_zip(src_zip, paths) - print(zips) + outdir = tempfile.mkdtemp() + zips = archive.create_partitioned_zips_from_zip(src_zip, outdir) + self.assertEqual(len(zips), 1) - z = zips[0] - zf = zipfile.ZipFile(z) - for info in zf.infolist(): - print("%s contains: %s" % (z, info.filename)) - self.assertEqual(info.filename in paths, True) + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[0]).infolist()), + sorted(paths), + ) + def test_trim_maven_dir(self): + self.load_words() - def test_trim_maven_dir(self): - self.load_words() + paths = ["path/one.txt", "path/to/two.txt", "path/to/stuff/three.txt"] + maven_paths = ["top-level/maven-repository/%s" % path for path in paths] - paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt'] - maven_paths = ["maven-repository/%s" % path for path in paths] + (_f, src_zip) = tempfile.mkstemp(suffix=".zip") - (_f,src_zip) = tempfile.mkstemp(suffix='.zip') - - self.write_zip(src_zip, maven_paths) + self.write_zip(src_zip, maven_paths) - outdir = tempfile.mkdtemp() - zips = archive.create_partitioned_zips_from_zip(src_zip, outdir) - self.assertEqual(len(zips), 1) + outdir = tempfile.mkdtemp() + zips = archive.create_partitioned_zips_from_zip(src_zip, outdir) + self.assertEqual(len(zips), 1) - print(zips) + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[0]).infolist()), + sorted(paths), + ) - z = zips[0] - zf = zipfile.ZipFile(z) - for info in zf.infolist(): - print("%s contains: %s" % (z, info.filename)) - self.assertEqual(info.filename in paths, True) + def test_count_rollover(self): + self.load_words() + paths = ["path/one.txt", "path/to/two.txt", "path/to/stuff/three.txt"] - def test_count_rollover(self): - self.load_words() + (_f, src_zip) = tempfile.mkstemp(suffix=".zip") - paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt'] + self.write_zip(src_zip, paths) - (_f,src_zip) = tempfile.mkstemp(suffix='.zip') - - self.write_zip(src_zip, paths) + outdir = tempfile.mkdtemp() + zips = archive.create_partitioned_zips_from_zip(src_zip, outdir, max_count=2) + self.assertEqual(len(zips), 2) - outdir = tempfile.mkdtemp() - zips = archive.create_partitioned_zips_from_zip(src_zip, outdir, max_count=2) - self.assertEqual(len(zips), 2) - print(zips) + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[0]).infolist()), + paths[:2], + ) + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[1]).infolist()), + paths[2:], + ) - for z in zips: - zf = zipfile.ZipFile(z) - for info in zf.infolist(): - print("%s contains: %s" % (z, info.filename)) - self.assertEqual(info.filename in paths, True) + def test_size_rollover(self): + self.load_words() - def test_size_rollover(self): - self.load_words() + paths = ["path/one.txt", "path/to/two.txt", "path/to/stuff/three.txt"] + src = "This is a test of the system" - paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt'] - src = "This is a test of the system" + (_f, src_zip) = tempfile.mkstemp(suffix=".zip") - (_f,src_zip) = tempfile.mkstemp(suffix='.zip') - - self.write_zip(src_zip, paths, content=src) + self.write_zip(src_zip, paths, content=src) - outdir = tempfile.mkdtemp() - zips = archive.create_partitioned_zips_from_zip(src_zip, outdir, max_size=2*len(src) + 1) - self.assertEqual(len(zips), 2) - print(zips) + outdir = tempfile.mkdtemp() + zips = archive.create_partitioned_zips_from_zip( + src_zip, outdir, max_size=2 * len(src) + 1 + ) + self.assertEqual(len(zips), 2) - for z in zips: - zf = zipfile.ZipFile(z) - for info in zf.infolist(): - print("%s contains: %s" % (z, info.filename)) - self.assertEqual(info.filename in paths, True) + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[0]).infolist()), + paths[:2], + ) + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[1]).infolist()), + paths[2:], + ) From 6a55552cfde85f7573220a84e50db75c9456a409 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lubom=C3=ADr=20Sedl=C3=A1=C5=99?= Date: Fri, 16 Aug 2019 12:02:16 +0200 Subject: [PATCH 2/2] Tweak tox.ini Increase line length at which warnings are generated, and make test-runner capture output. It will be printed on error, and it tends to drown important messages in lots of noise. --- tox.ini | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 1c053fa..b8b98f8 100644 --- a/tox.ini +++ b/tox.ini @@ -1,8 +1,11 @@ [tox] envlist = py27,py36,py37 +[flake8] +max-line-length = 88 + [testenv] -commands = nosetests -s --with-coverage --cover-package=rcm_nexus --cover-html --cover-html-dir=coverage +commands = nosetests --with-coverage --cover-package=rcm_nexus --cover-html --cover-html-dir=coverage setenv = XDG_CACHE_HOME={envtmpdir}/ deps = .[test,ci]