diff --git a/rcm_nexus/archive.py b/rcm_nexus/archive.py index 56e06fc..43d6b7c 100644 --- a/rcm_nexus/archive.py +++ b/rcm_nexus/archive.py @@ -5,51 +5,93 @@ MAX_COUNT = 1000 -MAX_SIZE = 1000000000 #1GB +MAX_SIZE = 10 ** 9 # 1GB OUT_ZIP_FORMAT = "part-%03d.zip" -def create_partitioned_zips(src, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE): - if os.path.isdir(src) is True: - return create_partitioned_zips_from_dir(src, out_dir, max_count, max_size) - elif src.endswith('.zip') and os.path.exists(src): - return create_partitioned_zips_from_zip(src, out_dir, max_count, max_size) - else: - raise Exception("Invalid input: %s" % src) -def create_partitioned_zips_from_dir(src, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE): +def create_partitioned_zips_from_dir( + src, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE +): + """ + Given directory, create a set of zips that contain all files in there. No + filtering is done here. + """ zips = Zipper(out_dir, max_count, max_size) for (dirpath, dirnames, filenames) in os.walk(src): dir_skip = len(src) dirname = dirpath[dir_skip:] - if dirname.startswith('/'): + if dirname.startswith("/"): dirname = dirname[1:] for filename in filenames: path = os.path.join(dirpath, filename) entry_name = os.path.join(dirname, filename) - # print("Path: %s (uncompressed size: %s)" % (os.path.join(dirname, filename), os.path.getsize(path))) - with open(path, 'rb') as f: + with open(path, "rb") as f: zips.append(entry_name, os.path.getsize(path), lambda: f.read()) return zips.list() -def create_partitioned_zips_from_zip(src, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE): + +def create_partitioned_zips_from_zip( + src, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE +): + """ + Given a zip archive, split it into smaller chunks and possibly filter out + only some parts. + + The general structure is like this (given foo-1.0.0-maven-repository.zip): + + foo-1.0-maven-repository/ + foo-1.0-maven-repository/examples/ + foo-1.0-maven-repository/maven-repository/... + foo-1.0-maven-repository/licenses/ + foo-1.0-maven-repository/example-config.xml + + This function will look for a maven-subdirectory inside the top-level + directory and only repackage its contents. If there is no such + subdirectory, all content will be taken without any changes. + The top-level directory name does not matter at all. + """ zips = Zipper(out_dir, max_count, max_size) zf = zipfile.ZipFile(src) - for info in zf.infolist(): + repodir = None + + zip_objects = zf.infolist() + + # Find if there is a maven-repository subdir under top-level directory. + for info in zip_objects: + parts = info.filename.split("/") + if len(parts) < 3: + # Not a subdirectory of top-level dir or a file in there. + continue + if parts[1] == "maven-repository": + repodir = os.path.join(*parts[:2]) + "/" + break + + # Iterate over all objects in the directory. + for info in zip_objects: if info.filename.endswith("/") and info.file_size == 0: - # Skip directories + # Skip directories for this iteration. continue - # print("Path: %s (uncompressed size: %s)" % (info.filename, info.file_size)) - zips.append(info.filename, info.file_size, lambda: zf.read(info.filename) ) + filename = info.filename + # We found maven-repository subdirectory previously, only content from + # there should be taken. + if repodir: + if filename.startswith(repodir): + # It's in correct location, move to top-level. + filename = filename[len(repodir):] + else: + # Not correct location, ignore it. + continue + + zips.append(filename, info.file_size, lambda: zf.read(info.filename)) return zips.list() class Zipper(object): - def __init__(self, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE): self.out_dir = out_dir self.max_count = max_count @@ -59,23 +101,27 @@ def __init__(self, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE): self.counter = 0 self.zip = None - def append(self, filename, size, stream_func): - if self.zip is None or self.file_count >= self.max_count or self.file_size + size >= self.max_size: - if self.zip is not None: - self.zip.close() - self.counter += 1 - self.file_count = 0 - self.file_size = 0 - - self.zip = zipfile.ZipFile(os.path.join(self.out_dir, OUT_ZIP_FORMAT % self.counter), mode='w') + def should_rollover(self, size): + return ( + self.zip is None # No zip created yet + or self.file_count >= self.max_count # Maximum file count reached + or self.file_size + size >= self.max_size # Maximum size reached + ) - if '/' in filename: - filename_parts = filename.split('/') + def rollover(self): + if self.zip is not None: + self.zip.close() + self.counter += 1 + self.file_count = 0 + self.file_size = 0 - while filename_parts and "maven-repository" in filename_parts[0] and len(filename_parts) > 1: - del filename_parts[0] + self.zip = zipfile.ZipFile( + os.path.join(self.out_dir, OUT_ZIP_FORMAT % self.counter), mode="w" + ) - filename = "/".join(filename_parts) + def append(self, filename, size, stream_func): + if self.should_rollover(size): + self.rollover() self.zip.writestr(filename, stream_func()) self.file_count += 1 @@ -86,7 +132,6 @@ def close(self): self.zip.close() def list(self): - return sorted([os.path.join(self.out_dir, fname) for fname in os.listdir(self.out_dir)]) - - - + return sorted( + os.path.join(self.out_dir, fname) for fname in os.listdir(self.out_dir) + ) diff --git a/tests/test_archive_dir.py b/tests/test_archive_dir.py index cd4e3a1..3d7d8b2 100644 --- a/tests/test_archive_dir.py +++ b/tests/test_archive_dir.py @@ -3,90 +3,68 @@ from rcm_nexus import archive from .base import NexupBaseTest import tempfile -import os -from random import randint import zipfile -class ArchiveZipest(NexupBaseTest): - - def test_small(self): - self.load_words() - - paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt'] - - srcdir = tempfile.mkdtemp() - self.write_dir(srcdir, paths) - - outdir = tempfile.mkdtemp() - zips = archive.create_partitioned_zips_from_dir(srcdir, outdir) - self.assertEqual(len(zips), 1) - - print(zips) - - z = zips[0] - zf = zipfile.ZipFile(z) - for info in zf.infolist(): - print("%s contains: %s" % (z, info.filename)) - self.assertEqual(info.filename in paths, True) - - - def test_trim_maven_dir(self): - self.load_words() - - paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt'] - maven_paths = ["maven-repository/%s" % path for path in paths] - - srcdir = tempfile.mkdtemp() - self.write_dir(srcdir, maven_paths) - - outdir = tempfile.mkdtemp() - zips = archive.create_partitioned_zips_from_dir(srcdir, outdir) - self.assertEqual(len(zips), 1) - - print(zips) - z = zips[0] - zf = zipfile.ZipFile(z) - for info in zf.infolist(): - print("%s contains: %s" % (z, info.filename)) - self.assertEqual(info.filename in paths, True) - - - def test_count_rollover(self): - self.load_words() - - paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt'] - - srcdir = tempfile.mkdtemp() - self.write_dir(srcdir, paths) - - outdir = tempfile.mkdtemp() - zips = archive.create_partitioned_zips_from_dir(srcdir, outdir, max_count=2) - self.assertEqual(len(zips), 2) - print(zips) - - for z in zips: - zf = zipfile.ZipFile(z) - for info in zf.infolist(): - print("%s contains: %s" % (z, info.filename)) - self.assertEqual(info.filename in paths, True) - - def test_size_rollover(self): - self.load_words() - - paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt'] - src = "This is a test of the system" - - srcdir = tempfile.mkdtemp() - self.write_dir(srcdir, paths, content=src) - - outdir = tempfile.mkdtemp() - zips = archive.create_partitioned_zips_from_dir(srcdir, outdir, max_size=2*len(src) + 1) - self.assertEqual(len(zips), 2) - print(zips) - - for z in zips: - zf = zipfile.ZipFile(z) - for info in zf.infolist(): - print("%s contains: %s" % (z, info.filename)) - self.assertEqual(info.filename in paths, True) +class ArchiveZipest(NexupBaseTest): + def test_small(self): + self.load_words() + + paths = ["path/one.txt", "path/to/two.txt", "path/to/stuff/three.txt"] + + srcdir = tempfile.mkdtemp() + self.write_dir(srcdir, paths) + + outdir = tempfile.mkdtemp() + zips = archive.create_partitioned_zips_from_dir(srcdir, outdir) + self.assertEqual(len(zips), 1) + + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[0]).infolist()), + sorted(paths), + ) + + def test_count_rollover(self): + self.load_words() + + paths = ["path/one.txt", "path/to/two.txt", "path/to/stuff/three.txt"] + + srcdir = tempfile.mkdtemp() + self.write_dir(srcdir, paths) + + outdir = tempfile.mkdtemp() + zips = archive.create_partitioned_zips_from_dir(srcdir, outdir, max_count=2) + self.assertEqual(len(zips), 2) + + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[0]).infolist()), + paths[:2], + ) + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[1]).infolist()), + paths[2:], + ) + + def test_size_rollover(self): + self.load_words() + + paths = ["path/one.txt", "path/to/two.txt", "path/to/stuff/three.txt"] + src = "This is a test of the system" + + srcdir = tempfile.mkdtemp() + self.write_dir(srcdir, paths, content=src) + + outdir = tempfile.mkdtemp() + zips = archive.create_partitioned_zips_from_dir( + srcdir, outdir, max_size=2 * len(src) + 1 + ) + self.assertEqual(len(zips), 2) + + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[0]).infolist()), + paths[:2], + ) + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[1]).infolist()), + paths[2:], + ) diff --git a/tests/test_archive_zip.py b/tests/test_archive_zip.py index 8f25ade..db845b2 100644 --- a/tests/test_archive_zip.py +++ b/tests/test_archive_zip.py @@ -3,94 +3,90 @@ from rcm_nexus import archive from .base import NexupBaseTest import tempfile -import os -from random import randint import zipfile -class ArchiveZipest(NexupBaseTest): - - def test_small(self): - self.load_words() - paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt'] +class ArchiveZipest(NexupBaseTest): + def test_small(self): + self.load_words() - (_f,src_zip) = tempfile.mkstemp(suffix='.zip') + paths = ["path/one.txt", "path/to/two.txt", "path/to/stuff/three.txt"] - self.write_zip(src_zip, paths) + (_f, src_zip) = tempfile.mkstemp(suffix=".zip") - outdir = tempfile.mkdtemp() - zips = archive.create_partitioned_zips_from_zip(src_zip, outdir) - self.assertEqual(len(zips), 1) + self.write_zip(src_zip, paths) - print(zips) + outdir = tempfile.mkdtemp() + zips = archive.create_partitioned_zips_from_zip(src_zip, outdir) + self.assertEqual(len(zips), 1) - z = zips[0] - zf = zipfile.ZipFile(z) - for info in zf.infolist(): - print("%s contains: %s" % (z, info.filename)) - self.assertEqual(info.filename in paths, True) + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[0]).infolist()), + sorted(paths), + ) + def test_trim_maven_dir(self): + self.load_words() - def test_trim_maven_dir(self): - self.load_words() + paths = ["path/one.txt", "path/to/two.txt", "path/to/stuff/three.txt"] + maven_paths = ["top-level/maven-repository/%s" % path for path in paths] - paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt'] - maven_paths = ["maven-repository/%s" % path for path in paths] + (_f, src_zip) = tempfile.mkstemp(suffix=".zip") - (_f,src_zip) = tempfile.mkstemp(suffix='.zip') - - self.write_zip(src_zip, maven_paths) + self.write_zip(src_zip, maven_paths) - outdir = tempfile.mkdtemp() - zips = archive.create_partitioned_zips_from_zip(src_zip, outdir) - self.assertEqual(len(zips), 1) + outdir = tempfile.mkdtemp() + zips = archive.create_partitioned_zips_from_zip(src_zip, outdir) + self.assertEqual(len(zips), 1) - print(zips) + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[0]).infolist()), + sorted(paths), + ) - z = zips[0] - zf = zipfile.ZipFile(z) - for info in zf.infolist(): - print("%s contains: %s" % (z, info.filename)) - self.assertEqual(info.filename in paths, True) + def test_count_rollover(self): + self.load_words() + paths = ["path/one.txt", "path/to/two.txt", "path/to/stuff/three.txt"] - def test_count_rollover(self): - self.load_words() + (_f, src_zip) = tempfile.mkstemp(suffix=".zip") - paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt'] + self.write_zip(src_zip, paths) - (_f,src_zip) = tempfile.mkstemp(suffix='.zip') - - self.write_zip(src_zip, paths) + outdir = tempfile.mkdtemp() + zips = archive.create_partitioned_zips_from_zip(src_zip, outdir, max_count=2) + self.assertEqual(len(zips), 2) - outdir = tempfile.mkdtemp() - zips = archive.create_partitioned_zips_from_zip(src_zip, outdir, max_count=2) - self.assertEqual(len(zips), 2) - print(zips) + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[0]).infolist()), + paths[:2], + ) + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[1]).infolist()), + paths[2:], + ) - for z in zips: - zf = zipfile.ZipFile(z) - for info in zf.infolist(): - print("%s contains: %s" % (z, info.filename)) - self.assertEqual(info.filename in paths, True) + def test_size_rollover(self): + self.load_words() - def test_size_rollover(self): - self.load_words() + paths = ["path/one.txt", "path/to/two.txt", "path/to/stuff/three.txt"] + src = "This is a test of the system" - paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt'] - src = "This is a test of the system" + (_f, src_zip) = tempfile.mkstemp(suffix=".zip") - (_f,src_zip) = tempfile.mkstemp(suffix='.zip') - - self.write_zip(src_zip, paths, content=src) + self.write_zip(src_zip, paths, content=src) - outdir = tempfile.mkdtemp() - zips = archive.create_partitioned_zips_from_zip(src_zip, outdir, max_size=2*len(src) + 1) - self.assertEqual(len(zips), 2) - print(zips) + outdir = tempfile.mkdtemp() + zips = archive.create_partitioned_zips_from_zip( + src_zip, outdir, max_size=2 * len(src) + 1 + ) + self.assertEqual(len(zips), 2) - for z in zips: - zf = zipfile.ZipFile(z) - for info in zf.infolist(): - print("%s contains: %s" % (z, info.filename)) - self.assertEqual(info.filename in paths, True) + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[0]).infolist()), + paths[:2], + ) + self.assertEqual( + sorted(info.filename for info in zipfile.ZipFile(zips[1]).infolist()), + paths[2:], + ) diff --git a/tox.ini b/tox.ini index 1c053fa..b8b98f8 100644 --- a/tox.ini +++ b/tox.ini @@ -1,8 +1,11 @@ [tox] envlist = py27,py36,py37 +[flake8] +max-line-length = 88 + [testenv] -commands = nosetests -s --with-coverage --cover-package=rcm_nexus --cover-html --cover-html-dir=coverage +commands = nosetests --with-coverage --cover-package=rcm_nexus --cover-html --cover-html-dir=coverage setenv = XDG_CACHE_HOME={envtmpdir}/ deps = .[test,ci]