Skip to content

Commit

Permalink
Merge pull request #19 from lubomir/zip-split
Browse files Browse the repository at this point in the history
Fix zip file splitting
  • Loading branch information
lubomir committed Aug 16, 2019
2 parents d36d5ae + 6a55552 commit d7c1bca
Show file tree
Hide file tree
Showing 4 changed files with 208 additions and 186 deletions.
117 changes: 81 additions & 36 deletions rcm_nexus/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,51 +5,93 @@


MAX_COUNT = 1000
MAX_SIZE = 1000000000 #1GB
MAX_SIZE = 10 ** 9 # 1GB
OUT_ZIP_FORMAT = "part-%03d.zip"

def create_partitioned_zips(src, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE):
if os.path.isdir(src) is True:
return create_partitioned_zips_from_dir(src, out_dir, max_count, max_size)
elif src.endswith('.zip') and os.path.exists(src):
return create_partitioned_zips_from_zip(src, out_dir, max_count, max_size)
else:
raise Exception("Invalid input: %s" % src)

def create_partitioned_zips_from_dir(src, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE):
def create_partitioned_zips_from_dir(
src, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE
):
"""
Given directory, create a set of zips that contain all files in there. No
filtering is done here.
"""
zips = Zipper(out_dir, max_count, max_size)

for (dirpath, dirnames, filenames) in os.walk(src):
dir_skip = len(src)
dirname = dirpath[dir_skip:]
if dirname.startswith('/'):
if dirname.startswith("/"):
dirname = dirname[1:]

for filename in filenames:
path = os.path.join(dirpath, filename)
entry_name = os.path.join(dirname, filename)
# print("Path: %s (uncompressed size: %s)" % (os.path.join(dirname, filename), os.path.getsize(path)))
with open(path, 'rb') as f:
with open(path, "rb") as f:
zips.append(entry_name, os.path.getsize(path), lambda: f.read())

return zips.list()

def create_partitioned_zips_from_zip(src, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE):

def create_partitioned_zips_from_zip(
src, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE
):
"""
Given a zip archive, split it into smaller chunks and possibly filter out
only some parts.
The general structure is like this (given foo-1.0.0-maven-repository.zip):
foo-1.0-maven-repository/
foo-1.0-maven-repository/examples/
foo-1.0-maven-repository/maven-repository/...
foo-1.0-maven-repository/licenses/
foo-1.0-maven-repository/example-config.xml
This function will look for a maven-subdirectory inside the top-level
directory and only repackage its contents. If there is no such
subdirectory, all content will be taken without any changes.
The top-level directory name does not matter at all.
"""
zips = Zipper(out_dir, max_count, max_size)
zf = zipfile.ZipFile(src)
for info in zf.infolist():
repodir = None

zip_objects = zf.infolist()

# Find if there is a maven-repository subdir under top-level directory.
for info in zip_objects:
parts = info.filename.split("/")
if len(parts) < 3:
# Not a subdirectory of top-level dir or a file in there.
continue
if parts[1] == "maven-repository":
repodir = os.path.join(*parts[:2]) + "/"
break

# Iterate over all objects in the directory.
for info in zip_objects:
if info.filename.endswith("/") and info.file_size == 0:
# Skip directories
# Skip directories for this iteration.
continue

# print("Path: %s (uncompressed size: %s)" % (info.filename, info.file_size))
zips.append(info.filename, info.file_size, lambda: zf.read(info.filename) )
filename = info.filename
# We found maven-repository subdirectory previously, only content from
# there should be taken.
if repodir:
if filename.startswith(repodir):
# It's in correct location, move to top-level.
filename = filename[len(repodir):]
else:
# Not correct location, ignore it.
continue

zips.append(filename, info.file_size, lambda: zf.read(info.filename))

return zips.list()


class Zipper(object):

def __init__(self, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE):
self.out_dir = out_dir
self.max_count = max_count
Expand All @@ -59,23 +101,27 @@ def __init__(self, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE):
self.counter = 0
self.zip = None

def append(self, filename, size, stream_func):
if self.zip is None or self.file_count >= self.max_count or self.file_size + size >= self.max_size:
if self.zip is not None:
self.zip.close()
self.counter += 1
self.file_count = 0
self.file_size = 0

self.zip = zipfile.ZipFile(os.path.join(self.out_dir, OUT_ZIP_FORMAT % self.counter), mode='w')
def should_rollover(self, size):
return (
self.zip is None # No zip created yet
or self.file_count >= self.max_count # Maximum file count reached
or self.file_size + size >= self.max_size # Maximum size reached
)

if '/' in filename:
filename_parts = filename.split('/')
def rollover(self):
if self.zip is not None:
self.zip.close()
self.counter += 1
self.file_count = 0
self.file_size = 0

while filename_parts and "maven-repository" in filename_parts[0] and len(filename_parts) > 1:
del filename_parts[0]
self.zip = zipfile.ZipFile(
os.path.join(self.out_dir, OUT_ZIP_FORMAT % self.counter), mode="w"
)

filename = "/".join(filename_parts)
def append(self, filename, size, stream_func):
if self.should_rollover(size):
self.rollover()

self.zip.writestr(filename, stream_func())
self.file_count += 1
Expand All @@ -86,7 +132,6 @@ def close(self):
self.zip.close()

def list(self):
return sorted([os.path.join(self.out_dir, fname) for fname in os.listdir(self.out_dir)])



return sorted(
os.path.join(self.out_dir, fname) for fname in os.listdir(self.out_dir)
)
146 changes: 62 additions & 84 deletions tests/test_archive_dir.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,90 +3,68 @@
from rcm_nexus import archive
from .base import NexupBaseTest
import tempfile
import os
from random import randint
import zipfile

class ArchiveZipest(NexupBaseTest):

def test_small(self):
self.load_words()

paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt']

srcdir = tempfile.mkdtemp()
self.write_dir(srcdir, paths)

outdir = tempfile.mkdtemp()
zips = archive.create_partitioned_zips_from_dir(srcdir, outdir)
self.assertEqual(len(zips), 1)

print(zips)

z = zips[0]
zf = zipfile.ZipFile(z)
for info in zf.infolist():
print("%s contains: %s" % (z, info.filename))
self.assertEqual(info.filename in paths, True)


def test_trim_maven_dir(self):
self.load_words()

paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt']
maven_paths = ["maven-repository/%s" % path for path in paths]

srcdir = tempfile.mkdtemp()
self.write_dir(srcdir, maven_paths)

outdir = tempfile.mkdtemp()
zips = archive.create_partitioned_zips_from_dir(srcdir, outdir)
self.assertEqual(len(zips), 1)

print(zips)

z = zips[0]
zf = zipfile.ZipFile(z)
for info in zf.infolist():
print("%s contains: %s" % (z, info.filename))
self.assertEqual(info.filename in paths, True)


def test_count_rollover(self):
self.load_words()

paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt']

srcdir = tempfile.mkdtemp()
self.write_dir(srcdir, paths)

outdir = tempfile.mkdtemp()
zips = archive.create_partitioned_zips_from_dir(srcdir, outdir, max_count=2)
self.assertEqual(len(zips), 2)
print(zips)

for z in zips:
zf = zipfile.ZipFile(z)
for info in zf.infolist():
print("%s contains: %s" % (z, info.filename))
self.assertEqual(info.filename in paths, True)

def test_size_rollover(self):
self.load_words()

paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt']
src = "This is a test of the system"

srcdir = tempfile.mkdtemp()
self.write_dir(srcdir, paths, content=src)

outdir = tempfile.mkdtemp()
zips = archive.create_partitioned_zips_from_dir(srcdir, outdir, max_size=2*len(src) + 1)
self.assertEqual(len(zips), 2)
print(zips)

for z in zips:
zf = zipfile.ZipFile(z)
for info in zf.infolist():
print("%s contains: %s" % (z, info.filename))
self.assertEqual(info.filename in paths, True)
class ArchiveZipest(NexupBaseTest):
def test_small(self):
self.load_words()

paths = ["path/one.txt", "path/to/two.txt", "path/to/stuff/three.txt"]

srcdir = tempfile.mkdtemp()
self.write_dir(srcdir, paths)

outdir = tempfile.mkdtemp()
zips = archive.create_partitioned_zips_from_dir(srcdir, outdir)
self.assertEqual(len(zips), 1)

self.assertEqual(
sorted(info.filename for info in zipfile.ZipFile(zips[0]).infolist()),
sorted(paths),
)

def test_count_rollover(self):
self.load_words()

paths = ["path/one.txt", "path/to/two.txt", "path/to/stuff/three.txt"]

srcdir = tempfile.mkdtemp()
self.write_dir(srcdir, paths)

outdir = tempfile.mkdtemp()
zips = archive.create_partitioned_zips_from_dir(srcdir, outdir, max_count=2)
self.assertEqual(len(zips), 2)

self.assertEqual(
sorted(info.filename for info in zipfile.ZipFile(zips[0]).infolist()),
paths[:2],
)
self.assertEqual(
sorted(info.filename for info in zipfile.ZipFile(zips[1]).infolist()),
paths[2:],
)

def test_size_rollover(self):
self.load_words()

paths = ["path/one.txt", "path/to/two.txt", "path/to/stuff/three.txt"]
src = "This is a test of the system"

srcdir = tempfile.mkdtemp()
self.write_dir(srcdir, paths, content=src)

outdir = tempfile.mkdtemp()
zips = archive.create_partitioned_zips_from_dir(
srcdir, outdir, max_size=2 * len(src) + 1
)
self.assertEqual(len(zips), 2)

self.assertEqual(
sorted(info.filename for info in zipfile.ZipFile(zips[0]).infolist()),
paths[:2],
)
self.assertEqual(
sorted(info.filename for info in zipfile.ZipFile(zips[1]).infolist()),
paths[2:],
)
Loading

0 comments on commit d7c1bca

Please sign in to comment.