Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix zip file splitting #19

Merged
merged 2 commits into from
Aug 16, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 81 additions & 36 deletions rcm_nexus/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,51 +5,93 @@


MAX_COUNT = 1000
MAX_SIZE = 1000000000 #1GB
MAX_SIZE = 10 ** 9 # 1GB
OUT_ZIP_FORMAT = "part-%03d.zip"

def create_partitioned_zips(src, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE):
if os.path.isdir(src) is True:
return create_partitioned_zips_from_dir(src, out_dir, max_count, max_size)
elif src.endswith('.zip') and os.path.exists(src):
return create_partitioned_zips_from_zip(src, out_dir, max_count, max_size)
else:
raise Exception("Invalid input: %s" % src)

def create_partitioned_zips_from_dir(src, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE):
def create_partitioned_zips_from_dir(
src, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE
):
"""
Given directory, create a set of zips that contain all files in there. No
filtering is done here.
"""
zips = Zipper(out_dir, max_count, max_size)

for (dirpath, dirnames, filenames) in os.walk(src):
dir_skip = len(src)
dirname = dirpath[dir_skip:]
if dirname.startswith('/'):
if dirname.startswith("/"):
dirname = dirname[1:]

for filename in filenames:
path = os.path.join(dirpath, filename)
entry_name = os.path.join(dirname, filename)
# print("Path: %s (uncompressed size: %s)" % (os.path.join(dirname, filename), os.path.getsize(path)))
with open(path, 'rb') as f:
with open(path, "rb") as f:
zips.append(entry_name, os.path.getsize(path), lambda: f.read())

return zips.list()

def create_partitioned_zips_from_zip(src, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE):

def create_partitioned_zips_from_zip(
src, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE
):
"""
Given a zip archive, split it into smaller chunks and possibly filter out
only some parts.

The general structure is like this (given foo-1.0.0-maven-repository.zip):

foo-1.0-maven-repository/
foo-1.0-maven-repository/examples/
foo-1.0-maven-repository/maven-repository/...
foo-1.0-maven-repository/licenses/
foo-1.0-maven-repository/example-config.xml

This function will look for a maven-subdirectory inside the top-level
directory and only repackage its contents. If there is no such
subdirectory, all content will be taken without any changes.
The top-level directory name does not matter at all.
"""
zips = Zipper(out_dir, max_count, max_size)
zf = zipfile.ZipFile(src)
for info in zf.infolist():
repodir = None

zip_objects = zf.infolist()

# Find if there is a maven-repository subdir under top-level directory.
for info in zip_objects:
parts = info.filename.split("/")
if len(parts) < 3:
# Not a subdirectory of top-level dir or a file in there.
continue
if parts[1] == "maven-repository":
repodir = os.path.join(*parts[:2]) + "/"
break

# Iterate over all objects in the directory.
for info in zip_objects:
if info.filename.endswith("/") and info.file_size == 0:
# Skip directories
# Skip directories for this iteration.
continue

# print("Path: %s (uncompressed size: %s)" % (info.filename, info.file_size))
zips.append(info.filename, info.file_size, lambda: zf.read(info.filename) )
filename = info.filename
# We found maven-repository subdirectory previously, only content from
# there should be taken.
if repodir:
if filename.startswith(repodir):
# It's in correct location, move to top-level.
filename = filename[len(repodir):]
else:
# Not correct location, ignore it.
continue

zips.append(filename, info.file_size, lambda: zf.read(info.filename))

return zips.list()


class Zipper(object):

def __init__(self, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE):
self.out_dir = out_dir
self.max_count = max_count
Expand All @@ -59,23 +101,27 @@ def __init__(self, out_dir, max_count=MAX_COUNT, max_size=MAX_SIZE):
self.counter = 0
self.zip = None

def append(self, filename, size, stream_func):
if self.zip is None or self.file_count >= self.max_count or self.file_size + size >= self.max_size:
if self.zip is not None:
self.zip.close()
self.counter += 1
self.file_count = 0
self.file_size = 0

self.zip = zipfile.ZipFile(os.path.join(self.out_dir, OUT_ZIP_FORMAT % self.counter), mode='w')
def should_rollover(self, size):
return (
self.zip is None # No zip created yet
or self.file_count >= self.max_count # Maximum file count reached
or self.file_size + size >= self.max_size # Maximum size reached
)

if '/' in filename:
filename_parts = filename.split('/')
def rollover(self):
if self.zip is not None:
self.zip.close()
self.counter += 1
self.file_count = 0
self.file_size = 0

while filename_parts and "maven-repository" in filename_parts[0] and len(filename_parts) > 1:
del filename_parts[0]
self.zip = zipfile.ZipFile(
os.path.join(self.out_dir, OUT_ZIP_FORMAT % self.counter), mode="w"
)

filename = "/".join(filename_parts)
def append(self, filename, size, stream_func):
if self.should_rollover(size):
self.rollover()

self.zip.writestr(filename, stream_func())
self.file_count += 1
Expand All @@ -86,7 +132,6 @@ def close(self):
self.zip.close()

def list(self):
return sorted([os.path.join(self.out_dir, fname) for fname in os.listdir(self.out_dir)])



return sorted(
os.path.join(self.out_dir, fname) for fname in os.listdir(self.out_dir)
)
146 changes: 62 additions & 84 deletions tests/test_archive_dir.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,90 +3,68 @@
from rcm_nexus import archive
from .base import NexupBaseTest
import tempfile
import os
from random import randint
import zipfile

class ArchiveZipest(NexupBaseTest):

def test_small(self):
self.load_words()

paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt']

srcdir = tempfile.mkdtemp()
self.write_dir(srcdir, paths)

outdir = tempfile.mkdtemp()
zips = archive.create_partitioned_zips_from_dir(srcdir, outdir)
self.assertEqual(len(zips), 1)

print(zips)

z = zips[0]
zf = zipfile.ZipFile(z)
for info in zf.infolist():
print("%s contains: %s" % (z, info.filename))
self.assertEqual(info.filename in paths, True)


def test_trim_maven_dir(self):
self.load_words()

paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt']
maven_paths = ["maven-repository/%s" % path for path in paths]

srcdir = tempfile.mkdtemp()
self.write_dir(srcdir, maven_paths)

outdir = tempfile.mkdtemp()
zips = archive.create_partitioned_zips_from_dir(srcdir, outdir)
self.assertEqual(len(zips), 1)

print(zips)

z = zips[0]
zf = zipfile.ZipFile(z)
for info in zf.infolist():
print("%s contains: %s" % (z, info.filename))
self.assertEqual(info.filename in paths, True)


def test_count_rollover(self):
self.load_words()

paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt']

srcdir = tempfile.mkdtemp()
self.write_dir(srcdir, paths)

outdir = tempfile.mkdtemp()
zips = archive.create_partitioned_zips_from_dir(srcdir, outdir, max_count=2)
self.assertEqual(len(zips), 2)
print(zips)

for z in zips:
zf = zipfile.ZipFile(z)
for info in zf.infolist():
print("%s contains: %s" % (z, info.filename))
self.assertEqual(info.filename in paths, True)

def test_size_rollover(self):
self.load_words()

paths = ['path/one.txt', 'path/to/two.txt', 'path/to/stuff/three.txt']
src = "This is a test of the system"

srcdir = tempfile.mkdtemp()
self.write_dir(srcdir, paths, content=src)

outdir = tempfile.mkdtemp()
zips = archive.create_partitioned_zips_from_dir(srcdir, outdir, max_size=2*len(src) + 1)
self.assertEqual(len(zips), 2)
print(zips)

for z in zips:
zf = zipfile.ZipFile(z)
for info in zf.infolist():
print("%s contains: %s" % (z, info.filename))
self.assertEqual(info.filename in paths, True)
class ArchiveZipest(NexupBaseTest):
def test_small(self):
self.load_words()

paths = ["path/one.txt", "path/to/two.txt", "path/to/stuff/three.txt"]

srcdir = tempfile.mkdtemp()
self.write_dir(srcdir, paths)

outdir = tempfile.mkdtemp()
zips = archive.create_partitioned_zips_from_dir(srcdir, outdir)
self.assertEqual(len(zips), 1)

self.assertEqual(
sorted(info.filename for info in zipfile.ZipFile(zips[0]).infolist()),
sorted(paths),
)

def test_count_rollover(self):
self.load_words()

paths = ["path/one.txt", "path/to/two.txt", "path/to/stuff/three.txt"]

srcdir = tempfile.mkdtemp()
self.write_dir(srcdir, paths)

outdir = tempfile.mkdtemp()
zips = archive.create_partitioned_zips_from_dir(srcdir, outdir, max_count=2)
self.assertEqual(len(zips), 2)

self.assertEqual(
sorted(info.filename for info in zipfile.ZipFile(zips[0]).infolist()),
paths[:2],
)
self.assertEqual(
sorted(info.filename for info in zipfile.ZipFile(zips[1]).infolist()),
paths[2:],
)

def test_size_rollover(self):
self.load_words()

paths = ["path/one.txt", "path/to/two.txt", "path/to/stuff/three.txt"]
src = "This is a test of the system"

srcdir = tempfile.mkdtemp()
self.write_dir(srcdir, paths, content=src)

outdir = tempfile.mkdtemp()
zips = archive.create_partitioned_zips_from_dir(
srcdir, outdir, max_size=2 * len(src) + 1
)
self.assertEqual(len(zips), 2)

self.assertEqual(
sorted(info.filename for info in zipfile.ZipFile(zips[0]).infolist()),
paths[:2],
)
self.assertEqual(
sorted(info.filename for info in zipfile.ZipFile(zips[1]).infolist()),
paths[2:],
)
Loading