From a47f3b4d6e0287b69621a96f5e219132bb1220a5 Mon Sep 17 00:00:00 2001 From: Avery Pennarun Date: Wed, 6 Jan 2010 00:19:11 -0500 Subject: [PATCH] split: Prevent memory drain from excessively long shalists. This avoids huge RAM usage when you're splitting a really huge object, plus git probably doesn't work too well with single trees that contain millions of objects anyway. --- cmd-split.py | 5 +++++ hashsplit.py | 21 ++++++++++++++++++++- test-sh | 11 ++++++++--- 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/cmd-split.py b/cmd-split.py index e7ca12299..5f4164621 100755 --- a/cmd-split.py +++ b/cmd-split.py @@ -17,6 +17,7 @@ bench print benchmark timings to stderr max-pack-size= maximum bytes in a single pack max-pack-objects= maximum number of objects in a single pack +fanout= maximum number of blobs in a single tree """ o = options.Options('bup split', optspec) (opt, flags, extra) = o.parse(sys.argv[1:]) @@ -34,6 +35,10 @@ hashsplit.max_pack_size = int(opt.max_pack_size) if opt.max_pack_objects: hashsplit.max_pack_objects = int(opt.max_pack_objects) +if opt.fanout: + hashsplit.fanout = int(opt.fanout) +if opt.blobs: + hashsplit.fanout = 0 start_time = time.time() diff --git a/hashsplit.py b/hashsplit.py index 16f723f0d..6a928726f 100644 --- a/hashsplit.py +++ b/hashsplit.py @@ -8,6 +8,7 @@ split_verbosely = 0 max_pack_size = 1000*1000*1000 max_pack_objects = 10*1000*1000 +fanout = 4096 class Buf: def __init__(self): @@ -122,8 +123,26 @@ def split_to_shalist(w, files): yield ('100644', 'bup.chunk.%016x' % cn, sha) +def _next(i): + try: + return i.next() + except StopIteration: + return None + + def split_to_tree(w, files): - shalist = list(split_to_shalist(w, files)) + sl = iter(split_to_shalist(w, files)) + if not fanout: + shalist = list(sl) + else: + shalist = [] + tmplist = [] + for e in sl: + tmplist.append(e) + if len(tmplist) >= fanout and len(tmplist) >= 3: + shalist.append(('40000', tmplist[0][1], w.new_tree(tmplist))) + tmplist = [] + shalist += tmplist tree = w.new_tree(shalist) return (shalist, tree) diff --git a/test-sh b/test-sh index 56ac25a9d..91fde5968 100755 --- a/test-sh +++ b/test-sh @@ -16,8 +16,13 @@ bup init bup split --bench -b tags1.tmp bup split -vvvv -b testfile2 >tags2.tmp bup split -t testfile2 >tags2t.tmp +bup split -t testfile2 --fanout 3 >tags2tf.tmp bup split -r "$BUP_DIR" -c testfile2 >tags2c.tmp diff -u tags1.tmp tags2.tmp || true +if diff -q tags2t.tmp tags2tf.tmp; then + echo "fanout tree same as non-fanout tree!?" + false +fi wc -c testfile1 testfile2 wc -l tags1.tmp tags2.tmp bup join $(cat tags1.tmp) >out1.tmp @@ -32,11 +37,11 @@ diff -u testfile2 out2c.tmp ( set -e cd "$BUP_DIR" || exit 1 - git repack -Ad - git prune + #git repack -Ad + #git prune (cd "$TOP/t/sampledata" && bup save -vvn master .) || exit 1 n=$(git fsck --full --strict 2>&1 | - grep -v 'dangling commit' | + egrep -v 'dangling (commit|tree)' | tee -a /dev/stderr | wc -l) if [ "$n" != 0 ]; then