Skip to content

Commit

Permalink
split: Prevent memory drain from excessively long shalists.
Browse files Browse the repository at this point in the history
This avoids huge RAM usage when you're splitting a really huge object, plus
git probably doesn't work too well with single trees that contain millions
of objects anyway.
  • Loading branch information
apenwarr committed Jan 6, 2010
1 parent eacc2d3 commit a47f3b4
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 4 deletions.
5 changes: 5 additions & 0 deletions cmd-split.py
Expand Up @@ -17,6 +17,7 @@
bench print benchmark timings to stderr
max-pack-size= maximum bytes in a single pack
max-pack-objects= maximum number of objects in a single pack
fanout= maximum number of blobs in a single tree
"""
o = options.Options('bup split', optspec)
(opt, flags, extra) = o.parse(sys.argv[1:])
Expand All @@ -34,6 +35,10 @@
hashsplit.max_pack_size = int(opt.max_pack_size)
if opt.max_pack_objects:
hashsplit.max_pack_objects = int(opt.max_pack_objects)
if opt.fanout:
hashsplit.fanout = int(opt.fanout)
if opt.blobs:
hashsplit.fanout = 0

start_time = time.time()

Expand Down
21 changes: 20 additions & 1 deletion hashsplit.py
Expand Up @@ -8,6 +8,7 @@
split_verbosely = 0
max_pack_size = 1000*1000*1000
max_pack_objects = 10*1000*1000
fanout = 4096

class Buf:
def __init__(self):
Expand Down Expand Up @@ -122,8 +123,26 @@ def split_to_shalist(w, files):
yield ('100644', 'bup.chunk.%016x' % cn, sha)


def _next(i):
try:
return i.next()
except StopIteration:
return None


def split_to_tree(w, files):
shalist = list(split_to_shalist(w, files))
sl = iter(split_to_shalist(w, files))
if not fanout:
shalist = list(sl)
else:
shalist = []
tmplist = []
for e in sl:
tmplist.append(e)
if len(tmplist) >= fanout and len(tmplist) >= 3:
shalist.append(('40000', tmplist[0][1], w.new_tree(tmplist)))
tmplist = []
shalist += tmplist
tree = w.new_tree(shalist)
return (shalist, tree)

Expand Down
11 changes: 8 additions & 3 deletions test-sh
Expand Up @@ -16,8 +16,13 @@ bup init
bup split --bench -b <testfile1 >tags1.tmp
bup split -vvvv -b testfile2 >tags2.tmp
bup split -t testfile2 >tags2t.tmp
bup split -t testfile2 --fanout 3 >tags2tf.tmp
bup split -r "$BUP_DIR" -c testfile2 >tags2c.tmp
diff -u tags1.tmp tags2.tmp || true
if diff -q tags2t.tmp tags2tf.tmp; then
echo "fanout tree same as non-fanout tree!?"
false
fi
wc -c testfile1 testfile2
wc -l tags1.tmp tags2.tmp
bup join $(cat tags1.tmp) >out1.tmp
Expand All @@ -32,11 +37,11 @@ diff -u testfile2 out2c.tmp
(
set -e
cd "$BUP_DIR" || exit 1
git repack -Ad
git prune
#git repack -Ad
#git prune
(cd "$TOP/t/sampledata" && bup save -vvn master .) || exit 1
n=$(git fsck --full --strict 2>&1 |
grep -v 'dangling commit' |
egrep -v 'dangling (commit|tree)' |
tee -a /dev/stderr |
wc -l)
if [ "$n" != 0 ]; then
Expand Down

0 comments on commit a47f3b4

Please sign in to comment.