Skip to content

Commit

Permalink
Merge pull request #3290 from aawsome/prune-handle-duplicates
Browse files Browse the repository at this point in the history
prune: Handle duplicate blobs more efficiently
  • Loading branch information
MichaelEischer committed Jul 17, 2022
2 parents 6cbeb4a + 715d457 commit d9ea1e9
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 44 deletions.
12 changes: 12 additions & 0 deletions changelog/unreleased/issue-3114
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Enhancement: Optimize handling of duplicate blobs in `prune`

Restic `prune` always used to repack all data files containing duplicate
blobs. This effectively removed all duplicates during prune. However, as a
consequence all these data files were repacked even if the unused repository
space threshold could be reached with less work.

This is now changed and `prune` works nice and fast also if there are lots
of duplicates.

https://github.com/restic/restic/issues/3114
https://github.com/restic/restic/pull/3290
126 changes: 82 additions & 44 deletions cmd/restic/cmd_prune.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,13 +195,12 @@ func runPruneWithRepo(opts PruneOptions, gopts GlobalOptions, repo *repository.R
}

type packInfo struct {
usedBlobs uint
unusedBlobs uint
duplicateBlobs uint
usedSize uint64
unusedSize uint64
tpe restic.BlobType
uncompressed bool
usedBlobs uint
unusedBlobs uint
usedSize uint64
unusedSize uint64
tpe restic.BlobType
uncompressed bool
}

type packInfoWithID struct {
Expand Down Expand Up @@ -243,7 +242,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
Verbosef("searching used packs...\n")

keepBlobs := restic.NewBlobSet()
duplicateBlobs := restic.NewBlobSet()
duplicateBlobs := make(map[restic.BlobHandle]uint8)

// iterate over all blobs in index to find out which blobs are duplicates
for blob := range repo.Index().Each(ctx) {
Expand All @@ -256,7 +255,16 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
stats.size.used += size
stats.blobs.used++
case keepBlobs.Has(bh): // duplicate blob
duplicateBlobs.Insert(bh)
count, ok := duplicateBlobs[bh]
if !ok {
count = 2 // this one is already the second blob!
} else if count < math.MaxUint8 {
// don't overflow, but saturate count at 255
// this can lead to a non-optimal pack selection, but won't cause
// problems otherwise
count++
}
duplicateBlobs[bh] = count
stats.size.duplicate += size
stats.blobs.duplicate++
default:
Expand Down Expand Up @@ -299,10 +307,9 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB

bh := blob.BlobHandle
size := uint64(blob.Length)
_, isDuplicate := duplicateBlobs[bh]
switch {
case duplicateBlobs.Has(bh): // duplicate blob
ip.usedSize += size
ip.duplicateBlobs++
case isDuplicate: // duplicate blobs will be handled later
case keepBlobs.Has(bh): // used blob, not duplicate
ip.usedSize += size
ip.usedBlobs++
Expand All @@ -317,21 +324,49 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
indexPack[blob.PackID] = ip
}

// if duplicate blobs exist, those will be set to either "used" or "unused":
// - mark only one occurence of duplicate blobs as used
// - if there are already some used blobs in a pack, possibly mark duplicates in this pack as "used"
// - if there are no used blobs in a pack, possibly mark duplicates as "unused"
if len(duplicateBlobs) > 0 {
// iterate again over all blobs in index (this is pretty cheap, all in-mem)
for blob := range repo.Index().Each(ctx) {
bh := blob.BlobHandle
count, isDuplicate := duplicateBlobs[bh]
if !isDuplicate {
continue
}

ip := indexPack[blob.PackID]
size := uint64(blob.Length)
switch {
case count == 0:
// used duplicate exists -> mark as unused
ip.unusedSize += size
ip.unusedBlobs++
case ip.usedBlobs > 0, count == 1:
// other used blobs in pack or "last" occurency -> mark as used
ip.usedSize += size
ip.usedBlobs++
// let other occurences be marked as unused
duplicateBlobs[bh] = 0
default:
// mark as unused and decrease counter
ip.unusedSize += size
ip.unusedBlobs++
duplicateBlobs[bh] = count - 1
}
// update indexPack
indexPack[blob.PackID] = ip
}
}

Verbosef("collecting packs for deletion and repacking\n")
removePacksFirst := restic.NewIDSet()
removePacks := restic.NewIDSet()
repackPacks := restic.NewIDSet()

var repackCandidates []packInfoWithID
repackAllPacksWithDuplicates := true

keep := func(p packInfo) {
stats.packs.keep++
if p.duplicateBlobs > 0 {
repackAllPacksWithDuplicates = false
}
}

repoVersion := repo.Config().Version

// loop over all packs and decide what to do
Expand All @@ -346,8 +381,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
return nil
}

if p.unusedSize+p.usedSize != uint64(packSize) &&
!(p.usedBlobs == 0 && p.duplicateBlobs == 0) {
if p.unusedSize+p.usedSize != uint64(packSize) && p.usedBlobs != 0 {
// Pack size does not fit and pack is needed => error
// If the pack is not needed, this is no error, the pack can
// and will be simply removed, see below.
Expand All @@ -358,7 +392,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB

// statistics
switch {
case p.usedBlobs == 0 && p.duplicateBlobs == 0:
case p.usedBlobs == 0:
stats.packs.unused++
case p.unusedBlobs == 0:
stats.packs.used++
Expand All @@ -377,19 +411,19 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB

// decide what to do
switch {
case p.usedBlobs == 0 && p.duplicateBlobs == 0:
case p.usedBlobs == 0:
// All blobs in pack are no longer used => remove pack!
removePacks.Insert(id)
stats.blobs.remove += p.unusedBlobs
stats.size.remove += p.unusedSize

case opts.RepackCachableOnly && p.tpe == restic.DataBlob:
// if this is a data pack and --repack-cacheable-only is set => keep pack!
keep(p)
stats.packs.keep++

case p.unusedBlobs == 0 && p.duplicateBlobs == 0 && p.tpe != restic.InvalidBlob && !mustCompress:
// All blobs in pack are used and not duplicates/mixed => keep pack!
keep(p)
case p.unusedBlobs == 0 && p.tpe != restic.InvalidBlob && !mustCompress:
// All blobs in pack are used and not mixed => keep pack!
stats.packs.keep++

default:
// all other packs are candidates for repacking
Expand All @@ -410,7 +444,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
// missing packs that are not needed can be ignored
ignorePacks := restic.NewIDSet()
for id, p := range indexPack {
if p.usedBlobs == 0 && p.duplicateBlobs == 0 {
if p.usedBlobs == 0 {
ignorePacks.Insert(id)
stats.blobs.remove += p.unusedBlobs
stats.size.remove += p.unusedSize
Expand Down Expand Up @@ -439,15 +473,11 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
// This is equivalent to sorting by unused / total space.
// Instead of unused[i] / used[i] > unused[j] / used[j] we use
// unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64
// Morover duplicates and packs containing trees are sorted to the beginning
// Morover packs containing trees are sorted to the beginning
sort.Slice(repackCandidates, func(i, j int) bool {
pi := repackCandidates[i].packInfo
pj := repackCandidates[j].packInfo
switch {
case pi.duplicateBlobs > 0 && pj.duplicateBlobs == 0:
return true
case pj.duplicateBlobs > 0 && pi.duplicateBlobs == 0:
return false
case pi.tpe != restic.DataBlob && pj.tpe == restic.DataBlob:
return true
case pj.tpe != restic.DataBlob && pi.tpe == restic.DataBlob:
Expand All @@ -458,7 +488,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB

repack := func(id restic.ID, p packInfo) {
repackPacks.Insert(id)
stats.blobs.repack += p.unusedBlobs + p.duplicateBlobs + p.usedBlobs
stats.blobs.repack += p.unusedBlobs + p.usedBlobs
stats.size.repack += p.unusedSize + p.usedSize
stats.blobs.repackrm += p.unusedBlobs
stats.size.repackrm += p.unusedSize
Expand All @@ -470,25 +500,33 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB

switch {
case reachedRepackSize:
keep(p.packInfo)
stats.packs.keep++

case p.duplicateBlobs > 0, p.tpe != restic.DataBlob, p.uncompressed:
// repacking duplicates/non-data/uncompressed-trees is only limited by repackSize
case p.tpe != restic.DataBlob, p.uncompressed:
// repacking non-data packs / uncompressed-trees is only limited by repackSize
repack(p.ID, p.packInfo)

case reachedUnusedSizeAfter:
// for all other packs stop repacking if tolerated unused size is reached.
keep(p.packInfo)
stats.packs.keep++

default:
repack(p.ID, p.packInfo)
}
}

// if all duplicates are repacked, print out correct statistics
if repackAllPacksWithDuplicates {
stats.blobs.repackrm += stats.blobs.duplicate
stats.size.repackrm += stats.size.duplicate
if len(repackPacks) != 0 {
// when repacking, we do not want to keep blobs which are
// already contained in kept packs, so delete them from keepBlobs
for blob := range repo.Index().Each(ctx) {
if removePacks.Has(blob.PackID) || repackPacks.Has(blob.PackID) {
continue
}
keepBlobs.Delete(blob.BlobHandle)
}
} else {
// keepBlobs is only needed if packs are repacked
keepBlobs = nil
}

Verboseff("\nused: %10d blobs / %s\n", stats.blobs.used, formatBytes(stats.size.used))
Expand Down

0 comments on commit d9ea1e9

Please sign in to comment.