Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prune repack threshold #1994

Closed
wants to merge 11 commits into from
16 changes: 16 additions & 0 deletions changelog/unreleased/issue-1985
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Enhancement: Add adjustment for re-writing packs during prune

We've added the `--repack-threshold` flag to the `prune` command which
allows the user to specify the maximum percentage of unused space to
allow in a pack before prune will rewrite that pack. This gives the user
some control over the read/write activity occurs on the backend, which
is important for those backends with metered transactions.

Setting the value of this new flag to zero causes prune to re-write
packs with any amount of unused data (previous behavior). Setting the
value to 100 causes prune to only re-write packs that are completely
unused. The flag defaults to 20, which means that packs with less than
20% unused data are not re-written while packs with 20% or more
unused date are re-written.

https://github.com/restic/restic/issues/1985
3 changes: 2 additions & 1 deletion cmd/restic/cmd_forget.go
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,8 @@ func runForget(opts ForgetOptions, gopts GlobalOptions, args []string) error {
if removeSnapshots > 0 && opts.Prune {
Verbosef("%d snapshots have been removed, running prune\n", removeSnapshots)
if !opts.DryRun {
return pruneRepository(gopts, repo)
pruneOptions := PruneOptions{RepackThreshold: DefaultRepackThreshold}
return pruneRepository(pruneOptions, gopts, repo)
}
}

Expand Down
179 changes: 93 additions & 86 deletions cmd/restic/cmd_prune.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ import (
"github.com/spf13/cobra"
)

// DefaultRepackThreshold - Default value for --repack-threshold when it is not specified
const DefaultRepackThreshold int = 20
fd0 marked this conversation as resolved.
Show resolved Hide resolved

var cmdPrune = &cobra.Command{
Use: "prune [flags]",
Short: "Remove unneeded data from the repository",
Expand All @@ -22,12 +25,24 @@ referenced and therefore not needed any more.
`,
DisableAutoGenTag: true,
RunE: func(cmd *cobra.Command, args []string) error {
return runPrune(globalOptions)
return runPrune(pruneOptions, globalOptions)
},
}

// PruneOptions collects all options for the prune command.
type PruneOptions struct {
RepackThreshold int
}

var pruneOptions PruneOptions

func init() {
cmdRoot.AddCommand(cmdPrune)

f := cmdPrune.Flags()
f.IntVarP(&pruneOptions.RepackThreshold, "repack-threshold", "", DefaultRepackThreshold, "only rebuild packs with at least `n`% unused space")

f.SortFlags = false
}

func shortenStatus(maxLength int, s string) string {
Expand Down Expand Up @@ -70,7 +85,7 @@ func newProgressMax(show bool, max uint64, description string) *restic.Progress
return p
}

func runPrune(gopts GlobalOptions) error {
func runPrune(opts PruneOptions, gopts GlobalOptions) error {
repo, err := OpenRepository(gopts)
if err != nil {
return err
Expand All @@ -82,7 +97,7 @@ func runPrune(gopts GlobalOptions) error {
return err
}

return pruneRepository(gopts, repo)
return pruneRepository(opts, gopts, repo)
}

func mixedBlobs(list []restic.Blob) bool {
Expand All @@ -104,7 +119,7 @@ func mixedBlobs(list []restic.Blob) bool {
return false
}

func pruneRepository(gopts GlobalOptions, repo restic.Repository) error {
func pruneRepository(opts PruneOptions, gopts GlobalOptions, repo restic.Repository) error {
ctx := gopts.ctx

err := repo.LoadIndex(ctx)
Expand All @@ -113,24 +128,29 @@ func pruneRepository(gopts GlobalOptions, repo restic.Repository) error {
}

var stats struct {
blobs int
packs int
snapshots int
bytes int64
totalFiles int
totalPacks int
totalBlobs int
totalBytes uint64
snapshots int
usedBlobs int
duplicateBlobs int
duplicateBytes uint64
remainingBytes uint64
removeBytes uint64
}

Verbosef("counting files in repo\n")
err = repo.List(ctx, restic.DataFile, func(restic.ID, int64) error {
stats.packs++
stats.totalFiles++
return nil
})
if err != nil {
return err
}

Verbosef("building new index for repo\n")

bar := newProgressMax(!gopts.Quiet, uint64(stats.packs), "packs")
bar := newProgressMax(!gopts.Quiet, uint64(stats.totalFiles), "packs")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you meant stats.totalPacks here.

idx, invalidFiles, err := index.New(ctx, repo, restic.NewIDSet(), bar)
if err != nil {
return err
Expand All @@ -140,50 +160,38 @@ func pruneRepository(gopts GlobalOptions, repo restic.Repository) error {
Warnf("incomplete pack file (will be removed): %v\n", id)
}

blobs := 0
blobCounts := make(map[restic.BlobHandle]int)
for _, pack := range idx.Packs {
stats.bytes += pack.Size
blobs += len(pack.Entries)
}
Verbosef("repository contains %v packs (%v blobs) with %v\n",
len(idx.Packs), blobs, formatBytes(uint64(stats.bytes)))

blobCount := make(map[restic.BlobHandle]int)
var duplicateBlobs uint64
var duplicateBytes uint64

// find duplicate blobs
for _, p := range idx.Packs {
for _, entry := range p.Entries {
stats.blobs++
h := restic.BlobHandle{ID: entry.ID, Type: entry.Type}
blobCount[h]++

if blobCount[h] > 1 {
duplicateBlobs++
duplicateBytes += uint64(entry.Length)
stats.totalPacks++
stats.totalBytes += uint64(pack.Size)
for _, blob := range pack.Entries {
stats.totalBlobs++
h := restic.BlobHandle{ID: blob.ID, Type: blob.Type}
blobCounts[h]++
if blobCounts[h] > 1 {
stats.duplicateBlobs++
stats.duplicateBytes += uint64(blob.Length)
stats.removeBytes += uint64(blob.Length)
}
}
}
Verbosef("repository contains %v packs (%v blobs) with %v\n",
stats.totalPacks, stats.totalBlobs, formatBytes(stats.totalBytes))
Verbosef("found %d duplicate blobs, %v duplicate\n",
stats.duplicateBlobs, formatBytes(stats.duplicateBytes))

Verbosef("processed %d blobs: %d duplicate blobs, %v duplicate\n",
stats.blobs, duplicateBlobs, formatBytes(uint64(duplicateBytes)))
Verbosef("load all snapshots\n")

// find referenced blobs
snapshots, err := restic.LoadAllSnapshots(ctx, repo)
if err != nil {
return err
}

stats.snapshots = len(snapshots)

Verbosef("find data that is still in use for %d snapshots\n", stats.snapshots)

usedBlobs := restic.NewBlobSet()
seenBlobs := restic.NewBlobSet()

bar = newProgressMax(!gopts.Quiet, uint64(len(snapshots)), "snapshots")
bar = newProgressMax(!gopts.Quiet, uint64(stats.snapshots), "snapshots")
bar.Start()
for _, sn := range snapshots {
debug.Log("process snapshot %v", sn.ID())
Expand All @@ -193,83 +201,80 @@ func pruneRepository(gopts GlobalOptions, repo restic.Repository) error {
if repo.Backend().IsNotExist(err) {
return errors.Fatal("unable to load a tree from the repo: " + err.Error())
}

return err
}

debug.Log("processed snapshot %v", sn.ID())
bar.Report(restic.Stat{Blobs: 1})
}
bar.Done()
stats.usedBlobs = len(usedBlobs)

Verbosef("found %d of %d data blobs still in use, %d blobs unused\n",
stats.usedBlobs, stats.totalBlobs, stats.totalBlobs-stats.usedBlobs)

if len(usedBlobs) > stats.blobs {
if stats.usedBlobs > stats.totalBlobs {
return errors.Fatalf("number of used blobs is larger than number of available blobs!\n" +
"Please report this error (along with the output of the 'prune' run) at\n" +
"https://github.com/restic/restic/issues/new")
}

Verbosef("found %d of %d data blobs still in use, removing %d blobs\n",
len(usedBlobs), stats.blobs, stats.blobs-len(usedBlobs))
// get packs to be removed
removePacks := restic.NewIDSet()
for _, id := range invalidFiles {
removePacks.Insert(id)
}

// find packs that need a rewrite
rewritePacks := restic.NewIDSet()
for _, pack := range idx.Packs {
packNeedsRewrite := false
packNeedsRemoval := false

if mixedBlobs(pack.Entries) {
rewritePacks.Insert(pack.ID)
continue
Verbosef("found deprecated mixed data/tree pack %v, marking for rewrite\n", pack.ID)
packNeedsRewrite = true
}

packTotalBytes := uint64(0)
packUnusedBytes := uint64(0)
for _, blob := range pack.Entries {
packTotalBytes += uint64(blob.Length)
h := restic.BlobHandle{ID: blob.ID, Type: blob.Type}
if !usedBlobs.Has(h) {
rewritePacks.Insert(pack.ID)
continue
packUnusedBytes += uint64(blob.Length)
}

if blobCount[h] > 1 {
rewritePacks.Insert(pack.ID)
// if pack has a duplicated blob, force rewrite
if blobCounts[h] > 1 {
packNeedsRewrite = true
}
}
}

removeBytes := duplicateBytes

// find packs that are unneeded
removePacks := restic.NewIDSet()

Verbosef("will remove %d invalid files\n", len(invalidFiles))
for _, id := range invalidFiles {
removePacks.Insert(id)
}

for packID, p := range idx.Packs {

hasActiveBlob := false
for _, blob := range p.Entries {
h := restic.BlobHandle{ID: blob.ID, Type: blob.Type}
if usedBlobs.Has(h) {
hasActiveBlob = true
continue
if packUnusedBytes >= packTotalBytes {
packNeedsRemoval = true
} else if packUnusedBytes > 0 {
unusedPercent := int((packUnusedBytes * 100) / packTotalBytes)
if unusedPercent >= opts.RepackThreshold {
packNeedsRewrite = true
}

removeBytes += uint64(blob.Length)
}

if hasActiveBlob {
continue
}

removePacks.Insert(packID)

if !rewritePacks.Has(packID) {
return errors.Fatalf("pack %v is unneeded, but not contained in rewritePacks", packID.Str())
if packNeedsRemoval {
removePacks.Insert(pack.ID)
stats.removeBytes += packTotalBytes
} else if packNeedsRewrite {
rewritePacks.Insert(pack.ID)
stats.removeBytes += packUnusedBytes
} else {
stats.remainingBytes += packUnusedBytes
}

rewritePacks.Delete(packID)
}

Verbosef("will delete %d packs and rewrite %d packs, this frees %s\n",
len(removePacks), len(rewritePacks), formatBytes(uint64(removeBytes)))
Verbosef("will delete %d packs and rewrite %d packs\n",
len(removePacks), len(rewritePacks))
Verbosef("frees %s with %s unused remaining\n",
formatBytes(stats.removeBytes), formatBytes(stats.remainingBytes))

var obsoletePacks restic.IDSet
if len(rewritePacks) != 0 {
Expand All @@ -280,12 +285,14 @@ func pruneRepository(gopts GlobalOptions, repo restic.Repository) error {
return err
}
bar.Done()
}

removePacks.Merge(obsoletePacks)
removePacks.Merge(obsoletePacks)
}

if err = rebuildIndex(ctx, repo, removePacks); err != nil {
return err
if len(rewritePacks) != 0 || len(removePacks) != 0 {
if err = rebuildIndex(ctx, repo, removePacks); err != nil {
return err
}
}

if len(removePacks) != 0 {
Expand Down
3 changes: 2 additions & 1 deletion cmd/restic/integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,8 @@ func testRunForget(t testing.TB, gopts GlobalOptions, args ...string) {
}

func testRunPrune(t testing.TB, gopts GlobalOptions) {
rtest.OK(t, runPrune(gopts))
opts := PruneOptions{RepackThreshold: DefaultRepackThreshold}
rtest.OK(t, runPrune(opts, gopts))
}

func TestBackup(t *testing.T) {
Expand Down