Skip to content

Commit

Permalink
Add incremental backups!
Browse files Browse the repository at this point in the history
Fixes #69

If the file mtime and size is the same as in the previous
(complete) band, we can fairly safely
assume it's the same, without reading all of the content.

This'll be safer if we track higher-resolution
mtimes, which is #81.
  • Loading branch information
sourcefrog committed Jan 21, 2020
1 parent 57ea8da commit 7bb93bf
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 18 deletions.
24 changes: 13 additions & 11 deletions NEWS.md
Expand Up @@ -2,23 +2,25 @@

## UNRELEASED

The archive format has changed to "0.6": install an older Conserve release
to read from older archives. This format removes the whole-file hash, in favor
of just per-block hashes. The whole-file hash slows backups and adds little
protection.
* Changed to new archive format "0.6", which has common block storage across
bands, and removes the whole-file hash in favor of per-block hashes.

* Change from `rustc_serialize` to `serde`.
To read from Conserve 0.5 archives, use an old Conserve binary. Until 1.0,
support for old formats won't be kept in the head version.

* Very basic `conserve diff` command, which compares a source directory
* Added incremental backups! If files have the same size and mtime (tracked with
integer second accuracy), they aren't read and stored but rather a reference
to the previous block is added.

* Added a basic `conserve diff` command, which compares a source directory
to a stored tree.

* Move to Rust edition 2018.
* Changed to Rust edition 2018.

* New command `conserve debug index dump`.
* Added command `conserve debug index dump`.

* Remove `conserve versions --sizes` options, as storage is now shared
across bands. The size of one stored tree can be measured with
`conserve tree size`.
* Removed `conserve versions --sizes` options, as storage is now shared across
bands. The size of one stored tree can be measured with `conserve tree size`.

## Conserve 0.5.1 2018-11-11

Expand Down
81 changes: 81 additions & 0 deletions src/backup.rs
Expand Up @@ -9,27 +9,40 @@ use snafu::ResultExt;

use super::blockdir::StoreFiles;
use super::*;
use crate::index::IndexEntryIter;

/// Accepts files to write in the archive (in apath order.)
pub struct BackupWriter {
band: Band,
index_builder: IndexBuilder,
report: Report,
store_files: StoreFiles,
block_dir: BlockDir,

/// The index for the last stored band, used as hints for whether newly
/// stored files have changed.
basis_index: Option<IndexEntryIter>,
}

impl BackupWriter {
/// Create a new BackupWriter.
///
/// This currently makes a new top-level band.
pub fn begin(archive: &Archive) -> Result<BackupWriter> {
let basis_index = archive
.last_complete_band()?
.map(|b| b.iter_entries(&archive.report()))
.transpose()?;
// Create the new band only after finding the basis band!
let band = Band::create(archive)?;
let index_builder = band.index_builder();
Ok(BackupWriter {
band,
index_builder,
report: archive.report().clone(),
store_files: StoreFiles::new(archive.block_dir().clone()),
block_dir: archive.block_dir().clone(),
basis_index,
})
}

Expand Down Expand Up @@ -62,6 +75,35 @@ impl tree::WriteTree for BackupWriter {
fn write_file(&mut self, source_entry: &Entry, content: &mut dyn std::io::Read) -> Result<()> {
self.report.increment("file", 1);
let apath = source_entry.apath();
if let Some(basis_entry) = self
.basis_index
.as_mut()
.map(|bi| bi.advance_to(&apath))
.flatten()
{
if source_entry.is_unchanged_from(&basis_entry) {
// TODO: In verbose mode, say if the file is changed, unchanged,
// etc.
//
// self.report.print(&format!("unchanged file {}", apath));
if self.block_dir.contains_all_blocks(&basis_entry.addrs) {
self.report.increment("file.unchanged", 1);
self.report.increment_size(
"file.bytes",
Sizes {
uncompressed: source_entry.size().unwrap_or_default(),
compressed: 0,
},
);
return self.push_entry(basis_entry);
} else {
// self.report.problem(&format!("Some blocks of basis file {} are missing from the blockdir; writing them again", apath));
}
} else {
self.report.print(&format!("changed file {}", apath));
}
}

let addrs = self
.store_files
.store_file_content(&apath, content, &self.report)?;
Expand Down Expand Up @@ -199,4 +241,43 @@ mod tests {
assert_eq!(sf.read_to_string(&mut s).unwrap(), 0);
assert_eq!(s.len(), 0);
}

#[test]
pub fn detect_unchanged() {
let af = ScratchArchive::new();
let srcdir = TreeFixture::new();
srcdir.create_file("aaa");
srcdir.create_file("bbb");

let mut bw = BackupWriter::begin(&af).unwrap();
let report = af.report();
copy_tree(&srcdir.live_tree(), &mut bw).unwrap();

assert_eq!(report.get_count("file"), 2);
assert_eq!(report.get_count("file.unchanged"), 0);

// Make a second backup from the same tree, and we should see that
// both files are unchanged.
let mut bw = BackupWriter::begin(&af).unwrap();
bw.report = Report::new();
copy_tree(&srcdir.live_tree(), &mut bw).unwrap();

assert_eq!(bw.report.get_count("file"), 2);
assert_eq!(bw.report.get_count("file.unchanged"), 2);

// Change one of the files; we should now see it has changed.
//
// There is a possibility of a race if the file is changed within the granularity of the mtime, without the size changing.
// The proper fix for that is to store a more precise mtime
// <https://github.com/sourcefrog/conserve/issues/81>. To avoid
// it for now, we'll make sure the length changes.
srcdir.create_file_with_contents("bbb", b"longer content for bbb");

let mut bw = BackupWriter::begin(&af).unwrap();
bw.report = Report::new();
copy_tree(&srcdir.live_tree(), &mut bw).unwrap();

assert_eq!(bw.report.get_count("file"), 2);
assert_eq!(bw.report.get_count("file.unchanged"), 1);
}
}
6 changes: 6 additions & 0 deletions src/blockdir.rs
Expand Up @@ -287,6 +287,12 @@ impl BlockDir {
.context(errors::ReadBlock { path })?
.len())
}

pub(crate) fn contains_all_blocks(&self, addrs: &[Address]) -> bool {
addrs
.iter()
.all(|a| self.contains(&a.hash).unwrap_or_default())
}
}

/// Manages storage into the BlockDir of any number of files.
Expand Down
22 changes: 15 additions & 7 deletions src/entry.rs
Expand Up @@ -77,12 +77,20 @@ impl Entry {

/// Size of the file, if it is a file. None for directories and symlinks.
pub fn size(&self) -> Option<u64> {
if self.size.is_some() {
self.size
} else if self.addrs.is_empty() {
None
} else {
Some(self.addrs.iter().map(|a| a.len).sum())
}
// TODO: This is a little gross, because really there are two distinct
// cases and we should know in advance which it is: files read from a
// live tree should always have the `size` field populated, and files in
// a stored tree should always have a list of addrs.
self.size
.or_else(|| Some(self.addrs.iter().map(|a| a.len).sum()))
}

/// True if the metadata supports an assumption the file contents have
/// not changed.
pub fn is_unchanged_from(&self, basis_entry: &Entry) -> bool {
basis_entry.kind == self.kind
&& basis_entry.mtime.is_some()
&& basis_entry.mtime == self.mtime
&& basis_entry.size() == self.size()
}
}
3 changes: 3 additions & 0 deletions src/report.rs
Expand Up @@ -34,6 +34,7 @@ static KNOWN_COUNTERS: &[&str] = &[
"file.empty",
"file.medium",
"file.large",
"file.unchanged",
"symlink",
"backup.error.stat",
"block.read",
Expand Down Expand Up @@ -375,6 +376,7 @@ impl Counts {
// read and write for incremental indexes.
format!(
"{:>12} MB in {} files, {} directories, {} symlinks.\n\
{:>12} files are unchanged.\n\
{:>12} MB/s input rate.\n\
{:>12} MB after deduplication.\n\
{:>12} MB in {} blocks after {:.1}x compression.\n\
Expand All @@ -384,6 +386,7 @@ impl Counts {
self.get_count("file").separate_with_commas(),
self.get_count("dir").separate_with_commas(),
self.get_count("symlink").separate_with_commas(),
self.get_count("file.unchanged").separate_with_commas(),
(mbps_rate(
self.get_size("file.bytes").uncompressed,
self.elapsed_time()
Expand Down

0 comments on commit 7bb93bf

Please sign in to comment.