Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Loading…

sync --preserve checks mtime #35

Closed
wants to merge 1 commit into from

1 participant

Matt Domsch
Matt Domsch
Owner

This causes an extra HEAD request for each remote file, which greatly
slows down execution, and increases monetary cost $(0.01/10000
requests), but guarantees files whose mtime has changed will get
resync'd.

This is necessary for yum repositories, where repodata/* files may be
updated but not change size. It also correctly handles large files
whose md5 values as returned by S3 are incorrect having their content
(and thus mtime) changed, perhaps by RPM signing.

Matt Domsch mdomsch sync --preserve checks mtime
This causes an extra HEAD request for each remote file, which greatly
slows down execution, and increases monetary cost $(0.01/10000
requests), but guarantees files whose mtime has changed will get
resync'd.

This is necessary for yum repositories, where repodata/* files may be
updated but not change size.  It also correctly handles large files
whose md5 values as returned by S3 are incorrect having their content
(and thus mtime) changed, perhaps by RPM signing.
282136a
Matt Domsch
Owner

we're trading a ton of local disk I/O to calculate md5 on each file, for a HEAD call to S3 for each file
we do get the LastModified (uploaded) time from S3 w/o the HEAD call
I wonder if we can simply look at files with mtimes newer than LastModified...
and assume if file mtime is newer than LastModified, then it needs to be updated.

For regular occuring sync runs, I think that's valid...

Matt Domsch
Owner

With this patch, syncing takes 10x longer. Probably the wrong approach then. Maybe LastModified as a proxy for mtime is good enough...

Matt Domsch
Owner

Killing this pull request. What I've done elsewhere in my tree works better w/o the I/O penalty.

Matt Domsch mdomsch closed this
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on Mar 2, 2012
  1. Matt Domsch

    sync --preserve checks mtime

    mdomsch authored
    This causes an extra HEAD request for each remote file, which greatly
    slows down execution, and increases monetary cost $(0.01/10000
    requests), but guarantees files whose mtime has changed will get
    resync'd.
    
    This is necessary for yum repositories, where repodata/* files may be
    updated but not change size.  It also correctly handles large files
    whose md5 values as returned by S3 are incorrect having their content
    (and thus mtime) changed, perhaps by RPM signing.
This page is out of date. Refresh to see the latest.
Showing with 32 additions and 11 deletions.
  1. +1 −1  S3/Config.py
  2. +30 −2 S3/FileLists.py
  3. +1 −8 s3cmd
2  S3/Config.py
View
@@ -66,7 +66,7 @@ class Config(object):
enable_multipart = True
multipart_chunk_size_mb = 15 # MB
# List of checks to be performed for 'sync'
- sync_checks = ['size', 'md5'] # 'weak-timestamp'
+ sync_checks = ['size', 'mtime', 'md5'] # 'weak-timestamp'
# List of compiled REGEXPs
exclude = []
include = []
32 S3/FileLists.py
View
@@ -15,7 +15,7 @@
import os
import glob
-__all__ = ["fetch_local_list", "fetch_remote_list", "compare_filelists", "filter_exclude_include"]
+__all__ = ["fetch_local_list", "fetch_remote_list", "compare_filelists", "filter_exclude_include", "parse_attrs_header"]
def _fswalk_follow_symlinks(path):
'''
@@ -149,6 +149,13 @@ def _get_filelist_local(local_uri):
return local_list, single_file
+def parse_attrs_header(attrs_header):
+ attrs = {}
+ for attr in attrs_header.split("/"):
+ key, val = attr.split(":")
+ attrs[key] = val
+ return attrs
+
def fetch_remote_list(args, require_attribs = False, recursive = None):
def _get_filelist_remote(remote_uri, recursive = True):
## If remote_uri ends with '/' then all remote files will have
@@ -197,6 +204,14 @@ def _get_filelist_remote(remote_uri, recursive = True):
'object_uri_str' : object_uri_str,
'base_uri' : remote_uri,
}
+ if cfg.preserve_attrs:
+ objinfo = S3(cfg).object_info(S3Uri(object_uri_str))
+ if objinfo['headers'].has_key('x-amz-meta-s3cmd-attrs'):
+ attrs = parse_attrs_header(objinfo['headers']['x-amz-meta-s3cmd-attrs'])
+ if attrs.has_key('mtime'):
+ rem_list[key].update({
+ 'mtime':int(attrs['mtime'])
+ })
if break_now:
break
return rem_list
@@ -259,7 +274,13 @@ def _get_filelist_remote(remote_uri, recursive = True):
'md5': response['headers']['etag'].strip('"\''),
'timestamp' : dateRFC822toUnix(response['headers']['date'])
})
- remote_list[key] = remote_item
+ if response['headers'].has_key('x-amz-meta-s3cmd-attrs'): # we have the data, it costs nothing to hold onto it
+ attrs = parse_attrs_header(response['headers']['x-amz-meta-s3cmd-attrs'])
+ if attrs.has_key('mtime'):
+ remote_item.update({
+ 'mtime':int(attrs['mtime'])
+ })
+ remote_list[key] = remote_item
return remote_list
def compare_filelists(src_list, dst_list, src_remote, dst_remote):
@@ -295,6 +316,13 @@ def __direction_str(is_remote):
debug(u"XFER: %s (size mismatch: src=%s dst=%s)" % (file, src_list[file]['size'], dst_list[file]['size']))
attribs_match = False
+ ## Check mtime next
+ if 'mtime' in cfg.sync_checks:
+ if dst_list[file].has_key('mtime') and src_list[file].has_key('mtime'):
+ if dst_list[file]['mtime'] != src_list[file]['mtime']:
+ debug(u"XFER: %s (mtime mismatch: src=%s dst=%s)" % (file, src_list[file]['mtime'], dst_list[file]['mtime']))
+ attribs_match = False
+
## Check MD5
compare_md5 = 'md5' in cfg.sync_checks
# Multipart-uploaded files don't have a valid MD5 sum - it ends with "...-NN"
9 s3cmd
View
@@ -656,13 +656,6 @@ def cmd_sync_remote2remote(args):
info(outstr)
def cmd_sync_remote2local(args):
- def _parse_attrs_header(attrs_header):
- attrs = {}
- for attr in attrs_header.split("/"):
- key, val = attr.split(":")
- attrs[key] = val
- return attrs
-
s3 = S3(Config())
destination_base = args[-1]
@@ -747,7 +740,7 @@ def cmd_sync_remote2local(args):
response = s3.object_get(uri, dst_stream, extra_label = seq_label)
dst_stream.close()
if response['headers'].has_key('x-amz-meta-s3cmd-attrs') and cfg.preserve_attrs:
- attrs = _parse_attrs_header(response['headers']['x-amz-meta-s3cmd-attrs'])
+ attrs = parse_attrs_header(response['headers']['x-amz-meta-s3cmd-attrs'])
if attrs.has_key('mode'):
os.chmod(dst_file, int(attrs['mode']))
if attrs.has_key('mtime') or attrs.has_key('atime'):
Something went wrong with that request. Please try again.