Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Loading…

sync --preserve checks mtime #35

Closed
wants to merge 1 commit into from

1 participant

@mdomsch
Owner

This causes an extra HEAD request for each remote file, which greatly
slows down execution, and increases monetary cost $(0.01/10000
requests), but guarantees files whose mtime has changed will get
resync'd.

This is necessary for yum repositories, where repodata/* files may be
updated but not change size. It also correctly handles large files
whose md5 values as returned by S3 are incorrect having their content
(and thus mtime) changed, perhaps by RPM signing.

@mdomsch mdomsch sync --preserve checks mtime
This causes an extra HEAD request for each remote file, which greatly
slows down execution, and increases monetary cost $(0.01/10000
requests), but guarantees files whose mtime has changed will get
resync'd.

This is necessary for yum repositories, where repodata/* files may be
updated but not change size.  It also correctly handles large files
whose md5 values as returned by S3 are incorrect having their content
(and thus mtime) changed, perhaps by RPM signing.
282136a
@mdomsch
Owner

we're trading a ton of local disk I/O to calculate md5 on each file, for a HEAD call to S3 for each file
we do get the LastModified (uploaded) time from S3 w/o the HEAD call
I wonder if we can simply look at files with mtimes newer than LastModified...
and assume if file mtime is newer than LastModified, then it needs to be updated.

For regular occuring sync runs, I think that's valid...

@mdomsch
Owner

With this patch, syncing takes 10x longer. Probably the wrong approach then. Maybe LastModified as a proxy for mtime is good enough...

@mdomsch
Owner

Killing this pull request. What I've done elsewhere in my tree works better w/o the I/O penalty.

@mdomsch mdomsch closed this
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on Mar 2, 2012
  1. @mdomsch

    sync --preserve checks mtime

    mdomsch authored
    This causes an extra HEAD request for each remote file, which greatly
    slows down execution, and increases monetary cost $(0.01/10000
    requests), but guarantees files whose mtime has changed will get
    resync'd.
    
    This is necessary for yum repositories, where repodata/* files may be
    updated but not change size.  It also correctly handles large files
    whose md5 values as returned by S3 are incorrect having their content
    (and thus mtime) changed, perhaps by RPM signing.
This page is out of date. Refresh to see the latest.
Showing with 32 additions and 11 deletions.
  1. +1 −1  S3/Config.py
  2. +30 −2 S3/FileLists.py
  3. +1 −8 s3cmd
View
2  S3/Config.py
@@ -66,7 +66,7 @@ class Config(object):
enable_multipart = True
multipart_chunk_size_mb = 15 # MB
# List of checks to be performed for 'sync'
- sync_checks = ['size', 'md5'] # 'weak-timestamp'
+ sync_checks = ['size', 'mtime', 'md5'] # 'weak-timestamp'
# List of compiled REGEXPs
exclude = []
include = []
View
32 S3/FileLists.py
@@ -15,7 +15,7 @@
import os
import glob
-__all__ = ["fetch_local_list", "fetch_remote_list", "compare_filelists", "filter_exclude_include"]
+__all__ = ["fetch_local_list", "fetch_remote_list", "compare_filelists", "filter_exclude_include", "parse_attrs_header"]
def _fswalk_follow_symlinks(path):
'''
@@ -149,6 +149,13 @@ def _get_filelist_local(local_uri):
return local_list, single_file
+def parse_attrs_header(attrs_header):
+ attrs = {}
+ for attr in attrs_header.split("/"):
+ key, val = attr.split(":")
+ attrs[key] = val
+ return attrs
+
def fetch_remote_list(args, require_attribs = False, recursive = None):
def _get_filelist_remote(remote_uri, recursive = True):
## If remote_uri ends with '/' then all remote files will have
@@ -197,6 +204,14 @@ def _get_filelist_remote(remote_uri, recursive = True):
'object_uri_str' : object_uri_str,
'base_uri' : remote_uri,
}
+ if cfg.preserve_attrs:
+ objinfo = S3(cfg).object_info(S3Uri(object_uri_str))
+ if objinfo['headers'].has_key('x-amz-meta-s3cmd-attrs'):
+ attrs = parse_attrs_header(objinfo['headers']['x-amz-meta-s3cmd-attrs'])
+ if attrs.has_key('mtime'):
+ rem_list[key].update({
+ 'mtime':int(attrs['mtime'])
+ })
if break_now:
break
return rem_list
@@ -259,7 +274,13 @@ def _get_filelist_remote(remote_uri, recursive = True):
'md5': response['headers']['etag'].strip('"\''),
'timestamp' : dateRFC822toUnix(response['headers']['date'])
})
- remote_list[key] = remote_item
+ if response['headers'].has_key('x-amz-meta-s3cmd-attrs'): # we have the data, it costs nothing to hold onto it
+ attrs = parse_attrs_header(response['headers']['x-amz-meta-s3cmd-attrs'])
+ if attrs.has_key('mtime'):
+ remote_item.update({
+ 'mtime':int(attrs['mtime'])
+ })
+ remote_list[key] = remote_item
return remote_list
def compare_filelists(src_list, dst_list, src_remote, dst_remote):
@@ -295,6 +316,13 @@ def __direction_str(is_remote):
debug(u"XFER: %s (size mismatch: src=%s dst=%s)" % (file, src_list[file]['size'], dst_list[file]['size']))
attribs_match = False
+ ## Check mtime next
+ if 'mtime' in cfg.sync_checks:
+ if dst_list[file].has_key('mtime') and src_list[file].has_key('mtime'):
+ if dst_list[file]['mtime'] != src_list[file]['mtime']:
+ debug(u"XFER: %s (mtime mismatch: src=%s dst=%s)" % (file, src_list[file]['mtime'], dst_list[file]['mtime']))
+ attribs_match = False
+
## Check MD5
compare_md5 = 'md5' in cfg.sync_checks
# Multipart-uploaded files don't have a valid MD5 sum - it ends with "...-NN"
View
9 s3cmd
@@ -656,13 +656,6 @@ def cmd_sync_remote2remote(args):
info(outstr)
def cmd_sync_remote2local(args):
- def _parse_attrs_header(attrs_header):
- attrs = {}
- for attr in attrs_header.split("/"):
- key, val = attr.split(":")
- attrs[key] = val
- return attrs
-
s3 = S3(Config())
destination_base = args[-1]
@@ -747,7 +740,7 @@ def cmd_sync_remote2local(args):
response = s3.object_get(uri, dst_stream, extra_label = seq_label)
dst_stream.close()
if response['headers'].has_key('x-amz-meta-s3cmd-attrs') and cfg.preserve_attrs:
- attrs = _parse_attrs_header(response['headers']['x-amz-meta-s3cmd-attrs'])
+ attrs = parse_attrs_header(response['headers']['x-amz-meta-s3cmd-attrs'])
if attrs.has_key('mode'):
os.chmod(dst_file, int(attrs['mode']))
if attrs.has_key('mtime') or attrs.has_key('atime'):
Something went wrong with that request. Please try again.