Skip to content

Commit

Permalink
add ability to exclude files in rsync command
Browse files Browse the repository at this point in the history
  • Loading branch information
hhuuggoo committed Mar 14, 2024
1 parent a716678 commit e42d21d
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 2 deletions.
120 changes: 119 additions & 1 deletion saturnfs/client/saturnfs.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from __future__ import annotations

import logging
import math
import os
import weakref
from concurrent.futures import ThreadPoolExecutor
from copy import copy
from datetime import datetime
from fnmatch import fnmatch
from functools import partial
from glob import has_magic
from io import BytesIO, TextIOWrapper
Expand Down Expand Up @@ -48,6 +50,9 @@
DEFAULT_CALLBACK = NoOpCallback()


logger = logging.getLogger(__name__)


class _CachedTyped(_Cached):
# Add typing to the metaclass to get around an issue with pylance
# https://github.com/microsoft/pylance-release/issues/4384
Expand Down Expand Up @@ -781,9 +786,10 @@ def rm_bulk(self, paths: List[str], callback: Callback = DEFAULT_CALLBACK):
self.invalidate_cache(full_path(owner_name, path))
i += settings.OBJECT_STORAGE_MAX_LIST_COUNT


def rsync(self, source: str, destination: str, delete_missing: bool = False, **kwargs):
kwargs["fs"] = SaturnGenericFilesystem()
return rsync(source, destination, delete_missing=delete_missing, **kwargs)
return _rsync(source, destination, delete_missing=delete_missing, **kwargs)

def list_uploads(
self, path: str, is_copy: Optional[bool] = None
Expand Down Expand Up @@ -1324,3 +1330,115 @@ def _is_saturnfs(self, protocol: str) -> bool:


register_implementation(SaturnFS.protocol, SaturnFS)


def check_exclude_globs(input_string, exclude_globs) -> bool:
"""
returns True if input_string matches a list of globs that we want to exclude
"""
for glob in exclude_globs:
if fnmatch(input_string, glob):
return True
return False


def _rsync(
source,
destination,
delete_missing=False,
source_field="size",
dest_field="size",
update_cond="different",
inst_kwargs=None,
fs=None,
exclude_globs=None,
**kwargs,
):
"""Sync files between two directory trees
(experimental)
Parameters
----------
source: str
Root of the directory tree to take files from. This must be a directory, but
do not include any terminating "/" character
destination: str
Root path to copy into. The contents of this location should be
identical to the contents of ``source`` when done. This will be made a
directory, and the terminal "/" should not be included.
delete_missing: bool
If there are paths in the destination that don't exist in the
source and this is True, delete them. Otherwise, leave them alone.
source_field: str | callable
If ``update_field`` is "different", this is the key in the info
of source files to consider for difference. Maybe a function of the
info dict.
dest_field: str | callable
If ``update_field`` is "different", this is the key in the info
of destination files to consider for difference. May be a function of
the info dict.
update_cond: "different"|"always"|"never"
If "always", every file is copied, regardless of whether it exists in
the destination. If "never", files that exist in the destination are
not copied again. If "different" (default), only copy if the info
fields given by ``source_field`` and ``dest_field`` (usually "size")
are different. Other comparisons may be added in the future.
inst_kwargs: dict|None
If ``fs`` is None, use this set of keyword arguments to make a
GenericFileSystem instance
fs: GenericFileSystem|None
Instance to use if explicitly given. The instance defines how to
to make downstream file system instances from paths.
"""
if exclude_globs is None:
exclude_globs = []
fs = fs or GenericFileSystem(**(inst_kwargs or {}))
source = fs._strip_protocol(source)
destination = fs._strip_protocol(destination)
allfiles = fs.find(source, withdirs=True, detail=True)
allfiles = {a: v for a, v in allfiles.items() if not check_exclude_globs(v["name"], exclude_globs)}
if not fs.isdir(source):
raise ValueError("Can only rsync on a directory")
otherfiles = fs.find(destination, withdirs=True, detail=True)
dirs = [
a
for a, v in allfiles.items()
if v["type"] == "directory" and a.replace(source, destination) not in otherfiles
]
logger.debug(f"{len(dirs)} directories to create")
for dirn in dirs:
# no async
fs.mkdirs(dirn.replace(source, destination), exist_ok=True)
allfiles = {a: v for a, v in allfiles.items() if v["type"] == "file"}
logger.debug(f"{len(allfiles)} files to consider for copy")
to_delete = [
o
for o, v in otherfiles.items()
if o.replace(destination, source) not in allfiles and v["type"] == "file"
]
for k, v in allfiles.copy().items():
otherfile = k.replace(source, destination)
if otherfile in otherfiles:
if update_cond == "always":
allfiles[k] = otherfile
elif update_cond == "different":
inf1 = source_field(v) if callable(source_field) else v[source_field]
v2 = otherfiles[otherfile]
inf2 = dest_field(v2) if callable(dest_field) else v2[dest_field]
if inf1 != inf2:
# details mismatch, make copy
allfiles[k] = otherfile
else:
# details match, don't copy
allfiles.pop(k)
else:
# file not in target yet
allfiles[k] = otherfile
logger.debug(f"{len(allfiles)} files to copy")
if allfiles:
source_files, target_files = zip(*allfiles.items())
fs.cp(source_files, target_files, **kwargs)
logger.debug(f"{len(to_delete)} files to delete")
if delete_missing:
fs.rm(to_delete)
2 changes: 1 addition & 1 deletion saturnfs/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@

SATURNFS_PROTOCOL = "sfs"
SATURNFS_FILE_PREFIX = f"{SATURNFS_PROTOCOL}://"
SATURNFS_DEFAULT_MAX_WORKERS = max(1, int(os.getenv("SATURNFS_DEFAULT_MAX_WORKERS", "10")))
SATURNFS_DEFAULT_MAX_WORKERS = max(1, int(os.getenv("SATURNFS_DEFAULT_MAX_WORKERS", "20")))

0 comments on commit e42d21d

Please sign in to comment.