Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[3007.x] Don't unnecessarily download remote sources to cache #66343

Merged
merged 2 commits into from
May 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog/66342.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Made `file.managed` skip download of a remote source if the managed file already exists with the correct hash
61 changes: 59 additions & 2 deletions salt/states/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -2501,8 +2501,12 @@ def managed(
Set to ``False`` to discard the cached copy of the source file once the
state completes. This can be useful for larger files to keep them from
taking up space in minion cache. However, keep in mind that discarding
the source file will result in the state needing to re-download the
source file if the state is run again.
the source file might result in the state needing to re-download the
source file if the state is run again. If the source is not a local or
``salt://`` one, the source hash is known, ``skip_verify`` is not true
and the managed file exists with the correct hash and is not templated,
this is not the case (i.e. remote downloads are avoided if the local hash
matches the expected one).

.. versionadded:: 2017.7.3

Expand Down Expand Up @@ -3220,6 +3224,59 @@ def managed(
if defaults and not isinstance(defaults, dict):
return _error(ret, "Defaults must be formed as a dict")

# If we're pulling from a remote source untemplated and we have a source hash,
# check early if the local file exists with the correct hash and skip
# managing contents if so. This avoids a lot of overhead.
if (
contents is None
and not template
and source
and not skip_verify
and os.path.exists(name)
and replace
):
try:
# If the source is a list, find the first existing file.
# We're doing this after basic checks to not slow down
# runs where it does not matter.
source, source_hash = __salt__["file.source_list"](
source, source_hash, __env__
)
source_sum = None
if (
source
and source_hash
and urllib.parse.urlparse(source).scheme
not in (
"salt",
"file",
)
and not os.path.isabs(source)
):
source_sum = __salt__["file.get_source_sum"](
name,
source,
source_hash,
source_hash_name,
__env__,
verify_ssl=verify_ssl,
source_hash_sig=source_hash_sig,
signed_by_any=signed_by_any,
signed_by_all=signed_by_all,
keyring=keyring,
gnupghome=gnupghome,
)
hsum = __salt__["file.get_hash"](name, source_sum["hash_type"])
except (CommandExecutionError, OSError) as err:
log.error(
"Failed checking existing file's hash against specified source_hash: %s",
err,
exc_info_on_loglevel=logging.DEBUG,
)
else:
if source_sum and source_sum["hsum"] == hsum:
replace = False

if not replace and os.path.exists(name):
ret_perms = {}
# Check and set the permissions if necessary
Expand Down
23 changes: 23 additions & 0 deletions tests/pytests/functional/states/file/test_managed.py
Original file line number Diff line number Diff line change
Expand Up @@ -1042,3 +1042,26 @@ def test_issue_60203(
assert "Unable to manage file" in ret.comment
assert "/files/test.tar.gz.sha256" in ret.comment
assert "dontshowme" not in ret.comment


def test_file_managed_remote_source_does_not_refetch_existing_file_with_correct_digest(
file, tmp_path, grail_scene33_file, grail_scene33_file_hash
):
"""
If an existing file is managed from a remote source and its source hash is
known beforehand, ensure that `file.managed` checks the local file's digest
and if it matches the expected one, does not download the file to the local
cache unnecessarily.
This is especially important when huge files are managed with `keep_source`
set to False.
Issue #64373
"""
name = tmp_path / "scene33"
name.write_bytes(grail_scene33_file.read_bytes())
ret = file.managed(
str(name),
source="http://127.0.0.1:1337/does/not/exist",
source_hash=grail_scene33_file_hash,
)
assert ret.result is True
assert not ret.changes