From 28e367c0c2429a07cc9e0344c3e269abab6aa897 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Sun, 5 Mar 2023 17:26:47 +0100 Subject: [PATCH 1/3] Add failing test. --- docs/source/changes.md | 1 + tests/test_execute.py | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/docs/source/changes.md b/docs/source/changes.md index cd17b6ff..e2555ad6 100644 --- a/docs/source/changes.md +++ b/docs/source/changes.md @@ -10,6 +10,7 @@ releases are available on [PyPI](https://pypi.org/project/pytask) and - {pull}`345` updates the version numbers in animations. - {pull}`352` publishes `db` that is required by pytask-environment. - {pull}`355` refactors a lot of things related to nodes. +- {pull}`356` add hashing for task files to detect changes. ## 0.3.1 - 2023-12-25 diff --git a/tests/test_execute.py b/tests/test_execute.py index 5724945b..51573096 100644 --- a/tests/test_execute.py +++ b/tests/test_execute.py @@ -383,3 +383,16 @@ def task_example(): assert result.exit_code == ExitCode.OK assert "task_example" in result.output assert "Collect 1 task" + + +@pytest.mark.end_to_end() +def test_task_is_not_reexecuted_when_modification_changed_file_not(runner, tmp_path): + tmp_path.joinpath("task_example.py").write_text("def task_example(): pass") + result = runner.invoke(cli, [tmp_path.as_posix()]) + assert result.exit_code == ExitCode.OK + assert "1 Succeeded" in result.output + + tmp_path.joinpath("task_example.py").write_text("def task_example(): pass") + result = runner.invoke(cli, [tmp_path.as_posix()]) + assert result.exit_code == ExitCode.OK + assert "1 Skipped" in result.output From aae1ba4e569d0093c42d22bef0c7994459976427 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Mon, 6 Mar 2023 00:08:08 +0100 Subject: [PATCH 2/3] Implement hashin. --- .pre-commit-config.yaml | 40 +++++++++++++++++------------------ docs/source/changes.md | 3 ++- src/_pytask/dag.py | 27 +++++++++++++++++------ src/_pytask/database_utils.py | 31 +++++++++++++++++++++------ src/_pytask/report.py | 1 + tests/test_database.py | 4 ++-- tests/test_dry_run.py | 36 +++++++++++-------------------- tests/test_persist.py | 4 ++-- 8 files changed, 85 insertions(+), 61 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ecc6613c..d98f24f7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -48,6 +48,26 @@ repos: hooks: - id: refurb args: [--ignore, FURB126] +- repo: https://github.com/econchick/interrogate + rev: 1.5.0 + hooks: + - id: interrogate + args: [-v, --fail-under=75] + exclude: ^(tests/|docs/|scripts/) +- repo: https://github.com/pre-commit/mirrors-mypy + rev: 'v1.0.1' + hooks: + - id: mypy + args: [ + --no-strict-optional, + --ignore-missing-imports, + ] + additional_dependencies: [ + attrs>=21.3.0, + click, + types-setuptools + ] + pass_filenames: false - repo: https://github.com/executablebooks/mdformat rev: 0.7.16 hooks: @@ -82,12 +102,6 @@ repos: docs/source/tutorials/selecting_tasks.md| docs/source/tutorials/set_up_a_project.md )$ -- repo: https://github.com/econchick/interrogate - rev: 1.5.0 - hooks: - - id: interrogate - args: [-v, --fail-under=75] - exclude: ^(tests/|docs/|scripts/) - repo: https://github.com/codespell-project/codespell rev: v2.2.2 hooks: @@ -99,20 +113,6 @@ repos: - id: check-manifest args: [--no-build-isolation] additional_dependencies: [setuptools-scm, toml] -- repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v1.0.1' - hooks: - - id: mypy - args: [ - --no-strict-optional, - --ignore-missing-imports, - ] - additional_dependencies: [ - attrs>=21.3.0, - click, - types-setuptools - ] - pass_filenames: false - repo: meta hooks: - id: check-hooks-apply diff --git a/docs/source/changes.md b/docs/source/changes.md index a3c7303f..8b1a69bf 100644 --- a/docs/source/changes.md +++ b/docs/source/changes.md @@ -12,7 +12,8 @@ releases are available on [PyPI](https://pypi.org/project/pytask) and - {pull}`354` adds a `-f/--force` flag to execute tasks even though nothing may have changed. - {pull}`355` refactors a lot of things related to nodes. -- {pull}`356` add hashing for task files to detect changes. +- {pull}`356` add hashing for task files to detect changes when modification times do + not match. ## 0.3.1 - 2023-12-25 diff --git a/src/_pytask/dag.py b/src/_pytask/dag.py index d35c0d99..a7068557 100644 --- a/src/_pytask/dag.py +++ b/src/_pytask/dag.py @@ -1,6 +1,7 @@ """This module contains code related to resolving dependencies.""" from __future__ import annotations +import hashlib import itertools import sys @@ -130,16 +131,30 @@ def _have_task_or_neighbors_changed( def pytask_dag_has_node_changed(session: Session, node: Node, task_name: str) -> bool: """Indicate whether a single dependency or product has changed.""" if isinstance(node, (FilePathNode, Task)): - state = session.hook.pytask_node_state(node=node) - if state is None: + # If node does not exist, we receive None. + file_state = session.hook.pytask_node_state(node=node) + if file_state is None: return True + # If the node is not in the database. try: - state_in_db = State[task_name, node.name].state # type: ignore[misc] - has_changed = state != state_in_db + name = node.name + db_state = State[task_name, name] # type: ignore[type-arg, valid-type] except orm.ObjectNotFound: - has_changed = True - return has_changed + return True + + # If the modification times match, the node has not been changed. + if file_state == db_state.modification_time: + return False + + # If the modification time changed, quickly return for non-tasks. + if isinstance(node, FilePathNode): + return True + + # When modification times changed, we are still comparing the hash of the file + # to avoid unnecessary and expensive reexecutions of tasks. + file_hash = hashlib.sha256(node.path.read_bytes()).hexdigest() + return file_hash != db_state.file_hash return None diff --git a/src/_pytask/database_utils.py b/src/_pytask/database_utils.py index cc1e167f..1de2c1f4 100644 --- a/src/_pytask/database_utils.py +++ b/src/_pytask/database_utils.py @@ -1,7 +1,10 @@ """This module contains utilities for the database.""" from __future__ import annotations +import hashlib + from _pytask.dag_utils import node_and_neighbors +from _pytask.nodes_utils import Task from _pytask.session import Session from pony import orm @@ -17,7 +20,8 @@ class State(db.Entity): # type: ignore[name-defined] task = orm.Required(str) node = orm.Required(str) - state = orm.Required(str) + modification_time = orm.Required(str) + file_hash = orm.Optional(str) orm.PrimaryKey(task, node) @@ -34,19 +38,34 @@ def create_database( @orm.db_session -def _create_or_update_state(first_key: str, second_key: str, state: str) -> None: +def _create_or_update_state( + first_key: str, second_key: str, modification_time: str, file_hash: str +) -> None: """Create or update a state.""" try: state_in_db = State[first_key, second_key] # type: ignore[type-arg, valid-type] except orm.ObjectNotFound: - State(task=first_key, node=second_key, state=state) + State( + task=first_key, + node=second_key, + modification_time=modification_time, + file_hash=file_hash, + ) else: - state_in_db.state = state + state_in_db.modification_time = modification_time + state_in_db.file_hash = file_hash def update_states_in_database(session: Session, task_name: str) -> None: """Update the state for each node of a task in the database.""" for name in node_and_neighbors(session.dag, task_name): node = session.dag.nodes[name].get("task") or session.dag.nodes[name]["node"] - state = session.hook.pytask_node_state(node=node) - _create_or_update_state(task_name, node.name, state) + + modification_time = session.hook.pytask_node_state(node=node) + + if isinstance(node, Task): + hash_ = hashlib.sha256(node.path.read_bytes()).hexdigest() + else: + hash_ = "" + + _create_or_update_state(task_name, node.name, modification_time, hash_) diff --git a/src/_pytask/report.py b/src/_pytask/report.py index 24ce81b8..4e3c4ced 100644 --- a/src/_pytask/report.py +++ b/src/_pytask/report.py @@ -49,6 +49,7 @@ class DagReport: @classmethod def from_exception(cls, exc_info: ExceptionInfo) -> DagReport: + exc_info = remove_internal_traceback_frames_from_exc_info(exc_info) return cls(exc_info) diff --git a/tests/test_database.py b/tests/test_database.py index 6f3feb42..a1185255 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -46,8 +46,8 @@ def task_write(produces): (in_path.as_posix(), in_path), (out_path.as_posix(), out_path), ): - state = State[task_id, id_].state - assert float(state) == path.stat().st_mtime + modification_time = State[task_id, id_].modification_time + assert float(modification_time) == path.stat().st_mtime @pytest.mark.end_to_end() diff --git a/tests/test_dry_run.py b/tests/test_dry_run.py index 4d10e2ff..d73a5ab6 100644 --- a/tests/test_dry_run.py +++ b/tests/test_dry_run.py @@ -52,7 +52,10 @@ def task_example(produces): assert result.exit_code == ExitCode.OK assert "2 Succeeded" in result.output - tmp_path.joinpath("task_example_first.py").write_text(textwrap.dedent(source)) + # Add a new line to the file or the task would be skipped. + tmp_path.joinpath("task_example_first.py").write_text( + textwrap.dedent(source + "\n") + ) result = runner.invoke(cli, ["--dry-run", tmp_path.as_posix()]) @@ -63,16 +66,16 @@ def task_example(produces): @pytest.mark.end_to_end() def test_dry_run_w_subsequent_skipped_task(runner, tmp_path): """A skip is more important than a would be run.""" - source = """ + source_1 = """ import pytask @pytask.mark.produces("out.txt") def task_example(produces): produces.touch() """ - tmp_path.joinpath("task_example_first.py").write_text(textwrap.dedent(source)) + tmp_path.joinpath("task_example_first.py").write_text(textwrap.dedent(source_1)) - source = """ + source_2 = """ import pytask @pytask.mark.depends_on("out.txt") @@ -80,32 +83,17 @@ def task_example(produces): def task_example(produces): produces.touch() """ - tmp_path.joinpath("task_example_second.py").write_text(textwrap.dedent(source)) + tmp_path.joinpath("task_example_second.py").write_text(textwrap.dedent(source_2)) result = runner.invoke(cli, [tmp_path.as_posix()]) assert result.exit_code == ExitCode.OK assert "2 Succeeded" in result.output - source = """ - import pytask - - @pytask.mark.produces("out.txt") - def task_example(produces): - produces.touch() - """ - tmp_path.joinpath("task_example_first.py").write_text(textwrap.dedent(source)) - - source = """ - import pytask - - @pytask.mark.skip - @pytask.mark.depends_on("out.txt") - @pytask.mark.produces("out_2.txt") - def task_example(produces): - produces.touch() - """ - tmp_path.joinpath("task_example_second.py").write_text(textwrap.dedent(source)) + tmp_path.joinpath("task_example_first.py").write_text(textwrap.dedent(source_1)) + tmp_path.joinpath("task_example_second.py").write_text( + textwrap.dedent(source_2 + "\n") + ) result = runner.invoke(cli, ["--dry-run", tmp_path.as_posix()]) diff --git a/tests/test_persist.py b/tests/test_persist.py index 794f2ab9..0b6bf8bb 100644 --- a/tests/test_persist.py +++ b/tests/test_persist.py @@ -73,8 +73,8 @@ def task_dummy(depends_on, produces): task_id = tmp_path.joinpath("task_module.py").as_posix() + "::task_dummy" node_id = tmp_path.joinpath("out.txt").as_posix() - state = State[task_id, node_id].state - assert float(state) == tmp_path.joinpath("out.txt").stat().st_mtime + modification_time = State[task_id, node_id].modification_time + assert float(modification_time) == tmp_path.joinpath("out.txt").stat().st_mtime session = main({"paths": tmp_path}) From 319ce025f73850df8ef38a65674ab9e757f5048e Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Mon, 6 Mar 2023 00:10:17 +0100 Subject: [PATCH 3/3] Fix. --- docs/source/changes.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/changes.md b/docs/source/changes.md index 8b1a69bf..d5a1b06c 100644 --- a/docs/source/changes.md +++ b/docs/source/changes.md @@ -12,7 +12,7 @@ releases are available on [PyPI](https://pypi.org/project/pytask) and - {pull}`354` adds a `-f/--force` flag to execute tasks even though nothing may have changed. - {pull}`355` refactors a lot of things related to nodes. -- {pull}`356` add hashing for task files to detect changes when modification times do +- {pull}`357` add hashing for task files to detect changes when modification times do not match. ## 0.3.1 - 2023-12-25