From b57bb81f1104bf9ec761495260233eaf01382b30 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Wed, 30 Apr 2025 11:57:47 +0200 Subject: [PATCH 01/45] switch to ported slurm docker cluster --- cluster_tools/dockered-slurm/docker-compose.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cluster_tools/dockered-slurm/docker-compose.yml b/cluster_tools/dockered-slurm/docker-compose.yml index 1e8746cc3..4f18d0cd4 100644 --- a/cluster_tools/dockered-slurm/docker-compose.yml +++ b/cluster_tools/dockered-slurm/docker-compose.yml @@ -13,7 +13,7 @@ services: - ..:/cluster_tools slurmdbd: - image: scalableminds/slurm-docker-cluster:master__11274637426 + image: scalableminds/slurm-docker-cluster:port_slurm_in_docker_to_debian__14751549755 command: [ "slurmdbd" ] container_name: slurmdbd hostname: slurmdbd @@ -29,7 +29,7 @@ services: - mysql slurmctld: - image: scalableminds/slurm-docker-cluster:master__11274637426 + image: scalableminds/slurm-docker-cluster:port_slurm_in_docker_to_debian__14751549755 command: [ "slurmctld" ] container_name: slurmctld environment: @@ -50,7 +50,7 @@ services: - "slurmdbd" c1: - image: scalableminds/slurm-docker-cluster:master__11274637426 + image: scalableminds/slurm-docker-cluster:port_slurm_in_docker_to_debian__14751549755 command: [ "slurmd" ] hostname: c1 container_name: c1 @@ -68,7 +68,7 @@ services: - "slurmctld" c2: - image: scalableminds/slurm-docker-cluster:master__11274637426 + image: scalableminds/slurm-docker-cluster:port_slurm_in_docker_to_debian__14751549755 command: [ "slurmd" ] hostname: c2 container_name: c2 From 56177ec078969fb024a304d27e2f340be594c915 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Wed, 30 Apr 2025 13:44:28 +0200 Subject: [PATCH 02/45] fix slurm config and new slurm image --- cluster_tools/dockered-slurm/docker-compose.yml | 8 ++++---- cluster_tools/dockered-slurm/slurm.conf | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cluster_tools/dockered-slurm/docker-compose.yml b/cluster_tools/dockered-slurm/docker-compose.yml index 4f18d0cd4..73c89843c 100644 --- a/cluster_tools/dockered-slurm/docker-compose.yml +++ b/cluster_tools/dockered-slurm/docker-compose.yml @@ -13,7 +13,7 @@ services: - ..:/cluster_tools slurmdbd: - image: scalableminds/slurm-docker-cluster:port_slurm_in_docker_to_debian__14751549755 + image: scalableminds/slurm-docker-cluster:port_slurm_in_docker_to_debian__14753584102 command: [ "slurmdbd" ] container_name: slurmdbd hostname: slurmdbd @@ -29,7 +29,7 @@ services: - mysql slurmctld: - image: scalableminds/slurm-docker-cluster:port_slurm_in_docker_to_debian__14751549755 + image: scalableminds/slurm-docker-cluster:port_slurm_in_docker_to_debian__14753584102 command: [ "slurmctld" ] container_name: slurmctld environment: @@ -50,7 +50,7 @@ services: - "slurmdbd" c1: - image: scalableminds/slurm-docker-cluster:port_slurm_in_docker_to_debian__14751549755 + image: scalableminds/slurm-docker-cluster:port_slurm_in_docker_to_debian__14753584102 command: [ "slurmd" ] hostname: c1 container_name: c1 @@ -68,7 +68,7 @@ services: - "slurmctld" c2: - image: scalableminds/slurm-docker-cluster:port_slurm_in_docker_to_debian__14751549755 + image: scalableminds/slurm-docker-cluster:port_slurm_in_docker_to_debian__14753584102 command: [ "slurmd" ] hostname: c2 container_name: c2 diff --git a/cluster_tools/dockered-slurm/slurm.conf b/cluster_tools/dockered-slurm/slurm.conf index aac01df67..5308702af 100644 --- a/cluster_tools/dockered-slurm/slurm.conf +++ b/cluster_tools/dockered-slurm/slurm.conf @@ -57,7 +57,7 @@ SchedulerType=sched/backfill #SchedulerAuth= #SchedulerPort= #SchedulerRootFilter= -SelectType=select/cons_res +SelectType=select/cons_tres SelectTypeParameters=CR_CPU_Memory #PriorityType=priority/multifactor #PriorityDecayHalfLife=14-0 From 3522a3efaae668d22e5f43c423671d4b487d4607 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Tue, 6 May 2025 16:24:57 +0200 Subject: [PATCH 03/45] split ci steps for better readability --- .github/workflows/ci.yml | 176 +++++++++++++++++++++++---------------- 1 file changed, 103 insertions(+), 73 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fa439966c..11f43af9d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,38 +29,34 @@ jobs: cluster_tools: - 'cluster_tools/**' - cluster_tools: + cluster_tools[slurm]: needs: changes if: ${{ needs.changes.outputs.cluster_tools == 'true' }} runs-on: ubuntu-latest timeout-minutes: 30 strategy: - max-parallel: 4 matrix: - executors: [multiprocessing, slurm, kubernetes, dask] python-version: ["3.13", "3.12", "3.11", "3.10"] - defaults: - run: - working-directory: cluster_tools steps: - - uses: actions/checkout@v3 - - name: Install uv - uses: astral-sh/setup-uv@v3 + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v6 with: version: "0.6.3" enable-cache: true cache-dependency-glob: "cluster_tools/uv.lock" - - - name: Set up Python ${{ matrix.python-version }} - run: uv python install ${{ matrix.python-version }} - - name: Build/pull dockered-slurm image - if: ${{ matrix.executors == 'slurm' }} + - run: uv python install ${{ matrix.python-version }} + - name: Start Docker Cluster run: | + set -xe cd ./dockered-slurm - - echo docker compose up docker compose up -d - + - name: Install UV dependencies + run: | + for name in "slurmctld" "c1" "c2"; do + docker exec -w /cluster_tools "$name" bash -c "uv sync --frozen" + done + - name: Wait for Cluster to become ready + run: | # Register cluster (with retry) for i in {1..5}; do echo register_cluster @@ -68,82 +64,116 @@ jobs: sleep 10 done - # Show log output for debugging - docker logs slurmctld - docker logs c1 - docker logs c2 - - # Run setup.py on all three nodes - docker exec -w /cluster_tools slurmctld bash -c "uv sync --frozen" & - docker exec -w /cluster_tools c1 bash -c "uv sync --frozen" & - docker exec -w /cluster_tools c2 bash -c "uv sync --frozen" & - wait - - - name: Setup Kubernetes-in-Docker - if: ${{ matrix.executors == 'kubernetes' }} + for name in "slurmctld" "c1" "c2"; do + docker logs "$name" + done + - name: "Run Tests (test_all, test_slurm)" run: | - curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.11.1/kind-linux-amd64 - chmod +x ./kind - sed -i "s#__PATH__#$(pwd)#g" tests/cluster-config.yaml - ./kind create cluster --config=tests/cluster-config.yaml - ./kind export kubeconfig - - docker build \ - --build-arg PYTHON_VERSION=${{ matrix.python-version }} \ - -f tests/Dockerfile \ - -t scalableminds/cluster-tools:latest \ - . - ./kind load docker-image scalableminds/cluster-tools:latest - - - name: Install dependencies (without docker) - if: ${{ matrix.executors == 'multiprocessing' }} + docker exec \ + -w /cluster_tools/tests \ + -e PYTEST_EXECUTORS=slurm \ + slurmctld bash -c "uv run --frozen python -m pytest -sv test_all.py test_slurm.py" + - name: "Run Tests (test_deref_main)" run: | - uv sync --frozen + docker exec \ + -w /cluster_tools/tests \ + slurmctld bash -c "uv run --frozen python test_deref_main.py" + cluster_tools[multiprocessing]: + needs: changes + if: ${{ needs.changes.outputs.cluster_tools == 'true' }} + runs-on: ubuntu-latest + timeout-minutes: 30 + strategy: + matrix: + python-version: ["3.13", "3.12", "3.11", "3.10"] + steps: + - uses: actions/checkout@v4 + - name: Install uv + uses: astral-sh/setup-uv@v6 + with: + version: "0.6.3" + enable-cache: true + cache-dependency-glob: "cluster_tools/uv.lock" + - name: Set up Python ${{ matrix.python-version }} + run: uv python install ${{ matrix.python-version }} - name: Install dependencies (without docker) - if: ${{ matrix.executors == 'kubernetes' || matrix.executors == 'dask' }} - run: | - uv sync --all-extras --frozen - + run: uv sync --frozen - name: Check typing - if: ${{ matrix.executors == 'multiprocessing' && matrix.python-version == '3.11' }} + if: ${{ matrix.python-version == '3.11' }} run: ./typecheck.sh - - name: Check formatting - if: ${{ matrix.executors == 'multiprocessing' && matrix.python-version == '3.11' }} + if: ${{ matrix.python-version == '3.11' }} run: ./format.sh check - - name: Lint code - if: ${{ matrix.executors == 'multiprocessing' && matrix.python-version == '3.11' }} + if: ${{ matrix.python-version == '3.11' }} run: ./lint.sh - - name: Run multiprocessing tests - if: ${{ matrix.executors == 'multiprocessing' }} run: | cd tests PYTEST_EXECUTORS=multiprocessing,sequential,multiprocessing_with_pickling,sequential_with_pickling \ uv run --frozen python -m pytest -sv test_all.py test_multiprocessing.py - - name: Run slurm tests - if: ${{ matrix.executors == 'slurm' }} + cluster_tools[kubernetes]: + needs: changes + if: ${{ needs.changes.outputs.cluster_tools == 'true' }} + runs-on: ubuntu-latest + timeout-minutes: 30 + strategy: + matrix: + python-version: ["3.13", "3.12", "3.11", "3.10"] + steps: + - uses: actions/checkout@v4 + - name: Install uv + uses: astral-sh/setup-uv@v6 + with: + version: "0.6.3" + enable-cache: true + cache-dependency-glob: "cluster_tools/uv.lock" + - name: Set up Python ${{ matrix.python-version }} + run: uv python install ${{ matrix.python-version }} + - name: Setup Kubernetes-in-Docker run: | - cd ./dockered-slurm - docker exec \ - -w /cluster_tools/tests \ - -e PYTEST_EXECUTORS=slurm \ - slurmctld bash -c "uv run --frozen python -m pytest -sv test_all.py test_slurm.py" - docker exec \ - -w /cluster_tools/tests \ - slurmctld bash -c "uv run --frozen python test_deref_main.py" + curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.11.1/kind-linux-amd64 + chmod +x ./kind + sed -i "s#__PATH__#$(pwd)#g" tests/cluster-config.yaml + ./kind create cluster --config=tests/cluster-config.yaml + ./kind export kubeconfig - - name: Run kubernetes tests - if: ${{ matrix.executors == 'kubernetes' }} + docker build \ + --build-arg PYTHON_VERSION=${{ matrix.python-version }} \ + -f tests/Dockerfile \ + -t scalableminds/cluster-tools:latest \ + . + ./kind load docker-image scalableminds/cluster-tools:latest + - name: Install dependencies (without docker) + run: uv sync --all-extras --frozen + - name: "Run Kubernetes" run: | cd tests PYTEST_EXECUTORS=kubernetes uv run --frozen python -m pytest -sv test_all.py test_kubernetes.py - - name: Run dask tests - if: ${{ matrix.executors == 'dask' }} + cluster_tools[dask]: + needs: changes + if: ${{ needs.changes.outputs.cluster_tools == 'true' }} + runs-on: ubuntu-latest + timeout-minutes: 30 + strategy: + matrix: + python-version: ["3.13", "3.12", "3.11", "3.10"] + steps: + - uses: actions/checkout@v4 + - name: Install uv + uses: astral-sh/setup-uv@v6 + with: + version: "0.6.3" + enable-cache: true + cache-dependency-glob: "cluster_tools/uv.lock" + - name: Set up Python ${{ matrix.python-version }} + run: uv python install ${{ matrix.python-version }} + - name: Install dependencies (without docker) + run: uv sync --all-extras --frozen + - name: "Run Kubernetes" run: | cd tests PYTEST_EXECUTORS=dask uv run --frozen python -m pytest -sv test_all.py test_dask.py @@ -177,7 +207,7 @@ jobs: - name: Install proxay run: npm install -g proxay - + - name: Set up Python ${{ matrix.python-version }} run: uv python install ${{ matrix.python-version }} @@ -258,7 +288,7 @@ jobs: token: ${{ secrets.GITHUB_TOKEN }} thresholdAll: 0.8 thresholdNew: 0.8 - + - name: Cleanup temporary files run: rm -rf ~/coverage-files From 190a35c733e7b4ead317c89d2912b16c280f848c Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Tue, 6 May 2025 16:29:47 +0200 Subject: [PATCH 04/45] fix ci names --- .github/workflows/ci.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 11f43af9d..1d62ca843 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,7 +29,7 @@ jobs: cluster_tools: - 'cluster_tools/**' - cluster_tools[slurm]: + cluster_tools_slurm: needs: changes if: ${{ needs.changes.outputs.cluster_tools == 'true' }} runs-on: ubuntu-latest @@ -79,7 +79,7 @@ jobs: -w /cluster_tools/tests \ slurmctld bash -c "uv run --frozen python test_deref_main.py" - cluster_tools[multiprocessing]: + cluster_tools_multiprocessing: needs: changes if: ${{ needs.changes.outputs.cluster_tools == 'true' }} runs-on: ubuntu-latest @@ -114,7 +114,7 @@ jobs: PYTEST_EXECUTORS=multiprocessing,sequential,multiprocessing_with_pickling,sequential_with_pickling \ uv run --frozen python -m pytest -sv test_all.py test_multiprocessing.py - cluster_tools[kubernetes]: + cluster_tools_kubernetes: needs: changes if: ${{ needs.changes.outputs.cluster_tools == 'true' }} runs-on: ubuntu-latest @@ -153,7 +153,7 @@ jobs: cd tests PYTEST_EXECUTORS=kubernetes uv run --frozen python -m pytest -sv test_all.py test_kubernetes.py - cluster_tools[dask]: + cluster_tools_dask: needs: changes if: ${{ needs.changes.outputs.cluster_tools == 'true' }} runs-on: ubuntu-latest @@ -187,7 +187,7 @@ jobs: strategy: max-parallel: 4 matrix: - python-version: ["3.12", "3.13", "3.11", "3.10"] + python-version: ["3.13", "3.12", "3.11", "3.10"] group: [1, 2, 3] fail-fast: false defaults: From 32dc40a973d31eae7903bc2b2e699cd100a7062d Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Tue, 6 May 2025 16:34:13 +0200 Subject: [PATCH 05/45] fix job references --- .github/workflows/ci.yml | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1d62ca843..b924eb705 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -293,7 +293,12 @@ jobs: run: rm -rf ~/coverage-files webknossos_cli_docker: - needs: [cluster_tools, webknossos_linux] + needs: + - cluster_tools_slurm + - cluster_tools_multiprocessing + - cluster_tools_kubernetes + - cluster_tools_dask + - webknossos_linux if: | always() && !contains(needs.*.result, 'failure') && @@ -365,7 +370,12 @@ jobs: docker push scalableminds/webknossos-cli:$NORMALIZED_CI_BRANCH docs: - needs: [cluster_tools, webknossos_linux] + needs: + - cluster_tools_slurm + - cluster_tools_multiprocessing + - cluster_tools_kubernetes + - cluster_tools_dask + - webknossos_linux runs-on: ubuntu-latest if: | always() && @@ -421,7 +431,12 @@ jobs: "$SLACK_HOOK" pypi_and_gh_release: - needs: [cluster_tools, webknossos_linux] + needs: + - cluster_tools_slurm + - cluster_tools_multiprocessing + - cluster_tools_kubernetes + - cluster_tools_dask + - webknossos_linux if: | always() && !contains(needs.*.result, 'failure') && @@ -459,7 +474,10 @@ jobs: complete: needs: [ - cluster_tools, + cluster_tools_dask, + cluster_tools_kubernetes, + cluster_tools_multiprocessing, + cluster_tools_slurm, webknossos_linux, webknossos_cli_docker, docs, From 065830dc2b37826d54f3c8a67bbc6459a291157f Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Tue, 6 May 2025 16:38:26 +0200 Subject: [PATCH 06/45] set default working directory --- .github/workflows/ci.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b924eb705..9e371de31 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,6 +37,9 @@ jobs: strategy: matrix: python-version: ["3.13", "3.12", "3.11", "3.10"] + defaults: + run: + working-directory: cluster_tools steps: - uses: actions/checkout@v4 - uses: astral-sh/setup-uv@v6 @@ -87,6 +90,9 @@ jobs: strategy: matrix: python-version: ["3.13", "3.12", "3.11", "3.10"] + defaults: + run: + working-directory: cluster_tools steps: - uses: actions/checkout@v4 - name: Install uv @@ -122,6 +128,9 @@ jobs: strategy: matrix: python-version: ["3.13", "3.12", "3.11", "3.10"] + defaults: + run: + working-directory: cluster_tools steps: - uses: actions/checkout@v4 - name: Install uv @@ -161,6 +170,9 @@ jobs: strategy: matrix: python-version: ["3.13", "3.12", "3.11", "3.10"] + defaults: + run: + working-directory: cluster_tools steps: - uses: actions/checkout@v4 - name: Install uv From b047832c1001e288413cdef7b4934cb31fa154f6 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Wed, 7 May 2025 17:56:02 +0200 Subject: [PATCH 07/45] ci: docker compose: improve startup --- .github/workflows/ci.yml | 12 ------ .../dockered-slurm/docker-compose.yml | 37 ++++++++++++++----- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9e371de31..fa4a2c18c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -58,18 +58,6 @@ jobs: for name in "slurmctld" "c1" "c2"; do docker exec -w /cluster_tools "$name" bash -c "uv sync --frozen" done - - name: Wait for Cluster to become ready - run: | - # Register cluster (with retry) - for i in {1..5}; do - echo register_cluster - ./register_cluster.sh && s=0 && break || s=$? - sleep 10 - done - - for name in "slurmctld" "c1" "c2"; do - docker logs "$name" - done - name: "Run Tests (test_all, test_slurm)" run: | docker exec \ diff --git a/cluster_tools/dockered-slurm/docker-compose.yml b/cluster_tools/dockered-slurm/docker-compose.yml index 73c89843c..7a8afa119 100644 --- a/cluster_tools/dockered-slurm/docker-compose.yml +++ b/cluster_tools/dockered-slurm/docker-compose.yml @@ -1,6 +1,6 @@ services: mysql: - image: mysql:5.7 + image: mysql:9.3 hostname: mysql container_name: mysql environment: @@ -8,12 +8,17 @@ services: MYSQL_DATABASE: slurm_acct_db MYSQL_USER: slurm MYSQL_PASSWORD: password + healthcheck: + test: ["CMD", "mysqladmin", "ping", "-h", "localhost"] + interval: 5s + timeout: 5s + retries: 5 volumes: - var_lib_mysql:/var/lib/mysql - ..:/cluster_tools slurmdbd: - image: scalableminds/slurm-docker-cluster:port_slurm_in_docker_to_debian__14753584102 + image: scalableminds/slurm-docker-cluster:port_slurm_in_docker_to_debian command: [ "slurmdbd" ] container_name: slurmdbd hostname: slurmdbd @@ -25,11 +30,17 @@ services: - var_log_slurm:/var/log/slurm expose: - "6819" + healthcheck: + test: ["CMD", "bash", "-c", " Date: Wed, 7 May 2025 18:01:36 +0200 Subject: [PATCH 08/45] update test script --- cluster_tools/tests/test_slurm.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py index f0672e1d5..0f72cc8e7 100644 --- a/cluster_tools/tests/test_slurm.py +++ b/cluster_tools/tests/test_slurm.py @@ -45,9 +45,10 @@ def expect_fork() -> bool: def search_and_replace_in_slurm_config(search_string: str, replace_string: str) -> None: - chcall( - f"sed -ci 's/{search_string}/{replace_string}/g' /etc/slurm/slurm.conf && scontrol reconfigure" - ) + chcall(f"sed 's/{search_string}/{replace_string}/g' /etc/slurm/slurm.conf > /etc/slurm/slurm.conf.bak") + chcall(f"cp /etc/slurm/slurm.conf.bak /etc/slurm/slurm.conf") + chcall("scontrol reconfigure") + sleep(310) def test_map_with_spawn() -> None: @@ -273,6 +274,7 @@ def test_slurm_number_of_submitted_jobs() -> None: assert executor.get_number_of_submitted_jobs() == 0 +@pytest.mark.slurm_change_config def test_slurm_max_array_size() -> None: max_array_size = 2 @@ -282,7 +284,8 @@ def test_slurm_max_array_size() -> None: command = f"MaxArraySize={max_array_size}" try: - chcall(f"echo -e '{command}' >> /etc/slurm/slurm.conf && scontrol reconfigure") + chcall(f"echo '{command}' >> /etc/slurm/slurm.conf && scontrol reconfigure") + sleep(310) new_max_array_size = executor.get_max_array_size() assert new_max_array_size == max_array_size @@ -323,6 +326,7 @@ def test_slurm_time_limit() -> None: ) +@pytest.mark.slurm_change_config def test_slurm_memory_limit() -> None: # Request 1 MB executor = cluster_tools.get_executor( From 6cbc36c480c9d7f9da63a8fe0497df6e80e90998 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Wed, 7 May 2025 18:02:40 +0200 Subject: [PATCH 09/45] only run test without config change --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fa4a2c18c..c409390f4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -63,7 +63,7 @@ jobs: docker exec \ -w /cluster_tools/tests \ -e PYTEST_EXECUTORS=slurm \ - slurmctld bash -c "uv run --frozen python -m pytest -sv test_all.py test_slurm.py" + slurmctld bash -c "uv run --frozen python -m pytest -sv test_all.py test_slurm.py -m 'not slurm_change_config'" - name: "Run Tests (test_deref_main)" run: | docker exec \ From f4ac2bca2b3096f06634d2c8caaa20622f1803ed Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Wed, 7 May 2025 18:06:19 +0200 Subject: [PATCH 10/45] ci: fix logging and make slurm node privilged --- .github/workflows/ci.yml | 5 +++++ cluster_tools/dockered-slurm/docker-compose.yml | 2 ++ 2 files changed, 7 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c409390f4..500a0e7bb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -53,6 +53,11 @@ jobs: set -xe cd ./dockered-slurm docker compose up -d + - name: Log Core Container + run: | + for name in "slurmctld" "c1" "c2"; do + docker logs "$name" + done - name: Install UV dependencies run: | for name in "slurmctld" "c1" "c2"; do diff --git a/cluster_tools/dockered-slurm/docker-compose.yml b/cluster_tools/dockered-slurm/docker-compose.yml index 7a8afa119..032160902 100644 --- a/cluster_tools/dockered-slurm/docker-compose.yml +++ b/cluster_tools/dockered-slurm/docker-compose.yml @@ -69,6 +69,7 @@ services: c1: image: scalableminds/slurm-docker-cluster:port_slurm_in_docker_to_debian command: [ "slurmd" ] + privileged: true hostname: c1 container_name: c1 volumes: @@ -88,6 +89,7 @@ services: c2: image: scalableminds/slurm-docker-cluster:port_slurm_in_docker_to_debian command: [ "slurmd" ] + privileged: true hostname: c2 container_name: c2 volumes: From 8429f2320421704cb55e22918297f04f491d89f9 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Wed, 7 May 2025 18:20:22 +0200 Subject: [PATCH 11/45] apply format --- cluster_tools/tests/test_slurm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py index 0f72cc8e7..bfb34a6e9 100644 --- a/cluster_tools/tests/test_slurm.py +++ b/cluster_tools/tests/test_slurm.py @@ -45,7 +45,9 @@ def expect_fork() -> bool: def search_and_replace_in_slurm_config(search_string: str, replace_string: str) -> None: - chcall(f"sed 's/{search_string}/{replace_string}/g' /etc/slurm/slurm.conf > /etc/slurm/slurm.conf.bak") + chcall( + f"sed 's/{search_string}/{replace_string}/g' /etc/slurm/slurm.conf > /etc/slurm/slurm.conf.bak" + ) chcall(f"cp /etc/slurm/slurm.conf.bak /etc/slurm/slurm.conf") chcall("scontrol reconfigure") sleep(310) From f358045d02fe02f9f70662a0e261d7f7d9995829 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Wed, 7 May 2025 18:49:12 +0200 Subject: [PATCH 12/45] ci: add extra slurm test run --- .github/workflows/ci.yml | 22 +++++-- cluster_tools/dockered-slurm/slurm.conf | 24 ++------ cluster_tools/tests/test_slurm.py | 77 +++++++------------------ 3 files changed, 43 insertions(+), 80 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 500a0e7bb..4e9b2c5e6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -49,10 +49,7 @@ jobs: cache-dependency-glob: "cluster_tools/uv.lock" - run: uv python install ${{ matrix.python-version }} - name: Start Docker Cluster - run: | - set -xe - cd ./dockered-slurm - docker compose up -d + run: cd ./dockered-slurm && docker compose up -d - name: Log Core Container run: | for name in "slurmctld" "c1" "c2"; do @@ -75,6 +72,23 @@ jobs: -w /cluster_tools/tests \ slurmctld bash -c "uv run --frozen python test_deref_main.py" + - name: Stop Slurm Cluster + run: cd ./dockered-slurm && docker compose stop + - name: Update Slurm Config + run: | + echo "MaxArraySize=2" >> ./dockered-slurm/slurm.conf + sed "s/JobAcctGatherFrequency=30/JobAcctGatherFrequency=1/g" ./dockered-slurm/slurm.conf > ./dockered-slurm/slurm.conf.bak + cp ./dockered-slurm/slurm.conf.bak ./dockered-slurm/slurm.conf + - name: Start Slurm Cluster + run: cd ./dockered-slurm && docker compose start + + - name: "Run Tests (test_all, test_slurm)" + run: | + docker exec \ + -w /cluster_tools/tests \ + -e PYTEST_EXECUTORS=slurm \ + slurmctld bash -c "uv run --frozen python -m pytest -sv test_slurm.py -m 'slurm_change_config'" + cluster_tools_multiprocessing: needs: changes if: ${{ needs.changes.outputs.cluster_tools == 'true' }} diff --git a/cluster_tools/dockered-slurm/slurm.conf b/cluster_tools/dockered-slurm/slurm.conf index 5308702af..6d570e885 100644 --- a/cluster_tools/dockered-slurm/slurm.conf +++ b/cluster_tools/dockered-slurm/slurm.conf @@ -3,8 +3,7 @@ # See the slurm.conf man page for more information. # ClusterName=linux -ControlMachine=slurmctld -ControlAddr=slurmctld +SlurmctldHost=slurmctld #BackupController= #BackupAddr= # @@ -22,22 +21,7 @@ MpiDefault=none SlurmctldPidFile=/var/run/slurmd/slurmctld.pid SlurmdPidFile=/var/run/slurmd/slurmd.pid ProctrackType=proctrack/linuxproc -#PluginDir= -#FirstJobId= -ReturnToService=0 -#MaxJobCount= -#PlugStackConfig= -#PropagatePrioProcess= -#PropagateResourceLimits= -#PropagateResourceLimitsExcept= -#Prolog= -#Epilog= -#SrunProlog= -#SrunEpilog= -#TaskProlog= -#TaskEpilog= -# The task/affinity plugin needs to be enabled for the SLURM_CPU_BIND environment variable to have an effect. -# We test for a regression that's related to CPU binding in the `test_cpu_bind_regression` test in `test_slurm.py`. + TaskPlugin=task/affinity #TrackWCKey=no #TreeWidth=50 @@ -69,9 +53,9 @@ SelectTypeParameters=CR_CPU_Memory #PriorityMaxAge=1-0 # # LOGGING -SlurmctldDebug=3 +SlurmctldDebug=debug SlurmctldLogFile=/var/log/slurm/slurmctld.log -SlurmdDebug=3 +SlurmdDebug=debug SlurmdLogFile=/var/log/slurm/slurmd.log JobCompType=jobcomp/filetxt JobCompLoc=/var/log/slurm/jobcomp.log diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py index bfb34a6e9..898d1b2d9 100644 --- a/cluster_tools/tests/test_slurm.py +++ b/cluster_tools/tests/test_slurm.py @@ -44,15 +44,6 @@ def expect_fork() -> bool: return True -def search_and_replace_in_slurm_config(search_string: str, replace_string: str) -> None: - chcall( - f"sed 's/{search_string}/{replace_string}/g' /etc/slurm/slurm.conf > /etc/slurm/slurm.conf.bak" - ) - chcall(f"cp /etc/slurm/slurm.conf.bak /etc/slurm/slurm.conf") - chcall("scontrol reconfigure") - sleep(310) - - def test_map_with_spawn() -> None: with cluster_tools.get_executor( "slurm", max_workers=5, start_method="spawn" @@ -278,33 +269,20 @@ def test_slurm_number_of_submitted_jobs() -> None: @pytest.mark.slurm_change_config def test_slurm_max_array_size() -> None: - max_array_size = 2 + expected_max_array_size = 2 - executor = cluster_tools.get_executor("slurm", debug=True) - original_max_array_size = executor.get_max_array_size() - - command = f"MaxArraySize={max_array_size}" + max_array_size = executor.get_max_array_size() + assert max_array_size == expected_max_array_size - try: - chcall(f"echo '{command}' >> /etc/slurm/slurm.conf && scontrol reconfigure") - sleep(310) + with executor: + futures = executor.map_to_futures(square, range(6)) + concurrent.futures.wait(futures) + job_ids = [fut.cluster_jobid for fut in futures] # type: ignore[attr-defined] - new_max_array_size = executor.get_max_array_size() - assert new_max_array_size == max_array_size + # Count how often each job_id occurs which corresponds to the array size of the job + occurrences = list(Counter(job_ids).values()) - with executor: - futures = executor.map_to_futures(square, range(6)) - concurrent.futures.wait(futures) - job_ids = [fut.cluster_jobid for fut in futures] # type: ignore[attr-defined] - - # Count how often each job_id occurs which corresponds to the array size of the job - occurrences = list(Counter(job_ids).values()) - - assert all(array_size <= max_array_size for array_size in occurrences) - finally: - search_and_replace_in_slurm_config(command, "") - reset_max_array_size = executor.get_max_array_size() - assert reset_max_array_size == original_max_array_size + assert all(array_size <= expected_max_array_size for array_size in occurrences) @pytest.mark.skip( @@ -335,32 +313,19 @@ def test_slurm_memory_limit() -> None: "slurm", debug=True, job_resources={"mem": "1M"} ) - original_gather_frequency_config = "JobAcctGatherFrequency=30" # from slurm.conf - new_gather_frequency_config = "JobAcctGatherFrequency=1" - - try: - # Increase the frequency at which slurm checks whether a job uses too much memory - search_and_replace_in_slurm_config( - original_gather_frequency_config, new_gather_frequency_config + with executor: + # Schedule a job that allocates more than 1 MB and let it run for more than 1 second + # because the frequency of the memory polling is 1 second + duration = 3 + futures = executor.map_to_futures( + partial(allocate, duration), [1024 * 1024 * 2] ) + concurrent.futures.wait(futures) - with executor: - # Schedule a job that allocates more than 1 MB and let it run for more than 1 second - # because the frequency of the memory polling is 1 second - duration = 3 - futures = executor.map_to_futures( - partial(allocate, duration), [1024 * 1024 * 2] - ) - concurrent.futures.wait(futures) - - # Job should have been killed with a RemoteOutOfMemoryException - assert all( - isinstance(fut.exception(), cluster_tools.RemoteOutOfMemoryException) - for fut in futures - ) - finally: - search_and_replace_in_slurm_config( - new_gather_frequency_config, original_gather_frequency_config + # Job should have been killed with a RemoteOutOfMemoryException + assert all( + isinstance(fut.exception(), cluster_tools.RemoteOutOfMemoryException) + for fut in futures ) From dbbcbbdc1b4e31848c42f27294080fd219758445 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Wed, 7 May 2025 18:53:17 +0200 Subject: [PATCH 13/45] add missing executor --- cluster_tools/tests/test_slurm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py index 898d1b2d9..6d9927db7 100644 --- a/cluster_tools/tests/test_slurm.py +++ b/cluster_tools/tests/test_slurm.py @@ -271,6 +271,8 @@ def test_slurm_number_of_submitted_jobs() -> None: def test_slurm_max_array_size() -> None: expected_max_array_size = 2 + executor = cluster_tools.get_executor("slurm", debug=True) + max_array_size = executor.get_max_array_size() assert max_array_size == expected_max_array_size From ebf99af9b162c184a7e987b5b776b3d7c1d3c80e Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Thu, 15 May 2025 13:34:01 +0200 Subject: [PATCH 14/45] register pytest mark --- cluster_tools/pyproject.toml | 12 ++++++++---- cluster_tools/tests/test_slurm.py | 2 ++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/cluster_tools/pyproject.toml b/cluster_tools/pyproject.toml index 0b62a0a59..d1d935017 100644 --- a/cluster_tools/pyproject.toml +++ b/cluster_tools/pyproject.toml @@ -2,9 +2,10 @@ name = "cluster_tools" version = "0.0.0" # filled by dunamai description = "Utility library for easily distributing code execution on clusters" -authors = [{name= "scalable minds", email="hello@scalableminds.com"}] +authors = [{name= "scalable minds", email="hello@scalableminds.com"}] readme = "README.md" -license = {text = "MIT"} +license-files = ["LICENSE"] +license = "MIT" requires-python = ">=3.10" dependencies = [ "typing-extensions ~=4.12.0", @@ -39,8 +40,6 @@ exclude = ["cluster_tools.tests"] # This is a fix for an issue in setuptools. See: https://github.com/pypa/setuptools/issues/4759 # This should be removed when the issue is resolved. -[tool.setuptools] -license-files = [] [tool.ruff] # Exclude a variety of commonly ignored directories. @@ -108,3 +107,8 @@ namespace_packages = true strict_equality = true show_error_codes = true no_implicit_optional = true + +[tool.pytest.ini_options] +markers = [ + "slurm_config_change: marks a test which needs additional changes to the slurm.conf", +] diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py index 6d9927db7..1ea062c6f 100644 --- a/cluster_tools/tests/test_slurm.py +++ b/cluster_tools/tests/test_slurm.py @@ -324,6 +324,8 @@ def test_slurm_memory_limit() -> None: ) concurrent.futures.wait(futures) + print([fut.exception() for fut in futures]) + # Job should have been killed with a RemoteOutOfMemoryException assert all( isinstance(fut.exception(), cluster_tools.RemoteOutOfMemoryException) From 4387776cb51ed0d23915090c5bd9147226a2838b Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Thu, 15 May 2025 13:47:15 +0200 Subject: [PATCH 15/45] move marker to .py and print slurm logs --- cluster_tools/pyproject.toml | 5 ----- cluster_tools/tests/test_slurm.py | 8 ++++++++ 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cluster_tools/pyproject.toml b/cluster_tools/pyproject.toml index d1d935017..f7f8d0307 100644 --- a/cluster_tools/pyproject.toml +++ b/cluster_tools/pyproject.toml @@ -107,8 +107,3 @@ namespace_packages = true strict_equality = true show_error_codes = true no_implicit_optional = true - -[tool.pytest.ini_options] -markers = [ - "slurm_config_change: marks a test which needs additional changes to the slurm.conf", -] diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py index 1ea062c6f..8ac8762af 100644 --- a/cluster_tools/tests/test_slurm.py +++ b/cluster_tools/tests/test_slurm.py @@ -36,6 +36,10 @@ def allocate(duration: float, num_bytes: int) -> int: return sys.getsizeof(data) +def pytest_configuration(config): + config.addinivalue_line("makers", "slurm_change_config") + + logging.basicConfig() @@ -324,6 +328,10 @@ def test_slurm_memory_limit() -> None: ) concurrent.futures.wait(futures) + with open(".cfut/slurmpy.91_0.log.stdout", "r") as file: + print("=== LOG ===") + print(file.readall().decode()) + print([fut.exception() for fut in futures]) # Job should have been killed with a RemoteOutOfMemoryException From 08d6e182fe28171ba1624d3532064fb9f3568982 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Thu, 15 May 2025 13:58:38 +0200 Subject: [PATCH 16/45] fix python read --- cluster_tools/tests/test_slurm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py index 8ac8762af..10aa2fb0c 100644 --- a/cluster_tools/tests/test_slurm.py +++ b/cluster_tools/tests/test_slurm.py @@ -330,7 +330,7 @@ def test_slurm_memory_limit() -> None: with open(".cfut/slurmpy.91_0.log.stdout", "r") as file: print("=== LOG ===") - print(file.readall().decode()) + print(file.read().decode()) print([fut.exception() for fut in futures]) From 0c08004a64e74570ddc26e3622660bed174fe8c8 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Thu, 15 May 2025 14:00:38 +0200 Subject: [PATCH 17/45] remove unnecessary decode --- cluster_tools/tests/test_slurm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py index 10aa2fb0c..39d206b34 100644 --- a/cluster_tools/tests/test_slurm.py +++ b/cluster_tools/tests/test_slurm.py @@ -330,7 +330,7 @@ def test_slurm_memory_limit() -> None: with open(".cfut/slurmpy.91_0.log.stdout", "r") as file: print("=== LOG ===") - print(file.read().decode()) + print(file.read()) print([fut.exception() for fut in futures]) From 936cc077e8d2b4248e9a478b7201210bc1c9ce0d Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Thu, 15 May 2025 14:03:32 +0200 Subject: [PATCH 18/45] add pytest_configuration type annotation --- cluster_tools/tests/test_slurm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py index 39d206b34..68e8360b8 100644 --- a/cluster_tools/tests/test_slurm.py +++ b/cluster_tools/tests/test_slurm.py @@ -36,7 +36,7 @@ def allocate(duration: float, num_bytes: int) -> int: return sys.getsizeof(data) -def pytest_configuration(config): +def pytest_configuration(config) -> None: config.addinivalue_line("makers", "slurm_change_config") From d99f7d7c8607e11e76ea97a57bb487f27a7d0284 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Thu, 15 May 2025 14:32:55 +0200 Subject: [PATCH 19/45] print seff output --- cluster_tools/cluster_tools/schedulers/slurm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cluster_tools/cluster_tools/schedulers/slurm.py b/cluster_tools/cluster_tools/schedulers/slurm.py index 55e0ba50b..0d657c8fb 100644 --- a/cluster_tools/cluster_tools/schedulers/slurm.py +++ b/cluster_tools/cluster_tools/schedulers/slurm.py @@ -439,6 +439,8 @@ def parse_key_value_pairs( if exit_code != 0: return None + print("seff", stdout) + # Parse stdout into a key-value object properties = parse_key_value_pairs(stdout, "\n", ":") From aa794bd20bc82e546843c22eaf3d1eb5a3329747 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Thu, 15 May 2025 14:49:49 +0200 Subject: [PATCH 20/45] add more prints --- cluster_tools/cluster_tools/schedulers/slurm.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cluster_tools/cluster_tools/schedulers/slurm.py b/cluster_tools/cluster_tools/schedulers/slurm.py index 0d657c8fb..6496d7f5b 100644 --- a/cluster_tools/cluster_tools/schedulers/slurm.py +++ b/cluster_tools/cluster_tools/schedulers/slurm.py @@ -410,6 +410,8 @@ def investigate_failed_job( # of the job: If the job was killed with signal 9, it's very likely due to some # resource limit. + print("JobId", job_id_with_index) + def parse_key_value_pairs( text: str, pair_delimiter: str, key_value_delimiter: str ) -> dict[str, str]: @@ -423,7 +425,8 @@ def parse_key_value_pairs( # Call `scontrol show jobid=` which should return some output including # key=value pairs, such as: "Reason=...", "TimeLimit=...", and "RunTime=..." - stdout, _, exit_code = call(f"scontrol show jobid={job_id_with_index}") + stdout, stderr, exit_code = call(f"scontrol show jobid={job_id_with_index}") + print("exit code", exit_code, stderr) if exit_code == 0: # Parse stdout into a key-value object @@ -436,6 +439,7 @@ def parse_key_value_pairs( # Call `seff job_id` which should return some output including a line, # such as: "Memory Efficiency: 25019.18% of 1.00 GB" stdout, _, exit_code = call(f"seff {job_id_with_index}") + print("seff exit code", exit_code) if exit_code != 0: return None From f0f35bb1476b3de4ecb7a7910ee87cf835740bcb Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Thu, 15 May 2025 15:20:59 +0200 Subject: [PATCH 21/45] make seff optional --- cluster_tools/cluster_tools/schedulers/slurm.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/cluster_tools/cluster_tools/schedulers/slurm.py b/cluster_tools/cluster_tools/schedulers/slurm.py index 6496d7f5b..25e632139 100644 --- a/cluster_tools/cluster_tools/schedulers/slurm.py +++ b/cluster_tools/cluster_tools/schedulers/slurm.py @@ -428,6 +428,7 @@ def parse_key_value_pairs( stdout, stderr, exit_code = call(f"scontrol show jobid={job_id_with_index}") print("exit code", exit_code, stderr) + properties = None if exit_code == 0: # Parse stdout into a key-value object properties = parse_key_value_pairs(stdout, " ", "=") @@ -440,17 +441,18 @@ def parse_key_value_pairs( # such as: "Memory Efficiency: 25019.18% of 1.00 GB" stdout, _, exit_code = call(f"seff {job_id_with_index}") print("seff exit code", exit_code) - if exit_code != 0: + if exit_code == 0: return None - print("seff", stdout) + # Parse stdout into a key-value object + properties = parse_key_value_pairs(stdout, "\n", ":") - # Parse stdout into a key-value object - properties = parse_key_value_pairs(stdout, "\n", ":") + memory_limit_investigation = self._investigate_memory_consumption(properties) + if memory_limit_investigation: + return memory_limit_investigation - memory_limit_investigation = self._investigate_memory_consumption(properties) - if memory_limit_investigation: - return memory_limit_investigation + if properties is None: + return None return self._investigate_exit_code(properties) From e0dc6b8ced8202f191a0b3dd2bc81e971ef6fab2 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Thu, 15 May 2025 15:31:37 +0200 Subject: [PATCH 22/45] print properties --- cluster_tools/cluster_tools/schedulers/slurm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cluster_tools/cluster_tools/schedulers/slurm.py b/cluster_tools/cluster_tools/schedulers/slurm.py index 25e632139..c2eeedf05 100644 --- a/cluster_tools/cluster_tools/schedulers/slurm.py +++ b/cluster_tools/cluster_tools/schedulers/slurm.py @@ -451,6 +451,7 @@ def parse_key_value_pairs( if memory_limit_investigation: return memory_limit_investigation + print("properties", properties) if properties is None: return None From a083455cebc570fb90f3602b5f71da850d093b69 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Thu, 15 May 2025 16:29:10 +0200 Subject: [PATCH 23/45] replace seff with sacct --- .../cluster_tools/schedulers/slurm.py | 30 +++---------------- 1 file changed, 4 insertions(+), 26 deletions(-) diff --git a/cluster_tools/cluster_tools/schedulers/slurm.py b/cluster_tools/cluster_tools/schedulers/slurm.py index c2eeedf05..8294db4b1 100644 --- a/cluster_tools/cluster_tools/schedulers/slurm.py +++ b/cluster_tools/cluster_tools/schedulers/slurm.py @@ -439,11 +439,9 @@ def parse_key_value_pairs( # Call `seff job_id` which should return some output including a line, # such as: "Memory Efficiency: 25019.18% of 1.00 GB" - stdout, _, exit_code = call(f"seff {job_id_with_index}") + stdout, _, exit_code = call(f"sacct -P -j {job_id_with_index}") print("seff exit code", exit_code) if exit_code == 0: - return None - # Parse stdout into a key-value object properties = parse_key_value_pairs(stdout, "\n", ":") @@ -474,31 +472,11 @@ def _investigate_time_limit( reason = f"The job was probably terminated because it ran for too long ({time_limit_note})." return (reason, RemoteTimeLimitException) - def _investigate_memory_consumption( - self, properties: dict[str, str] - ) -> tuple[str, type[RemoteOutOfMemoryException]] | None: - if not properties.get("Memory Efficiency", None): - return None - - # Extract the "25019.18% of 1.00 GB" part of the line - efficiency_note = properties["Memory Efficiency"] - PERCENTAGE_REGEX = r"([0-9]+(\.[0-9]+)?)%" - - # Extract the percentage to see whether it exceeds 100%. - match = re.search(PERCENTAGE_REGEX, efficiency_note) - percentage = None - if match is None: - return None - - try: - percentage = float(match.group(1)) - except ValueError: - return None - - if percentage < 100: + def _investigate_memory_consumption(self, stdout: str) -> tuple[str, type[RemoteOutOfMemoryException]] | None: + if "OUT_OF_MEMORY" not in stdout: return None - reason = f"The job was probably terminated because it consumed too much memory ({efficiency_note})." + reason = "The job was probably terminated because it consumed too much memory." return (reason, RemoteOutOfMemoryException) def _investigate_exit_code( From a49895133a4390391bd66436af319764da20d632 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Tue, 20 May 2025 11:36:16 +0200 Subject: [PATCH 24/45] detect out of memory using sacct --- cluster_tools/cluster_tools/schedulers/slurm.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cluster_tools/cluster_tools/schedulers/slurm.py b/cluster_tools/cluster_tools/schedulers/slurm.py index 8294db4b1..97fa942d1 100644 --- a/cluster_tools/cluster_tools/schedulers/slurm.py +++ b/cluster_tools/cluster_tools/schedulers/slurm.py @@ -439,13 +439,11 @@ def parse_key_value_pairs( # Call `seff job_id` which should return some output including a line, # such as: "Memory Efficiency: 25019.18% of 1.00 GB" - stdout, _, exit_code = call(f"sacct -P -j {job_id_with_index}") - print("seff exit code", exit_code) + stdout, _, exit_code = call(f"sacct -P --format=JobID,State -j {job_id_with_index}") + print("sacct exit code", exit_code) if exit_code == 0: # Parse stdout into a key-value object - properties = parse_key_value_pairs(stdout, "\n", ":") - - memory_limit_investigation = self._investigate_memory_consumption(properties) + memory_limit_investigation = self._investigate_memory_consumption(stdout) if memory_limit_investigation: return memory_limit_investigation From 0dc1a3c77c28ad51de8f4fe6dcac495016fc7c51 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Tue, 20 May 2025 11:43:28 +0200 Subject: [PATCH 25/45] sacct print stdout --- cluster_tools/cluster_tools/schedulers/slurm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cluster_tools/cluster_tools/schedulers/slurm.py b/cluster_tools/cluster_tools/schedulers/slurm.py index 97fa942d1..d5f82c2aa 100644 --- a/cluster_tools/cluster_tools/schedulers/slurm.py +++ b/cluster_tools/cluster_tools/schedulers/slurm.py @@ -440,6 +440,7 @@ def parse_key_value_pairs( # Call `seff job_id` which should return some output including a line, # such as: "Memory Efficiency: 25019.18% of 1.00 GB" stdout, _, exit_code = call(f"sacct -P --format=JobID,State -j {job_id_with_index}") + print("sacct stdout:\n", stdout) print("sacct exit code", exit_code) if exit_code == 0: # Parse stdout into a key-value object From 572eb036032956d6ebff418aa8f121b52d3f95d6 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Tue, 20 May 2025 11:58:33 +0200 Subject: [PATCH 26/45] slurm conf constrain ram space --- cluster_tools/dockered-slurm/cgroup.conf | 1 + cluster_tools/dockered-slurm/slurm.conf | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cluster_tools/dockered-slurm/cgroup.conf b/cluster_tools/dockered-slurm/cgroup.conf index 981525869..0097ebbb8 100644 --- a/cluster_tools/dockered-slurm/cgroup.conf +++ b/cluster_tools/dockered-slurm/cgroup.conf @@ -1,3 +1,4 @@ # autodetect which is the default detects cgroup/v2 in the github CI, # which fails during the initialization of the c1 and c2 nodes CgroupPlugin=cgroup/v1 +ConstrainRAMSpace=yes diff --git a/cluster_tools/dockered-slurm/slurm.conf b/cluster_tools/dockered-slurm/slurm.conf index 6d570e885..5faa5b110 100644 --- a/cluster_tools/dockered-slurm/slurm.conf +++ b/cluster_tools/dockered-slurm/slurm.conf @@ -20,9 +20,9 @@ SwitchType=switch/none MpiDefault=none SlurmctldPidFile=/var/run/slurmd/slurmctld.pid SlurmdPidFile=/var/run/slurmd/slurmd.pid -ProctrackType=proctrack/linuxproc +ProctrackType=proctrack/cgroup -TaskPlugin=task/affinity +TaskPlugin=task/cgroup,task/affinity #TrackWCKey=no #TreeWidth=50 #TmpFS= @@ -61,7 +61,7 @@ JobCompType=jobcomp/filetxt JobCompLoc=/var/log/slurm/jobcomp.log # # ACCOUNTING -JobAcctGatherType=jobacct_gather/linux +JobAcctGatherType=jobacct_gather/cgroup JobAcctGatherFrequency=30 JobAcctGatherParams=OverMemoryKill # From b9335af36740e14fe77567379d8aa5dadfba3cb1 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Tue, 20 May 2025 13:06:11 +0200 Subject: [PATCH 27/45] slurm job acct: disable over memory kill --- cluster_tools/dockered-slurm/slurm.conf | 1 - 1 file changed, 1 deletion(-) diff --git a/cluster_tools/dockered-slurm/slurm.conf b/cluster_tools/dockered-slurm/slurm.conf index 5faa5b110..f31d80489 100644 --- a/cluster_tools/dockered-slurm/slurm.conf +++ b/cluster_tools/dockered-slurm/slurm.conf @@ -63,7 +63,6 @@ JobCompLoc=/var/log/slurm/jobcomp.log # ACCOUNTING JobAcctGatherType=jobacct_gather/cgroup JobAcctGatherFrequency=30 -JobAcctGatherParams=OverMemoryKill # AccountingStorageType=accounting_storage/slurmdbd AccountingStorageHost=slurmdbd From a51ade291d6902ced950c46a89fb91eb05b78a0d Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Tue, 20 May 2025 13:29:49 +0200 Subject: [PATCH 28/45] switch cgroup v2 and ignore systemd --- cluster_tools/dockered-slurm/cgroup.conf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cluster_tools/dockered-slurm/cgroup.conf b/cluster_tools/dockered-slurm/cgroup.conf index 0097ebbb8..605d285e4 100644 --- a/cluster_tools/dockered-slurm/cgroup.conf +++ b/cluster_tools/dockered-slurm/cgroup.conf @@ -1,4 +1,5 @@ # autodetect which is the default detects cgroup/v2 in the github CI, # which fails during the initialization of the c1 and c2 nodes -CgroupPlugin=cgroup/v1 +CgroupPlugin=cgroup/v2 ConstrainRAMSpace=yes +IgnoreSystemd=yes From bb5db2905b40efbbffbe14b55b825a21ddd66291 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Wed, 21 May 2025 18:04:22 +0200 Subject: [PATCH 29/45] fix out of memory detection --- .../cluster_tools/schedulers/slurm.py | 33 ++++++++++++++++--- cluster_tools/dockered-slurm/cgroup.conf | 1 + .../dockered-slurm/docker-compose.yml | 2 ++ cluster_tools/tests/test_slurm.py | 8 ++--- 4 files changed, 34 insertions(+), 10 deletions(-) diff --git a/cluster_tools/cluster_tools/schedulers/slurm.py b/cluster_tools/cluster_tools/schedulers/slurm.py index d5f82c2aa..f890e3ef9 100644 --- a/cluster_tools/cluster_tools/schedulers/slurm.py +++ b/cluster_tools/cluster_tools/schedulers/slurm.py @@ -439,9 +439,10 @@ def parse_key_value_pairs( # Call `seff job_id` which should return some output including a line, # such as: "Memory Efficiency: 25019.18% of 1.00 GB" - stdout, _, exit_code = call(f"sacct -P --format=JobID,State -j {job_id_with_index}") + stdout, _, exit_code = call(f"sacct -P --format=JobID,State,MaxRSS,ReqMem --unit K -j {job_id_with_index}") print("sacct stdout:\n", stdout) print("sacct exit code", exit_code) + if exit_code == 0: # Parse stdout into a key-value object memory_limit_investigation = self._investigate_memory_consumption(stdout) @@ -472,10 +473,34 @@ def _investigate_time_limit( return (reason, RemoteTimeLimitException) def _investigate_memory_consumption(self, stdout: str) -> tuple[str, type[RemoteOutOfMemoryException]] | None: - if "OUT_OF_MEMORY" not in stdout: + stdout_lines = stdout.splitlines() + max_rss = 0 + req_mem = 0 + if len(stdout_lines) > 0: + for line in stdout_lines[1:]: + params = line.split("|") + print(params) + try: + print(params[2][:-1]) + max_rss = max(max_rss, int(params[2][:-1])) + except: + pass + try: + print(params[3][:-1]) + req_mem = max(max_rss, int(params[3][:-1])) + except: + pass + + if "OUT_OF_MEMORY" in stdout: + # Check if task plugin killed the job. This is the case if cgroup is used as TaskPlugin and + # memory limits are enforced. + reason = f"The job was terminated because it consumed too much memory (Requested: {req_mem / 1000} MB)." + elif max_rss > req_mem: + # Check if job accounting canceled the job. This is the case if JobAcctGatherParam=OverMemoryKill + # is enabled. + reason = f"The job was probably terminated because it consumed too much memory. Required {max_rss / 1000} MB but requeseted {req_mem / 1000} MB." + else: return None - - reason = "The job was probably terminated because it consumed too much memory." return (reason, RemoteOutOfMemoryException) def _investigate_exit_code( diff --git a/cluster_tools/dockered-slurm/cgroup.conf b/cluster_tools/dockered-slurm/cgroup.conf index 605d285e4..5455f2622 100644 --- a/cluster_tools/dockered-slurm/cgroup.conf +++ b/cluster_tools/dockered-slurm/cgroup.conf @@ -2,4 +2,5 @@ # which fails during the initialization of the c1 and c2 nodes CgroupPlugin=cgroup/v2 ConstrainRAMSpace=yes +ConstrainSwapSpace=yes IgnoreSystemd=yes diff --git a/cluster_tools/dockered-slurm/docker-compose.yml b/cluster_tools/dockered-slurm/docker-compose.yml index 032160902..8e21cd387 100644 --- a/cluster_tools/dockered-slurm/docker-compose.yml +++ b/cluster_tools/dockered-slurm/docker-compose.yml @@ -72,6 +72,7 @@ services: privileged: true hostname: c1 container_name: c1 + init: true volumes: - etc_munge:/etc/munge - etc_slurm:/etc/slurm @@ -92,6 +93,7 @@ services: privileged: true hostname: c2 container_name: c2 + init: true volumes: - etc_munge:/etc/munge - etc_slurm:/etc/slurm diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py index 68e8360b8..6aba43029 100644 --- a/cluster_tools/tests/test_slurm.py +++ b/cluster_tools/tests/test_slurm.py @@ -316,7 +316,7 @@ def test_slurm_time_limit() -> None: def test_slurm_memory_limit() -> None: # Request 1 MB executor = cluster_tools.get_executor( - "slurm", debug=True, job_resources={"mem": "1M"} + "slurm", debug=True, job_resources={"mem": "30M"} # 30M is minimal required memory ) with executor: @@ -324,14 +324,10 @@ def test_slurm_memory_limit() -> None: # because the frequency of the memory polling is 1 second duration = 3 futures = executor.map_to_futures( - partial(allocate, duration), [1024 * 1024 * 2] + partial(allocate, duration), [1024 * 1024 * 50] ) concurrent.futures.wait(futures) - with open(".cfut/slurmpy.91_0.log.stdout", "r") as file: - print("=== LOG ===") - print(file.read()) - print([fut.exception() for fut in futures]) # Job should have been killed with a RemoteOutOfMemoryException From 11fb6fbff3784d1f6edac54d984bed72ef3a182f Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Wed, 21 May 2025 18:14:32 +0200 Subject: [PATCH 30/45] restart slurm --- .github/workflows/ci.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4e9b2c5e6..d83c354b7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -72,15 +72,13 @@ jobs: -w /cluster_tools/tests \ slurmctld bash -c "uv run --frozen python test_deref_main.py" - - name: Stop Slurm Cluster - run: cd ./dockered-slurm && docker compose stop - name: Update Slurm Config run: | echo "MaxArraySize=2" >> ./dockered-slurm/slurm.conf sed "s/JobAcctGatherFrequency=30/JobAcctGatherFrequency=1/g" ./dockered-slurm/slurm.conf > ./dockered-slurm/slurm.conf.bak cp ./dockered-slurm/slurm.conf.bak ./dockered-slurm/slurm.conf - name: Start Slurm Cluster - run: cd ./dockered-slurm && docker compose start + run: cd ./dockered-slurm && docker compose restart slurmctld c1 c2 - name: "Run Tests (test_all, test_slurm)" run: | From b0083f50ae1b9f3c26d467dc51bd2fa16e18c4e7 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Wed, 21 May 2025 18:14:38 +0200 Subject: [PATCH 31/45] remove pytest_configuration --- cluster_tools/tests/test_slurm.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py index 6aba43029..52c097ad0 100644 --- a/cluster_tools/tests/test_slurm.py +++ b/cluster_tools/tests/test_slurm.py @@ -36,10 +36,6 @@ def allocate(duration: float, num_bytes: int) -> int: return sys.getsizeof(data) -def pytest_configuration(config) -> None: - config.addinivalue_line("makers", "slurm_change_config") - - logging.basicConfig() From 3e9fe4c3cced82d4a4bb239ee5ed49a9770429f9 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Thu, 22 May 2025 14:17:54 +0200 Subject: [PATCH 32/45] apply format --- cluster_tools/cluster_tools/schedulers/slurm.py | 8 ++++++-- cluster_tools/tests/test_slurm.py | 4 +++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/cluster_tools/cluster_tools/schedulers/slurm.py b/cluster_tools/cluster_tools/schedulers/slurm.py index f890e3ef9..4b3341cc8 100644 --- a/cluster_tools/cluster_tools/schedulers/slurm.py +++ b/cluster_tools/cluster_tools/schedulers/slurm.py @@ -439,7 +439,9 @@ def parse_key_value_pairs( # Call `seff job_id` which should return some output including a line, # such as: "Memory Efficiency: 25019.18% of 1.00 GB" - stdout, _, exit_code = call(f"sacct -P --format=JobID,State,MaxRSS,ReqMem --unit K -j {job_id_with_index}") + stdout, _, exit_code = call( + f"sacct -P --format=JobID,State,MaxRSS,ReqMem --unit K -j {job_id_with_index}" + ) print("sacct stdout:\n", stdout) print("sacct exit code", exit_code) @@ -472,7 +474,9 @@ def _investigate_time_limit( reason = f"The job was probably terminated because it ran for too long ({time_limit_note})." return (reason, RemoteTimeLimitException) - def _investigate_memory_consumption(self, stdout: str) -> tuple[str, type[RemoteOutOfMemoryException]] | None: + def _investigate_memory_consumption( + self, stdout: str + ) -> tuple[str, type[RemoteOutOfMemoryException]] | None: stdout_lines = stdout.splitlines() max_rss = 0 req_mem = 0 diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py index 52c097ad0..be7017ffc 100644 --- a/cluster_tools/tests/test_slurm.py +++ b/cluster_tools/tests/test_slurm.py @@ -312,7 +312,9 @@ def test_slurm_time_limit() -> None: def test_slurm_memory_limit() -> None: # Request 1 MB executor = cluster_tools.get_executor( - "slurm", debug=True, job_resources={"mem": "30M"} # 30M is minimal required memory + "slurm", + debug=True, + job_resources={"mem": "30M"}, # 30M is minimal required memory ) with executor: From e8964b8d59e3d359affd0d614e823ca9132cbe8e Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Thu, 22 May 2025 14:21:11 +0200 Subject: [PATCH 33/45] retry gathering job information --- .../cluster_tools/schedulers/slurm.py | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/cluster_tools/cluster_tools/schedulers/slurm.py b/cluster_tools/cluster_tools/schedulers/slurm.py index 4b3341cc8..dead46fca 100644 --- a/cluster_tools/cluster_tools/schedulers/slurm.py +++ b/cluster_tools/cluster_tools/schedulers/slurm.py @@ -4,6 +4,7 @@ import os import re import sys +import time import threading from collections.abc import Callable, Iterable from concurrent.futures import Future @@ -437,20 +438,28 @@ def parse_key_value_pairs( if time_limit_investigation: return time_limit_investigation - # Call `seff job_id` which should return some output including a line, - # such as: "Memory Efficiency: 25019.18% of 1.00 GB" - stdout, _, exit_code = call( - f"sacct -P --format=JobID,State,MaxRSS,ReqMem --unit K -j {job_id_with_index}" - ) - print("sacct stdout:\n", stdout) - print("sacct exit code", exit_code) + # Request gathered job information. + for _ in range(10): + stdout, _, exit_code = call( + f"sacct -P --format=JobID,State,MaxRSS,ReqMem --unit K -j {job_id_with_index}" + ) + print("sacct stdout:\n", stdout) + print("sacct exit code", exit_code) + + if exit_code != 0: + break + + if len(stdout.splitlines()) <= 1: + time.sleep(0.1) + continue - if exit_code == 0: # Parse stdout into a key-value object memory_limit_investigation = self._investigate_memory_consumption(stdout) if memory_limit_investigation: return memory_limit_investigation + break + print("properties", properties) if properties is None: return None From e9aa7c452928edf38831b20884ef3a28acb23cf5 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Thu, 22 May 2025 14:22:54 +0200 Subject: [PATCH 34/45] fix linting errors --- cluster_tools/cluster_tools/schedulers/slurm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cluster_tools/cluster_tools/schedulers/slurm.py b/cluster_tools/cluster_tools/schedulers/slurm.py index dead46fca..8079c30b0 100644 --- a/cluster_tools/cluster_tools/schedulers/slurm.py +++ b/cluster_tools/cluster_tools/schedulers/slurm.py @@ -4,8 +4,8 @@ import os import re import sys -import time import threading +import time from collections.abc import Callable, Iterable from concurrent.futures import Future from functools import lru_cache @@ -496,12 +496,12 @@ def _investigate_memory_consumption( try: print(params[2][:-1]) max_rss = max(max_rss, int(params[2][:-1])) - except: + except Exception: pass try: print(params[3][:-1]) req_mem = max(max_rss, int(params[3][:-1])) - except: + except Exception: pass if "OUT_OF_MEMORY" in stdout: From 8d67f8b599a02b8c7e1a72f5c2991657c6235c61 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Thu, 22 May 2025 14:27:30 +0200 Subject: [PATCH 35/45] remove max parallel ci jobs --- .github/workflows/ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d83c354b7..a844792c5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -202,7 +202,6 @@ jobs: ${{ needs.changes.outputs.webknossos == 'true' }} runs-on: ubuntu-latest strategy: - max-parallel: 4 matrix: python-version: ["3.13", "3.12", "3.11", "3.10"] group: [1, 2, 3] From 164e62c9700be150cccaecf1ed2f1325b2c78d4b Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Thu, 22 May 2025 14:40:57 +0200 Subject: [PATCH 36/45] decrease sacct request frequencies --- cluster_tools/cluster_tools/schedulers/slurm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster_tools/cluster_tools/schedulers/slurm.py b/cluster_tools/cluster_tools/schedulers/slurm.py index 8079c30b0..7723ef48c 100644 --- a/cluster_tools/cluster_tools/schedulers/slurm.py +++ b/cluster_tools/cluster_tools/schedulers/slurm.py @@ -450,7 +450,7 @@ def parse_key_value_pairs( break if len(stdout.splitlines()) <= 1: - time.sleep(0.1) + time.sleep(0.2) continue # Parse stdout into a key-value object From 5f2d0d845b51a6c33dd2f5e546cda852efb7ae56 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Thu, 22 May 2025 14:44:56 +0200 Subject: [PATCH 37/45] remove prints --- cluster_tools/cluster_tools/schedulers/slurm.py | 6 ------ cluster_tools/tests/test_slurm.py | 2 -- 2 files changed, 8 deletions(-) diff --git a/cluster_tools/cluster_tools/schedulers/slurm.py b/cluster_tools/cluster_tools/schedulers/slurm.py index 7723ef48c..f2e05e273 100644 --- a/cluster_tools/cluster_tools/schedulers/slurm.py +++ b/cluster_tools/cluster_tools/schedulers/slurm.py @@ -443,8 +443,6 @@ def parse_key_value_pairs( stdout, _, exit_code = call( f"sacct -P --format=JobID,State,MaxRSS,ReqMem --unit K -j {job_id_with_index}" ) - print("sacct stdout:\n", stdout) - print("sacct exit code", exit_code) if exit_code != 0: break @@ -460,7 +458,6 @@ def parse_key_value_pairs( break - print("properties", properties) if properties is None: return None @@ -492,14 +489,11 @@ def _investigate_memory_consumption( if len(stdout_lines) > 0: for line in stdout_lines[1:]: params = line.split("|") - print(params) try: - print(params[2][:-1]) max_rss = max(max_rss, int(params[2][:-1])) except Exception: pass try: - print(params[3][:-1]) req_mem = max(max_rss, int(params[3][:-1])) except Exception: pass diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py index be7017ffc..0144ab214 100644 --- a/cluster_tools/tests/test_slurm.py +++ b/cluster_tools/tests/test_slurm.py @@ -326,8 +326,6 @@ def test_slurm_memory_limit() -> None: ) concurrent.futures.wait(futures) - print([fut.exception() for fut in futures]) - # Job should have been killed with a RemoteOutOfMemoryException assert all( isinstance(fut.exception(), cluster_tools.RemoteOutOfMemoryException) From 5b3674386814d468abc7a26e274f68bc01d876b4 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Thu, 22 May 2025 14:52:41 +0200 Subject: [PATCH 38/45] remove prints 2 --- cluster_tools/cluster_tools/schedulers/slurm.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/cluster_tools/cluster_tools/schedulers/slurm.py b/cluster_tools/cluster_tools/schedulers/slurm.py index f2e05e273..1a2580bb4 100644 --- a/cluster_tools/cluster_tools/schedulers/slurm.py +++ b/cluster_tools/cluster_tools/schedulers/slurm.py @@ -411,8 +411,6 @@ def investigate_failed_job( # of the job: If the job was killed with signal 9, it's very likely due to some # resource limit. - print("JobId", job_id_with_index) - def parse_key_value_pairs( text: str, pair_delimiter: str, key_value_delimiter: str ) -> dict[str, str]: @@ -427,7 +425,6 @@ def parse_key_value_pairs( # Call `scontrol show jobid=` which should return some output including # key=value pairs, such as: "Reason=...", "TimeLimit=...", and "RunTime=..." stdout, stderr, exit_code = call(f"scontrol show jobid={job_id_with_index}") - print("exit code", exit_code, stderr) properties = None if exit_code == 0: From 7a3b25825c2ab183741c751c026cb0a59ffe26ca Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Tue, 27 May 2025 09:16:53 +0200 Subject: [PATCH 39/45] use maste docker cluster --- cluster_tools/dockered-slurm/docker-compose.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cluster_tools/dockered-slurm/docker-compose.yml b/cluster_tools/dockered-slurm/docker-compose.yml index 8e21cd387..f7abffc6a 100644 --- a/cluster_tools/dockered-slurm/docker-compose.yml +++ b/cluster_tools/dockered-slurm/docker-compose.yml @@ -18,7 +18,7 @@ services: - ..:/cluster_tools slurmdbd: - image: scalableminds/slurm-docker-cluster:port_slurm_in_docker_to_debian + image: scalableminds/slurm-docker-cluster:master command: [ "slurmdbd" ] container_name: slurmdbd hostname: slurmdbd @@ -40,7 +40,7 @@ services: condition: service_healthy slurmctld: - image: scalableminds/slurm-docker-cluster:port_slurm_in_docker_to_debian + image: scalableminds/slurm-docker-cluster:master command: [ "slurmctld" ] container_name: slurmctld environment: @@ -67,7 +67,7 @@ services: condition: service_healthy c1: - image: scalableminds/slurm-docker-cluster:port_slurm_in_docker_to_debian + image: scalableminds/slurm-docker-cluster:master command: [ "slurmd" ] privileged: true hostname: c1 @@ -88,7 +88,7 @@ services: condition: service_healthy c2: - image: scalableminds/slurm-docker-cluster:port_slurm_in_docker_to_debian + image: scalableminds/slurm-docker-cluster:master command: [ "slurmd" ] privileged: true hostname: c2 From 76602e3c579840750d915b08b21d514f39d3ab85 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Tue, 27 May 2025 10:11:17 +0200 Subject: [PATCH 40/45] apply suggestions --- .github/workflows/ci.yml | 17 +++---- .../cluster_tools/schedulers/slurm.py | 44 +++++++++---------- cluster_tools/tests/test_slurm.py | 10 ++--- 3 files changed, 35 insertions(+), 36 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a844792c5..9bce19eda 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -60,12 +60,12 @@ jobs: for name in "slurmctld" "c1" "c2"; do docker exec -w /cluster_tools "$name" bash -c "uv sync --frozen" done - - name: "Run Tests (test_all, test_slurm)" + - name: "Run Tests (test_all, test_slurm) without modified slurm.conf" run: | docker exec \ -w /cluster_tools/tests \ -e PYTEST_EXECUTORS=slurm \ - slurmctld bash -c "uv run --frozen python -m pytest -sv test_all.py test_slurm.py -m 'not slurm_change_config'" + slurmctld bash -c "uv run --frozen python -m pytest -sv test_all.py test_slurm.py -m 'not requires_modified_slurm_config'" - name: "Run Tests (test_deref_main)" run: | docker exec \ @@ -75,17 +75,18 @@ jobs: - name: Update Slurm Config run: | echo "MaxArraySize=2" >> ./dockered-slurm/slurm.conf - sed "s/JobAcctGatherFrequency=30/JobAcctGatherFrequency=1/g" ./dockered-slurm/slurm.conf > ./dockered-slurm/slurm.conf.bak - cp ./dockered-slurm/slurm.conf.bak ./dockered-slurm/slurm.conf - - name: Start Slurm Cluster + sed "s/JobAcctGatherFrequency=30/JobAcctGatherFrequency=1/g" ./dockered-slurm/slurm.conf > ./dockered-slurm/slurm.conf.tmp + mv ./dockered-slurm/slurm.conf.tmp ./dockered-slurm/slurm.conf + - name: Restart Slurm Cluster run: cd ./dockered-slurm && docker compose restart slurmctld c1 c2 - - name: "Run Tests (test_all, test_slurm)" + - name: "Run Tests (test_all, test_slurm) with modified slurn.conf" run: | + # Run tests requiring a modified slurm config docker exec \ -w /cluster_tools/tests \ -e PYTEST_EXECUTORS=slurm \ - slurmctld bash -c "uv run --frozen python -m pytest -sv test_slurm.py -m 'slurm_change_config'" + slurmctld bash -c "uv run --frozen python -m pytest -sv test_slurm.py -m 'requires_modified_slurm_config'" cluster_tools_multiprocessing: needs: changes @@ -190,7 +191,7 @@ jobs: run: uv python install ${{ matrix.python-version }} - name: Install dependencies (without docker) run: uv sync --all-extras --frozen - - name: "Run Kubernetes" + - name: "Run Dask" run: | cd tests PYTEST_EXECUTORS=dask uv run --frozen python -m pytest -sv test_all.py test_dask.py diff --git a/cluster_tools/cluster_tools/schedulers/slurm.py b/cluster_tools/cluster_tools/schedulers/slurm.py index 1a2580bb4..3241b0473 100644 --- a/cluster_tools/cluster_tools/schedulers/slurm.py +++ b/cluster_tools/cluster_tools/schedulers/slurm.py @@ -443,17 +443,14 @@ def parse_key_value_pairs( if exit_code != 0: break - - if len(stdout.splitlines()) <= 1: + elif len(stdout_lines()) <= 1: time.sleep(0.2) - continue - - # Parse stdout into a key-value object - memory_limit_investigation = self._investigate_memory_consumption(stdout) - if memory_limit_investigation: - return memory_limit_investigation - - break + else: + # Parse stdout into a key-value object + memory_limit_investigation = self._investigate_memory_consumption(stdout) + if memory_limit_investigation: + return memory_limit_investigation + break if properties is None: return None @@ -483,19 +480,20 @@ def _investigate_memory_consumption( stdout_lines = stdout.splitlines() max_rss = 0 req_mem = 0 - if len(stdout_lines) > 0: - for line in stdout_lines[1:]: - params = line.split("|") - try: - max_rss = max(max_rss, int(params[2][:-1])) - except Exception: - pass - try: - req_mem = max(max_rss, int(params[3][:-1])) - except Exception: - pass - - if "OUT_OF_MEMORY" in stdout: + states = [] + # Table Format: + # + # JobID|State|MaxRSS|ReqMem + # 91_0|FAILED||30720K + # 91_0.batch|FAILED|248K| + # 91_0.0|OUT_OF_MEMORY|164K| + linefilter = re.compile(r"^([^|]*)\|(\w*)\|(\d*)K?\|(\d*)K?$", re.MULTILINE) + for job_id, state, rss, mem in linefilter.findall(text): + max_rss = max(max_rss, int(rss or 0)) + req_mem = max(req_mem, int(mem or 0)) + states += [state] + + if "OUT_OF_MEMORY" in states: # Check if task plugin killed the job. This is the case if cgroup is used as TaskPlugin and # memory limits are enforced. reason = f"The job was terminated because it consumed too much memory (Requested: {req_mem / 1000} MB)." diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py index 0144ab214..491b14fe3 100644 --- a/cluster_tools/tests/test_slurm.py +++ b/cluster_tools/tests/test_slurm.py @@ -267,7 +267,7 @@ def test_slurm_number_of_submitted_jobs() -> None: assert executor.get_number_of_submitted_jobs() == 0 -@pytest.mark.slurm_change_config +@pytest.mark.requires_modified_slurm_config def test_slurm_max_array_size() -> None: expected_max_array_size = 2 @@ -308,17 +308,17 @@ def test_slurm_time_limit() -> None: ) -@pytest.mark.slurm_change_config +@pytest.mark.requires_modified_slurm_config def test_slurm_memory_limit() -> None: - # Request 1 MB + # Request 30 MB executor = cluster_tools.get_executor( "slurm", debug=True, - job_resources={"mem": "30M"}, # 30M is minimal required memory + job_resources={"mem": "30M"}, # 30M is the smallest limited enforced by Cgroups ) with executor: - # Schedule a job that allocates more than 1 MB and let it run for more than 1 second + # Schedule a job that allocates more than 30 MB and let it run for more than 1 second # because the frequency of the memory polling is 1 second duration = 3 futures = executor.map_to_futures( From 5286668dde5c1581a4d0a073bafb336a044bcd32 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Tue, 27 May 2025 10:15:05 +0200 Subject: [PATCH 41/45] fix lints and format --- cluster_tools/cluster_tools/schedulers/slurm.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/cluster_tools/cluster_tools/schedulers/slurm.py b/cluster_tools/cluster_tools/schedulers/slurm.py index 3241b0473..0af69b062 100644 --- a/cluster_tools/cluster_tools/schedulers/slurm.py +++ b/cluster_tools/cluster_tools/schedulers/slurm.py @@ -440,14 +440,17 @@ def parse_key_value_pairs( stdout, _, exit_code = call( f"sacct -P --format=JobID,State,MaxRSS,ReqMem --unit K -j {job_id_with_index}" ) + stdout_lines = stdout.splitlines() if exit_code != 0: break - elif len(stdout_lines()) <= 1: + elif len(stdout_lines) <= 1: time.sleep(0.2) else: # Parse stdout into a key-value object - memory_limit_investigation = self._investigate_memory_consumption(stdout) + memory_limit_investigation = self._investigate_memory_consumption( + stdout + ) if memory_limit_investigation: return memory_limit_investigation break @@ -477,18 +480,17 @@ def _investigate_time_limit( def _investigate_memory_consumption( self, stdout: str ) -> tuple[str, type[RemoteOutOfMemoryException]] | None: - stdout_lines = stdout.splitlines() - max_rss = 0 - req_mem = 0 - states = [] # Table Format: # # JobID|State|MaxRSS|ReqMem # 91_0|FAILED||30720K # 91_0.batch|FAILED|248K| # 91_0.0|OUT_OF_MEMORY|164K| + max_rss = 0 + req_mem = 0 + states = [] linefilter = re.compile(r"^([^|]*)\|(\w*)\|(\d*)K?\|(\d*)K?$", re.MULTILINE) - for job_id, state, rss, mem in linefilter.findall(text): + for job_id, state, rss, mem in linefilter.findall(stdout): max_rss = max(max_rss, int(rss or 0)) req_mem = max(req_mem, int(mem or 0)) states += [state] From 26eeaf1830e76c76623070c2e5b9a53df3a88eb9 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Tue, 27 May 2025 10:19:16 +0200 Subject: [PATCH 42/45] add Changelog entry --- cluster_tools/Changelog.md | 1 + 1 file changed, 1 insertion(+) diff --git a/cluster_tools/Changelog.md b/cluster_tools/Changelog.md index 89f3299e3..811e2a1a5 100644 --- a/cluster_tools/Changelog.md +++ b/cluster_tools/Changelog.md @@ -14,6 +14,7 @@ For upgrade instructions, please check the respective *Breaking Changes* section ### Added ### Changed +- Use `sacct` to detect out of memory errors instead of `seff` for Slurm executor. [#1297](https://github.com/scalableminds/webknossos-libs/pull/1297) ### Fixed From 20c2236b9d66fdf62934f4eb8e085d0e65c35ab5 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Tue, 27 May 2025 10:21:46 +0200 Subject: [PATCH 43/45] add missing setuptools entry --- cluster_tools/pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cluster_tools/pyproject.toml b/cluster_tools/pyproject.toml index f7f8d0307..d9ac4cec2 100644 --- a/cluster_tools/pyproject.toml +++ b/cluster_tools/pyproject.toml @@ -40,6 +40,8 @@ exclude = ["cluster_tools.tests"] # This is a fix for an issue in setuptools. See: https://github.com/pypa/setuptools/issues/4759 # This should be removed when the issue is resolved. +[tool.setuptools] +license-files = [] [tool.ruff] # Exclude a variety of commonly ignored directories. From e3ed6921a15b30b1c76befd26463fb8e8190cb16 Mon Sep 17 00:00:00 2001 From: Robert Oleynik Date: Tue, 27 May 2025 11:52:48 +0200 Subject: [PATCH 44/45] cluster_tools: remove setuptools entry --- cluster_tools/pyproject.toml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cluster_tools/pyproject.toml b/cluster_tools/pyproject.toml index d9ac4cec2..3b03093d9 100644 --- a/cluster_tools/pyproject.toml +++ b/cluster_tools/pyproject.toml @@ -38,11 +38,6 @@ where = ["."] include = ["cluster_tools*"] exclude = ["cluster_tools.tests"] -# This is a fix for an issue in setuptools. See: https://github.com/pypa/setuptools/issues/4759 -# This should be removed when the issue is resolved. -[tool.setuptools] -license-files = [] - [tool.ruff] # Exclude a variety of commonly ignored directories. exclude = [ From 110611d6d221681324608d11c89822ed30cbf7f2 Mon Sep 17 00:00:00 2001 From: robert-oleynik <62473688+robert-oleynik@users.noreply.github.com> Date: Tue, 27 May 2025 15:16:37 +0200 Subject: [PATCH 45/45] Update cluster_tools/tests/test_slurm.py Co-authored-by: Philipp Otto --- cluster_tools/tests/test_slurm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py index 491b14fe3..b847b5469 100644 --- a/cluster_tools/tests/test_slurm.py +++ b/cluster_tools/tests/test_slurm.py @@ -314,7 +314,7 @@ def test_slurm_memory_limit() -> None: executor = cluster_tools.get_executor( "slurm", debug=True, - job_resources={"mem": "30M"}, # 30M is the smallest limited enforced by Cgroups + job_resources={"mem": "30M"}, # 30M is the smallest limit enforced by Cgroups ) with executor: