diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 92345b3baed..59eef28d29b 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -121,18 +121,13 @@ jobs: - name: Run GPU Unit Tests continue-on-error: false - run: | - - script=$(cat << EOF - - set -xe - - bash tests/run_gpu_tests.sh 2 - - EOF - ) - - docker exec -t pthd /bin/bash -c "${script}" + uses: nick-fields/retry@v3 + with: + max_attempts: 5 + timeout_minutes: 30 + shell: bash + command: docker exec -t pthd /bin/bash -xec 'tests/run_gpu_tests.sh 2' + new_command_on_retry: docker exec -e USE_LAST_FAILED=1 -t pthd /bin/bash -xec 'tests/run_gpu_tests.sh 2' - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 diff --git a/.github/workflows/hvd-tests.yml b/.github/workflows/hvd-tests.yml index 3f6ba7f24bd..e22c0908372 100644 --- a/.github/workflows/hvd-tests.yml +++ b/.github/workflows/hvd-tests.yml @@ -81,6 +81,11 @@ jobs: timeout_minutes: 25 shell: bash command: bash tests/run_cpu_tests.sh + on_retry_command: | + if [ -f .ignite_testing.pid ] && [ -s .ignite_testing.pid ] && ps -p $(cat .ignite_testing.pid) ; then + echo sending kill signal until process group no longer exists... + bash tests/kill_process_group.sh .ignite_testing.pid + fi new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh - name: Upload coverage to Codecov diff --git a/.github/workflows/pytorch-version-tests.yml b/.github/workflows/pytorch-version-tests.yml index 62020d0aba1..73bcc7367e1 100644 --- a/.github/workflows/pytorch-version-tests.yml +++ b/.github/workflows/pytorch-version-tests.yml @@ -98,6 +98,11 @@ jobs: timeout_minutes: 25 shell: bash command: bash tests/run_cpu_tests.sh "not test_time_profilers" + on_retry_command: | + if [ -f .ignite_testing.pid ] && [ -s .ignite_testing.pid ]; then + echo sending kill signal until process group no longer exists... + python tests/utils/kill_process_group.py .ignite_testing.pid + fi new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh "not test_time_profilers" # create-issue: diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml index a7a52949114..a9b27646ce2 100644 --- a/.github/workflows/tpu-tests.yml +++ b/.github/workflows/tpu-tests.yml @@ -97,6 +97,11 @@ jobs: command: | python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)" bash tests/run_tpu_tests.sh + on_retry_command: | + if [ -f .ignite_testing.pid ] && [ -s .ignite_testing.pid ]; then + echo sending kill signal until process group no longer exists... + python tests/utils/kill_process_group.py .ignite_testing.pid + fi new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_tpu_tests.sh env: LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH }}:${{ env.Python_ROOT_DIR }}/lib diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index fd91bc18d4a..43695cf7376 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -126,6 +126,11 @@ jobs: timeout_minutes: 25 shell: bash command: SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh + on_retry_command: | + if [ -f .ignite_testing.pid ] && [ -s .ignite_testing.pid ]; then + echo sending kill signal until process group no longer exists... + python tests/utils/kill_process_group.py .ignite_testing.pid + fi new_command_on_retry: USE_LAST_FAILED=1 SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh - name: Upload coverage to Codecov diff --git a/tests/common-test-functionality.sh b/tests/common-test-functionality.sh index 1670da14500..b289d6bb37f 100644 --- a/tests/common-test-functionality.sh +++ b/tests/common-test-functionality.sh @@ -17,6 +17,8 @@ run_tests() { local world_size=0 # Always clean up pytest.ini trap 'rm -f pytest.ini' RETURN + # write out the process id so that we can send signals... + echo $$ > .ignite_testing.pid # Parse arguments while [[ $# -gt 0 ]] do diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/utils/kill_process_group.py b/tests/utils/kill_process_group.py new file mode 100644 index 00000000000..8057fd24f3f --- /dev/null +++ b/tests/utils/kill_process_group.py @@ -0,0 +1,82 @@ +import os +import signal +import time + +import psutil + + +def kill_process_group(pid_file): + if not os.path.isfile(pid_file) or os.stat(pid_file).st_size == 0: + print(f"File {pid_file} does not exist or is empty") + return 1 + + with open(pid_file, "r") as file: + pgid = int(file.read().strip()) + + if os.name != "posix": + return kill_process_group_win(pid_file) + + try: + # Check if the process group exists by sending a dummy signal (0) + os.killpg(pgid, 0) + except ProcessLookupError: + print(f"Process group {pgid} does not exist") + return 1 + + # Send INT signal to the process group + sigtype = "interrupt" + os.killpg(pgid, signal.SIGINT) + # Poll process existence and kill forcefully if necessary + try: + trying = 5 + while trying > 0: + time.sleep(0.5) + os.killpg(pgid, 0) + trying -= 0.5 + if trying == 0.5 and sigtype == "interrupt": + sigtype = "kill" + trying = 2 + os.killpg(pgid, signal.SIGKILL) + + except ProcessLookupError: + print(f"Process group killed successfully with {sigtype} signal") + return 0 + + print(f"Failed to kill process group {pgid}") + return 1 + + +def kill_process_group_win(pid_file): + if not os.path.isfile(pid_file) or os.stat(pid_file).st_size == 0: + print(f"File {pid_file} does not exist or is empty") + return 1 + + with open(pid_file, "r") as file: + pid = int(file.read().strip()) + + try: + proc = psutil.Process(pid) + except psutil.NoSuchProcess: + print(f"Process {pid} does not exist") + return 1 + + # Try to terminate the process + try: + proc.terminate() # Sends SIGTERM on Unix, TerminateProcess on Windows + proc.wait(timeout=3) # Wait up to 3 seconds for the process to terminate + except psutil.TimeoutExpired: + # Process did not terminate in time, kill it + try: + proc.kill() + proc.wait(timeout=1) # Wait for the process to be killed + print("Process killed successfully with kill signal") + return 0 + except Exception as e: + print(f"Failed to kill process {pid}: {e}") + return 1 + except Exception as e: + print(f"Error when attempting to terminate process {pid}: {e}") + return 1 + + print("Process terminated successfully with terminate signal") + return 0 diff --git a/tests/utils/test_kill_process_group.py b/tests/utils/test_kill_process_group.py new file mode 100644 index 00000000000..2eb96e6d031 --- /dev/null +++ b/tests/utils/test_kill_process_group.py @@ -0,0 +1,46 @@ +import os +import sys +import time +from unittest.mock import patch + +from .kill_process_group import kill_process_group + + +# Attempt at creating interuptible command failed +# command = """setsid sh -c 'trap "" SIGINT SIGTERM; sleep 1000' >/dev/null 2>&1 & echo $! > .kpg_testing.pid""" +def test_kill_process_group_existing_persistent(): + pid_file = ".kpg_testing.pid" + os.system("setsid sh -c 'sleep 1000' >/dev/null 2>&1 & echo $! > .kpg_testing.pid") + time.sleep(1) + message = "Process group killed successfully with kill signal" + with patch("builtins.print") as mock_print: + assert kill_process_group(pid_file) == 0 + mock_print.assert_called_with(message) + + +def test_kill_process_group_nonexistent_process(): + pid_file = ".kpg_testing.pid" + with open(pid_file, "w") as file: + file.write("12345") + message = "Process group 12345 does not exist" + with patch("builtins.print") as mock_print: + assert kill_process_group(pid_file) == 1 + mock_print.assert_called_with(message) + + +def test_kill_process_group_no_file(): + pid_file = ".kpg_testing_nonexistent.pid" + message = "File .kpg_testing_nonexistent.pid does not exist or is empty" + with patch("builtins.print") as mock_print: + assert kill_process_group(pid_file) == 1 + mock_print.assert_called_with(message) + + +if __name__ == "__main__": + if len(sys.argv) == 2: + pid_file = sys.argv[1] + result = kill_process_group(pid_file) + sys.exit(result) + else: + print("Please provide the path to the PID file as a command line argument.") + print("Example: python kill_process_group.py /path/to/pid_file")