Skip to content

Commit

Permalink
add hack to make sure testing session is killed
Browse files Browse the repository at this point in the history
currently tests are interupted between retries using SIGTERM. This is not always
successful and results in simultaneous runs of the tests.

expand use of on_retry_command
  • Loading branch information
leej3 committed May 13, 2024
1 parent f32a215 commit f37e188
Show file tree
Hide file tree
Showing 9 changed files with 157 additions and 12 deletions.
19 changes: 7 additions & 12 deletions .github/workflows/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -121,18 +121,13 @@ jobs:
- name: Run GPU Unit Tests
continue-on-error: false
run: |
script=$(cat << EOF
set -xe
bash tests/run_gpu_tests.sh 2
EOF
)
docker exec -t pthd /bin/bash -c "${script}"
uses: nick-fields/retry@v3
with:
max_attempts: 5
timeout_minutes: 30
shell: bash
command: docker exec -t pthd /bin/bash -xec 'tests/run_gpu_tests.sh 2'
new_command_on_retry: docker exec -e USE_LAST_FAILED=1 -t pthd /bin/bash -xec 'tests/run_gpu_tests.sh 2'

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/hvd-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@ jobs:
timeout_minutes: 25
shell: bash
command: bash tests/run_cpu_tests.sh
on_retry_command: |
if [ -f .ignite_testing.pid ] && [ -s .ignite_testing.pid ] && ps -p $(cat .ignite_testing.pid) ; then
echo sending kill signal until process group no longer exists...
bash tests/kill_process_group.sh .ignite_testing.pid
fi
new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh

- name: Upload coverage to Codecov
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/pytorch-version-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,11 @@ jobs:
timeout_minutes: 25
shell: bash
command: bash tests/run_cpu_tests.sh "not test_time_profilers"
on_retry_command: |
if [ -f .ignite_testing.pid ] && [ -s .ignite_testing.pid ]; then
echo sending kill signal until process group no longer exists...
python tests/utils/kill_process_group.py .ignite_testing.pid
fi
new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh "not test_time_profilers"

# create-issue:
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/tpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,11 @@ jobs:
command: |
python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)"
bash tests/run_tpu_tests.sh
on_retry_command: |
if [ -f .ignite_testing.pid ] && [ -s .ignite_testing.pid ]; then
echo sending kill signal until process group no longer exists...
python tests/utils/kill_process_group.py .ignite_testing.pid
fi
new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_tpu_tests.sh
env:
LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH }}:${{ env.Python_ROOT_DIR }}/lib
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,11 @@ jobs:
timeout_minutes: 25
shell: bash
command: SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
on_retry_command: |
if [ -f .ignite_testing.pid ] && [ -s .ignite_testing.pid ]; then
echo sending kill signal until process group no longer exists...
python tests/utils/kill_process_group.py .ignite_testing.pid
fi
new_command_on_retry: USE_LAST_FAILED=1 SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh

- name: Upload coverage to Codecov
Expand Down
2 changes: 2 additions & 0 deletions tests/common-test-functionality.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ run_tests() {
local world_size=0
# Always clean up pytest.ini
trap 'rm -f pytest.ini' RETURN
# write out the process id so that we can send signals...
echo $$ > .ignite_testing.pid
# Parse arguments
while [[ $# -gt 0 ]]
do
Expand Down
Empty file added tests/utils/__init__.py
Empty file.
82 changes: 82 additions & 0 deletions tests/utils/kill_process_group.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import os
import signal
import time

import psutil


def kill_process_group(pid_file):
if not os.path.isfile(pid_file) or os.stat(pid_file).st_size == 0:
print(f"File {pid_file} does not exist or is empty")
return 1

with open(pid_file, "r") as file:
pgid = int(file.read().strip())

if os.name != "posix":
return kill_process_group_win(pid_file)

try:
# Check if the process group exists by sending a dummy signal (0)
os.killpg(pgid, 0)
except ProcessLookupError:
print(f"Process group {pgid} does not exist")
return 1

# Send INT signal to the process group
sigtype = "interrupt"
os.killpg(pgid, signal.SIGINT)
# Poll process existence and kill forcefully if necessary
try:
trying = 5
while trying > 0:
time.sleep(0.5)
os.killpg(pgid, 0)
trying -= 0.5
if trying == 0.5 and sigtype == "interrupt":
sigtype = "kill"
trying = 2
os.killpg(pgid, signal.SIGKILL)

except ProcessLookupError:
print(f"Process group killed successfully with {sigtype} signal")
return 0

print(f"Failed to kill process group {pgid}")
return 1


def kill_process_group_win(pid_file):
if not os.path.isfile(pid_file) or os.stat(pid_file).st_size == 0:
print(f"File {pid_file} does not exist or is empty")
return 1

with open(pid_file, "r") as file:
pid = int(file.read().strip())

try:
proc = psutil.Process(pid)
except psutil.NoSuchProcess:
print(f"Process {pid} does not exist")
return 1

# Try to terminate the process
try:
proc.terminate() # Sends SIGTERM on Unix, TerminateProcess on Windows
proc.wait(timeout=3) # Wait up to 3 seconds for the process to terminate
except psutil.TimeoutExpired:
# Process did not terminate in time, kill it
try:
proc.kill()
proc.wait(timeout=1) # Wait for the process to be killed
print("Process killed successfully with kill signal")
return 0
except Exception as e:
print(f"Failed to kill process {pid}: {e}")
return 1
except Exception as e:
print(f"Error when attempting to terminate process {pid}: {e}")
return 1

print("Process terminated successfully with terminate signal")
return 0
46 changes: 46 additions & 0 deletions tests/utils/test_kill_process_group.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os
import sys
import time
from unittest.mock import patch

from .kill_process_group import kill_process_group


# Attempt at creating interuptible command failed
# command = """setsid sh -c 'trap "" SIGINT SIGTERM; sleep 1000' >/dev/null 2>&1 & echo $! > .kpg_testing.pid"""
def test_kill_process_group_existing_persistent():
pid_file = ".kpg_testing.pid"
os.system("setsid sh -c 'sleep 1000' >/dev/null 2>&1 & echo $! > .kpg_testing.pid")
time.sleep(1)
message = "Process group killed successfully with kill signal"
with patch("builtins.print") as mock_print:
assert kill_process_group(pid_file) == 0
mock_print.assert_called_with(message)


def test_kill_process_group_nonexistent_process():
pid_file = ".kpg_testing.pid"
with open(pid_file, "w") as file:
file.write("12345")
message = "Process group 12345 does not exist"
with patch("builtins.print") as mock_print:
assert kill_process_group(pid_file) == 1
mock_print.assert_called_with(message)


def test_kill_process_group_no_file():
pid_file = ".kpg_testing_nonexistent.pid"
message = "File .kpg_testing_nonexistent.pid does not exist or is empty"
with patch("builtins.print") as mock_print:
assert kill_process_group(pid_file) == 1
mock_print.assert_called_with(message)


if __name__ == "__main__":
if len(sys.argv) == 2:
pid_file = sys.argv[1]
result = kill_process_group(pid_file)
sys.exit(result)
else:
print("Please provide the path to the PID file as a command line argument.")
print("Example: python kill_process_group.py /path/to/pid_file")

0 comments on commit f37e188

Please sign in to comment.