Debug container issue #1

Workflow file for this run

.github/workflows/_linux-test-arc.yml at 5f22ba0

	name: linux-test

	on:
	workflow_call:
	inputs:
	build-environment:
	required: true
	type: string
	description: Top-level label for what's being built/tested.
	test-matrix:
	required: true
	type: string
	description: JSON description of what test configs to run.
	docker-image:
	required: true
	type: string
	description: Docker image to run in.
	sync-tag:
	required: false
	type: string
	default: ""
	description: \|
	If this is set, our linter will use this to make sure that every other
	job with the same `sync-tag` is identical.
	timeout-minutes:
	required: false
	type: number
	default: 240
	description: \|
	Set the maximum (in minutes) how long the workflow should take to finish
	use-gha:
	required: false
	type: string
	default: ""
	description: If set to any value, upload to GHA. Otherwise upload to S3.
	dashboard-tag:
	required: false
	type: string
	default: ""
	s3-bucket:
	description: S3 bucket to download artifact
	required: false
	type: string
	default: "gha-artifacts"
	aws-role-to-assume:
	description: role to assume for downloading artifacts
	required: false
	type: string
	default: ""
	secrets:
	HUGGING_FACE_HUB_TOKEN:
	required: false
	description: \|
	HF Auth token to avoid rate limits when downloading models or datasets from hub

	env:
	GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}

	jobs:
	test:
	# Don't run on forked repos or empty test matrix
	if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
	strategy:
	matrix: ${{ fromJSON(inputs.test-matrix) }}
	fail-fast: false
	runs-on:
	group: ${{ matrix.runner }}
	timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 \|\| inputs.timeout-minutes }}
	steps:
	- name: Setup SSH (Click me for login details)
	uses: pytorch/test-infra/.github/actions/setup-ssh@main
	if: ${{ !contains(matrix.runner, 'gcp.a100') }}
	with:
	github-secret: ${{ secrets.GITHUB_TOKEN }}
	instructions: \|
	All testing is done inside the container, to start an interactive session run:
	docker exec -it $(docker container ps --format '{{.ID}}') bash

	- name: Checkout PyTorch
	uses: pytorch/pytorch/.github/actions/checkout-pytorch@main

	- name: Setup Linux
	uses: ./.github/actions/setup-linux

	- name: Configure aws credentials
	if: ${{ inputs.aws-role-to-assume != '' }}
	uses: aws-actions/configure-aws-credentials@v3
	with:
	role-to-assume: ${{ inputs.aws-role-to-assume }}
	role-session-name: gha-linux-test
	aws-region: us-east-1

	- name: Calculate docker image
	id: calculate-docker-image
	uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
	with:
	docker-image-name: ${{ inputs.docker-image }}

	- name: Use following to pull public copy of the image
	id: print-ghcr-mirror
	env:
	ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
	shell: bash
	run: \|
	tag=${ECR_DOCKER_IMAGE##*/}
	echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"

	- name: Pull docker image
	uses: pytorch/test-infra/.github/actions/pull-docker-image@main
	with:
	docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

	- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
	id: install-nvidia-driver
	uses: pytorch/test-infra/.github/actions/setup-nvidia@main
	if: contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu')

	- name: Lock NVIDIA A100 40GB Frequency
	run: \|
	sudo nvidia-smi -pm 1
	sudo nvidia-smi -ac 1215,1410
	nvidia-smi
	if: contains(matrix.runner, 'a100')

	- name: Start monitoring script
	id: monitor-script
	shell: bash
	continue-on-error: true
	run: \|
	python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
	python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
	echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

	- name: Download build artifacts
	uses: ./.github/actions/download-build-artifacts
	with:
	name: ${{ inputs.build-environment }}
	s3-bucket: ${{ inputs.s3-bucket }}

	- name: Download TD artifacts
	continue-on-error: true
	uses: ./.github/actions/download-td-artifacts

	- name: Parse ref
	id: parse-ref
	run: .github/scripts/parse_ref.py

	- name: Get workflow job id
	id: get-job-id
	uses: ./.github/actions/get-workflow-job-id
	if: always()
	with:
	github-token: ${{ secrets.GITHUB_TOKEN }}

	- name: Check for keep-going label and re-enabled test issues
	# This uses the filter-test-configs action because it conviniently
	# checks for labels and re-enabled test issues. It does not actually do
	# any filtering. All filtering is done in the build step.
	id: keep-going
	uses: ./.github/actions/filter-test-configs
	with:
	github-token: ${{ secrets.GITHUB_TOKEN }}
	test-matrix: ${{ inputs.test-matrix }}
	job-name: ${{ steps.get-job-id.outputs.job-name }}

	- name: Set Test step time
	id: test-timeout
	shell: bash
	env:
	JOB_TIMEOUT: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 \|\| inputs.timeout-minutes }}
	run: \|
	echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"

Check failure on line 173 in .github/workflows/_linux-test-arc.yml View workflow run for this annotation GitHub Actions / .github/workflows/_linux-test-arc.yml Invalid workflow file `You have an error in your yaml syntax on line 173`
	- name: Test
	id: test
	timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
	env:
	BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
	PR_NUMBER: ${{ github.event.pull_request.number }}
	GITHUB_REPOSITORY: ${{ github.repository }}
	GITHUB_WORKFLOW: ${{ github.workflow }}
	GITHUB_JOB: ${{ github.job }}
	GITHUB_RUN_ID: ${{ github.run_id }}
	GITHUB_RUN_NUMBER: ${{ github.run_number }}
	GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
	JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
	JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
	BRANCH: ${{ steps.parse-ref.outputs.branch }}
	SHA1: ${{ github.event.pull_request.head.sha \|\| github.sha }}
	BASE_SHA: ${{ github.event.pull_request.base.sha \|\| github.sha }}
	TEST_CONFIG: ${{ matrix.config }}
	SHARD_NUMBER: ${{ matrix.shard }}
	NUM_TEST_SHARDS: ${{ matrix.num_shards }}
	REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
	CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
	VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
	NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
	NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
	SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
	SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
	SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' \|\| '1g' }}
	DOCKER_IMAGE: ${{ inputs.docker-image }}
	XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' \|\| '' }}
	XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
	PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' \|\| '0' }}
	PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' \|\| '0' }}
	DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
	HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
	DOCKER_HOST: unix:///run/docker/docker.sock
	run: \|
	set -x

	if [[ $TEST_CONFIG == 'multigpu' ]]; then
	TEST_COMMAND=.ci/pytorch/multigpu-test.sh
	elif [[ $BUILD_ENVIRONMENT == onnx ]]; then
	TEST_COMMAND=.ci/onnx/test.sh
	else
	TEST_COMMAND=.ci/pytorch/test.sh
	fi

	echo $DOCKER_HOST
	docker info

	# detached container should get cleaned up by teardown_ec2_linux
	# TODO: Stop building test binaries as part of the build phase
	# Used for GPU_FLAG since that doesn't play nice
	# shellcheck disable=SC2086,SC2090
	container_name=$(docker run \
	${GPU_FLAG:-} \
	-e BUILD_ENVIRONMENT \
	-e PR_NUMBER \
	-e GITHUB_ACTIONS \
	-e GITHUB_REPOSITORY \
	-e GITHUB_WORKFLOW \
	-e GITHUB_JOB \
	-e GITHUB_RUN_ID \
	-e GITHUB_RUN_NUMBER \
	-e GITHUB_RUN_ATTEMPT \
	-e JOB_ID \
	-e JOB_NAME \
	-e BASE_SHA \
	-e BRANCH \
	-e SHA1 \
	-e AWS_DEFAULT_REGION \
	-e IN_WHEEL_TEST \
	-e SHARD_NUMBER \
	-e TEST_CONFIG \
	-e NUM_TEST_SHARDS \
	-e REENABLED_ISSUES \
	-e CONTINUE_THROUGH_ERROR \
	-e VERBOSE_TEST_LOGS \
	-e NO_TEST_TIMEOUT \
	-e NO_TD \
	-e PR_LABELS \
	-e MAX_JOBS="$(nproc --ignore=2)" \
	-e SCCACHE_BUCKET \
	-e SCCACHE_S3_KEY_PREFIX \
	-e XLA_CUDA \
	-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
	-e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
	-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
	-e SKIP_SCCACHE_INITIALIZATION=1 \
	-e HUGGING_FACE_HUB_TOKEN \
	-e DASHBOARD_TAG \
	--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
	--ulimit stack=10485760:83886080 \
	--security-opt seccomp=unconfined \
	--cap-add=SYS_PTRACE \
	--ipc=host \
	--shm-size="${SHM_SIZE}" \
	--tty \
	--detach \
	--name="${container_name}" \
	--user jenkins \
	--privileged \
	-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
	-w /var/lib/jenkins/workspace \
	"${DOCKER_IMAGE}"
	)

	# echo "${container_name}"
	# sleep(10000)

	# Propagate download.pytorch.org IP to container
	grep download.pytorch.org /etc/hosts \| docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
	echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
	docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"

	- name: Upload pytest cache if tests failed
	uses: ./.github/actions/pytest-cache-upload
	continue-on-error: true
	if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure'
	with:
	cache_dir: .pytest_cache
	shard: ${{ matrix.shard }}
	sha: ${{ github.event.pull_request.head.sha \|\| github.sha }}
	test_config: ${{ matrix.config }}
	job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}

	- name: Print remaining test logs
	shell: bash
	if: always() && steps.test.conclusion
	run: \|
	cat test/*/_toprint.log \|\| true

	- name: Stop monitoring script
	if: always() && steps.monitor-script.outputs.monitor-script-pid
	shell: bash
	continue-on-error: true
	env:
	MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
	run: \|
	kill "$MONITOR_SCRIPT_PID"

	- name: Upload test artifacts
	uses: ./.github/actions/upload-test-artifacts
	if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
	with:
	file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
	use-gha: ${{ inputs.use-gha }}

	- name: Collect backtraces from coredumps (if any)
	if: always()
	run: \|
	# shellcheck disable=SC2156
	find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;

	- name: Store Core dumps on S3
	uses: seemethere/upload-artifact-s3@v5
	if: failure()
	with:
	name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
	retention-days: 14
	if-no-files-found: ignore
	path: ./*/core.[1-9]

	- name: Teardown Linux
	uses: pytorch/test-infra/.github/actions/teardown-linux@main
	if: always()

	# NB: We are currently having an intermittent GPU-related issue on G5 runners with
	# A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
	# not seem to help. Here are some symptoms:
	# * Calling nvidia-smi timeouts after 60 second
	# * Fail to run nvidia-smi with an unable to determine the device handle for GPU
	# unknown error
	# * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch
	# * Run docker --gpus all fails with error response from daemon
	#
	# As both the root cause and recovery path are unclear, let's take the runner out of
	# service so that it doesn't get any more jobs
	- name: Check NVIDIA driver installation step
	if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped'
	shell: bash
	env:
	RUNNER_WORKSPACE: ${{ runner.workspace }}
	run: \|
	set +e
	set -x

	nvidia-smi
	# NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in
	# the case where the driver has already crashed as it still can get the driver version
	# and some basic information like the bus ID. However, the rest of the information
	# would be missing (ERR!), for example:
	#
	# +-----------------------------------------------------------------------------+
	# \| NVIDIA-SMI 525.89.02 Driver Version: 525.89.02 CUDA Version: 12.0 \|
	# \|-------------------------------+----------------------+----------------------+
	# \| GPU Name Persistence-M\| Bus-Id Disp.A \| Volatile Uncorr. ECC \|
	# \| Fan Temp Perf Pwr:Usage/Cap\| Memory-Usage \| GPU-Util Compute M. \|
	# \| \| \| MIG M. \|
	# \|===============================+======================+======================\|
	# \| 0 ERR! Off \| 00000000:00:1E.0 Off \| ERR! \|
	# \|ERR! ERR! ERR! ERR! / ERR! \| 4184MiB / 23028MiB \| ERR! Default \|
	# \| \| \| ERR! \|
	# +-------------------------------+----------------------+----------------------+
	#
	# +-----------------------------------------------------------------------------+
	# \| Processes: \|
	# \| GPU GI CI PID Type Process name GPU Memory \|
	# \| ID ID Usage \|
	# \|=============================================================================\|
	# +-----------------------------------------------------------------------------+
	#
	# This should be reported as a failure instead as it will guarantee to fail when
	# Docker tries to run with --gpus all
	#
	# So, the correct check here is to query one of the missing piece of info like
	# GPU name, so that the command can fail accordingly
	nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
	NVIDIA_SMI_STATUS=$?

	# These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
	if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
	echo "NVIDIA driver installation has failed, shutting down the runner..."
	.github/scripts/stop_runner_service.sh
	fi

	# For runner with multiple GPUs, we also want to confirm that the number of GPUs are the
	# power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails
	# https://github.com/pytorch/test-infra/issues/4000
	GPU_COUNT=$(nvidia-smi --list-gpus \| wc -l)
	NVIDIA_SMI_STATUS=$?

	# These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
	if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
	echo "NVIDIA driver installation has failed, shutting down the runner..."
	.github/scripts/stop_runner_service.sh
	fi

	# Check the GPU count to be a power of 2
	if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then
	echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..."
	.github/scripts/stop_runner_service.sh
	fi

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Debug container issue #1

Workflow file

Debug container issue #1

Jobs

Run details

Workflow file for this run

GitHub Actions / .github/workflows/_linux-test-arc.yml