-
Notifications
You must be signed in to change notification settings - Fork 21.3k
244 lines (222 loc) · 9.52 KB
/
_rocm-test.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# TODO: this looks sort of similar to _linux-test, but there are like a dozen
# places where you would have to insert an if statement. Probably it's better to
# just use a different workflow altogether
name: test
on:
workflow_call:
inputs:
build-environment:
required: true
type: string
description: Top-level label for what's being built/tested.
test-matrix:
required: true
type: string
description: JSON description of what test configs to run.
docker-image:
required: true
type: string
description: Docker image to run in.
sync-tag:
required: false
type: string
default: ""
description: |
If this is set, our linter will use this to make sure that every other
job with the same `sync-tag` is identical.
secrets:
AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID:
required: true
description: access key id for test stats upload
AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY:
required: true
description: secret acess key for test stats upload
env:
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
jobs:
test:
# Don't run on forked repos.
if: github.repository_owner == 'pytorch'
timeout-minutes: 300
strategy:
matrix: ${{ fromJSON(inputs.test-matrix) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
steps:
# [see note: pytorch repo ref]
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
with:
no-sudo: true
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ inputs.docker-image }}
- name: Start monitoring script
id: monitor-script
shell: bash
run: |
python3 -m pip install psutil==5.9.1
python3 -m pip install pynvml==11.4.1
python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
- name: Download build artifacts
uses: ./.github/actions/download-build-artifacts
with:
name: ${{ inputs.build-environment }}
- name: Parse ref
id: parse-ref
run: .github/scripts/parse_ref.py
- name: Test
id: test
env:
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
PR_NUMBER: ${{ github.event.pull_request.number }}
BRANCH: ${{ steps.parse-ref.outputs.branch }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
PYTORCH_RETRY_TEST_CASES: 1
PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
TEST_CONFIG: ${{ matrix.config }}
SHARD_NUMBER: ${{ matrix.shard }}
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
PR_BODY: ${{ github.event.pull_request.body }}
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
DOCKER_IMAGE: ${{ inputs.docker-image }}
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
PYTORCH_JIT_ENABLE_NVFUSER: 1
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
timeout-minutes: 270
run: |
set -x
if [[ $TEST_CONFIG == 'multigpu' ]]; then
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
TEST_COMMAND=.jenkins/caffe2/test.sh
else
TEST_COMMAND=.jenkins/pytorch/test.sh
fi
COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
# sanitize the input commit message and PR body here:
#
# trim all new lines from commit messages + PR_BODY to avoid issues with batch environment
# variable copying. see https://github.com/pytorch/pytorch/pull/80043#issuecomment-1167796028
COMMIT_MESSAGES="${COMMIT_MESSAGES//[$'\n\r']}"
PR_BODY="${PR_BODY//[$'\n\r']}"
# then trim all special characters like single and double quotes to avoid unescaped inputs to
# wreak havoc internally
export COMMIT_MESSAGES="${COMMIT_MESSAGES//[\'\"]}"
export PR_BODY="${PR_BODY//[\'\"]}"
# detached container should get cleaned up by teardown_ec2_linux
# TODO: Stop building test binaries as part of the build phase
# Used for GPU_FLAG since that doesn't play nice
# shellcheck disable=SC2086,SC2090
container_name=$(docker run \
${GPU_FLAG:-} \
-e BUILD_ENVIRONMENT \
-e PR_NUMBER \
-e GITHUB_ACTIONS \
-e BRANCH \
-e SHA1 \
-e AWS_DEFAULT_REGION \
-e IN_WHEEL_TEST \
-e SHARD_NUMBER \
-e TEST_CONFIG \
-e NUM_TEST_SHARDS \
-e PR_BODY \
-e COMMIT_MESSAGES \
-e PYTORCH_RETRY_TEST_CASES \
-e PYTORCH_OVERRIDE_FLAKY_SIGNAL \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e SCCACHE_BUCKET \
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--ulimit stack=10485760:83886080 \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--shm-size="8g" \
--tty \
--detach \
--name="${container_name}" \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}"
)
# save container name for later step
echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
# jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
docker exec -t "${container_name}" sh -c "cd .. && cp -R workspace pytorch && cd pytorch && pip install dist/*.whl && ${TEST_COMMAND}"
- name: Save test results
if: always()
run: |
# copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"
- name: Get workflow job id
id: get-job-id
uses: ./.github/actions/get-workflow-job-id
if: always()
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Stop monitoring script
if: always() && steps.monitor-script.outputs.monitor-script-pid
shell: bash
continue-on-error: true
env:
MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
run: |
kill "$MONITOR_SCRIPT_PID"
- name: Upload test artifacts
uses: ./.github/actions/upload-test-artifacts
if: always() && (steps.test.conclusion == 'success' || steps.test.conclusion == 'failure')
with:
use-gha: true
file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
- name: Upload test statistics
if: always()
env:
AWS_DEFAULT_REGION: us-east-1
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
BRANCH: ${{ steps.parse-ref.outputs.branch }}
TEST_CONFIG: ${{ matrix.config }}
SHARD_NUMBER: ${{ matrix.shard }}
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
PR_NUMBER: ${{ github.event.pull_request.number }}
PYTORCH_RETRY_TEST_CASES: 1
PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
TAG: ${{ steps.parse-ref.outputs.tag }}
WORKFLOW_ID: ${{ github.run_id }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
shell: bash
run: |
set -x
python3 -m pip install -r requirements.txt
python3 -m pip install boto3==1.19.12
python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
- name: Teardown ROCm
if: always()
shell: bash
run: |
# Only stop the docker container we started since there might be multiple runners on this host.
docker stop "${{ env.CONTAINER_NAME }}" || true
# Prune all of the docker containers.
# Might fail if a prune is already in progress by another runner.
docker container prune -f || true
# Prune everything docker if there are more than 10 images (~200GB).
# This is easier than using a time filter, e.g., "until=24h".
# Might fail if a prune is already in progress by another runner.
image_count=$(docker images | wc -l)
if [[ ${image_count} -gt 10 ]]; then
echo "Purging all docker caches"
docker system prune -af || true
else
echo "Will not purge docker, only ${image_count} images found"
fi