Skip to content

Commit

Permalink
Actionize it - take 1
Browse files Browse the repository at this point in the history
  • Loading branch information
ZainRizvi committed May 3, 2023
1 parent c4186e6 commit 5c0f1d8
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 53 deletions.
12 changes: 11 additions & 1 deletion .github/actions/pytest-cache-upload/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ name: Upload PyTest cache
description: Uploads the pytest cache to S3

inputs:
cache_dir:
description: Unique identifier that only repeats when the same workflow is being run against the same PR
required: true
pr-identifier:
description: Unique identifier that only repeats when the same workflow is being run against the same PR
required: true
Expand All @@ -23,9 +26,16 @@ runs:
id: get-id
shell: bash
env:
CACHE_DIR: ${{ inputs.cache_dir }}
PR_IDENTIFIER: ${{ inputs.pr-identifier }}
WORKFLOW: ${{ github.workflow }}
JOB: ${{ github.job }}
SHARD: ${{ inputs.shard }}
run: |
python3 .github/scripts/pytest_upload_cache.py --pr_identifier $PR_IDENTIFIER
python3 .github/scripts/pytest_upload_cache.py \
--cache_dir $CACHE_DIR \
--pr_identifier $PR_IDENTIFIER \
--workflow $WORKFLOW \
--job $JOB \
--shard $SHARD \
--temp_dir $RUNNER_TEMP \
79 changes: 38 additions & 41 deletions .github/scripts/pytest_caching_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

PYTEST_CACHE_KEY_PREFIX = "pytest_cache"
PYTEST_CACHE_DIR_NAME = ".pytest_cache"
BUCKET = "pytest_cache"
TEMP_DIR = "/tmp" # a backup location in case one isn't provided

def get_sanitized_pr_identifier(pr_identifier):
import hashlib
Expand All @@ -29,59 +31,57 @@ def create_test_files_in_folder(folder, num_files):
print("done")
print("Created file {}".format(file_path))

def upload_pytest_cache(pr_identifier, bucket, job, shard):
def get_s3_key_prefix(pr_identifier, workflow, job, shard=None):
"""
The prefix to any S3 object key for a pytest cache. It's only a prefix though, not a full path to an object.
For example, it won't include the file extension.
"""
prefix = f"{PYTEST_CACHE_KEY_PREFIX}/{pr_identifier}/{sanitize_for_s3(workflow)}/{sanitize_for_s3(job)}"

if shard:
prefix += f"/{shard}"

return prefix

# TODO: After uploading the cache, delete the old S3 uploads so that we don't keep downloading unnecessary files.
# Since we want the cache to contain a union of all jobs that failed in this PR, before
# uploading the cache we should first combine the resulting pytest cache after tests
# with the downloaded/merged cache from before the tests.
# However, in the short term the extra donloads are okay since they aren't that big
def upload_pytest_cache(pr_identifier, workflow, job, shard, pytest_cache_dir, bucket=BUCKET, temp_dir=TEMP_DIR):
"""
Uploads the pytest cache to S3
Args:
pr_identifier: A unique, human readable identifier for the PR
bucket: The S3 bucket to upload the cache to
job: The name of the job that is uploading the cache
"""
obj_key_prefix = f"{PYTEST_CACHE_KEY_PREFIX}/{pr_identifier}/{job}/{shard}"
tmp_zip_file_path_base = f"tmp/zip-upload/{obj_key_prefix}"
obj_key_prefix = get_s3_key_prefix(pr_identifier, workflow, job, shard)
zip_file_path_base = f"{temp_dir}/zip-upload/{obj_key_prefix}" # doesn't include the extension

try:
tmp_zip_file_path = zip_folder(pytest_cache_dir, tmp_zip_file_path_base)
obj_key = f"{obj_key_prefix}{os.path.splitext(tmp_zip_file_path)[1]}" # Keep the new file extension
upload_file_to_s3(tmp_zip_file_path, bucket, obj_key)
zip_file_path = zip_folder(pytest_cache_dir, zip_file_path_base)
obj_key = f"{obj_key_prefix}{os.path.splitext(zip_file_path)[1]}" # Keep the new file extension
upload_file_to_s3(zip_file_path, bucket, obj_key)
finally:
print(f"Deleting {tmp_zip_file_path}")
os.remove(tmp_zip_file_path)
print(f"Deleting {zip_file_path}")
os.remove(zip_file_path)

def extract_identifiers_from_download_path(download_path):
"""
download_path is expected to have the format:
<download_folder_root>/<pr_identifier>/<job_name>/<shard_id>.zip
We want to extract the pr_identifier, job_name, and shard_id from this path
so we can unzip the cache folder to the correct location
"""

download_path_parts = download_path.split("/")
assert len(download_path_parts) >= 3
def download_pytest_cache(pr_identifier, workflow, job_name, temp_dir, pytest_cache_dir_new, bucket=BUCKET):

pr_identifier = download_path_parts[-3]
job_name = download_path_parts[-2]
shard_id = download_path_parts[-1].split(".")[0]
return pr_identifier, job_name, shard_id

def download_pytest_cache(pr_identifier, bucket, job_name, temp_dir, pytest_cache_dir_new):
obj_key_prefix = f"{PYTEST_CACHE_KEY_PREFIX}/{pr_identifier}/{job_name}/"
obj_key_prefix = get_s3_key_prefix(pr_identifier, workflow, job_name)

zip_download_dir = f"{temp_dir}/cache-zip-downloads/{pr_identifier}_{job_name}"
zip_download_dir = f"{temp_dir}/cache-zip-downloads/{obj_key_prefix}"
# do the following in a try/finally block so we can clean up the temp files if something goes wrong
try:
# downloads the cache zips for all shards
downloads = download_s3_objects_with_prefix(bucket, obj_key_prefix, zip_download_dir)

# unzip all the pytest caches
pytest_caches = []
for download in downloads:
(pr_identifier, job_name, shard_id) = extract_identifiers_from_download_path(download)
pytest_cache_dir_for_shard = os.path.join(f"{temp_dir}/unzipped-caches", pr_identifier, job_name, shard_id, PYTEST_CACHE_DIR_NAME)
for downloaded_zip_path in downloads:
shard_id = os.path.splitext(os.path.basename(downloaded_zip_path))[0] # the file name of the zip is the shard id
pytest_cache_dir_for_shard = os.path.join(f"{temp_dir}/unzipped-caches", get_s3_key_prefix(pr_identifier, workflow, job_name, shard_id), PYTEST_CACHE_DIR_NAME)

try:
unzip_folder(download, pytest_cache_dir_for_shard)
unzip_folder(downloaded_zip_path, pytest_cache_dir_for_shard)
print(f"Merging cache for job {job_name} shard {shard_id} into {pytest_cache_dir_new}")
merge_pytest_caches(pytest_cache_dir_for_shard, pytest_cache_dir_new)
finally:
Expand Down Expand Up @@ -158,9 +158,6 @@ def merged_lastfailed_content(from_lastfailed, to_lastfailed):


if __name__ == '__main__':

# get bucket from env var "MY_BUCKET"
bucket = os.environ.get('MY_BUCKET')
id = "b"
folder = f"/Users/zainr/deleteme/{id}test-files"
subfolder = f"{folder}/subfolder"
Expand All @@ -174,9 +171,9 @@ def merged_lastfailed_content(from_lastfailed, to_lastfailed):
packed_file = os.path.relpath(packed_file, os.getcwd())
print(packed_file)

upload_file_to_s3(packed_file, bucket, packed_file)
upload_file_to_s3(packed_file, BUCKET, packed_file)

download_s3_objects_with_prefix(bucket, "zipped_file/ff", "downloaded_files")
download_s3_objects_with_prefix(BUCKET, "zipped_file/ff", "downloaded_files")

unzip_folder("downloaded_files/zipped_file/ffzsome-job-btest-files.zip", "/Users/zainr/deleteme/ffunzip")

Expand All @@ -185,12 +182,12 @@ def merged_lastfailed_content(from_lastfailed, to_lastfailed):
job = "test-job-name"
shard = "shard-3"
pytest_cache_dir = f"/Users/zainr/test-infra/{PYTEST_CACHE_DIR_NAME}"
upload_pytest_cache(pr_identifier, bucket, job, shard, pytest_cache_dir)
upload_pytest_cache(pr_identifier, BUCKET, job, shard, pytest_cache_dir)

temp_dir = "/Users/zainr/deleteme/tmp"

pytest_cache_dir_new = f"/Users/zainr/deleteme/test_pytest_cache"
download_pytest_cache(pr_identifier, bucket, job, temp_dir, pytest_cache_dir_new)
download_pytest_cache(pr_identifier, BUCKET, job, temp_dir, pytest_cache_dir_new)

# parser = argparse.ArgumentParser(description='')
# parser.add_argument('--pr_identifier', type=str, help='A unique identifier for the PR')
Expand Down
21 changes: 16 additions & 5 deletions .github/scripts/pytest_upload_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,29 @@

def main():
parser = argparse.ArgumentParser(description="Upload this job's the pytest cache to S3")
parser.add_argument('--cache_dir', required=True, help='Path to the folder containing the pytest cache')
parser.add_argument('--pr_identifier', required=True, help='A unique PR identifier')
parser.add_argument('--workflow', required=True, help='The workflow name')
parser.add_argument('--job', required=True, help='The job name')
parser.add_argument('--shard', required=True, help='The shard id')
parser.add_argument('--temp_dir', required=False, help='Directory to store temp files in')
parser.add_argument('--bucket', required=False, help='The S3 bucket to upload the cache to')
args = parser.parse_args()

# The command to run this script is:
# python .github/scripts/pytest_upload_cache.py --pr_identifier ${{ github.event.inputs.pr_identifier }} --workflow_name ${{ github.workflow }} --job_name ${{ github.job }} --shard_id ${{ github.run_number }} --bucket ${{ secrets.S3_BUCKET }}

print(args)
#download_pytest_cache(args.pr_identifier, args.bucket, args.job_name, args.temp_dir, args.pytest_cache_dir_new)

# TODO: First check if it's even worth uploading a new cache:
# Does the cache even mark any failed tests?

upload_pytest_cache(
pr_identifier=args.pr_identifier,
workflow=args.workflow,
job=args.job,
shard=args.shard,
pytest_cache_dir=args.pytest_cache_dir,
bucket=args.bucket,
temp_dir=args.temp_dir,
)


if __name__ == '__main__':
main()
8 changes: 8 additions & 0 deletions .github/workflows/_linux-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,14 @@ jobs:
echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
- name: Upload pytest cache
uses: ./.github/actions/pytest-cache-upload
if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
with:
cache_dir: ${GITHUB_WORKSPACE}/.pytest_cache
pr-identifier: ${{ steps.test.outputs.pr-identifier }}
shard: ${{ matrix.shard }}

- name: Print remaining test logs
shell: bash
if: always() && steps.test.conclusion
Expand Down
29 changes: 23 additions & 6 deletions tools/shared/s3_upload_utils.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
import boto3
import os
import json
import re
import shutil

def zip_folder(folder_to_zip, zip_file_path):
import shutil
print(f"Zipping {folder_to_zip} to {zip_file_path}")
return shutil.make_archive(zip_file_path, 'zip', folder_to_zip)
def zip_folder(folder_to_zip, zip_file_base_name):
"""
Returns the path to the resulting zip file
"""
print(f"Zipping {folder_to_zip} to {zip_file_base_name}")
return shutil.make_archive(zip_file_base_name, 'zip', folder_to_zip)


def unzip_folder(zip_file_path, unzip_to_folder):
import shutil
"""
Returns the path to the unzipped folder
"""
print(f"Unzipping {zip_file_path} to {unzip_to_folder}")
return shutil.unpack_archive(zip_file_path, unzip_to_folder, 'zip')
shutil.unpack_archive(zip_file_path, unzip_to_folder, 'zip')


def ensure_dir_exists(dir):
Expand All @@ -20,6 +26,9 @@ def ensure_dir_exists(dir):


def load_json_file(file_path):
"""
Returns the deserialized json object
"""
with open(file_path, 'r') as f:
return json.load(f)

Expand All @@ -32,6 +41,14 @@ def write_json_file(file_path, content):
json.dump(content, f, indent=2)


def sanitize_for_s3(text):
"""
S3 keys can only contain alphanumeric characters, underscores, and dashes.
This function replaces all other characters with underscores.
"""
return re.sub(r"[^a-zA-Z0-9_-]", "_", text)


def upload_file_to_s3(file_name, bucket, key):
print(f"Uploading {file_name} to s3://{bucket}/{key}...", end="")

Expand Down

0 comments on commit 5c0f1d8

Please sign in to comment.