diff --git a/s3_management/Dockerfile b/s3_management/Dockerfile new file mode 100644 index 000000000..172f60af6 --- /dev/null +++ b/s3_management/Dockerfile @@ -0,0 +1,6 @@ +FROM python:3.8 +WORKDIR /work +ADD requirements.txt . +RUN pip install -r requirements.txt +ADD manage.py . +ENTRYPOINT ["python", "manage.py"] diff --git a/s3_management/Makefile b/s3_management/Makefile new file mode 100644 index 000000000..e9e4699d1 --- /dev/null +++ b/s3_management/Makefile @@ -0,0 +1,9 @@ +DIR:=$(strip $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))) + +.PHONY: build-image +build-image: requirements.txt manage.py + docker build -t pytorch/manage_s3_html "$(DIR)" + +.PHONY: push-image +push-image: build-image + docker push pytorch/manage_s3_html diff --git a/s3_management/README.md b/s3_management/README.md new file mode 100644 index 000000000..edc87691b --- /dev/null +++ b/s3_management/README.md @@ -0,0 +1,21 @@ +# s3_management + +This directory houses scripts to maintain the s3 HTML indices for https://download.pytorch.org/whl + +## Building the image + +``` +make build-image +``` + +## Pushing the image + +``` +make push-image +``` + +## Running the image + +``` +docker run --rm -it -e AWS_SECRET_ACCESS_KEY -e AWS_ACCESS_KEY_ID pytorch/manage_s3_html all +``` diff --git a/s3_management/manage.py b/s3_management/manage.py new file mode 100644 index 000000000..9c17b3e28 --- /dev/null +++ b/s3_management/manage.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python + +import argparse +import tempfile + +from os import path +from collections import defaultdict +from typing import List, Type, Dict, Set, TypeVar, Generator, Optional +from re import sub, match + +import botocore +import boto3 + + +S3 = boto3.resource('s3') +CLIENT = boto3.client('s3') +BUCKET = S3.Bucket('pytorch') + +ACCEPTED_FILE_EXTENSIONS = ("whl", "zip") +ACCEPTED_SUBDIR_PATTERNS = [ + r"cu[0-9]+", # for cuda + r"rocm[0-9]+\.[0-9]+", # for rocm + "cpu", +] +PREFIXES_WITH_HTML = { + "whl": "torch_stable.html", + "whl/test": "torch_test.html", + "whl/nightly": "torch_nightly.html" +} + +# How many packages should we keep of a specific package? +KEEP_THRESHOLD = 60 + +S3IndexType = TypeVar('S3IndexType', bound='S3Index') + + +class S3Index: + def __init__(self: S3IndexType, objects: List[str], prefix: str) -> None: + self.objects = objects + self.prefix = prefix.rstrip("/") + self.html_name = PREFIXES_WITH_HTML[self.prefix] + # should dynamically grab subdirectories like whl/test/cu101 + # so we don't need to add them manually anymore + self.subdirs = { + path.dirname(obj) for obj in objects if path.dirname != prefix + } + + def nightly_packages_to_show(self: S3IndexType) -> Set[str]: + """Finding packages to show based on a threshold we specify + + Basically takes our S3 packages, normalizes the version for easier + comparisons, then iterates over normalized versions untill we reach a + threshold and then starts adding package to delete after that threshold + has been reached + + After figuring out what versions we'd like to hide we iterate over + our original object list again and pick out the full paths to the + packages that are included in the list of versions to delete + """ + # also includes versions without GPU specifier (i.e. cu102) for easier + # sorting, sorts in reverse to put the most recent versions first + all_sorted_packages = sorted( + {self.normalize_package_version(obj) for obj in self.objects}, + reverse=True, + ) + packages: Dict[str, int] = defaultdict(int) + to_hide: Set[str] = set() + for obj in all_sorted_packages: + package_name = path.basename(obj).split('-')[0] + if packages[package_name] >= KEEP_THRESHOLD: + to_hide.add(obj) + else: + packages[package_name] += 1 + return set(self.objects).difference({ + obj for obj in self.objects + if self.normalize_package_version(obj) in to_hide + }) + + def normalize_package_version(self: S3IndexType, obj: str) -> str: + # removes the GPU specifier from the package name as well as + # unnecessary things like the file extension, architecture name, etc. + return sub( + r"%2B.*", + "", + "-".join(path.basename(obj).split("-")[:2]) + ) + + def to_legacy_html( + self, + subdir: Optional[str]=None + ) -> str: + """Generates a string that can be used as the HTML index + + Takes our objects and transforms them into HTML that have historically + been used by pip for installing pytorch. + + NOTE: These are not PEP 503 compliant but are here for legacy purposes + """ + objects = ( + self.nightly_packages_to_show() if self.prefix == 'whl/nightly' + else self.objects + ) + out: List[str] = [] + if not subdir: + subdir = self.prefix + # make sure we strip any trailing slashes + subdir = subdir.rstrip("/") + is_root = subdir == self.prefix + for obj in objects: + obj_at_root = path.dirname(obj) == self.prefix + if not obj_at_root and not obj.startswith(subdir): + continue + # Strip our prefix + sanitized_obj = obj.replace(subdir, "", 1) + # we include objects at our root prefix so that users can still + # install packages like torchaudio / torchtext even if they want + # to install a specific GPU arch of torch / torchvision + if not is_root and obj_at_root: + # strip root prefix + sanitized_obj = obj.replace(self.prefix, "", 1).lstrip("/") + sanitized_obj = f"/../{sanitized_obj}" + out.append(f"
") + return "\n".join(sorted(out)) + + def upload_legacy_html(self) -> None: + for subdir in self.subdirs: + BUCKET.Object( + key=f"{subdir}/{self.html_name}" + ).put( + ACL='public-read', + Body=self.to_legacy_html(subdir=subdir) + ) + + + @classmethod + def from_S3(cls: Type[S3IndexType], prefix: str) -> S3IndexType: + objects = [] + prefix = prefix.rstrip("/") + for obj in BUCKET.objects.filter(Prefix=prefix): + is_acceptable = any([path.dirname(obj.key) == prefix] + [ + match( + f"{prefix}/{pattern}", + path.dirname(obj.key) + ) + for pattern in ACCEPTED_SUBDIR_PATTERNS + ]) and obj.key.endswith(ACCEPTED_FILE_EXTENSIONS) + if is_acceptable: + sanitized_key = obj.key.replace("+", "%2B") + objects.append(sanitized_key) + return cls(objects, prefix) + +def create_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser("Manage S3 HTML indices for PyTorch") + parser.add_argument( + "prefix", + type=str, + choices=list(PREFIXES_WITH_HTML.keys()) + ["all"] + ) + return parser + +def main(): + parser = create_parser() + args = parser.parse_args() + if args.prefix == 'all': + for prefix in PREFIXES_WITH_HTML.keys(): + print(f"INFO: Uploading indices for '{prefix}'") + idx = S3Index.from_S3(prefix=prefix) + idx.upload_legacy_html() + else: + print(f"INFO: Uploading indices for '{args.prefix}'") + idx = S3Index.from_S3(prefix=args.prefix) + idx.upload_legacy_html() + +if __name__ == "__main__": + main() diff --git a/s3_management/requirements.txt b/s3_management/requirements.txt new file mode 100644 index 000000000..30ddf823b --- /dev/null +++ b/s3_management/requirements.txt @@ -0,0 +1 @@ +boto3