diff --git a/.gitignore b/.gitignore index b0efde0c43..550ce2ab59 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,8 @@ dist/ .github/.DS_Store .DS_Store frontend/server/src/main/java/org/pytorch/serve/grpc/ +*.pem +*.backup # Postman files test/artifacts/ @@ -18,5 +20,10 @@ test/model_store/ test/ts_console.log test/config.properties + .vscode .scratch/ + +# Custom benchmark artifacts +instances.yaml +instances.yaml.backup diff --git a/docker/Dockerfile.neuron.dev b/docker/Dockerfile.neuron.dev new file mode 100644 index 0000000000..ce31c434c0 --- /dev/null +++ b/docker/Dockerfile.neuron.dev @@ -0,0 +1,109 @@ +# syntax = docker/dockerfile:experimental +# +# Following comments have been shamelessly copied from https://github.com/pytorch/pytorch/blob/master/Dockerfile +# +# NOTE: To build this you will need a docker version > 18.06 with +# experimental enabled and DOCKER_BUILDKIT=1 +# +# If you do not use buildkit you are not going to have a good time +# +# For reference: +# https://docs.docker.com/develop/develop-images/build_enhancements/ + +ARG BASE_IMAGE=ubuntu:18.04 +ARG BUILD_TYPE=dev +FROM ${BASE_IMAGE} AS compile-image + +ARG BASE_IMAGE +ARG BRANCH_NAME=master +ARG MACHINE_TYPE=cpu +ARG CUDA_VERSION + +ENV PYTHONUNBUFFERED TRUE + +RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \ + apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ + fakeroot \ + ca-certificates \ + dpkg-dev \ + sudo \ + g++ \ + git \ + python3-dev \ + build-essential \ + openjdk-11-jdk \ + curl \ + wget \ + vim \ + && rm -rf /var/lib/apt/lists/* \ + && cd /tmp \ + && curl -O https://bootstrap.pypa.io/get-pip.py \ + && python3 get-pip.py + +RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1 \ + && update-alternatives --install /usr/local/bin/pip pip /usr/local/bin/pip3 1 + +RUN pip install -U pip setuptools + +RUN echo "deb https://apt.repos.neuron.amazonaws.com bionic main" > /etc/apt/sources.list.d/neuron.list +RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - + +RUN apt-get update \ + && apt-get install -y \ + aws-neuron-runtime \ + aws-neuron-tools \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# Build Dev Image +FROM compile-image AS dev-image +ARG MACHINE_TYPE=cpu +ARG CUDA_VERSION +RUN if [ "$MACHINE_TYPE" = "gpu" ]; then export USE_CUDA=1; fi \ + && git clone https://github.com/pytorch/serve.git \ + && cd serve \ + && git checkout --track ${BRANCH_NAME} \ + && if [ -z "$CUDA_VERSION" ]; then python ts_scripts/install_dependencies.py --environment=dev; else python ts_scripts/install_dependencies.py --environment=dev --cuda $CUDA_VERSION; fi \ + && python ts_scripts/install_from_src.py \ + && useradd -m model-server \ + && mkdir -p /home/model-server/tmp \ + && cp docker/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh \ + && chmod +x /usr/local/bin/dockerd-entrypoint.sh \ + && chown -R model-server /home/model-server \ + && cp docker/config.properties /home/model-server/config.properties \ + && mkdir /home/model-server/model-store && chown -R model-server /home/model-server/model-store \ + && pip install torch-neuron 'neuron-cc[tensorflow]' --extra-index-url=https://pip.repos.neuron.amazonaws.com + +EXPOSE 8080 8081 8082 7070 7071 +USER model-server +WORKDIR /home/model-server +ENV TEMP=/home/model-server/tmp +ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"] +CMD ["serve"] + +# Build CodeBuild Image +FROM compile-image AS codebuild-image +ENV JAVA_VERSION=11 \ + JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64" \ + JDK_HOME="/usr/lib/jvm/java-11-openjdk-amd64" \ + JRE_HOME="/usr/lib/jvm/java-11-openjdk-amd64" \ + ANT_VERSION=1.10.3 \ + MAVEN_HOME="/opt/maven" \ + MAVEN_VERSION=3.5.4 \ + MAVEN_CONFIG="/root/.m2" \ + MAVEN_DOWNLOAD_SHA1="22cac91b3557586bb1eba326f2f7727543ff15e3" + +# Install Maven +RUN set -ex \ + && mkdir -p $MAVEN_HOME \ + && curl -LSso /var/tmp/apache-maven-$MAVEN_VERSION-bin.tar.gz https://apache.org/dist/maven/maven-3/$MAVEN_VERSION/binaries/apache-maven-$MAVEN_VERSION-bin.tar.gz \ + && echo "$MAVEN_DOWNLOAD_SHA1 /var/tmp/apache-maven-$MAVEN_VERSION-bin.tar.gz" | sha1sum -c - \ + && tar xzvf /var/tmp/apache-maven-$MAVEN_VERSION-bin.tar.gz -C $MAVEN_HOME --strip-components=1 \ + && update-alternatives --install /usr/bin/mvn mvn /opt/maven/bin/mvn 10000 \ + && mkdir -p $MAVEN_CONFIG + +FROM ${BUILD_TYPE}-image AS final-image +ARG BUILD_TYPE +RUN echo "${BUILD_TYPE} image creation completed" diff --git a/docker/build_image.sh b/docker/build_image.sh index 5ff4d38b21..48b80a1d15 100755 --- a/docker/build_image.sh +++ b/docker/build_image.sh @@ -89,5 +89,5 @@ if [ $BUILD_TYPE == "production" ] then DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg CUDA_VERSION=$CUDA_VERSION -t $DOCKER_TAG . else - DOCKER_BUILDKIT=1 docker build --file Dockerfile.dev -t $DOCKER_TAG --build-arg BUILD_TYPE=$BUILD_TYPE --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg BRANCH_NAME=$BRANCH_NAME --build-arg CUDA_VERSION=$CUDA_VERSION --build-arg MACHINE_TYPE=$MACHINE . + DOCKER_BUILDKIT=1 docker build --pull --file Dockerfile.dev -t $DOCKER_TAG --build-arg BUILD_TYPE=$BUILD_TYPE --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg BRANCH_NAME=$BRANCH_NAME --build-arg CUDA_VERSION=$CUDA_VERSION --build-arg MACHINE_TYPE=$MACHINE . fi diff --git a/test/benchmark/README.md b/test/benchmark/README.md index 4ac3b99e5a..7ef9aa7b75 100644 --- a/test/benchmark/README.md +++ b/test/benchmark/README.md @@ -21,8 +21,45 @@ If you'd like to use your own repo, edit the __init__.py under `serve/test/bench * Ensure you have [docker](https://docs.docker.com/get-docker/) client set-up on your system - osx/ec2 * Adjust the following global variables to your preference in the file `serve/test/benchmark/tests/utils/__init__.py`
-- IAM_INSTANCE_PROFILE :this role is attached to all ec2 instances created as part of the benchmarking process. Create this as described [here](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html#create-iam-role). Default role name is 'EC2Admin'.
+Use the following commands to create a new role if you don't have one you can use. +1. Create the trust policy file `ec2-admin-trust-policy.json` and add the following content: +``` +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": [ + "ec2.amazonaws.com" + ] + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +2. Create the EC2 role as follows: +``` +aws iam create-role --role-name EC2Admin --assume-role-policy-document file://ec2-admin-trust-policy.json +``` +3. Add the permissions to the role as follows: +``` +aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/IAMFullAccess --role-name EC2Admin +aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/AmazonEC2FullAccess --role-name EC2Admin +aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess --role-name EC2Admin +aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryFullAccess --role-name EC2Admin +``` -- S3_BUCKET_BENCHMARK_ARTIFACTS :all temporary benchmarking artifacts including server logs will be stored in this bucket:
+Use the following command to create a new S3 bucket if you don't have one you can use. +``` +aws s3api create-bucket --bucket --region us-west-2 +``` -- DEFAULT_DOCKER_DEV_ECR_REPO :docker image used for benchmarking will be pushed to this repo
+Use the following command to create a new ECR repo if you don't have one you can use. +``` +aws ecr create-repository --bucket torchserve-benchmark --region us-west-2 +``` * If you're running this setup on an EC2 instance, please ensure that the instance's security group settings 'allow' inbound ssh port 22. Refer [docs](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/security-group-rules.html). *The following steps assume that the current working directory is serve/.* @@ -32,6 +69,8 @@ If you'd like to use your own repo, edit the __init__.py under `serve/test/bench sudo apt-get install python3-venv python3 -m venv bvenv source bvenv/bin/activate +# Ensure you have the latest pip +pip3 install -U pip ``` 2. Install requirements for the benchmarking ``` @@ -57,7 +96,7 @@ python report.py ``` The final benchmark report will be available in markdown format as `report.md` in the `serve/` folder. -**Example report for vgg16 model** +**Example report for vgg11 model** ### Benchmark report @@ -103,3 +142,37 @@ The final benchmark report will be available in markdown format as `report.md` i | AB | vgg11 | 100 | 1000 | 0 | 3.47 | 28765 | 29849 | 30488 | 28781.227 | 0.0 | 1576.24 | 1758.28 | 1758.28 | 2249.52 | 2249.34 | 25210.43 | 46.77 | +## Features of the automation: +1. To save time by *not* creating new instances for every benchmark run for local testing, use the '--do-not-terminate' flag. This will automatically create a file called 'instances.yaml' and write instance-related data into the file so that it may be re-used next time. +``` +python test/benchmark/run_benchmark.py --do-not-terminate +``` + +2. To re-use an instance already recorded in `instances.yaml`, use the '--use-instances' flag: +``` +python test/benchmark/run_benchmark.py --use-instances /instances.yaml --do-no-terminate +``` +`Note: Use --do-not-termninate flag to keep re-using the instances, else, it will be terminated`. + +3. To run a test containing a specific string, use the `--run-only` flag. Note that the argument is 'string matched' i.e. if the test-name contains the supplied argument as a substring, the test will run. +``` +# To run mnist test +python test/benchmark/run_benchmark.py --run-only mnist + +# To run fastrcnn test +python test/benchmark/run_benchmark.py --run-only fastrcnn + +# To run bert_neuron and bert +python test/benchmark/run_benchmark.py --run-only bert + +# To run vgg11 test +python test/benchmark/run_benchmark.py --run-only vgg11 + +# To run vgg16 test +python test/benchmark/run_benchmark.py --run-only vgg16 +``` + +4. You can benchmark a specifc branch of the torchserve github repo by specifying the flag `--use-torchserve-branch` e.g., +``` +python test/benchmark/run_benchmark.py --use-torchserve-branch issue_1115 +``` \ No newline at end of file diff --git a/test/benchmark/requirements.txt b/test/benchmark/requirements.txt index 8fdd36f95c..fc06d1f3d6 100644 --- a/test/benchmark/requirements.txt +++ b/test/benchmark/requirements.txt @@ -11,4 +11,5 @@ gitpython docker pandas matplotlib -pyyaml \ No newline at end of file +pyyaml +cryptography==3.4.7 \ No newline at end of file diff --git a/test/benchmark/run_benchmark.py b/test/benchmark/run_benchmark.py index ebee2cac5c..bdf4777eb4 100644 --- a/test/benchmark/run_benchmark.py +++ b/test/benchmark/run_benchmark.py @@ -1,3 +1,4 @@ +import argparse import os import random import sys @@ -5,18 +6,105 @@ import re import uuid + import boto3 import pytest from invoke import run from invoke.context import Context + +from tests.utils.report import Report +from tests.utils import ( + S3_BUCKET_BENCHMARK_ARTIFACTS, + DEFAULT_REGION, + DEFAULT_DOCKER_DEV_ECR_REPO, + YamlHandler, + DockerImageHandler, +) + LOGGER = logging.getLogger(__name__) LOGGER.setLevel(logging.DEBUG) LOGGER.addHandler(logging.StreamHandler(sys.stdout)) +def build_docker_container(torchserve_branch="master"): + LOGGER.info(f"Setting up docker image to be used") + + docker_dev_image_config_path = os.path.join(os.getcwd(), "test", "benchmark", "tests", "suite", "docker", "docker.yaml") + + docker_config = YamlHandler.load_yaml(docker_dev_image_config_path) + YamlHandler.validate_docker_yaml(docker_config) + + account_id = run("aws sts get-caller-identity --query Account --output text").stdout.strip() + + for processor, config in docker_config.items(): + docker_tag = None + cuda_version = None + for config_key, config_value in config.items(): + if processor == "gpu" and config_key == "cuda_version": + cuda_version = config_value + if config_key == "docker_tag": + docker_tag = config_value + dockerImageHandler = DockerImageHandler(docker_tag, cuda_version, torchserve_branch) + dockerImageHandler.build_image() + dockerImageHandler.push_docker_image_to_ecr( + account_id, DEFAULT_REGION, f"{DEFAULT_DOCKER_DEV_ECR_REPO}:{docker_tag}" + ) + + def main(): + + LOGGER.info(f"sys.path: {sys.path}") + + parser = argparse.ArgumentParser() + + parser.add_argument( + "--use-instances", + action="store", + help="Supply a .yaml file with test_name, instance_id, and key_filename to re-use already-running instances", + ) + parser.add_argument( + "--do-not-terminate", + action="store_true", + default=False, + help="Use with caution: does not terminate instances, instead saves the list to a file in order to re-use", + ) + + parser.add_argument( + "--run-only", default=None, help="Runs the tests that contain the supplied keyword as a substring" + ) + + parser.add_argument( + "--use-torchserve-branch", + default="master", + help="Specify a specific torchserve branch to benchmark on, else uses 'master' by default" + ) + + parser.add_argument( + "--skip-docker-build", + action="store_true", + default=False, + help="Use if you already have a docker image built and available locally and have specified it in docker.yaml" + ) + + arguments = parser.parse_args() + do_not_terminate_string = "" if not arguments.do_not_terminate else "--do-not-terminate" + use_instances_arg_list = ["--use-instances", f"{arguments.use_instances}"] if arguments.use_instances else [] + run_only_test = arguments.run_only + + if run_only_test: + run_only_string = f"-k {run_only_test}" + LOGGER.info(f"Note: running only the tests that have the name '{run_only_test}'.") + else: + run_only_string = "" + + torchserve_branch = arguments.use_torchserve_branch + + # Build docker containers as specified in docker.yaml + if not arguments.skip_docker_build: + build_docker_container(torchserve_branch=torchserve_branch) + # Run this script from the root directory 'serve', it changes directory below as required os.chdir(os.path.join(os.getcwd(), "test", "benchmark")) @@ -25,12 +113,30 @@ def main(): test_path = os.path.join(os.getcwd(), "tests") LOGGER.info(f"Running tests from directory: {test_path}") - pytest_args = ["-s", "-rA", test_path, "-n=4", "--disable-warnings", "-v", "--execution-id", execution_id] + pytest_args = [ + "-s", + run_only_string, + "-rA", + test_path, + "-n=4", + "--disable-warnings", + "-v", + "--execution-id", + execution_id, + do_not_terminate_string, + ] + use_instances_arg_list LOGGER.info(f"Running pytest") pytest.main(pytest_args) + # Generate report + s3_results_uri = f"{S3_BUCKET_BENCHMARK_ARTIFACTS}/{execution_id}" + + report = Report() + report.download_benchmark_results_from_s3(s3_results_uri) + report.generate_comprehensive_report() + if __name__ == "__main__": main() diff --git a/test/benchmark/tests/__init__.py b/test/benchmark/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/benchmark/tests/conftest.py b/test/benchmark/tests/conftest.py index 9cf11acacf..ebb49d250f 100644 --- a/test/benchmark/tests/conftest.py +++ b/test/benchmark/tests/conftest.py @@ -4,6 +4,7 @@ import random import re import sys +import yaml import boto3 import pytest @@ -37,28 +38,19 @@ def pytest_addoption(parser): help="execution id that is used to keep all artifacts together", ) + parser.addoption( + "--use-instances", + default=False, + action="store", + help="Supply a .yaml file with test_name, instance_id, and key_filename to re-use already-running instances", + ) -@pytest.fixture(scope="session", autouse=True) -def build_docker_container(request, docker_dev_image_config_path): - LOGGER.info(f"Setting up docker image to be used") - docker_config = YamlHandler.load_yaml(docker_dev_image_config_path) - YamlHandler.validate_docker_yaml(docker_config) - - account_id = run("aws sts get-caller-identity --query Account --output text").stdout.strip() - - for processor, config in docker_config.items(): - docker_tag = None - cuda_version = None - for config_key, config_value in config.items(): - if processor == "gpu" and config_key == "cuda_version": - cuda_version = config_value - if config_key == "docker_tag": - docker_tag = config_value - dockerImageHandler = DockerImageHandler(docker_tag, cuda_version) - dockerImageHandler.build_image() - dockerImageHandler.push_docker_image_to_ecr( - account_id, DEFAULT_REGION, f"{DEFAULT_DOCKER_DEV_ECR_REPO}:{docker_tag}" - ) + parser.addoption( + "--do-not-terminate", + action="store_true", + default=False, + help="Use with caution: does not terminate instances, instead saves the list to a file in order to re-use", + ) @pytest.fixture(scope="session") @@ -75,6 +67,10 @@ def benchmark_execution_id(request): return execution_id +@pytest.fixture(scope="function") +def bert_neuron_config_file_path(request): + return os.path.join(os.getcwd(), "tests", "suite", "bert_neuron.yaml") + @pytest.fixture(scope="function") def vgg11_config_file_path(request): return os.path.join(os.getcwd(), "tests", "suite", "vgg11.yaml") @@ -162,12 +158,29 @@ def ec2_instance( ec2_instance_ami, region, ): - key_filename = ec2_utils.generate_ssh_keypair(ec2_client, ec2_key_name) - def delete_ssh_keypair(): - ec2_utils.destroy_ssh_keypair(ec2_client, key_filename) + use_instances_flag = request.config.getoption("--use-instances") if request.config.getoption("--use-instances") else None - request.addfinalizer(delete_ssh_keypair) + if use_instances_flag: + instances_file = request.config.getoption("--use-instances") + run(f"touch {instances_file}", warn=True) + instances_dict = YamlHandler.load_yaml(instances_file) + LOGGER.info(f"instances_dict: {instances_dict}") + instances = instances_dict.get(request.node.name.split("[")[0], "") + LOGGER.info(f"instances: {instances}") + assert instances != "", f"Could not find instance details corresponding to test: {request.node.name.split('[')[0]}" + instance_details = instances.get(ec2_instance_type, "") + assert instance_details != "", f"Could not obtain details for instance type: {ec2_instance_type}" + instance_id = instance_details.get("instance_id", "") + assert instance_id != "", f"Missing instance_id" + key_filename = instance_details.get("key_filename", "") + assert key_filename != "", f"Missing key_filename" + + LOGGER.info(f"For test: {request.node.name}; Using instance_id: {instance_id} and key_filename: {key_filename}") + + return instance_id, key_filename + + key_filename = ec2_utils.generate_ssh_keypair(ec2_client, ec2_key_name) params = { "KeyName": ec2_key_name, @@ -179,7 +192,7 @@ def delete_ssh_keypair(): ], "MaxCount": 1, "MinCount": 1, - "BlockDeviceMappings": [{"DeviceName": "/dev/sda1", "Ebs": {"VolumeSize": 120}}], + "BlockDeviceMappings": [{"DeviceName": "/dev/sda1", "Ebs": {"VolumeSize": 220}}], } try: @@ -196,10 +209,34 @@ def delete_ssh_keypair(): def terminate_ec2_instance(): ec2_client.terminate_instances(InstanceIds=[instance_id]) - request.addfinalizer(terminate_ec2_instance) + def delete_ssh_keypair(): + ec2_utils.destroy_ssh_keypair(ec2_client, key_filename) + + do_not_terminate_flag = request.config.getoption("--do-not-terminate") + + LOGGER.info(f"do_not_terminate_flag: {do_not_terminate_flag}") + + instances_file = os.path.join(os.getcwd(), "instances.yaml") + run(f"touch {instances_file}", warn=True) + + if not do_not_terminate_flag: + request.addfinalizer(terminate_ec2_instance) + request.addfinalizer(delete_ssh_keypair) + + if do_not_terminate_flag and not use_instances_flag: + instances_dict = YamlHandler.load_yaml(instances_file) + if not instances_dict: + instances_dict = {} + + update_dictionary = {request.node.name.split("[")[0]: {ec2_instance_type: {"instance_id": instance_id, "key_filename": key_filename}}} + + instances_dict.update(update_dictionary) + + YamlHandler.write_yaml(instances_file, instances_dict) ec2_utils.check_instance_state(instance_id, state="running", region=region) ec2_utils.check_system_state(instance_id, system_status="ok", instance_status="ok", region=region) + return instance_id, key_filename @@ -232,6 +269,4 @@ def delete_s3_artifact_copy(): request.addfinalizer(delete_s3_artifact_copy) - return conn - diff --git a/test/benchmark/tests/resources/neuron-bert/compile_bert.py b/test/benchmark/tests/resources/neuron-bert/compile_bert.py new file mode 100644 index 0000000000..8f0e30968a --- /dev/null +++ b/test/benchmark/tests/resources/neuron-bert/compile_bert.py @@ -0,0 +1,65 @@ +import tensorflow # to workaround a protobuf version conflict issue +import torch +import torch.neuron +from transformers import AutoTokenizer, AutoModelForSequenceClassification +import logging +import argparse + +## Enable logging so we can see any important warnings +logger = logging.getLogger('Neuron') +logger.setLevel(logging.INFO) + +parser = argparse.ArgumentParser() + +parser.add_argument( +"--batch-size", +action="store", +help="Supply a .yaml file with test_name, instance_id, and key_filename to re-use already-running instances", +) + +arguments = parser.parse_args() + +batch_size = int(arguments.batch_size) + +# Build tokenizer and model +tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc", return_dict=False) +model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc", return_dict=False) + +# Setup some example inputs +sequence_0 = "The company HuggingFace is based in New York City" +sequence_1 = "Apples are especially bad for your health" +sequence_2 = "HuggingFace's headquarters are situated in Manhattan" +paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, max_length=128, pad_to_max_length=True, return_tensors="pt") +not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, max_length=128, pad_to_max_length=True, return_tensors="pt") + +# Run the original PyTorch model on both example inputs +paraphrase_classification_logits = model(**paraphrase)[0] +not_paraphrase_classification_logits = model(**not_paraphrase)[0] + +max_length=128 +# Convert example inputs to a format that is compatible with TorchScript tracing +input_ids = paraphrase['input_ids'] # type:torch.Tensor +token_type_ids = paraphrase['token_type_ids'] # type:torch.Tensor +attention_mask = paraphrase['attention_mask'] # type:torch.Tensor +input_ids = input_ids.expand(batch_size, max_length) +token_type_ids = token_type_ids.expand(batch_size, max_length) +attention_mask = attention_mask.expand(batch_size, max_length) +example_inputs_paraphrase = input_ids, attention_mask, token_type_ids + +input_ids = not_paraphrase['input_ids'] # type:torch.Tensor +token_type_ids = not_paraphrase['token_type_ids'] # type:torch.Tensor +attention_mask = not_paraphrase['attention_mask'] # type:torch.Tensor +input_ids = input_ids.expand(batch_size, max_length) +token_type_ids = token_type_ids.expand(batch_size, max_length) +attention_mask = attention_mask.expand(batch_size, max_length) +example_inputs_not_paraphrase = input_ids, attention_mask, token_type_ids + +# Run torch.neuron.trace to generate a TorchScript that is optimized by AWS Neuron, using optimization level -O2 +model_neuron = torch.neuron.trace(model, example_inputs_paraphrase, compiler_args=['-O2']) + +# Verify the TorchScript works on both example inputs +paraphrase_classification_logits_neuron = model_neuron(*example_inputs_paraphrase) +not_paraphrase_classification_logits_neuron = model_neuron(*example_inputs_not_paraphrase) + +# Save the TorchScript for later use +model_neuron.save(f"bert_neuron_{batch_size}.pt") \ No newline at end of file diff --git a/test/benchmark/tests/resources/neuron-bert/config.py b/test/benchmark/tests/resources/neuron-bert/config.py new file mode 100644 index 0000000000..e21697aadd --- /dev/null +++ b/test/benchmark/tests/resources/neuron-bert/config.py @@ -0,0 +1,3 @@ +model_name='bert-base-cased-finetuned-mrpc' +max_length=128 +batch_size=1 \ No newline at end of file diff --git a/test/benchmark/tests/resources/neuron-bert/handler_bert.py b/test/benchmark/tests/resources/neuron-bert/handler_bert.py new file mode 100644 index 0000000000..9166fdd505 --- /dev/null +++ b/test/benchmark/tests/resources/neuron-bert/handler_bert.py @@ -0,0 +1,110 @@ +import os +import json +import sys +import logging + +import torch, torch_neuron +from transformers import AutoTokenizer +from abc import ABC +from ts.torch_handler.base_handler import BaseHandler + +# one core per worker +os.environ['NEURONCORE_GROUP_SIZES'] = '1' + +logger = logging.getLogger(__name__) + +class BertEmbeddingHandler(BaseHandler, ABC): + """ + Handler class for Bert Embedding computations. + """ + def __init__(self): + super(BertEmbeddingHandler, self).__init__() + self.initialized = False + + def initialize(self, ctx): + self.manifest = ctx.manifest + properties = ctx.system_properties + self.device = 'cpu' + model_dir = properties.get('model_dir') + serialized_file = self.manifest['model']['serializedFile'] + model_pt_path = os.path.join(model_dir, serialized_file) + + # point sys.path to our config file + sys.path.append(model_dir) + import config + self.max_length = config.max_length + self.batch_size = config.batch_size + self.classes = ['not paraphrase', 'paraphrase'] + + self.model = torch.jit.load(model_pt_path) + logger.debug(f'Model loaded from {model_dir}') + self.model.to(self.device) + self.model.eval() + + self.tokenizer = AutoTokenizer.from_pretrained(config.model_name) + self.initialized = True + + def preprocess(self, input_data): + """ + Tokenization pre-processing + """ + + input_ids = [] + attention_masks = [] + token_type_ids = [] + + for row in input_data: + #seq_0 = row['body']['seq_0'].decode('utf-8') + #seq_1 = row['body']['seq_1'].decode('utf-8') + + json_data = json.loads(row['body'].decode('utf-8')) + + seq_0 = json_data['seq_0'] + seq_1 = json_data['seq_1'] + logger.debug(f'Received text: "{seq_0}", "{seq_1}"') + + inputs = self.tokenizer.encode_plus( + seq_0, + seq_1, + max_length=self.max_length, + padding='max_length', + truncation=True, + return_tensors='pt' + ) + + input_ids.append(inputs['input_ids']) + attention_masks.append(inputs['attention_mask']) + token_type_ids.append(inputs['token_type_ids']) + + batch = (torch.cat(input_ids, 0), + torch.cat(attention_masks, 0), + torch.cat(token_type_ids, 0)) + + return batch + + def inference(self, inputs): + """ + Predict the class of a text using a trained transformer model. + """ + + # sanity check dimensions + assert(len(inputs) == 3) + num_inferences = len(inputs[0]) + assert(num_inferences <= self.batch_size) + + # insert padding if we received a partial batch + padding = self.batch_size - num_inferences + if padding > 0: + pad = torch.nn.ConstantPad1d((0, 0, 0, padding), value=0) + inputs = [pad(x) for x in inputs] + + outputs = self.model(*inputs)[0] + predictions = [] + for i in range(num_inferences): + prediction = self.classes[outputs[i].argmax().item()] + predictions.append([prediction]) + logger.debug("Model predicted: '%s'", prediction) + return predictions + + def postprocess(self, inference_output): + return inference_output \ No newline at end of file diff --git a/test/benchmark/tests/resources/neuron-bert/input b/test/benchmark/tests/resources/neuron-bert/input new file mode 100644 index 0000000000..e8a5324c7a --- /dev/null +++ b/test/benchmark/tests/resources/neuron-bert/input @@ -0,0 +1 @@ +{"seq_0": "HuggingFace's headquarters are situated in Manhattan", "seq_1": "This is total nonsense."} \ No newline at end of file diff --git a/test/benchmark/tests/suite/bert.yaml b/test/benchmark/tests/suite/bert.yaml index 40a447aa75..7173ac1bf3 100644 --- a/test/benchmark/tests/suite/bert.yaml +++ b/test/benchmark/tests/suite/bert.yaml @@ -2,7 +2,7 @@ bert: scripted_mode: benchmark_engine: "ab" - url: "https://torchserve.s3.amazonaws.com/mar_files/BERTSeqClassification_Torchscript_batch.mar" + url: "https://s3.us-west-2.amazonaws.com/ts0.4.1-marfiles/BERTSeqClassification_torchscript.mar" #for CPU: https://torchserve.s3.amazonaws.com/mar_files/BERTSeqClassification_Torchscript_batch.mar workers: 4 batch_delay: 100 batch_size: @@ -10,10 +10,10 @@ bert: - 2 - 4 - 8 - input: "https://raw.githubusercontent.com/pytorch/serve/master/examples/image_classifier/kitten.jpg" + input: "https://raw.githubusercontent.com/pytorch/serve/master/examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt" requests: 10000 concurrency: 100 - backend_profiling: True + backend_profiling: False exec_env: "docker" processors: - "cpu" diff --git a/test/benchmark/tests/suite/bert_neuron.yaml b/test/benchmark/tests/suite/bert_neuron.yaml new file mode 100644 index 0000000000..a80ad24e5c --- /dev/null +++ b/test/benchmark/tests/suite/bert_neuron.yaml @@ -0,0 +1,19 @@ +--- +bert_inf1: + scripted_mode: + benchmark_engine: "ab" + compile_per_batch_size: True + workers: 4 + batch_delay: 100 + batch_size: + - 1 + - 2 + - 4 + - 8 + input: "/home/ubuntu/serve/test/benchmark/tests/resources/neuron-bert/input" + requests: 10000 + concurrency: 100 + backend_profiling: False + exec_env: "local" + processors: + - "inferentia" \ No newline at end of file diff --git a/test/benchmark/tests/suite/fastrcnn.yaml b/test/benchmark/tests/suite/fastrcnn.yaml index 647e2725ae..52f19c0947 100644 --- a/test/benchmark/tests/suite/fastrcnn.yaml +++ b/test/benchmark/tests/suite/fastrcnn.yaml @@ -10,10 +10,10 @@ fastrcnn: - 2 - 4 - 8 - input: "https://raw.githubusercontent.com/pytorch/serve/master/examples/object_detector/persons.jpg" - requests: 10000 + input: "https://raw.githubusercontent.com/pytorch/serve/master/examples/image_classifier/kitten.jpg" + requests: 1000 concurrency: 100 - backend_profiling: True + backend_profiling: False exec_env: "docker" processors: - "cpu" diff --git a/test/benchmark/tests/suite/mnist.yaml b/test/benchmark/tests/suite/mnist.yaml index bfe42b6123..b360a724d8 100644 --- a/test/benchmark/tests/suite/mnist.yaml +++ b/test/benchmark/tests/suite/mnist.yaml @@ -10,10 +10,10 @@ mnist: - 2 - 4 - 8 - requests: 10000 + requests: 1000 concurrency: 10 input: "https://raw.githubusercontent.com/pytorch/serve/master/examples/image_classifier/mnist/test_data/0.png" - backend_profiling: True + backend_profiling: False exec_env: "docker" processors: - "cpu" diff --git a/test/benchmark/tests/suite/vgg11.yaml b/test/benchmark/tests/suite/vgg11.yaml index 7305e7de2e..f378ab18de 100644 --- a/test/benchmark/tests/suite/vgg11.yaml +++ b/test/benchmark/tests/suite/vgg11.yaml @@ -13,7 +13,7 @@ vgg11: input: "https://raw.githubusercontent.com/pytorch/serve/master/examples/image_classifier/kitten.jpg" requests: 1000 concurrency: 100 - backend_profiling: True + backend_profiling: False exec_env: "docker" processors: - "cpu" @@ -31,7 +31,7 @@ vgg11: input: "https://raw.githubusercontent.com/pytorch/serve/master/examples/image_classifier/kitten.jpg" requests: 1000 concurrency: 100 - backend_profiling: True + backend_profiling: False exec_env: "docker" dockerhub_image: "pytorch/torchserve:latest" processors: diff --git a/test/benchmark/tests/suite/vgg16.yaml b/test/benchmark/tests/suite/vgg16.yaml index 021925aa58..1700e9362e 100644 --- a/test/benchmark/tests/suite/vgg16.yaml +++ b/test/benchmark/tests/suite/vgg16.yaml @@ -13,7 +13,7 @@ vgg16: input: "https://raw.githubusercontent.com/pytorch/serve/master/examples/image_classifier/kitten.jpg" requests: 1000 concurrency: 100 - backend_profiling: True + backend_profiling: False exec_env: "docker" processors: - "cpu" @@ -31,7 +31,7 @@ vgg16: input: "https://raw.githubusercontent.com/pytorch/serve/master/examples/image_classifier/kitten.jpg" requests: 1000 concurrency: 100 - backend_profiling: True + backend_profiling: False exec_env: "docker" dockerhub_image: "pytorch/torchserve:latest" processors: diff --git a/test/benchmark/tests/test_bert.py b/test/benchmark/tests/test_bert.py index 5ccd5346f6..e866fed0ca 100644 --- a/test/benchmark/tests/test_bert.py +++ b/test/benchmark/tests/test_bert.py @@ -6,10 +6,10 @@ from invoke import run from invoke.context import Context -import utils.ec2 as ec2_utils -import utils.s3 as s3_utils -import utils.ts as ts_utils -import utils.apache_bench as ab_utils +import tests.utils.ec2 as ec2_utils +import tests.utils.s3 as s3_utils +import tests.utils.ts as ts_utils +import tests.utils.apache_bench as ab_utils from tests.utils import ( @@ -22,10 +22,10 @@ S3_BUCKET_BENCHMARK_ARTIFACTS, ) -INSTANCE_TYPES_TO_TEST = ["p3.8xlarge"] +# Add/remove from the following list to benchmark on the instance of your choice +INSTANCE_TYPES_TO_TEST = ["c4.4xlarge"] -@pytest.mark.skip() @pytest.mark.parametrize("ec2_instance_type", INSTANCE_TYPES_TO_TEST, indirect=True) def test_bert_benchmark( ec2_connection, ec2_instance_type, bert_config_file_path, docker_dev_image_config_path, benchmark_execution_id @@ -71,7 +71,7 @@ def test_bert_benchmark( account_id, DEFAULT_REGION, docker_repo_tag, connection=ec2_connection ) docker_repo_tag_for_current_instance = docker_repo_tag - cuda_version_for_instance = cuda_version + cuda_version_for_instance = None break mode_list = [] @@ -131,7 +131,7 @@ def test_bert_benchmark( torchserveHandler.unregister_model() # Stop torchserve - torchserveHandler.stop_torchserve() + torchserveHandler.stop_torchserve(exec_env="docker") # Generate report (note: needs to happen after torchserve has stopped) apacheBenchHandler.generate_report(requests=requests, concurrency=concurrency, connection=ec2_connection) diff --git a/test/benchmark/tests/test_bert_neuron.py b/test/benchmark/tests/test_bert_neuron.py new file mode 100644 index 0000000000..efb25d018b --- /dev/null +++ b/test/benchmark/tests/test_bert_neuron.py @@ -0,0 +1,171 @@ +import os +import pprint + +import pytest +import time +from invoke import run +from invoke.context import Context + +import tests.utils.ec2 as ec2_utils +import tests.utils.s3 as s3_utils +import tests.utils.ts as ts_utils +import tests.utils.apache_bench as ab_utils +import tests.utils.neuron as neuron_utils + +from tests.utils import ( + DEFAULT_DOCKER_DEV_ECR_REPO, + DEFAULT_REGION, + GPU_INSTANCES, + LOGGER, + DockerImageHandler, + YamlHandler, + S3_BUCKET_BENCHMARK_ARTIFACTS, +) + +# Add/remove from the following list to benchmark on the instance of your choice +INSTANCE_TYPES_TO_TEST = ["inf1.6xlarge"] + +@pytest.mark.skip(reason="Skipping neuron test, manually unskip if you need to benchmark") +@pytest.mark.parametrize("ec2_instance_type", INSTANCE_TYPES_TO_TEST, indirect=True) +def test_neuron_benchmark( + ec2_connection, ec2_instance_type, bert_neuron_config_file_path, docker_dev_image_config_path, benchmark_execution_id +): + + test_config = YamlHandler.load_yaml(bert_neuron_config_file_path) + + model_name = bert_neuron_config_file_path.split("/")[-1].split(".")[0] + + LOGGER.info("Validating yaml contents") + + LOGGER.info(YamlHandler.validate_benchmark_yaml(test_config)) + + docker_config = YamlHandler.load_yaml(docker_dev_image_config_path) + + docker_repo_tag_for_current_instance = "" + cuda_version_for_instance = "" + account_id = run("aws sts get-caller-identity --query Account --output text").stdout.strip() + + for processor, config in docker_config.items(): + docker_tag = None + cuda_version = None + for config_key, config_value in config.items(): + if processor == "gpu" and config_key == "cuda_version": + cuda_version = config_value + if config_key == "docker_tag": + docker_tag = config_value + # TODO: Improve logic that selectively pulls CPU image on CPU instances and likewise for GPU. + + docker_repo_tag = f"{DEFAULT_DOCKER_DEV_ECR_REPO}:{docker_tag}" + + if ec2_instance_type[:2] in GPU_INSTANCES and ("gpu" in docker_tag or "neuron" in docker_tag): + dockerImageHandler = DockerImageHandler(docker_tag, cuda_version) + dockerImageHandler.pull_docker_image_from_ecr( + account_id, DEFAULT_REGION, docker_repo_tag, connection=ec2_connection + ) + docker_repo_tag_for_current_instance = docker_repo_tag + cuda_version_for_instance = cuda_version + break + if ec2_instance_type[:2] not in GPU_INSTANCES and ("cpu" in docker_tag or "neuron" in docker_tag): + dockerImageHandler = DockerImageHandler(docker_tag, cuda_version) + dockerImageHandler.pull_docker_image_from_ecr( + account_id, DEFAULT_REGION, docker_repo_tag, connection=ec2_connection + ) + docker_repo_tag_for_current_instance = docker_repo_tag + cuda_version_for_instance = None + break + + mode_list = [] + config_list = [] + batch_size_list = [] + processor_list = [] + + apacheBenchHandler = ab_utils.ApacheBenchHandler(model_name=model_name, connection=ec2_connection) + + for model, config in test_config.items(): + for mode, mode_config in config.items(): + mode_list.append(mode) + benchmark_engine = mode_config.get("benchmark_engine") + workers = mode_config.get("workers") + batch_delay = mode_config.get("batch_delay") + batch_sizes = mode_config.get("batch_size") + input_file = mode_config.get("input") + requests = mode_config.get("requests") + concurrency = mode_config.get("concurrency") + backend_profiling = mode_config.get("backend_profiling") + exec_env = mode_config.get("exec_env") + processors = mode_config.get("processors") + gpus = None + if len(processors) == 2: + gpus = processors[1].get("gpus") + LOGGER.info(f"processors: {processors[1]}") + LOGGER.info(f"gpus: {gpus}") + + LOGGER.info( + f"\n benchmark_engine: {benchmark_engine}\n workers: {workers}\n batch_delay: {batch_delay}\n batch_size:{batch_sizes}\n input_file: {input_file}\n requests: {requests}\n concurrency: {concurrency}\n backend_profiling: {backend_profiling}\n exec_env: {exec_env}\n processors: {processors}" + ) + + torchserveHandler = ts_utils.TorchServeHandler( + exec_env=exec_env, + cuda_version=cuda_version_for_instance, + gpus=gpus, + torchserve_docker_image=docker_repo_tag_for_current_instance, + backend_profiling=backend_profiling, + connection=ec2_connection, + ) + + # Note: Assumes a DLAMI (conda-based) is being used + torchserveHandler.setup_torchserve(virtual_env_name="aws_neuron_pytorch_p36") + + for batch_size in batch_sizes: + url = f"benchmark_{batch_size}.mar" + LOGGER.info(f"Running benchmark for model archive: {url}") + + # Stop torchserve + torchserveHandler.stop_torchserve(exec_env="local", virtual_env_name="aws_neuron_pytorch_p36") + + # Generate bert inf model + neuron_utils.setup_neuron_mar_files(connection=ec2_connection, virtual_env_name="aws_neuron_pytorch_p36", batch_size=batch_size) + + # Start torchserve + torchserveHandler.start_torchserve_local(virtual_env_name="aws_neuron_pytorch_p36", stop_torchserve=False) + + # Register + torchserveHandler.register_model( + url=url, workers=workers, batch_delay=batch_delay, batch_size=batch_size + ) + + # Run benchmark + apacheBenchHandler.run_apache_bench(requests=requests, concurrency=concurrency, input_file=input_file) + + # Unregister + torchserveHandler.unregister_model() + + # Stop torchserve + torchserveHandler.stop_torchserve(exec_env="local", virtual_env_name="aws_neuron_pytorch_p36") + + # Generate report (note: needs to happen after torchserve has stopped) + apacheBenchHandler.generate_report( + requests=requests, concurrency=concurrency, connection=ec2_connection + ) + + # Move artifacts into a common folder. + remote_artifact_folder = ( + f"/home/ubuntu/{benchmark_execution_id}/{model_name}/{ec2_instance_type}/{mode}/{batch_size}" + ) + + ec2_connection.run(f"mkdir -p {remote_artifact_folder}") + ec2_connection.run(f"cp -R /home/ubuntu/benchmark/* {remote_artifact_folder}") + + # Upload artifacts to s3 bucket + ec2_connection.run( + f"aws s3 cp --recursive /home/ubuntu/{benchmark_execution_id}/ {S3_BUCKET_BENCHMARK_ARTIFACTS}/{benchmark_execution_id}/" + ) + + time.sleep(3) + + run( + f"aws s3 cp --recursive /tmp/{model_name}/ {S3_BUCKET_BENCHMARK_ARTIFACTS}/{benchmark_execution_id}/{model_name}/{ec2_instance_type}/{mode}/{batch_size}" + ) + + run(f"rm -rf /tmp/{model_name}") + apacheBenchHandler.clean_up() diff --git a/test/benchmark/tests/test_fastrcnn.py b/test/benchmark/tests/test_fastrcnn.py index 2fe2cba0c0..401f0c59d3 100644 --- a/test/benchmark/tests/test_fastrcnn.py +++ b/test/benchmark/tests/test_fastrcnn.py @@ -6,10 +6,10 @@ from invoke import run from invoke.context import Context -import utils.ec2 as ec2_utils -import utils.s3 as s3_utils -import utils.ts as ts_utils -import utils.apache_bench as ab_utils +import tests.utils.ec2 as ec2_utils +import tests.utils.s3 as s3_utils +import tests.utils.ts as ts_utils +import tests.utils.apache_bench as ab_utils from tests.utils import ( DEFAULT_DOCKER_DEV_ECR_REPO, @@ -21,9 +21,9 @@ S3_BUCKET_BENCHMARK_ARTIFACTS, ) -INSTANCE_TYPES_TO_TEST = ["p3.8xlarge"] +# Add/remove from the following list to benchmark on the instance of your choice +INSTANCE_TYPES_TO_TEST = ["c4.4xlarge", "p3.8xlarge"] -@pytest.mark.skip() @pytest.mark.parametrize("ec2_instance_type", INSTANCE_TYPES_TO_TEST, indirect=True) def test_fastrcnn_benchmark( ec2_connection, ec2_instance_type, fastrcnn_config_file_path, docker_dev_image_config_path, benchmark_execution_id @@ -69,7 +69,7 @@ def test_fastrcnn_benchmark( account_id, DEFAULT_REGION, docker_repo_tag, connection=ec2_connection ) docker_repo_tag_for_current_instance = docker_repo_tag - cuda_version_for_instance = cuda_version + cuda_version_for_instance = None break mode_list = [] diff --git a/test/benchmark/tests/test_mnist.py b/test/benchmark/tests/test_mnist.py index b6024f8834..0a1a8e8ced 100644 --- a/test/benchmark/tests/test_mnist.py +++ b/test/benchmark/tests/test_mnist.py @@ -6,10 +6,10 @@ from invoke import run from invoke.context import Context -import utils.ec2 as ec2_utils -import utils.s3 as s3_utils -import utils.ts as ts_utils -import utils.apache_bench as ab_utils +import tests.utils.ec2 as ec2_utils +import tests.utils.s3 as s3_utils +import tests.utils.ts as ts_utils +import tests.utils.apache_bench as ab_utils from tests.utils import ( DEFAULT_DOCKER_DEV_ECR_REPO, @@ -21,9 +21,9 @@ S3_BUCKET_BENCHMARK_ARTIFACTS, ) +# Add/remove from the following list to benchmark on the instance of your choice INSTANCE_TYPES_TO_TEST = ["p3.8xlarge"] -@pytest.mark.skip() @pytest.mark.parametrize("ec2_instance_type", INSTANCE_TYPES_TO_TEST, indirect=True) def test_mnist_benchmark( ec2_connection, ec2_instance_type, mnist_config_file_path, docker_dev_image_config_path, benchmark_execution_id @@ -69,7 +69,7 @@ def test_mnist_benchmark( account_id, DEFAULT_REGION, docker_repo_tag, connection=ec2_connection ) docker_repo_tag_for_current_instance = docker_repo_tag - cuda_version_for_instance = cuda_version + cuda_version_for_instance = None break mode_list = [] diff --git a/test/benchmark/tests/test_vgg11.py b/test/benchmark/tests/test_vgg11.py index d2f41bbc38..9549ec7a50 100644 --- a/test/benchmark/tests/test_vgg11.py +++ b/test/benchmark/tests/test_vgg11.py @@ -6,10 +6,10 @@ from invoke import run from invoke.context import Context -import utils.ec2 as ec2_utils -import utils.s3 as s3_utils -import utils.ts as ts_utils -import utils.apache_bench as ab_utils +import tests.utils.ec2 as ec2_utils +import tests.utils.s3 as s3_utils +import tests.utils.ts as ts_utils +import tests.utils.apache_bench as ab_utils from tests.utils import ( DEFAULT_DOCKER_DEV_ECR_REPO, @@ -21,9 +21,9 @@ S3_BUCKET_BENCHMARK_ARTIFACTS, ) +# Add/remove from the following list to benchmark on the instance of your choice INSTANCE_TYPES_TO_TEST = ["p3.8xlarge"] -@pytest.mark.skip() @pytest.mark.parametrize("ec2_instance_type", INSTANCE_TYPES_TO_TEST, indirect=True) def test_vgg11_benchmark( ec2_connection, ec2_instance_type, vgg11_config_file_path, docker_dev_image_config_path, benchmark_execution_id @@ -69,7 +69,7 @@ def test_vgg11_benchmark( account_id, DEFAULT_REGION, docker_repo_tag, connection=ec2_connection ) docker_repo_tag_for_current_instance = docker_repo_tag - cuda_version_for_instance = cuda_version + cuda_version_for_instance = None break mode_list = [] @@ -129,7 +129,7 @@ def test_vgg11_benchmark( torchserveHandler.unregister_model() # Stop torchserve - torchserveHandler.stop_torchserve() + torchserveHandler.stop_torchserve(exec_env="docker") # Generate report (note: needs to happen after torchserve has stopped) apacheBenchHandler.generate_report(requests=requests, concurrency=concurrency, connection=ec2_connection) diff --git a/test/benchmark/tests/test_vgg16.py b/test/benchmark/tests/test_vgg16.py index cf9f8e7997..b7116eb701 100644 --- a/test/benchmark/tests/test_vgg16.py +++ b/test/benchmark/tests/test_vgg16.py @@ -6,10 +6,10 @@ from invoke import run from invoke.context import Context -import utils.ec2 as ec2_utils -import utils.s3 as s3_utils -import utils.ts as ts_utils -import utils.apache_bench as ab_utils +import tests.utils.ec2 as ec2_utils +import tests.utils.s3 as s3_utils +import tests.utils.ts as ts_utils +import tests.utils.apache_bench as ab_utils from tests.utils import ( DEFAULT_DOCKER_DEV_ECR_REPO, @@ -21,7 +21,8 @@ S3_BUCKET_BENCHMARK_ARTIFACTS, ) -INSTANCE_TYPES_TO_TEST = ["p3.8xlarge"] +# Add/remove from the following list to benchmark on the instance of your choice +INSTANCE_TYPES_TO_TEST = ["c4.4xlarge"] @pytest.mark.parametrize("ec2_instance_type", INSTANCE_TYPES_TO_TEST, indirect=True) def test_vgg16_benchmark( @@ -54,7 +55,7 @@ def test_vgg16_benchmark( docker_repo_tag = f"{DEFAULT_DOCKER_DEV_ECR_REPO}:{docker_tag}" - if ec2_instance_type[:2] in GPU_INSTANCES and "gpu" in docker_tag: + if ec2_instance_type[:2] in GPU_INSTANCES and ("gpu" in docker_tag or "neuron" in docker_tag): dockerImageHandler = DockerImageHandler(docker_tag, cuda_version) dockerImageHandler.pull_docker_image_from_ecr( account_id, DEFAULT_REGION, docker_repo_tag, connection=ec2_connection @@ -62,13 +63,13 @@ def test_vgg16_benchmark( docker_repo_tag_for_current_instance = docker_repo_tag cuda_version_for_instance = cuda_version break - if ec2_instance_type[:2] not in GPU_INSTANCES and "cpu" in docker_tag: + if ec2_instance_type[:2] not in GPU_INSTANCES and ("cpu" in docker_tag or "neuron" in docker_tag): dockerImageHandler = DockerImageHandler(docker_tag, cuda_version) dockerImageHandler.pull_docker_image_from_ecr( account_id, DEFAULT_REGION, docker_repo_tag, connection=ec2_connection ) docker_repo_tag_for_current_instance = docker_repo_tag - cuda_version_for_instance = cuda_version + cuda_version_for_instance = None break mode_list = [] @@ -104,7 +105,7 @@ def test_vgg16_benchmark( torchserveHandler = ts_utils.TorchServeHandler( exec_env=exec_env, - cuda_version=cuda_version, + cuda_version=cuda_version_for_instance, gpus=gpus, torchserve_docker_image=docker_repo_tag_for_current_instance, backend_profiling=backend_profiling, @@ -128,7 +129,7 @@ def test_vgg16_benchmark( torchserveHandler.unregister_model() # Stop torchserve - torchserveHandler.stop_torchserve() + torchserveHandler.stop_torchserve(exec_env="docker") # Generate report (note: needs to happen after torchserve has stopped) apacheBenchHandler.generate_report(requests=requests, concurrency=concurrency, connection=ec2_connection) diff --git a/test/benchmark/tests/utils/__init__.py b/test/benchmark/tests/utils/__init__.py index 5c45d6d8a6..9fd2571599 100644 --- a/test/benchmark/tests/utils/__init__.py +++ b/test/benchmark/tests/utils/__init__.py @@ -1,5 +1,8 @@ +from __future__ import absolute_import + import json import logging +import fcntl import os import re import subprocess @@ -27,8 +30,9 @@ GPU_INSTANCES = ["p2", "p3", "p4", "g2", "g3", "g4"] # DLAMI with nVidia Driver ver. 450.119.03 (support upto CUDA 11.2), Ubuntu 18.04 -AMI_ID = "ami-0ff137c06803a8bb7" -# AMI_ID = "ami-0198925303105158c", with apache2-utils installed +# AMI_ID = "ami-064696901389beb84" +# AMI_ID = "ami-0198925303105158c", Base DLAMI 37.0 with apache2-utils installed +AMI_ID = "ami-00c5ebd9076702cbe"#, DLAMI 43.0 with apache2-utils installed LOGGER = logging.getLogger(__name__) LOGGER.setLevel(logging.INFO) @@ -50,10 +54,10 @@ def build_image(self): os.chdir(torch_serve_docker_directory) if self.cuda_version: run_out = run( - f"./build_image.sh -bt dev -g -cv {self.cuda_version} -t {DEFAULT_DOCKER_DEV_ECR_REPO}:{self.docker_tag}" + f"./build_image.sh -b {self.branch} -bt dev -g -cv {self.cuda_version} -t {DEFAULT_DOCKER_DEV_ECR_REPO}:{self.docker_tag}" ) else: - run_out = run(f"./build_image.sh -bt dev -t {DEFAULT_DOCKER_DEV_ECR_REPO}:{self.docker_tag}") + run_out = run(f"./build_image.sh -b {self.branch} -bt dev -t {DEFAULT_DOCKER_DEV_ECR_REPO}:{self.docker_tag}") # Switch back to original directory os.chdir(current_working_directory) @@ -140,11 +144,10 @@ class YamlHandler(object): "input", "processors", "requests", - "url", "workers", ] - optional_config_keys = ["dockerhub_image", "docker_dev_image"] + optional_config_keys = ["url", "dockerhub_image", "docker_dev_image", "compile_per_batch_size"] valid_config_keys = mandatory_config_keys + optional_config_keys @@ -154,7 +157,7 @@ class YamlHandler(object): valid_processors = ["cpu", "gpus"] - valid_docker_processors = ["cpu", "gpu"] + valid_docker_processors = ["cpu", "gpu", "inferentia"] mandatory_docker_config_keys = ["docker_tag"] @@ -179,8 +182,10 @@ def write_yaml(file_path, dictionary_object): :param dictionary_object: dictionary with content that needs to be written to a yaml file :return None """ - with open(file_path) as f: - yaml.dump(f, dictionary_object) + with open(file_path, "a") as f: + fcntl.flock(f, fcntl.LOCK_EX) + yaml.dump(dictionary_object, f) + fcntl.flock(f, fcntl.LOCK_UN) @staticmethod def validate_benchmark_yaml(yaml_content): diff --git a/test/benchmark/tests/utils/apache_bench.py b/test/benchmark/tests/utils/apache_bench.py index 27b9a2bc9a..2c0c2af47a 100644 --- a/test/benchmark/tests/utils/apache_bench.py +++ b/test/benchmark/tests/utils/apache_bench.py @@ -47,7 +47,7 @@ def install_dependencies(self): """ Installs apache2-utils, assuming it's an Ubuntu instance """ - run_out = self.connection.sudo(f"apt install -y apache2-utils") + run_out = self.connection.sudo(f"apt install -y apache2-utils", pty=True) return run_out.return_code def run_apache_bench(self, requests, concurrency, input_file): @@ -58,14 +58,15 @@ def run_apache_bench(self, requests, concurrency, input_file): """ self.connection.run(f"mkdir -p {TMP_DIR}/benchmark") - self.connection.run(f"wget {input_file}") + if input_file.startswith("https://") or input_file.startswith("http://"): + self.connection.run(f"wget {input_file}", warn=True) + file_name = self.connection.run(f"basename {input_file}").stdout.strip() + # Copy to the directory with other benchmark artifacts + self.connection.run(f"cp {file_name} {os.path.join(TMP_DIR, 'benchmark/input')}") + else: + self.connection.run(f"cp {input_file} {os.path.join(TMP_DIR, 'benchmark/input')}") - file_name = self.connection.run(f"basename {input_file}").stdout.strip() - - # Copy to the directory with other benchmark artifacts - self.connection.run(f"cp {file_name} {os.path.join(TMP_DIR, 'benchmark/input')}") - - apache_bench_command = f"ab -c {concurrency} -n {requests} -k -p {TMP_DIR}/benchmark/input -T application/png {self.inference_url}/predictions/benchmark > {self.result_file}" + apache_bench_command = f"ab -c {concurrency} -n {requests} -k -p {TMP_DIR}/benchmark/input -T application/jpg {self.inference_url}/predictions/benchmark > {self.result_file}" # Run apache bench run_out = self.connection.run( diff --git a/test/benchmark/tests/utils/neuron.py b/test/benchmark/tests/utils/neuron.py new file mode 100644 index 0000000000..96eb1763b9 --- /dev/null +++ b/test/benchmark/tests/utils/neuron.py @@ -0,0 +1,53 @@ +import subprocess +import time +import glob +import os +import requests +import tempfile + +import invoke +import pandas as pd + +from io import StringIO +from urllib.parse import urlparse +from invoke import run +from invoke.context import Context + +from . import DEFAULT_REGION, IAM_INSTANCE_PROFILE, AMI_ID, LOGGER, S3_BUCKET_BENCHMARK_ARTIFACTS + +# Assumes the functions from this file execute on an Ubuntu ec2 instance +ROOT_DIR = f"/home/ubuntu" +TORCHSERVE_DIR = os.path.join(ROOT_DIR, "serve") +MODEL_STORE = os.path.join(TORCHSERVE_DIR, "model_store") +LOCAL_TMP_DIR = "/tmp" +TMP_DIR = "/home/ubuntu" +NEURON_RESOURCES_FOLDER = os.path.join(TORCHSERVE_DIR, "test", "benchmark", "tests", "resources", "neuron-bert") + +def setup_neuron_mar_files(connection=None, virtual_env_name=None, batch_size=1): + activation_command = "" + + if virtual_env_name: + activation_command = f"cd /home/ubuntu/serve/test/benchmark/tests/resources/neuron-bert && source activate {virtual_env_name} && " + + # Note: change version here to make sure the torch version compatible with neuron is being used. + connection.run(f"{activation_command}pip3 install -U --ignore-installed torch==1.7.1", warn=True) + connection.run(f"{activation_command}pip3 install -U --ignore-installed torch-neuron 'neuron-cc[tensorflow]' --extra-index-url=https://pip.repos.neuron.amazonaws.com", warn=True) + + connection.run(f"{activation_command}python3 compile_bert.py --batch-size {batch_size}", warn=True) + time.sleep(5) + run_out_sed = connection.run(f"{activation_command}sed -i 's/batch_size=[[:digit:]]\+/batch_size={batch_size}/g' config.py", warn=True) + LOGGER.info(f"run_out_sed: {run_out_sed.stdout}, run_out_return: {run_out_sed.return_code}") + run_out_mkdir = connection.run(f"mkdir -p /home/ubuntu/benchmark/model_store") + LOGGER.info(f"run_out_mkdir: {run_out_mkdir.stdout}, run_out_return: {run_out_mkdir.return_code}") + run_out_archiver = connection.run(f"{activation_command}torch-model-archiver --model-name 'benchmark_{batch_size}' --version 1.0 --serialized-file ./bert_neuron_{batch_size}.pt --handler './handler_bert.py' --extra-files './config.py' -f", warn=True) + LOGGER.info(f"run_out_archiver: {run_out_archiver.stdout}, run_out_return: {run_out_archiver.return_code}") + + LOGGER.info(f"Running copy command") + connection.run(f"cp /home/ubuntu/serve/test/benchmark/tests/resources/neuron-bert/benchmark_{batch_size}.mar /home/ubuntu/benchmark/model_store") + run_out = connection.run(f"test -e /home/ubuntu/benchmark/model_store/benchmark_{batch_size}.mar") + if run_out.return_code == 0: + LOGGER.info(f"mar file available at location /home/ubuntu/benchmark/model_store/benchmark_{batch_size}.mar") + else: + LOGGER.info(f"mar file NOT available at location /home/ubuntu/benchmark/model_store/benchmark_{batch_size}.mar") + + time.sleep(5) \ No newline at end of file diff --git a/test/benchmark/tests/utils/report.py b/test/benchmark/tests/utils/report.py index f09c01ce9e..ca5bb9ec2a 100644 --- a/test/benchmark/tests/utils/report.py +++ b/test/benchmark/tests/utils/report.py @@ -18,7 +18,7 @@ TMP_DIR = "/tmp" - +from . import LOGGER class MarkdownTable: def __init__(self): @@ -81,10 +81,6 @@ def add_markdown_from_csv(self, file_path, delimiter): md_string += item + " | " md_string += "\n" - # writing md_string to the output_file - # file = open(output_file, "w", encoding="UTF-8") - # file.write(md_string) - # file.close() self.markdown_content += md_string print("The markdown file has been created!!!") @@ -99,7 +95,7 @@ def add_code_block(self, content: str, newline=True): newline_modifier = "\n" if newline else "" backticks_modifier = "```" if newline else "`" - self.markdown_content += str(f"{newline_modifier}{backticks_modifier}{newline_modifier}{content}\n{backticks_modifier}") + self.markdown_content += str(f"{newline_modifier}{backticks_modifier}\n{content}\n{backticks_modifier}{newline_modifier}") def add_paragraph(self, content: str, bold=False, italics=False, newline=True): """ @@ -123,45 +119,56 @@ def add_newline(self): def get_document(self): return self.markdown_content -def main(s3_bucket_uri): - """ - Compile a markdown file with different csv files as input - """ - # Download the s3 files - run(f"mkdir -p /tmp/report") - run(f"aws s3 cp --recursive {s3_bucket_uri} /tmp/report") - - csv_files = [] +class Report: + def __init__(self): + self.tmp_report_dir = os.path.join("/tmp", "report") - for root, dirs, files in os.walk("/tmp/report/"): - for name in files: - csv_files.append(os.path.join(root, name)) if "ab_report" in name else None - - markdownDocument = MarkdownDocument("Benchmark report") - markdownDocument.add_newline() - # Assume model configuration starts from /tmp/report - for report_path in csv_files: - split_path = report_path.split("/") - print(split_path) - model = split_path[3] - instance_type = split_path[4] - mode = split_path[5] - batch_size = split_path[6] + def download_benchmark_results_from_s3(self, s3_uri): + """ + Download benchmark results of various runs from s3 + """ + # Cleanup any previous folder + run(f"rm -rf {self.tmp_report_dir}") - config_header = f"{model} | {mode} | {instance_type} | batch size {batch_size}" + # Create a tmp folder + run(f"mkdir -p {self.tmp_report_dir}") - markdownDocument.add_paragraph(config_header, bold=True, newline=True) + run(f"aws s3 cp --recursive {s3_uri} {self.tmp_report_dir}") - print(f"Updating data from file: {report_path}") - markdownDocument.add_markdown_from_csv(report_path, delimiter=" ") - - with open("report.md", "w") as f: - f.write(markdownDocument.get_document()) - # Clean up - run(f"rm -rf /tmp/report") + def generate_comprehensive_report(self): + """ + Compile a markdown file with different csv files as input + """ + csv_files = [] + for root, dirs, files in os.walk("/tmp/report/"): + for name in files: + csv_files.append(os.path.join(root, name)) if "ab_report" in name else None + + csv_files = sorted(csv_files) + + markdownDocument = MarkdownDocument("Benchmark report") + markdownDocument.add_newline() + + # Assume model configuration starts from /tmp/report + for report_path in csv_files: + split_path = report_path.split("/") + print(split_path) + model = split_path[3] + instance_type = split_path[4] + mode = split_path[5] + batch_size = split_path[6] + + config_header = f"{model} | {mode} | {instance_type} | batch size {batch_size}" + + markdownDocument.add_code_block(config_header, newline=True) + + print(f"Updating data from file: {report_path}") + markdownDocument.add_markdown_from_csv(report_path, delimiter=" ") + + with open("report.md", "w") as f: + f.write(markdownDocument.get_document()) -if __name__ == "__main__": - generate_comprehensive_report("s3_bucket_uri") \ No newline at end of file + LOGGER.info(f"Benchmark report generated at: {os.path.join(os.getcwd(), 'report.md')}") \ No newline at end of file diff --git a/test/benchmark/tests/utils/ts.py b/test/benchmark/tests/utils/ts.py index 490755403e..605bb61a42 100644 --- a/test/benchmark/tests/utils/ts.py +++ b/test/benchmark/tests/utils/ts.py @@ -26,7 +26,7 @@ class TorchServeHandler(object): def __init__( self, - exec_env="local", + exec_env="docker", cuda_version="cu102", gpus=None, torchserve_docker_image=None, @@ -50,18 +50,31 @@ def __init__( # self.prepare_common_dependency() # self.getAPIS() - def setup_torchserve(self): + def setup_torchserve(self, virtual_env_name=None): """ Set up torchserve dependencies, and install torchserve """ - pass + activation_command = "" + self.connection.run(f"chmod +x -R /home/ubuntu/serve") + if virtual_env_name: + activation_command = f"cd /home/ubuntu/serve && source activate {virtual_env_name} && " + + if self.connection.run(f"{activation_command}torchserve --version", warn=True).return_code == 0: + return + + self.connection.run(f"{activation_command}python3 ./ts_scripts/install_dependencies.py --environment=dev", warn=True) + self.connection.run(f"{activation_command}pip3 install pygit2", warn=True) + self.connection.run(f"{activation_command}python3 ./ts_scripts/install_from_src.py", warn=True) + self.connection.run(f"{activation_command}torchserve --version") + def prepare_common_dependency(self): - # Note: the following command cleans up any previous run logs - self.connection.run(f"rm -rf {os.path.join(TMP_DIR, 'benchmark')}") + # Note: the following command cleans up any previous run logs, except any *.mar files generated to avoid re-creation + self.connection.run(f"find {os.path.join(TMP_DIR, 'benchmark')} ! -name '*.mar' -type f -exec rm -f {{}} +", warn=True) # Recreate required folders self.connection.run(f"mkdir -p {os.path.join(TMP_DIR, 'benchmark', 'conf')}") self.connection.run(f"mkdir -p {os.path.join(TMP_DIR, 'benchmark', 'logs')}") + self.connection.run(f"mkdir -p {os.path.join(TMP_DIR, 'benchmark', 'model_store')}") # Use config from benchmarks/ folder self.connection.run( @@ -86,10 +99,29 @@ def getAPIS(self): self.management_port = urlparse(management_api).port self.inference_api = urlparse(inference_api).port - def start_torchserve_local(self): - pass + def start_torchserve_local(self, virtual_env_name=None, stop_torchserve=True): + + self.prepare_common_dependency() + self.getAPIS() + + activation_command = "" + if virtual_env_name: + activation_command = f"cd /home/ubuntu/serve && source activate {virtual_env_name} && " + if self.backend_profiling: + activation_command = f"{activation_command} && export TS_BENCHMARK=True && " + + if stop_torchserve: + LOGGER.info(f"Stop existing torchserve instance") + self.connection.run(f"{activation_command}torchserve --stop", warn=True) + + self.connection.run(f"{activation_command}torchserve --start --model-store /home/ubuntu/benchmark/model_store/ --ts-config {TMP_DIR}/benchmark/conf/config.properties > {TMP_DIR}/benchmark/logs/model_metrics.log", warn=True) + LOGGER.info(f"Started torchserve using command") + LOGGER.info(f"{activation_command}torchserve --start --model-store /home/ubuntu/benchmark/model_store/ --ts-config {TMP_DIR}/benchmark/conf/config.properties > {TMP_DIR}/benchmark/logs/model_metrics.log") + + time.sleep(10) - def start_torchserve_docker(self): + + def start_torchserve_docker(self, stop_torchserve=True): self.prepare_common_dependency() self.getAPIS() @@ -101,8 +133,9 @@ def start_torchserve_docker(self): if self.backend_profiling: backend_profiling = f"-e TS_BENCHMARK=True" - LOGGER.info(f"Removing existing TS container instance...") - self.connection.run("docker rm -f ts") + if stop_torchserve: + LOGGER.info(f"Removing existing TS container instance...") + self.connection.run("docker rm -f ts") LOGGER.info(f"Starting docker container on the instance from image: {self.torchserve_docker_image}") docker_run_cmd = ( @@ -127,16 +160,17 @@ def register_model(self, url, workers, batch_delay, batch_size, model_name="benc :param batch_size: max number of requests allowed to be batched """ run_out = self.connection.run( - f'curl -X POST "http://localhost:8081/models?url={url}&initial_workers={workers}&batch_delay={batch_delay}&batch_size={batch_size}&synchronous=true&model_name=benchmark"' + f'curl -X POST "http://localhost:8081/models?url={url}&initial_workers={workers}&batch_delay={batch_delay}&batch_size={batch_size}&synchronous=true&model_name=benchmark"', warn=True ) LOGGER.info( f'curl -X POST "http://localhost:8081/models?url={url}&initial_workers={workers}&batch_delay={batch_delay}&batch_size={batch_size}&synchronous=true&model_name=benchmark"' ) - time.sleep(5) + time.sleep(40) - assert run_out.return_code == 0, f"Failed to register model {model_name} sourced from url: {url}" + if run_out.return_code == 0: + LOGGER.error(f"Failed to register model {model_name} sourced from url: {url}") def unregister_model(self, model_name="benchmark"): """ @@ -148,18 +182,23 @@ def unregister_model(self, model_name="benchmark"): LOGGER.info(f'curl -X DELETE "http://localhost:8081/models/{model_name}/1.0"') LOGGER.info(f"stdout: {run_out.stdout}") - time.sleep(5) + time.sleep(10) if run_out.return_code == 0: LOGGER.error(f"Failed to unregister model {model_name}") - def stop_torchserve(self, exec_env="local"): + def stop_torchserve(self, exec_env="docker", virtual_env_name=None): """ Stops torchserve depending on the exec_env :param exec_env: either 'local' or 'docker' """ if exec_env == "docker": - self.connection.run(f"docker rm -f ts") + self.connection.run(f"docker rm -f ts", warn=True) + else: + activation_command = "" + if virtual_env_name: + activation_command = f"cd /home/ubuntu/serve/test/benchmark/tests/resources/neuron-bert && source activate {virtual_env_name} && " + self.connection.run(f"{activation_command}torchserve --stop", warn=True) time.sleep(5)