diff --git a/.gitignore b/.gitignore
index b0efde0c43..550ce2ab59 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,8 @@ dist/
 .github/.DS_Store
 .DS_Store
 frontend/server/src/main/java/org/pytorch/serve/grpc/
+*.pem
+*.backup
 
 # Postman files
 test/artifacts/
@@ -18,5 +20,10 @@ test/model_store/
 test/ts_console.log
 test/config.properties
 
+
 .vscode
 .scratch/
+
+# Custom benchmark artifacts
+instances.yaml
+instances.yaml.backup
diff --git a/docker/Dockerfile.neuron.dev b/docker/Dockerfile.neuron.dev
new file mode 100644
index 0000000000..ce31c434c0
--- /dev/null
+++ b/docker/Dockerfile.neuron.dev
@@ -0,0 +1,109 @@
+# syntax = docker/dockerfile:experimental
+#
+# Following comments have been shamelessly copied from https://github.com/pytorch/pytorch/blob/master/Dockerfile
+# 
+# NOTE: To build this you will need a docker version > 18.06 with
+#       experimental enabled and DOCKER_BUILDKIT=1
+#
+#       If you do not use buildkit you are not going to have a good time
+#
+#       For reference: 
+#           https://docs.docker.com/develop/develop-images/build_enhancements/
+
+ARG BASE_IMAGE=ubuntu:18.04
+ARG BUILD_TYPE=dev
+FROM ${BASE_IMAGE} AS compile-image
+
+ARG BASE_IMAGE
+ARG BRANCH_NAME=master
+ARG MACHINE_TYPE=cpu
+ARG CUDA_VERSION
+
+ENV PYTHONUNBUFFERED TRUE
+
+RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
+    apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+    fakeroot \
+    ca-certificates \
+    dpkg-dev \
+    sudo \
+    g++ \
+    git \
+    python3-dev \
+    build-essential \
+    openjdk-11-jdk \
+    curl \
+    wget \
+    vim \
+    && rm -rf /var/lib/apt/lists/* \
+    && cd /tmp \
+    && curl -O https://bootstrap.pypa.io/get-pip.py \
+    && python3 get-pip.py
+
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1 \
+    && update-alternatives --install /usr/local/bin/pip pip /usr/local/bin/pip3 1
+
+RUN pip install -U pip setuptools
+
+RUN echo "deb https://apt.repos.neuron.amazonaws.com bionic main" > /etc/apt/sources.list.d/neuron.list
+RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
+
+RUN apt-get update \
+    && apt-get install -y \
+    aws-neuron-runtime \
+    aws-neuron-tools \
+    && rm -rf /var/lib/apt/lists/* \
+    && rm -rf /tmp/tmp* \
+    && apt-get clean
+
+# Build Dev Image
+FROM compile-image AS dev-image
+ARG MACHINE_TYPE=cpu
+ARG CUDA_VERSION
+RUN if [ "$MACHINE_TYPE" = "gpu" ]; then export USE_CUDA=1; fi \
+    && git clone https://github.com/pytorch/serve.git \
+    && cd serve \
+    && git checkout --track ${BRANCH_NAME} \
+    && if [ -z "$CUDA_VERSION" ]; then python ts_scripts/install_dependencies.py --environment=dev; else python ts_scripts/install_dependencies.py --environment=dev  --cuda $CUDA_VERSION; fi \
+    && python ts_scripts/install_from_src.py \
+    && useradd -m model-server \
+    && mkdir -p /home/model-server/tmp \
+    && cp docker/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh \
+    && chmod +x /usr/local/bin/dockerd-entrypoint.sh \
+    && chown -R model-server /home/model-server \
+    && cp docker/config.properties /home/model-server/config.properties \
+    && mkdir /home/model-server/model-store && chown -R model-server /home/model-server/model-store \
+    && pip install torch-neuron 'neuron-cc[tensorflow]' --extra-index-url=https://pip.repos.neuron.amazonaws.com 
+
+EXPOSE 8080 8081 8082 7070 7071
+USER model-server
+WORKDIR /home/model-server
+ENV TEMP=/home/model-server/tmp
+ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"]
+CMD ["serve"]
+
+# Build CodeBuild Image
+FROM compile-image AS codebuild-image
+ENV JAVA_VERSION=11 \
+  JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64" \
+  JDK_HOME="/usr/lib/jvm/java-11-openjdk-amd64" \
+  JRE_HOME="/usr/lib/jvm/java-11-openjdk-amd64" \
+  ANT_VERSION=1.10.3 \
+  MAVEN_HOME="/opt/maven" \
+  MAVEN_VERSION=3.5.4 \
+  MAVEN_CONFIG="/root/.m2" \
+  MAVEN_DOWNLOAD_SHA1="22cac91b3557586bb1eba326f2f7727543ff15e3"
+
+# Install Maven
+RUN set -ex \
+  && mkdir -p $MAVEN_HOME \
+  && curl -LSso /var/tmp/apache-maven-$MAVEN_VERSION-bin.tar.gz https://apache.org/dist/maven/maven-3/$MAVEN_VERSION/binaries/apache-maven-$MAVEN_VERSION-bin.tar.gz \
+  && echo "$MAVEN_DOWNLOAD_SHA1 /var/tmp/apache-maven-$MAVEN_VERSION-bin.tar.gz" | sha1sum -c - \
+  && tar xzvf /var/tmp/apache-maven-$MAVEN_VERSION-bin.tar.gz -C $MAVEN_HOME --strip-components=1 \
+  && update-alternatives --install /usr/bin/mvn mvn /opt/maven/bin/mvn 10000 \
+  && mkdir -p $MAVEN_CONFIG
+
+FROM ${BUILD_TYPE}-image AS final-image
+ARG BUILD_TYPE
+RUN echo "${BUILD_TYPE} image creation completed"
diff --git a/docker/build_image.sh b/docker/build_image.sh
index 5ff4d38b21..48b80a1d15 100755
--- a/docker/build_image.sh
+++ b/docker/build_image.sh
@@ -89,5 +89,5 @@ if [ $BUILD_TYPE == "production" ]
 then
   DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg CUDA_VERSION=$CUDA_VERSION -t $DOCKER_TAG .
 else
-  DOCKER_BUILDKIT=1 docker build --file Dockerfile.dev -t $DOCKER_TAG --build-arg BUILD_TYPE=$BUILD_TYPE --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg BRANCH_NAME=$BRANCH_NAME --build-arg CUDA_VERSION=$CUDA_VERSION --build-arg MACHINE_TYPE=$MACHINE .
+  DOCKER_BUILDKIT=1 docker build --pull --file Dockerfile.dev -t $DOCKER_TAG --build-arg BUILD_TYPE=$BUILD_TYPE --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg BRANCH_NAME=$BRANCH_NAME --build-arg CUDA_VERSION=$CUDA_VERSION --build-arg MACHINE_TYPE=$MACHINE .
 fi
diff --git a/test/benchmark/README.md b/test/benchmark/README.md
index 4ac3b99e5a..7ef9aa7b75 100644
--- a/test/benchmark/README.md
+++ b/test/benchmark/README.md
@@ -21,8 +21,45 @@ If you'd like to use your own repo, edit the __init__.py under `serve/test/bench
 * Ensure you have [docker](https://docs.docker.com/get-docker/) client set-up on your system - osx/ec2
 * Adjust the following global variables to your preference in the file `serve/test/benchmark/tests/utils/__init__.py` <br>
 -- IAM_INSTANCE_PROFILE :this role is attached to all ec2 instances created as part of the benchmarking process. Create this as described [here](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html#create-iam-role). Default role name is 'EC2Admin'.<br>
+Use the following commands to create a new role if you don't have one you can use.
+1. Create the trust policy file `ec2-admin-trust-policy.json` and add the following content:
+```
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Principal": {
+        "Service": [
+          "ec2.amazonaws.com"
+        ]
+      },
+      "Action": "sts:AssumeRole"
+    }
+  ]
+}
+```
+2. Create the EC2 role as follows:
+```
+aws iam create-role --role-name EC2Admin --assume-role-policy-document file://ec2-admin-trust-policy.json
+```
+3. Add the permissions to the role as follows:
+```
+aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/IAMFullAccess --role-name EC2Admin
+aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/AmazonEC2FullAccess --role-name EC2Admin
+aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess --role-name EC2Admin
+aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryFullAccess --role-name EC2Admin
+```
 -- S3_BUCKET_BENCHMARK_ARTIFACTS :all temporary benchmarking artifacts including server logs will be stored in this bucket: <br>
+Use the following command to create a new S3 bucket if you don't have one you can use.
+```
+aws s3api create-bucket --bucket <torchserve-benchmark> --region us-west-2
+```
 -- DEFAULT_DOCKER_DEV_ECR_REPO :docker image used for benchmarking will be pushed to this repo <br>
+Use the following command to create a new ECR repo if you don't have one you can use.
+```
+aws ecr create-repository --bucket torchserve-benchmark --region us-west-2
+```
 * If you're running this setup on an EC2 instance, please ensure that the instance's security group settings 'allow' inbound ssh port 22. Refer [docs](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/security-group-rules.html).
 
 *The following steps assume that the current working directory is serve/.*
@@ -32,6 +69,8 @@ If you'd like to use your own repo, edit the __init__.py under `serve/test/bench
 sudo apt-get install python3-venv
 python3 -m venv bvenv
 source bvenv/bin/activate
+# Ensure you have the latest pip
+pip3 install -U pip
 ```
 2. Install requirements for the benchmarking 
 ```
@@ -57,7 +96,7 @@ python report.py
 ```
 The final benchmark report will be available in markdown format as `report.md` in the `serve/` folder. 
 
-**Example report for vgg16 model**
+**Example report for vgg11 model**
 
 
 ### Benchmark report
@@ -103,3 +142,37 @@ The final benchmark report will be available in markdown format as `report.md` i
  | AB | vgg11 | 100 | 1000 | 0 | 3.47 | 28765 | 29849 | 30488 | 28781.227 | 0.0 | 1576.24 | 1758.28 | 1758.28 | 2249.52 | 2249.34 | 25210.43 | 46.77 | 
 
 
+## Features of the automation:
+1. To save time by *not* creating new instances for every benchmark run for local testing, use the '--do-not-terminate' flag. This will automatically create a file called 'instances.yaml' and write instance-related data into the file so that it may be re-used next time.
+```
+python test/benchmark/run_benchmark.py --do-not-terminate
+```
+
+2. To re-use an instance already recorded in `instances.yaml`, use the '--use-instances' flag:
+```
+python test/benchmark/run_benchmark.py --use-instances <full_path_to>/instances.yaml --do-no-terminate
+```
+`Note: Use --do-not-termninate flag to keep re-using the instances, else, it will be terminated`.
+
+3. To run a test containing a specific string, use the `--run-only` flag. Note that the argument is 'string matched' i.e. if the test-name contains the supplied argument as a substring, the test will run. 
+```
+# To run mnist test
+python test/benchmark/run_benchmark.py --run-only mnist
+
+# To run fastrcnn test
+python test/benchmark/run_benchmark.py --run-only fastrcnn
+
+# To run bert_neuron and bert
+python test/benchmark/run_benchmark.py --run-only bert
+
+# To run vgg11 test
+python test/benchmark/run_benchmark.py --run-only vgg11
+
+# To run vgg16 test
+python test/benchmark/run_benchmark.py --run-only vgg16
+```
+
+4. You can benchmark a specifc branch of the torchserve github repo by specifying the flag `--use-torchserve-branch` e.g., 
+```
+python test/benchmark/run_benchmark.py --use-torchserve-branch issue_1115
+```
\ No newline at end of file
diff --git a/test/benchmark/requirements.txt b/test/benchmark/requirements.txt
index 8fdd36f95c..fc06d1f3d6 100644
--- a/test/benchmark/requirements.txt
+++ b/test/benchmark/requirements.txt
@@ -11,4 +11,5 @@ gitpython
 docker
 pandas
 matplotlib
-pyyaml
\ No newline at end of file
+pyyaml
+cryptography==3.4.7
\ No newline at end of file
diff --git a/test/benchmark/run_benchmark.py b/test/benchmark/run_benchmark.py
index ebee2cac5c..bdf4777eb4 100644
--- a/test/benchmark/run_benchmark.py
+++ b/test/benchmark/run_benchmark.py
@@ -1,3 +1,4 @@
+import argparse
 import os
 import random
 import sys
@@ -5,18 +6,105 @@
 import re
 import uuid
 
+
 import boto3
 import pytest
 
 from invoke import run
 from invoke.context import Context
 
+
+from tests.utils.report import Report
+from tests.utils import (
+    S3_BUCKET_BENCHMARK_ARTIFACTS,
+    DEFAULT_REGION,
+    DEFAULT_DOCKER_DEV_ECR_REPO,
+    YamlHandler,
+    DockerImageHandler,
+)
+
 LOGGER = logging.getLogger(__name__)
 LOGGER.setLevel(logging.DEBUG)
 LOGGER.addHandler(logging.StreamHandler(sys.stdout))
 
 
+def build_docker_container(torchserve_branch="master"):
+    LOGGER.info(f"Setting up docker image to be used")
+
+    docker_dev_image_config_path = os.path.join(os.getcwd(), "test", "benchmark", "tests", "suite", "docker", "docker.yaml")
+
+    docker_config = YamlHandler.load_yaml(docker_dev_image_config_path)
+    YamlHandler.validate_docker_yaml(docker_config)
+
+    account_id = run("aws sts get-caller-identity --query Account --output text").stdout.strip()
+
+    for processor, config in docker_config.items():
+        docker_tag = None
+        cuda_version = None
+        for config_key, config_value in config.items():
+            if processor == "gpu" and config_key == "cuda_version":
+                cuda_version = config_value
+            if config_key == "docker_tag":
+                docker_tag = config_value
+        dockerImageHandler = DockerImageHandler(docker_tag, cuda_version, torchserve_branch)
+        dockerImageHandler.build_image()
+        dockerImageHandler.push_docker_image_to_ecr(
+            account_id, DEFAULT_REGION, f"{DEFAULT_DOCKER_DEV_ECR_REPO}:{docker_tag}"
+        )
+
+
 def main():
+
+    LOGGER.info(f"sys.path: {sys.path}")
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--use-instances",
+        action="store",
+        help="Supply a .yaml file with test_name, instance_id, and key_filename to re-use already-running instances",
+    )
+    parser.add_argument(
+        "--do-not-terminate",
+        action="store_true",
+        default=False,
+        help="Use with caution: does not terminate instances, instead saves the list to a file in order to re-use",
+    )
+
+    parser.add_argument(
+        "--run-only", default=None, help="Runs the tests that contain the supplied keyword as a substring"
+    )
+
+    parser.add_argument(
+        "--use-torchserve-branch",
+        default="master",
+        help="Specify a specific torchserve branch to benchmark on, else uses 'master' by default"
+    )
+
+    parser.add_argument(
+        "--skip-docker-build",
+        action="store_true",
+        default=False,
+        help="Use if you already have a docker image built and available locally and have specified it in docker.yaml"
+    )
+
+    arguments = parser.parse_args()
+    do_not_terminate_string = "" if not arguments.do_not_terminate else "--do-not-terminate"
+    use_instances_arg_list = ["--use-instances", f"{arguments.use_instances}"] if arguments.use_instances else []
+    run_only_test = arguments.run_only
+
+    if run_only_test:
+        run_only_string = f"-k {run_only_test}"
+        LOGGER.info(f"Note: running only the tests that have the name '{run_only_test}'.")
+    else:
+        run_only_string = ""
+
+    torchserve_branch = arguments.use_torchserve_branch
+
+    # Build docker containers as specified in docker.yaml
+    if not arguments.skip_docker_build:
+        build_docker_container(torchserve_branch=torchserve_branch)
+
     # Run this script from the root directory 'serve', it changes directory below as required
     os.chdir(os.path.join(os.getcwd(), "test", "benchmark"))
 
@@ -25,12 +113,30 @@ def main():
     test_path = os.path.join(os.getcwd(), "tests")
     LOGGER.info(f"Running tests from directory: {test_path}")
 
-    pytest_args = ["-s", "-rA", test_path, "-n=4", "--disable-warnings", "-v", "--execution-id", execution_id]
+    pytest_args = [
+        "-s",
+        run_only_string,
+        "-rA",
+        test_path,
+        "-n=4",
+        "--disable-warnings",
+        "-v",
+        "--execution-id",
+        execution_id,
+        do_not_terminate_string,
+    ] + use_instances_arg_list
 
     LOGGER.info(f"Running pytest")
 
     pytest.main(pytest_args)
 
+    # Generate report
+    s3_results_uri = f"{S3_BUCKET_BENCHMARK_ARTIFACTS}/{execution_id}"
+
+    report = Report()
+    report.download_benchmark_results_from_s3(s3_results_uri)
+    report.generate_comprehensive_report()
+
 
 if __name__ == "__main__":
     main()
diff --git a/test/benchmark/tests/__init__.py b/test/benchmark/tests/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test/benchmark/tests/conftest.py b/test/benchmark/tests/conftest.py
index 9cf11acacf..ebb49d250f 100644
--- a/test/benchmark/tests/conftest.py
+++ b/test/benchmark/tests/conftest.py
@@ -4,6 +4,7 @@
 import random
 import re
 import sys
+import yaml
 
 import boto3
 import pytest
@@ -37,28 +38,19 @@ def pytest_addoption(parser):
         help="execution id that is used to keep all artifacts together",
     )
 
+    parser.addoption(
+        "--use-instances",
+        default=False,
+        action="store",
+        help="Supply a .yaml file with test_name, instance_id, and key_filename to re-use already-running instances",
+    )
 
-@pytest.fixture(scope="session", autouse=True)
-def build_docker_container(request, docker_dev_image_config_path):
-    LOGGER.info(f"Setting up docker image to be used")
-    docker_config = YamlHandler.load_yaml(docker_dev_image_config_path)
-    YamlHandler.validate_docker_yaml(docker_config)
-
-    account_id = run("aws sts get-caller-identity --query Account --output text").stdout.strip()
-
-    for processor, config in docker_config.items():
-        docker_tag = None
-        cuda_version = None
-        for config_key, config_value in config.items():
-            if processor == "gpu" and config_key == "cuda_version":
-                cuda_version = config_value
-            if config_key == "docker_tag":
-                docker_tag = config_value
-        dockerImageHandler = DockerImageHandler(docker_tag, cuda_version)
-        dockerImageHandler.build_image()
-        dockerImageHandler.push_docker_image_to_ecr(
-            account_id, DEFAULT_REGION, f"{DEFAULT_DOCKER_DEV_ECR_REPO}:{docker_tag}"
-        )
+    parser.addoption(
+        "--do-not-terminate",
+        action="store_true",
+        default=False,
+        help="Use with caution: does not terminate instances, instead saves the list to a file in order to re-use",
+    )
 
 
 @pytest.fixture(scope="session")
@@ -75,6 +67,10 @@ def benchmark_execution_id(request):
     return execution_id
 
 
+@pytest.fixture(scope="function")
+def bert_neuron_config_file_path(request):
+    return os.path.join(os.getcwd(), "tests", "suite", "bert_neuron.yaml")
+
 @pytest.fixture(scope="function")
 def vgg11_config_file_path(request):
     return os.path.join(os.getcwd(), "tests", "suite", "vgg11.yaml")
@@ -162,12 +158,29 @@ def ec2_instance(
     ec2_instance_ami,
     region,
 ):
-    key_filename = ec2_utils.generate_ssh_keypair(ec2_client, ec2_key_name)
 
-    def delete_ssh_keypair():
-        ec2_utils.destroy_ssh_keypair(ec2_client, key_filename)
+    use_instances_flag = request.config.getoption("--use-instances") if request.config.getoption("--use-instances") else None
 
-    request.addfinalizer(delete_ssh_keypair)
+    if use_instances_flag:
+        instances_file = request.config.getoption("--use-instances")
+        run(f"touch {instances_file}", warn=True)
+        instances_dict = YamlHandler.load_yaml(instances_file)
+        LOGGER.info(f"instances_dict: {instances_dict}")
+        instances = instances_dict.get(request.node.name.split("[")[0], "")
+        LOGGER.info(f"instances: {instances}")
+        assert instances != "", f"Could not find instance details corresponding to test: {request.node.name.split('[')[0]}"
+        instance_details = instances.get(ec2_instance_type, "")
+        assert instance_details != "", f"Could not obtain details for instance type: {ec2_instance_type}"
+        instance_id = instance_details.get("instance_id", "")
+        assert instance_id != "", f"Missing instance_id"
+        key_filename = instance_details.get("key_filename", "")
+        assert key_filename != "", f"Missing key_filename"
+
+        LOGGER.info(f"For test: {request.node.name}; Using instance_id: {instance_id} and key_filename: {key_filename}")
+
+        return instance_id, key_filename
+
+    key_filename = ec2_utils.generate_ssh_keypair(ec2_client, ec2_key_name)
 
     params = {
         "KeyName": ec2_key_name,
@@ -179,7 +192,7 @@ def delete_ssh_keypair():
         ],
         "MaxCount": 1,
         "MinCount": 1,
-        "BlockDeviceMappings": [{"DeviceName": "/dev/sda1", "Ebs": {"VolumeSize": 120}}],
+        "BlockDeviceMappings": [{"DeviceName": "/dev/sda1", "Ebs": {"VolumeSize": 220}}],
     }
 
     try:
@@ -196,10 +209,34 @@ def delete_ssh_keypair():
     def terminate_ec2_instance():
         ec2_client.terminate_instances(InstanceIds=[instance_id])
 
-    request.addfinalizer(terminate_ec2_instance)
+    def delete_ssh_keypair():
+        ec2_utils.destroy_ssh_keypair(ec2_client, key_filename)
+
+    do_not_terminate_flag = request.config.getoption("--do-not-terminate")
+
+    LOGGER.info(f"do_not_terminate_flag: {do_not_terminate_flag}")
+
+    instances_file = os.path.join(os.getcwd(), "instances.yaml")
+    run(f"touch {instances_file}", warn=True)
+
+    if not do_not_terminate_flag:
+        request.addfinalizer(terminate_ec2_instance)
+        request.addfinalizer(delete_ssh_keypair)
+
+    if do_not_terminate_flag and not use_instances_flag:
+        instances_dict = YamlHandler.load_yaml(instances_file)
+        if not instances_dict:
+            instances_dict = {}
+        
+        update_dictionary = {request.node.name.split("[")[0]: {ec2_instance_type: {"instance_id": instance_id, "key_filename": key_filename}}}
+
+        instances_dict.update(update_dictionary)
+
+        YamlHandler.write_yaml(instances_file, instances_dict)
 
     ec2_utils.check_instance_state(instance_id, state="running", region=region)
     ec2_utils.check_system_state(instance_id, system_status="ok", instance_status="ok", region=region)
+
     return instance_id, key_filename
 
 
@@ -232,6 +269,4 @@ def delete_s3_artifact_copy():
 
     request.addfinalizer(delete_s3_artifact_copy)
 
-
     return conn
-
diff --git a/test/benchmark/tests/resources/neuron-bert/compile_bert.py b/test/benchmark/tests/resources/neuron-bert/compile_bert.py
new file mode 100644
index 0000000000..8f0e30968a
--- /dev/null
+++ b/test/benchmark/tests/resources/neuron-bert/compile_bert.py
@@ -0,0 +1,65 @@
+import tensorflow  # to workaround a protobuf version conflict issue
+import torch
+import torch.neuron
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import logging
+import argparse
+
+## Enable logging so we can see any important warnings
+logger = logging.getLogger('Neuron')
+logger.setLevel(logging.INFO)
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+"--batch-size",
+action="store",
+help="Supply a .yaml file with test_name, instance_id, and key_filename to re-use already-running instances",
+)
+
+arguments = parser.parse_args()
+
+batch_size = int(arguments.batch_size)
+
+# Build tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc", return_dict=False)
+model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc", return_dict=False)
+
+# Setup some example inputs
+sequence_0 = "The company HuggingFace is based in New York City"
+sequence_1 = "Apples are especially bad for your health"
+sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
+paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, max_length=128, pad_to_max_length=True, return_tensors="pt")
+not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, max_length=128, pad_to_max_length=True, return_tensors="pt")
+
+# Run the original PyTorch model on both example inputs
+paraphrase_classification_logits = model(**paraphrase)[0]
+not_paraphrase_classification_logits = model(**not_paraphrase)[0]
+
+max_length=128
+# Convert example inputs to a format that is compatible with TorchScript tracing
+input_ids = paraphrase['input_ids']  # type:torch.Tensor
+token_type_ids = paraphrase['token_type_ids']  # type:torch.Tensor
+attention_mask = paraphrase['attention_mask']  # type:torch.Tensor
+input_ids = input_ids.expand(batch_size, max_length)
+token_type_ids = token_type_ids.expand(batch_size, max_length)
+attention_mask = attention_mask.expand(batch_size, max_length)
+example_inputs_paraphrase = input_ids, attention_mask, token_type_ids
+
+input_ids = not_paraphrase['input_ids']  # type:torch.Tensor
+token_type_ids = not_paraphrase['token_type_ids']  # type:torch.Tensor
+attention_mask = not_paraphrase['attention_mask']  # type:torch.Tensor
+input_ids = input_ids.expand(batch_size, max_length)
+token_type_ids = token_type_ids.expand(batch_size, max_length)
+attention_mask = attention_mask.expand(batch_size, max_length)
+example_inputs_not_paraphrase = input_ids, attention_mask, token_type_ids
+
+# Run torch.neuron.trace to generate a TorchScript that is optimized by AWS Neuron, using optimization level -O2
+model_neuron = torch.neuron.trace(model, example_inputs_paraphrase, compiler_args=['-O2'])
+
+# Verify the TorchScript works on both example inputs
+paraphrase_classification_logits_neuron = model_neuron(*example_inputs_paraphrase)
+not_paraphrase_classification_logits_neuron = model_neuron(*example_inputs_not_paraphrase)
+
+# Save the TorchScript for later use
+model_neuron.save(f"bert_neuron_{batch_size}.pt")
\ No newline at end of file
diff --git a/test/benchmark/tests/resources/neuron-bert/config.py b/test/benchmark/tests/resources/neuron-bert/config.py
new file mode 100644
index 0000000000..e21697aadd
--- /dev/null
+++ b/test/benchmark/tests/resources/neuron-bert/config.py
@@ -0,0 +1,3 @@
+model_name='bert-base-cased-finetuned-mrpc'
+max_length=128
+batch_size=1
\ No newline at end of file
diff --git a/test/benchmark/tests/resources/neuron-bert/handler_bert.py b/test/benchmark/tests/resources/neuron-bert/handler_bert.py
new file mode 100644
index 0000000000..9166fdd505
--- /dev/null
+++ b/test/benchmark/tests/resources/neuron-bert/handler_bert.py
@@ -0,0 +1,110 @@
+import os
+import json
+import sys
+import logging
+
+import torch, torch_neuron
+from transformers import AutoTokenizer
+from abc import ABC
+from ts.torch_handler.base_handler import BaseHandler
+
+# one core per worker
+os.environ['NEURONCORE_GROUP_SIZES'] = '1'
+
+logger = logging.getLogger(__name__)
+
+class BertEmbeddingHandler(BaseHandler, ABC):
+    """
+    Handler class for Bert Embedding computations.
+    """
+    def __init__(self):
+        super(BertEmbeddingHandler, self).__init__()
+        self.initialized = False
+
+    def initialize(self, ctx):
+        self.manifest = ctx.manifest
+        properties = ctx.system_properties
+        self.device = 'cpu'
+        model_dir = properties.get('model_dir')
+        serialized_file = self.manifest['model']['serializedFile']
+        model_pt_path = os.path.join(model_dir, serialized_file)
+
+        # point sys.path to our config file
+        sys.path.append(model_dir)
+        import config
+        self.max_length = config.max_length
+        self.batch_size = config.batch_size
+        self.classes = ['not paraphrase', 'paraphrase']
+
+        self.model = torch.jit.load(model_pt_path)
+        logger.debug(f'Model loaded from {model_dir}')
+        self.model.to(self.device)
+        self.model.eval()
+
+        self.tokenizer = AutoTokenizer.from_pretrained(config.model_name)
+        self.initialized = True
+
+    def preprocess(self, input_data):
+        """
+        Tokenization pre-processing
+        """
+
+        input_ids = []
+        attention_masks = []
+        token_type_ids = []
+
+        for row in input_data:
+            #seq_0 = row['body']['seq_0'].decode('utf-8')
+            #seq_1 = row['body']['seq_1'].decode('utf-8')
+
+            json_data = json.loads(row['body'].decode('utf-8'))
+
+            seq_0 = json_data['seq_0']
+            seq_1 = json_data['seq_1']
+            logger.debug(f'Received text: "{seq_0}", "{seq_1}"')
+
+            inputs = self.tokenizer.encode_plus(
+                    seq_0,
+                    seq_1,
+                    max_length=self.max_length,
+                    padding='max_length',
+                    truncation=True,
+                    return_tensors='pt'
+                    )
+
+            input_ids.append(inputs['input_ids'])
+            attention_masks.append(inputs['attention_mask'])
+            token_type_ids.append(inputs['token_type_ids'])
+
+        batch = (torch.cat(input_ids, 0),
+                torch.cat(attention_masks, 0),
+                torch.cat(token_type_ids, 0))
+
+        return batch
+
+    def inference(self, inputs):
+        """
+        Predict the class of a text using a trained transformer model.
+        """
+
+        # sanity check dimensions
+        assert(len(inputs) == 3)
+        num_inferences = len(inputs[0])
+        assert(num_inferences <= self.batch_size)
+
+        # insert padding if we received a partial batch
+        padding = self.batch_size - num_inferences
+        if padding > 0:
+            pad = torch.nn.ConstantPad1d((0, 0, 0, padding), value=0)
+            inputs = [pad(x) for x in inputs]
+
+        outputs = self.model(*inputs)[0]
+        predictions = []
+        for i in range(num_inferences):
+            prediction = self.classes[outputs[i].argmax().item()]
+            predictions.append([prediction])
+            logger.debug("Model predicted: '%s'", prediction)
+        return predictions
+
+    def postprocess(self, inference_output):
+        return inference_output
\ No newline at end of file
diff --git a/test/benchmark/tests/resources/neuron-bert/input b/test/benchmark/tests/resources/neuron-bert/input
new file mode 100644
index 0000000000..e8a5324c7a
--- /dev/null
+++ b/test/benchmark/tests/resources/neuron-bert/input
@@ -0,0 +1 @@
+{"seq_0": "HuggingFace's headquarters are situated in Manhattan", "seq_1": "This is total nonsense."}
\ No newline at end of file
diff --git a/test/benchmark/tests/suite/bert.yaml b/test/benchmark/tests/suite/bert.yaml
index 40a447aa75..7173ac1bf3 100644
--- a/test/benchmark/tests/suite/bert.yaml
+++ b/test/benchmark/tests/suite/bert.yaml
@@ -2,7 +2,7 @@
 bert:
     scripted_mode:
         benchmark_engine: "ab"
-        url: "https://torchserve.s3.amazonaws.com/mar_files/BERTSeqClassification_Torchscript_batch.mar"
+        url: "https://s3.us-west-2.amazonaws.com/ts0.4.1-marfiles/BERTSeqClassification_torchscript.mar" #for CPU: https://torchserve.s3.amazonaws.com/mar_files/BERTSeqClassification_Torchscript_batch.mar
         workers: 4
         batch_delay: 100
         batch_size:
@@ -10,10 +10,10 @@ bert:
             - 2
             - 4
             - 8
-        input: "https://raw.githubusercontent.com/pytorch/serve/master/examples/image_classifier/kitten.jpg"
+        input: "https://raw.githubusercontent.com/pytorch/serve/master/examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
         requests: 10000
         concurrency: 100
-        backend_profiling: True
+        backend_profiling: False
         exec_env: "docker"
         processors:
             - "cpu"
diff --git a/test/benchmark/tests/suite/bert_neuron.yaml b/test/benchmark/tests/suite/bert_neuron.yaml
new file mode 100644
index 0000000000..a80ad24e5c
--- /dev/null
+++ b/test/benchmark/tests/suite/bert_neuron.yaml
@@ -0,0 +1,19 @@
+---
+bert_inf1:
+    scripted_mode:
+        benchmark_engine: "ab"
+        compile_per_batch_size: True
+        workers: 4
+        batch_delay: 100
+        batch_size:
+            - 1
+            - 2
+            - 4
+            - 8
+        input: "/home/ubuntu/serve/test/benchmark/tests/resources/neuron-bert/input"
+        requests: 10000
+        concurrency: 100
+        backend_profiling: False
+        exec_env: "local"
+        processors:
+            - "inferentia"
\ No newline at end of file
diff --git a/test/benchmark/tests/suite/fastrcnn.yaml b/test/benchmark/tests/suite/fastrcnn.yaml
index 647e2725ae..52f19c0947 100644
--- a/test/benchmark/tests/suite/fastrcnn.yaml
+++ b/test/benchmark/tests/suite/fastrcnn.yaml
@@ -10,10 +10,10 @@ fastrcnn:
             - 2
             - 4
             - 8
-        input: "https://raw.githubusercontent.com/pytorch/serve/master/examples/object_detector/persons.jpg"
-        requests: 10000
+        input: "https://raw.githubusercontent.com/pytorch/serve/master/examples/image_classifier/kitten.jpg"
+        requests: 1000
         concurrency: 100
-        backend_profiling: True
+        backend_profiling: False
         exec_env: "docker"
         processors:
             - "cpu"
diff --git a/test/benchmark/tests/suite/mnist.yaml b/test/benchmark/tests/suite/mnist.yaml
index bfe42b6123..b360a724d8 100644
--- a/test/benchmark/tests/suite/mnist.yaml
+++ b/test/benchmark/tests/suite/mnist.yaml
@@ -10,10 +10,10 @@ mnist:
             - 2
             - 4
             - 8
-        requests: 10000
+        requests: 1000
         concurrency: 10
         input: "https://raw.githubusercontent.com/pytorch/serve/master/examples/image_classifier/mnist/test_data/0.png"
-        backend_profiling: True
+        backend_profiling: False
         exec_env: "docker"
         processors:
             - "cpu"
diff --git a/test/benchmark/tests/suite/vgg11.yaml b/test/benchmark/tests/suite/vgg11.yaml
index 7305e7de2e..f378ab18de 100644
--- a/test/benchmark/tests/suite/vgg11.yaml
+++ b/test/benchmark/tests/suite/vgg11.yaml
@@ -13,7 +13,7 @@ vgg11:
         input: "https://raw.githubusercontent.com/pytorch/serve/master/examples/image_classifier/kitten.jpg"
         requests: 1000
         concurrency: 100
-        backend_profiling: True
+        backend_profiling: False
         exec_env: "docker"
         processors:
             - "cpu"
@@ -31,7 +31,7 @@ vgg11:
         input: "https://raw.githubusercontent.com/pytorch/serve/master/examples/image_classifier/kitten.jpg"
         requests: 1000
         concurrency: 100
-        backend_profiling: True
+        backend_profiling: False
         exec_env: "docker"
         dockerhub_image:  "pytorch/torchserve:latest"
         processors:
diff --git a/test/benchmark/tests/suite/vgg16.yaml b/test/benchmark/tests/suite/vgg16.yaml
index 021925aa58..1700e9362e 100644
--- a/test/benchmark/tests/suite/vgg16.yaml
+++ b/test/benchmark/tests/suite/vgg16.yaml
@@ -13,7 +13,7 @@ vgg16:
         input: "https://raw.githubusercontent.com/pytorch/serve/master/examples/image_classifier/kitten.jpg"
         requests: 1000
         concurrency: 100
-        backend_profiling: True
+        backend_profiling: False
         exec_env: "docker"
         processors:
             - "cpu"
@@ -31,7 +31,7 @@ vgg16:
         input: "https://raw.githubusercontent.com/pytorch/serve/master/examples/image_classifier/kitten.jpg"
         requests: 1000
         concurrency: 100
-        backend_profiling: True
+        backend_profiling: False
         exec_env: "docker"
         dockerhub_image:  "pytorch/torchserve:latest"
         processors:
diff --git a/test/benchmark/tests/test_bert.py b/test/benchmark/tests/test_bert.py
index 5ccd5346f6..e866fed0ca 100644
--- a/test/benchmark/tests/test_bert.py
+++ b/test/benchmark/tests/test_bert.py
@@ -6,10 +6,10 @@
 from invoke import run
 from invoke.context import Context
 
-import utils.ec2 as ec2_utils
-import utils.s3 as s3_utils
-import utils.ts as ts_utils
-import utils.apache_bench as ab_utils
+import tests.utils.ec2 as ec2_utils
+import tests.utils.s3 as s3_utils
+import tests.utils.ts as ts_utils
+import tests.utils.apache_bench as ab_utils
 
 
 from tests.utils import (
@@ -22,10 +22,10 @@
     S3_BUCKET_BENCHMARK_ARTIFACTS,
 )
 
-INSTANCE_TYPES_TO_TEST = ["p3.8xlarge"]
+# Add/remove from the following list to benchmark on the instance of your choice
+INSTANCE_TYPES_TO_TEST = ["c4.4xlarge"]
 
 
-@pytest.mark.skip()
 @pytest.mark.parametrize("ec2_instance_type", INSTANCE_TYPES_TO_TEST, indirect=True)
 def test_bert_benchmark(
     ec2_connection, ec2_instance_type, bert_config_file_path, docker_dev_image_config_path, benchmark_execution_id
@@ -71,7 +71,7 @@ def test_bert_benchmark(
                 account_id, DEFAULT_REGION, docker_repo_tag, connection=ec2_connection
             )
             docker_repo_tag_for_current_instance = docker_repo_tag
-            cuda_version_for_instance = cuda_version
+            cuda_version_for_instance = None
             break
 
     mode_list = []
@@ -131,7 +131,7 @@ def test_bert_benchmark(
                 torchserveHandler.unregister_model()
 
                 # Stop torchserve
-                torchserveHandler.stop_torchserve()
+                torchserveHandler.stop_torchserve(exec_env="docker")
 
                 # Generate report (note: needs to happen after torchserve has stopped)
                 apacheBenchHandler.generate_report(requests=requests, concurrency=concurrency, connection=ec2_connection)
diff --git a/test/benchmark/tests/test_bert_neuron.py b/test/benchmark/tests/test_bert_neuron.py
new file mode 100644
index 0000000000..efb25d018b
--- /dev/null
+++ b/test/benchmark/tests/test_bert_neuron.py
@@ -0,0 +1,171 @@
+import os
+import pprint
+
+import pytest
+import time
+from invoke import run
+from invoke.context import Context
+
+import tests.utils.ec2 as ec2_utils
+import tests.utils.s3 as s3_utils
+import tests.utils.ts as ts_utils
+import tests.utils.apache_bench as ab_utils
+import tests.utils.neuron as neuron_utils
+
+from tests.utils import (
+    DEFAULT_DOCKER_DEV_ECR_REPO,
+    DEFAULT_REGION,
+    GPU_INSTANCES,
+    LOGGER,
+    DockerImageHandler,
+    YamlHandler,
+    S3_BUCKET_BENCHMARK_ARTIFACTS,
+)
+
+# Add/remove from the following list to benchmark on the instance of your choice
+INSTANCE_TYPES_TO_TEST = ["inf1.6xlarge"]
+
+@pytest.mark.skip(reason="Skipping neuron test, manually unskip if you need to benchmark")
+@pytest.mark.parametrize("ec2_instance_type", INSTANCE_TYPES_TO_TEST, indirect=True)
+def test_neuron_benchmark(
+    ec2_connection, ec2_instance_type, bert_neuron_config_file_path, docker_dev_image_config_path, benchmark_execution_id
+):
+
+    test_config = YamlHandler.load_yaml(bert_neuron_config_file_path)
+
+    model_name = bert_neuron_config_file_path.split("/")[-1].split(".")[0]
+
+    LOGGER.info("Validating yaml contents")
+
+    LOGGER.info(YamlHandler.validate_benchmark_yaml(test_config))
+
+    docker_config = YamlHandler.load_yaml(docker_dev_image_config_path)
+
+    docker_repo_tag_for_current_instance = ""
+    cuda_version_for_instance = ""
+    account_id = run("aws sts get-caller-identity --query Account --output text").stdout.strip()
+
+    for processor, config in docker_config.items():
+        docker_tag = None
+        cuda_version = None
+        for config_key, config_value in config.items():
+            if processor == "gpu" and config_key == "cuda_version":
+                cuda_version = config_value
+            if config_key == "docker_tag":
+                docker_tag = config_value
+        # TODO: Improve logic that selectively pulls CPU image on CPU instances and likewise for GPU.
+
+        docker_repo_tag = f"{DEFAULT_DOCKER_DEV_ECR_REPO}:{docker_tag}"
+
+        if ec2_instance_type[:2] in GPU_INSTANCES and ("gpu" in docker_tag or "neuron" in docker_tag):
+            dockerImageHandler = DockerImageHandler(docker_tag, cuda_version)
+            dockerImageHandler.pull_docker_image_from_ecr(
+                account_id, DEFAULT_REGION, docker_repo_tag, connection=ec2_connection
+            )
+            docker_repo_tag_for_current_instance = docker_repo_tag
+            cuda_version_for_instance = cuda_version
+            break
+        if ec2_instance_type[:2] not in GPU_INSTANCES and ("cpu" in docker_tag or "neuron" in docker_tag):
+            dockerImageHandler = DockerImageHandler(docker_tag, cuda_version)
+            dockerImageHandler.pull_docker_image_from_ecr(
+                account_id, DEFAULT_REGION, docker_repo_tag, connection=ec2_connection
+            )
+            docker_repo_tag_for_current_instance = docker_repo_tag
+            cuda_version_for_instance = None
+            break
+
+    mode_list = []
+    config_list = []
+    batch_size_list = []
+    processor_list = []
+
+    apacheBenchHandler = ab_utils.ApacheBenchHandler(model_name=model_name, connection=ec2_connection)
+
+    for model, config in test_config.items():
+        for mode, mode_config in config.items():
+            mode_list.append(mode)
+            benchmark_engine = mode_config.get("benchmark_engine")
+            workers = mode_config.get("workers")
+            batch_delay = mode_config.get("batch_delay")
+            batch_sizes = mode_config.get("batch_size")
+            input_file = mode_config.get("input")
+            requests = mode_config.get("requests")
+            concurrency = mode_config.get("concurrency")
+            backend_profiling = mode_config.get("backend_profiling")
+            exec_env = mode_config.get("exec_env")
+            processors = mode_config.get("processors")
+            gpus = None
+            if len(processors) == 2:
+                gpus = processors[1].get("gpus")
+                LOGGER.info(f"processors: {processors[1]}")
+                LOGGER.info(f"gpus: {gpus}")
+
+            LOGGER.info(
+                f"\n benchmark_engine: {benchmark_engine}\n  workers: {workers}\n batch_delay: {batch_delay}\n batch_size:{batch_sizes}\n input_file: {input_file}\n requests: {requests}\n concurrency: {concurrency}\n backend_profiling: {backend_profiling}\n exec_env: {exec_env}\n processors: {processors}"
+            )
+
+            torchserveHandler = ts_utils.TorchServeHandler(
+                exec_env=exec_env,
+                cuda_version=cuda_version_for_instance,
+                gpus=gpus,
+                torchserve_docker_image=docker_repo_tag_for_current_instance,
+                backend_profiling=backend_profiling,
+                connection=ec2_connection,
+            )
+            
+            # Note: Assumes a DLAMI (conda-based) is being used
+            torchserveHandler.setup_torchserve(virtual_env_name="aws_neuron_pytorch_p36")
+
+            for batch_size in batch_sizes:
+                url = f"benchmark_{batch_size}.mar"
+                LOGGER.info(f"Running benchmark for model archive: {url}")
+                
+                # Stop torchserve
+                torchserveHandler.stop_torchserve(exec_env="local", virtual_env_name="aws_neuron_pytorch_p36")
+
+                # Generate bert inf model
+                neuron_utils.setup_neuron_mar_files(connection=ec2_connection, virtual_env_name="aws_neuron_pytorch_p36", batch_size=batch_size)
+
+                # Start torchserve
+                torchserveHandler.start_torchserve_local(virtual_env_name="aws_neuron_pytorch_p36", stop_torchserve=False)
+
+                # Register
+                torchserveHandler.register_model(
+                    url=url, workers=workers, batch_delay=batch_delay, batch_size=batch_size
+                )
+
+                # Run benchmark
+                apacheBenchHandler.run_apache_bench(requests=requests, concurrency=concurrency, input_file=input_file)
+
+                # Unregister
+                torchserveHandler.unregister_model()
+
+                # Stop torchserve
+                torchserveHandler.stop_torchserve(exec_env="local", virtual_env_name="aws_neuron_pytorch_p36")
+
+                # Generate report (note: needs to happen after torchserve has stopped)
+                apacheBenchHandler.generate_report(
+                    requests=requests, concurrency=concurrency, connection=ec2_connection
+                )
+
+                # Move artifacts into a common folder.
+                remote_artifact_folder = (
+                    f"/home/ubuntu/{benchmark_execution_id}/{model_name}/{ec2_instance_type}/{mode}/{batch_size}"
+                )
+
+                ec2_connection.run(f"mkdir -p {remote_artifact_folder}")
+                ec2_connection.run(f"cp -R /home/ubuntu/benchmark/* {remote_artifact_folder}")
+
+                # Upload artifacts to s3 bucket
+                ec2_connection.run(
+                    f"aws s3 cp --recursive /home/ubuntu/{benchmark_execution_id}/ {S3_BUCKET_BENCHMARK_ARTIFACTS}/{benchmark_execution_id}/"
+                )
+
+                time.sleep(3)
+
+                run(
+                    f"aws s3 cp --recursive /tmp/{model_name}/ {S3_BUCKET_BENCHMARK_ARTIFACTS}/{benchmark_execution_id}/{model_name}/{ec2_instance_type}/{mode}/{batch_size}"
+                )
+
+                run(f"rm -rf /tmp/{model_name}")
+                apacheBenchHandler.clean_up()
diff --git a/test/benchmark/tests/test_fastrcnn.py b/test/benchmark/tests/test_fastrcnn.py
index 2fe2cba0c0..401f0c59d3 100644
--- a/test/benchmark/tests/test_fastrcnn.py
+++ b/test/benchmark/tests/test_fastrcnn.py
@@ -6,10 +6,10 @@
 from invoke import run
 from invoke.context import Context
 
-import utils.ec2 as ec2_utils
-import utils.s3 as s3_utils
-import utils.ts as ts_utils
-import utils.apache_bench as ab_utils
+import tests.utils.ec2 as ec2_utils
+import tests.utils.s3 as s3_utils
+import tests.utils.ts as ts_utils
+import tests.utils.apache_bench as ab_utils
 
 from tests.utils import (
     DEFAULT_DOCKER_DEV_ECR_REPO,
@@ -21,9 +21,9 @@
     S3_BUCKET_BENCHMARK_ARTIFACTS,
 )
 
-INSTANCE_TYPES_TO_TEST = ["p3.8xlarge"]
+# Add/remove from the following list to benchmark on the instance of your choice
+INSTANCE_TYPES_TO_TEST = ["c4.4xlarge", "p3.8xlarge"]
 
-@pytest.mark.skip()
 @pytest.mark.parametrize("ec2_instance_type", INSTANCE_TYPES_TO_TEST, indirect=True)
 def test_fastrcnn_benchmark(
     ec2_connection, ec2_instance_type, fastrcnn_config_file_path, docker_dev_image_config_path, benchmark_execution_id
@@ -69,7 +69,7 @@ def test_fastrcnn_benchmark(
                 account_id, DEFAULT_REGION, docker_repo_tag, connection=ec2_connection
             )
             docker_repo_tag_for_current_instance = docker_repo_tag
-            cuda_version_for_instance = cuda_version
+            cuda_version_for_instance = None
             break
 
     mode_list = []
diff --git a/test/benchmark/tests/test_mnist.py b/test/benchmark/tests/test_mnist.py
index b6024f8834..0a1a8e8ced 100644
--- a/test/benchmark/tests/test_mnist.py
+++ b/test/benchmark/tests/test_mnist.py
@@ -6,10 +6,10 @@
 from invoke import run
 from invoke.context import Context
 
-import utils.ec2 as ec2_utils
-import utils.s3 as s3_utils
-import utils.ts as ts_utils
-import utils.apache_bench as ab_utils
+import tests.utils.ec2 as ec2_utils
+import tests.utils.s3 as s3_utils
+import tests.utils.ts as ts_utils
+import tests.utils.apache_bench as ab_utils
 
 from tests.utils import (
     DEFAULT_DOCKER_DEV_ECR_REPO,
@@ -21,9 +21,9 @@
     S3_BUCKET_BENCHMARK_ARTIFACTS,
 )
 
+# Add/remove from the following list to benchmark on the instance of your choice
 INSTANCE_TYPES_TO_TEST = ["p3.8xlarge"]
 
-@pytest.mark.skip()
 @pytest.mark.parametrize("ec2_instance_type", INSTANCE_TYPES_TO_TEST, indirect=True)
 def test_mnist_benchmark(
     ec2_connection, ec2_instance_type, mnist_config_file_path, docker_dev_image_config_path, benchmark_execution_id
@@ -69,7 +69,7 @@ def test_mnist_benchmark(
                 account_id, DEFAULT_REGION, docker_repo_tag, connection=ec2_connection
             )
             docker_repo_tag_for_current_instance = docker_repo_tag
-            cuda_version_for_instance = cuda_version
+            cuda_version_for_instance = None
             break
 
     mode_list = []
diff --git a/test/benchmark/tests/test_vgg11.py b/test/benchmark/tests/test_vgg11.py
index d2f41bbc38..9549ec7a50 100644
--- a/test/benchmark/tests/test_vgg11.py
+++ b/test/benchmark/tests/test_vgg11.py
@@ -6,10 +6,10 @@
 from invoke import run
 from invoke.context import Context
 
-import utils.ec2 as ec2_utils
-import utils.s3 as s3_utils
-import utils.ts as ts_utils
-import utils.apache_bench as ab_utils
+import tests.utils.ec2 as ec2_utils
+import tests.utils.s3 as s3_utils
+import tests.utils.ts as ts_utils
+import tests.utils.apache_bench as ab_utils
 
 from tests.utils import (
     DEFAULT_DOCKER_DEV_ECR_REPO,
@@ -21,9 +21,9 @@
     S3_BUCKET_BENCHMARK_ARTIFACTS,
 )
 
+# Add/remove from the following list to benchmark on the instance of your choice
 INSTANCE_TYPES_TO_TEST = ["p3.8xlarge"]
 
-@pytest.mark.skip()
 @pytest.mark.parametrize("ec2_instance_type", INSTANCE_TYPES_TO_TEST, indirect=True)
 def test_vgg11_benchmark(
     ec2_connection, ec2_instance_type, vgg11_config_file_path, docker_dev_image_config_path, benchmark_execution_id
@@ -69,7 +69,7 @@ def test_vgg11_benchmark(
                 account_id, DEFAULT_REGION, docker_repo_tag, connection=ec2_connection
             )
             docker_repo_tag_for_current_instance = docker_repo_tag
-            cuda_version_for_instance = cuda_version
+            cuda_version_for_instance = None
             break
 
     mode_list = []
@@ -129,7 +129,7 @@ def test_vgg11_benchmark(
                 torchserveHandler.unregister_model()
 
                 # Stop torchserve
-                torchserveHandler.stop_torchserve()
+                torchserveHandler.stop_torchserve(exec_env="docker")
 
                 # Generate report (note: needs to happen after torchserve has stopped)
                 apacheBenchHandler.generate_report(requests=requests, concurrency=concurrency, connection=ec2_connection)
diff --git a/test/benchmark/tests/test_vgg16.py b/test/benchmark/tests/test_vgg16.py
index cf9f8e7997..b7116eb701 100644
--- a/test/benchmark/tests/test_vgg16.py
+++ b/test/benchmark/tests/test_vgg16.py
@@ -6,10 +6,10 @@
 from invoke import run
 from invoke.context import Context
 
-import utils.ec2 as ec2_utils
-import utils.s3 as s3_utils
-import utils.ts as ts_utils
-import utils.apache_bench as ab_utils
+import tests.utils.ec2 as ec2_utils
+import tests.utils.s3 as s3_utils
+import tests.utils.ts as ts_utils
+import tests.utils.apache_bench as ab_utils
 
 from tests.utils import (
     DEFAULT_DOCKER_DEV_ECR_REPO,
@@ -21,7 +21,8 @@
     S3_BUCKET_BENCHMARK_ARTIFACTS,
 )
 
-INSTANCE_TYPES_TO_TEST = ["p3.8xlarge"]
+# Add/remove from the following list to benchmark on the instance of your choice
+INSTANCE_TYPES_TO_TEST = ["c4.4xlarge"]
 
 @pytest.mark.parametrize("ec2_instance_type", INSTANCE_TYPES_TO_TEST, indirect=True)
 def test_vgg16_benchmark(
@@ -54,7 +55,7 @@ def test_vgg16_benchmark(
 
         docker_repo_tag = f"{DEFAULT_DOCKER_DEV_ECR_REPO}:{docker_tag}"
 
-        if ec2_instance_type[:2] in GPU_INSTANCES and "gpu" in docker_tag:
+        if ec2_instance_type[:2] in GPU_INSTANCES and ("gpu" in docker_tag or "neuron" in docker_tag):
             dockerImageHandler = DockerImageHandler(docker_tag, cuda_version)
             dockerImageHandler.pull_docker_image_from_ecr(
                 account_id, DEFAULT_REGION, docker_repo_tag, connection=ec2_connection
@@ -62,13 +63,13 @@ def test_vgg16_benchmark(
             docker_repo_tag_for_current_instance = docker_repo_tag
             cuda_version_for_instance = cuda_version
             break
-        if ec2_instance_type[:2] not in GPU_INSTANCES and "cpu" in docker_tag:
+        if ec2_instance_type[:2] not in GPU_INSTANCES and ("cpu" in docker_tag or "neuron" in docker_tag):
             dockerImageHandler = DockerImageHandler(docker_tag, cuda_version)
             dockerImageHandler.pull_docker_image_from_ecr(
                 account_id, DEFAULT_REGION, docker_repo_tag, connection=ec2_connection
             )
             docker_repo_tag_for_current_instance = docker_repo_tag
-            cuda_version_for_instance = cuda_version
+            cuda_version_for_instance = None
             break
 
     mode_list = []
@@ -104,7 +105,7 @@ def test_vgg16_benchmark(
 
             torchserveHandler = ts_utils.TorchServeHandler(
                 exec_env=exec_env,
-                cuda_version=cuda_version,
+                cuda_version=cuda_version_for_instance,
                 gpus=gpus,
                 torchserve_docker_image=docker_repo_tag_for_current_instance,
                 backend_profiling=backend_profiling,
@@ -128,7 +129,7 @@ def test_vgg16_benchmark(
                 torchserveHandler.unregister_model()
 
                 # Stop torchserve
-                torchserveHandler.stop_torchserve()
+                torchserveHandler.stop_torchserve(exec_env="docker")
 
                 # Generate report (note: needs to happen after torchserve has stopped)
                 apacheBenchHandler.generate_report(requests=requests, concurrency=concurrency, connection=ec2_connection)
diff --git a/test/benchmark/tests/utils/__init__.py b/test/benchmark/tests/utils/__init__.py
index 5c45d6d8a6..9fd2571599 100644
--- a/test/benchmark/tests/utils/__init__.py
+++ b/test/benchmark/tests/utils/__init__.py
@@ -1,5 +1,8 @@
+from __future__ import absolute_import
+
 import json
 import logging
+import fcntl
 import os
 import re
 import subprocess
@@ -27,8 +30,9 @@
 GPU_INSTANCES = ["p2", "p3", "p4", "g2", "g3", "g4"]
 
 # DLAMI with nVidia Driver ver. 450.119.03 (support upto CUDA 11.2), Ubuntu 18.04
-AMI_ID = "ami-0ff137c06803a8bb7"
-# AMI_ID = "ami-0198925303105158c", with apache2-utils installed
+# AMI_ID = "ami-064696901389beb84"
+# AMI_ID = "ami-0198925303105158c", Base DLAMI 37.0 with apache2-utils installed
+AMI_ID = "ami-00c5ebd9076702cbe"#, DLAMI 43.0 with apache2-utils installed
 
 LOGGER = logging.getLogger(__name__)
 LOGGER.setLevel(logging.INFO)
@@ -50,10 +54,10 @@ def build_image(self):
         os.chdir(torch_serve_docker_directory)
         if self.cuda_version:
             run_out = run(
-                f"./build_image.sh -bt dev -g -cv {self.cuda_version} -t {DEFAULT_DOCKER_DEV_ECR_REPO}:{self.docker_tag}"
+                f"./build_image.sh -b {self.branch} -bt dev -g -cv {self.cuda_version} -t {DEFAULT_DOCKER_DEV_ECR_REPO}:{self.docker_tag}"
             )
         else:
-            run_out = run(f"./build_image.sh -bt dev -t {DEFAULT_DOCKER_DEV_ECR_REPO}:{self.docker_tag}")
+            run_out = run(f"./build_image.sh -b {self.branch} -bt dev -t {DEFAULT_DOCKER_DEV_ECR_REPO}:{self.docker_tag}")
 
         # Switch back to original directory
         os.chdir(current_working_directory)
@@ -140,11 +144,10 @@ class YamlHandler(object):
         "input",
         "processors",
         "requests",
-        "url",
         "workers",
     ]
 
-    optional_config_keys = ["dockerhub_image", "docker_dev_image"]
+    optional_config_keys = ["url", "dockerhub_image", "docker_dev_image", "compile_per_batch_size"]
 
     valid_config_keys = mandatory_config_keys + optional_config_keys
 
@@ -154,7 +157,7 @@ class YamlHandler(object):
 
     valid_processors = ["cpu", "gpus"]
 
-    valid_docker_processors = ["cpu", "gpu"]
+    valid_docker_processors = ["cpu", "gpu", "inferentia"]
 
     mandatory_docker_config_keys = ["docker_tag"]
 
@@ -179,8 +182,10 @@ def write_yaml(file_path, dictionary_object):
         :param dictionary_object: dictionary with content that needs to be written to a yaml file
         :return None
         """
-        with open(file_path) as f:
-            yaml.dump(f, dictionary_object)
+        with open(file_path, "a") as f:
+            fcntl.flock(f, fcntl.LOCK_EX)
+            yaml.dump(dictionary_object, f)
+            fcntl.flock(f, fcntl.LOCK_UN)
 
     @staticmethod
     def validate_benchmark_yaml(yaml_content):
diff --git a/test/benchmark/tests/utils/apache_bench.py b/test/benchmark/tests/utils/apache_bench.py
index 27b9a2bc9a..2c0c2af47a 100644
--- a/test/benchmark/tests/utils/apache_bench.py
+++ b/test/benchmark/tests/utils/apache_bench.py
@@ -47,7 +47,7 @@ def install_dependencies(self):
         """
         Installs apache2-utils, assuming it's an Ubuntu instance
         """
-        run_out = self.connection.sudo(f"apt install -y apache2-utils")
+        run_out = self.connection.sudo(f"apt install -y apache2-utils", pty=True)
         return run_out.return_code
 
     def run_apache_bench(self, requests, concurrency, input_file):
@@ -58,14 +58,15 @@ def run_apache_bench(self, requests, concurrency, input_file):
         """
         self.connection.run(f"mkdir -p {TMP_DIR}/benchmark")
 
-        self.connection.run(f"wget {input_file}")
+        if input_file.startswith("https://") or input_file.startswith("http://"):
+            self.connection.run(f"wget {input_file}", warn=True)
+            file_name = self.connection.run(f"basename {input_file}").stdout.strip()
+            # Copy to the directory with other benchmark artifacts
+            self.connection.run(f"cp {file_name} {os.path.join(TMP_DIR, 'benchmark/input')}")
+        else:
+            self.connection.run(f"cp {input_file} {os.path.join(TMP_DIR, 'benchmark/input')}")
 
-        file_name = self.connection.run(f"basename {input_file}").stdout.strip()
-
-        # Copy to the directory with other benchmark artifacts
-        self.connection.run(f"cp {file_name} {os.path.join(TMP_DIR, 'benchmark/input')}")
-
-        apache_bench_command = f"ab -c {concurrency} -n {requests} -k -p {TMP_DIR}/benchmark/input -T application/png {self.inference_url}/predictions/benchmark > {self.result_file}"
+        apache_bench_command = f"ab -c {concurrency} -n {requests} -k -p {TMP_DIR}/benchmark/input -T application/jpg {self.inference_url}/predictions/benchmark > {self.result_file}"
 
         # Run apache bench
         run_out = self.connection.run(
diff --git a/test/benchmark/tests/utils/neuron.py b/test/benchmark/tests/utils/neuron.py
new file mode 100644
index 0000000000..96eb1763b9
--- /dev/null
+++ b/test/benchmark/tests/utils/neuron.py
@@ -0,0 +1,53 @@
+import subprocess
+import time
+import glob
+import os
+import requests
+import tempfile
+
+import invoke
+import pandas as pd
+
+from io import StringIO
+from urllib.parse import urlparse
+from invoke import run
+from invoke.context import Context
+
+from . import DEFAULT_REGION, IAM_INSTANCE_PROFILE, AMI_ID, LOGGER, S3_BUCKET_BENCHMARK_ARTIFACTS
+
+# Assumes the functions from this file execute on an Ubuntu ec2 instance
+ROOT_DIR = f"/home/ubuntu"
+TORCHSERVE_DIR = os.path.join(ROOT_DIR, "serve")
+MODEL_STORE = os.path.join(TORCHSERVE_DIR, "model_store")
+LOCAL_TMP_DIR = "/tmp"
+TMP_DIR = "/home/ubuntu"
+NEURON_RESOURCES_FOLDER = os.path.join(TORCHSERVE_DIR, "test", "benchmark", "tests", "resources", "neuron-bert")
+
+def setup_neuron_mar_files(connection=None, virtual_env_name=None, batch_size=1):
+    activation_command = ""
+    
+    if virtual_env_name:
+        activation_command = f"cd /home/ubuntu/serve/test/benchmark/tests/resources/neuron-bert && source activate {virtual_env_name} && "
+    
+    # Note: change version here to make sure the torch version compatible with neuron is being used.
+    connection.run(f"{activation_command}pip3 install -U --ignore-installed torch==1.7.1", warn=True)
+    connection.run(f"{activation_command}pip3 install -U --ignore-installed torch-neuron 'neuron-cc[tensorflow]' --extra-index-url=https://pip.repos.neuron.amazonaws.com", warn=True)
+    
+    connection.run(f"{activation_command}python3 compile_bert.py --batch-size {batch_size}", warn=True)
+    time.sleep(5)
+    run_out_sed = connection.run(f"{activation_command}sed -i 's/batch_size=[[:digit:]]\+/batch_size={batch_size}/g' config.py", warn=True)
+    LOGGER.info(f"run_out_sed: {run_out_sed.stdout}, run_out_return: {run_out_sed.return_code}")
+    run_out_mkdir = connection.run(f"mkdir -p /home/ubuntu/benchmark/model_store")
+    LOGGER.info(f"run_out_mkdir: {run_out_mkdir.stdout}, run_out_return: {run_out_mkdir.return_code}")
+    run_out_archiver = connection.run(f"{activation_command}torch-model-archiver --model-name 'benchmark_{batch_size}' --version 1.0 --serialized-file ./bert_neuron_{batch_size}.pt --handler './handler_bert.py' --extra-files './config.py' -f", warn=True)
+    LOGGER.info(f"run_out_archiver: {run_out_archiver.stdout}, run_out_return: {run_out_archiver.return_code}")
+    
+    LOGGER.info(f"Running copy command")
+    connection.run(f"cp /home/ubuntu/serve/test/benchmark/tests/resources/neuron-bert/benchmark_{batch_size}.mar /home/ubuntu/benchmark/model_store")
+    run_out = connection.run(f"test -e /home/ubuntu/benchmark/model_store/benchmark_{batch_size}.mar")
+    if run_out.return_code == 0:
+        LOGGER.info(f"mar file available at location /home/ubuntu/benchmark/model_store/benchmark_{batch_size}.mar")
+    else:
+        LOGGER.info(f"mar file NOT available at location /home/ubuntu/benchmark/model_store/benchmark_{batch_size}.mar")
+
+    time.sleep(5)
\ No newline at end of file
diff --git a/test/benchmark/tests/utils/report.py b/test/benchmark/tests/utils/report.py
index f09c01ce9e..ca5bb9ec2a 100644
--- a/test/benchmark/tests/utils/report.py
+++ b/test/benchmark/tests/utils/report.py
@@ -18,7 +18,7 @@
 
 TMP_DIR = "/tmp"
 
-
+from . import LOGGER
 
 class MarkdownTable:
     def __init__(self):
@@ -81,10 +81,6 @@ def add_markdown_from_csv(self, file_path, delimiter):
                 md_string += item + " | "
             md_string += "\n"
 
-        # writing md_string to the output_file
-        # file = open(output_file, "w", encoding="UTF-8")
-        # file.write(md_string)
-        # file.close()
         self.markdown_content += md_string
 
         print("The markdown file has been created!!!")
@@ -99,7 +95,7 @@ def add_code_block(self, content: str, newline=True):
         newline_modifier = "\n" if newline else ""
         backticks_modifier = "```" if newline else "`"
 
-        self.markdown_content += str(f"{newline_modifier}{backticks_modifier}{newline_modifier}{content}\n{backticks_modifier}")
+        self.markdown_content += str(f"{newline_modifier}{backticks_modifier}\n{content}\n{backticks_modifier}{newline_modifier}")
 
     def add_paragraph(self, content: str, bold=False, italics=False, newline=True):
         """
@@ -123,45 +119,56 @@ def add_newline(self):
     def get_document(self):
         return self.markdown_content
 
-def main(s3_bucket_uri):
-    """
-    Compile a markdown file with different csv files as input
-    """
-    # Download the s3 files
-    run(f"mkdir -p /tmp/report")
-    run(f"aws s3 cp --recursive {s3_bucket_uri} /tmp/report")
-
-    csv_files = []
+class Report:
+    def __init__(self):
+        self.tmp_report_dir = os.path.join("/tmp", "report")
 
-    for root, dirs, files in os.walk("/tmp/report/"):
-        for name in files:
-            csv_files.append(os.path.join(root, name)) if "ab_report" in name else None
-        
-    markdownDocument = MarkdownDocument("Benchmark report")
-    markdownDocument.add_newline()
 
-    # Assume model configuration starts from /tmp/report
-    for report_path in csv_files:
-        split_path = report_path.split("/")
-        print(split_path)
-        model = split_path[3]
-        instance_type = split_path[4]
-        mode = split_path[5]
-        batch_size = split_path[6]
+    def download_benchmark_results_from_s3(self, s3_uri):
+        """
+        Download benchmark results of various runs from s3
+        """
+        # Cleanup any previous folder
+        run(f"rm -rf {self.tmp_report_dir}")
 
-        config_header = f"{model} | {mode} | {instance_type} | batch size {batch_size}"
+        # Create a tmp folder
+        run(f"mkdir -p {self.tmp_report_dir}")
 
-        markdownDocument.add_paragraph(config_header, bold=True, newline=True)
+        run(f"aws s3 cp --recursive {s3_uri} {self.tmp_report_dir}")
 
-        print(f"Updating data from file: {report_path}")
-        markdownDocument.add_markdown_from_csv(report_path, delimiter=" ")
-    
-    with open("report.md", "w") as f:
-       f.write(markdownDocument.get_document()) 
 
-    # Clean up 
-    run(f"rm -rf /tmp/report")
+    def generate_comprehensive_report(self):
+        """
+        Compile a markdown file with different csv files as input
+        """
+        csv_files = []
 
+        for root, dirs, files in os.walk("/tmp/report/"):
+            for name in files:
+                csv_files.append(os.path.join(root, name)) if "ab_report" in name else None
+        
+        csv_files = sorted(csv_files)
+            
+        markdownDocument = MarkdownDocument("Benchmark report")
+        markdownDocument.add_newline()
+
+        # Assume model configuration starts from /tmp/report
+        for report_path in csv_files:
+            split_path = report_path.split("/")
+            print(split_path)
+            model = split_path[3]
+            instance_type = split_path[4]
+            mode = split_path[5]
+            batch_size = split_path[6]
+
+            config_header = f"{model} | {mode} | {instance_type} | batch size {batch_size}"
+
+            markdownDocument.add_code_block(config_header, newline=True)
+
+            print(f"Updating data from file: {report_path}")
+            markdownDocument.add_markdown_from_csv(report_path, delimiter=" ")
+        
+        with open("report.md", "w") as f:
+            f.write(markdownDocument.get_document()) 
 
-if __name__ == "__main__":
-    generate_comprehensive_report("s3_bucket_uri")
\ No newline at end of file
+        LOGGER.info(f"Benchmark report generated at: {os.path.join(os.getcwd(), 'report.md')}")
\ No newline at end of file
diff --git a/test/benchmark/tests/utils/ts.py b/test/benchmark/tests/utils/ts.py
index 490755403e..605bb61a42 100644
--- a/test/benchmark/tests/utils/ts.py
+++ b/test/benchmark/tests/utils/ts.py
@@ -26,7 +26,7 @@
 class TorchServeHandler(object):
     def __init__(
         self,
-        exec_env="local",
+        exec_env="docker",
         cuda_version="cu102",
         gpus=None,
         torchserve_docker_image=None,
@@ -50,18 +50,31 @@ def __init__(
         # self.prepare_common_dependency()
         # self.getAPIS()
 
-    def setup_torchserve(self):
+    def setup_torchserve(self, virtual_env_name=None):
         """
         Set up torchserve dependencies, and install torchserve
         """
-        pass
+        activation_command = ""
+        self.connection.run(f"chmod +x -R /home/ubuntu/serve")
+        if virtual_env_name:
+            activation_command = f"cd /home/ubuntu/serve && source activate {virtual_env_name} && "
+
+        if self.connection.run(f"{activation_command}torchserve --version", warn=True).return_code == 0:
+            return
+
+        self.connection.run(f"{activation_command}python3 ./ts_scripts/install_dependencies.py --environment=dev", warn=True)
+        self.connection.run(f"{activation_command}pip3 install pygit2", warn=True)
+        self.connection.run(f"{activation_command}python3 ./ts_scripts/install_from_src.py", warn=True)
+        self.connection.run(f"{activation_command}torchserve --version")
+
 
     def prepare_common_dependency(self):
-        # Note: the following command cleans up any previous run logs
-        self.connection.run(f"rm -rf {os.path.join(TMP_DIR, 'benchmark')}")
+        # Note: the following command cleans up any previous run logs, except any *.mar files generated to avoid re-creation
+        self.connection.run(f"find {os.path.join(TMP_DIR, 'benchmark')} ! -name '*.mar' -type f -exec rm -f {{}} +", warn=True)
         # Recreate required folders
         self.connection.run(f"mkdir -p {os.path.join(TMP_DIR, 'benchmark', 'conf')}")
         self.connection.run(f"mkdir -p {os.path.join(TMP_DIR, 'benchmark', 'logs')}")
+        self.connection.run(f"mkdir -p {os.path.join(TMP_DIR, 'benchmark', 'model_store')}")
 
         # Use config from benchmarks/ folder
         self.connection.run(
@@ -86,10 +99,29 @@ def getAPIS(self):
         self.management_port = urlparse(management_api).port
         self.inference_api = urlparse(inference_api).port
 
-    def start_torchserve_local(self):
-        pass
+    def start_torchserve_local(self, virtual_env_name=None, stop_torchserve=True):
+
+        self.prepare_common_dependency()
+        self.getAPIS()
+
+        activation_command = ""
+        if virtual_env_name:
+            activation_command = f"cd /home/ubuntu/serve && source activate {virtual_env_name} && "
+        if self.backend_profiling:
+            activation_command = f"{activation_command} && export TS_BENCHMARK=True && "
+        
+        if stop_torchserve:
+            LOGGER.info(f"Stop existing torchserve instance")
+            self.connection.run(f"{activation_command}torchserve --stop", warn=True)
+        
+        self.connection.run(f"{activation_command}torchserve --start --model-store /home/ubuntu/benchmark/model_store/ --ts-config {TMP_DIR}/benchmark/conf/config.properties > {TMP_DIR}/benchmark/logs/model_metrics.log", warn=True)
+        LOGGER.info(f"Started torchserve using command")
+        LOGGER.info(f"{activation_command}torchserve --start --model-store /home/ubuntu/benchmark/model_store/ --ts-config {TMP_DIR}/benchmark/conf/config.properties > {TMP_DIR}/benchmark/logs/model_metrics.log")
+
+        time.sleep(10)
 
-    def start_torchserve_docker(self):
+
+    def start_torchserve_docker(self, stop_torchserve=True):
 
         self.prepare_common_dependency()
         self.getAPIS()
@@ -101,8 +133,9 @@ def start_torchserve_docker(self):
         if self.backend_profiling:
             backend_profiling = f"-e TS_BENCHMARK=True"
 
-        LOGGER.info(f"Removing existing TS container instance...")
-        self.connection.run("docker rm -f ts")
+        if stop_torchserve:
+            LOGGER.info(f"Removing existing TS container instance...")
+            self.connection.run("docker rm -f ts")
 
         LOGGER.info(f"Starting docker container on the instance from image: {self.torchserve_docker_image}")
         docker_run_cmd = (
@@ -127,16 +160,17 @@ def register_model(self, url, workers, batch_delay, batch_size, model_name="benc
         :param batch_size: max number of requests allowed to be batched
         """
         run_out = self.connection.run(
-            f'curl -X POST "http://localhost:8081/models?url={url}&initial_workers={workers}&batch_delay={batch_delay}&batch_size={batch_size}&synchronous=true&model_name=benchmark"'
+            f'curl -X POST "http://localhost:8081/models?url={url}&initial_workers={workers}&batch_delay={batch_delay}&batch_size={batch_size}&synchronous=true&model_name=benchmark"', warn=True
         )
 
         LOGGER.info(
             f'curl -X POST "http://localhost:8081/models?url={url}&initial_workers={workers}&batch_delay={batch_delay}&batch_size={batch_size}&synchronous=true&model_name=benchmark"'
         )
 
-        time.sleep(5)
+        time.sleep(40)
 
-        assert run_out.return_code == 0, f"Failed to register model {model_name} sourced from url: {url}"
+        if run_out.return_code == 0:
+            LOGGER.error(f"Failed to register model {model_name} sourced from url: {url}")
 
     def unregister_model(self, model_name="benchmark"):
         """
@@ -148,18 +182,23 @@ def unregister_model(self, model_name="benchmark"):
         LOGGER.info(f'curl -X DELETE "http://localhost:8081/models/{model_name}/1.0"')
         LOGGER.info(f"stdout: {run_out.stdout}")
 
-        time.sleep(5)
+        time.sleep(10)
         if run_out.return_code == 0:
             LOGGER.error(f"Failed to unregister model {model_name}")
 
 
-    def stop_torchserve(self, exec_env="local"):
+    def stop_torchserve(self, exec_env="docker", virtual_env_name=None):
         """
         Stops torchserve depending on the exec_env
         :param exec_env: either 'local' or 'docker'
         """
         if exec_env == "docker":
-            self.connection.run(f"docker rm -f ts")
+            self.connection.run(f"docker rm -f ts", warn=True)
+        else:
+            activation_command = ""
+            if virtual_env_name:
+                activation_command = f"cd /home/ubuntu/serve/test/benchmark/tests/resources/neuron-bert && source activate {virtual_env_name} && "
+            self.connection.run(f"{activation_command}torchserve --stop", warn=True)
 
         time.sleep(5)