pytorch · lxning · Nov 8, 2023 · Aug 1, 2023 · Aug 5, 2023 · Aug 7, 2023
diff --git a/.github/workflows/benchmark_nightly.yml b/.github/workflows/benchmark_nightly.yml
@@ -36,6 +36,8 @@ jobs:
           java-version: '17'
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Install dependencies
         run: |
           sudo apt-get update -y

diff --git a/.github/workflows/ci_cpu.yml b/.github/workflows/ci_cpu.yml
@@ -35,6 +35,8 @@ jobs:
           java-version: '17'
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Install dependencies
         run: |
           python ts_scripts/install_dependencies.py --environment=dev

diff --git a/.github/workflows/ci_gpu.yml b/.github/workflows/ci_gpu.yml
@@ -39,6 +39,8 @@ jobs:
           java-version: '17'
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Install dependencies
         run: |
           python ts_scripts/install_dependencies.py --environment=dev --cuda=cu121

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -34,6 +34,8 @@ jobs:
     steps:
     - name: Checkout repository
       uses: actions/checkout@v3
+      with:
+        submodules: recursive
 
     - name: Setup Python 3.8
       uses: actions/setup-python@v4

diff --git a/.github/workflows/docker-ci.yaml b/.github/workflows/docker-ci.yaml
@@ -17,6 +17,8 @@ jobs:
         python-version: ["3.8", "3.9", "3.10"]
     steps:
       - uses: actions/checkout@v3
+        with:
+          submodules: recursive
 
       - name: Test build_image.sh script with custom tagging and gpu flag
         working-directory: docker

diff --git a/.github/workflows/docker-nightly-build.yml b/.github/workflows/docker-nightly-build.yml
@@ -22,6 +22,8 @@ jobs:
           architecture: x64
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Login to Docker
         env:
           DOCKER_PASSWORD: ${{secrets.DOCKER_PASSWORD}}

diff --git a/.github/workflows/regression_tests_cpu.yml b/.github/workflows/regression_tests_cpu.yml
@@ -34,6 +34,8 @@ jobs:
           java-version: '17'
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Install dependencies
         run: |
           python ts_scripts/install_dependencies.py --environment=dev

diff --git a/.github/workflows/regression_tests_cpu_binaries.yml b/.github/workflows/regression_tests_cpu_binaries.yml
@@ -21,6 +21,8 @@ jobs:
         binaries: ["pypi", "conda"]
     steps:
       - uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Setup conda with Python ${{ matrix.python-version }}
         uses: s-weigand/setup-conda@v1
         with:

diff --git a/.github/workflows/regression_tests_docker.yml b/.github/workflows/regression_tests_docker.yml
@@ -29,6 +29,8 @@ jobs:
           docker system prune --all --volumes -f
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Branch name
         run: |
           echo $GITHUB_REF_NAME

diff --git a/.github/workflows/regression_tests_gpu.yml b/.github/workflows/regression_tests_gpu.yml
@@ -42,6 +42,8 @@ jobs:
           java-version: '17'
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Install dependencies
         run: |
           python ts_scripts/install_dependencies.py --environment=dev --cuda=cu121

diff --git a/.github/workflows/regression_tests_gpu_binaries.yml b/.github/workflows/regression_tests_gpu_binaries.yml
@@ -32,6 +32,8 @@ jobs:
           java-version: '17'
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - uses: conda-incubator/setup-miniconda@v2
         with:
           miniconda-version: "latest"

diff --git a/.github/workflows/torchserve-nightly-build.yml b/.github/workflows/torchserve-nightly-build.yml
@@ -14,6 +14,8 @@ jobs:
       - run: conda install -y conda-build anaconda-client
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Install dependencies
         run: |
           python ts_scripts/install_dependencies.py --environment=dev

diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "third_party/google/rpc"]
+	path = third_party/google/rpc
+	url = https://github.com/googleapis/googleapis.git
diff --git a/docs/grpc_api.md b/docs/grpc_api.md
@@ -38,7 +38,7 @@ cd serve
  - Install gRPC python dependencies
 
 ```bash
-pip install -U grpcio protobuf grpcio-tools
+pip install -U grpcio protobuf grpcio-tools googleapis-common-protos
 ```
 
  - Start torchServe
@@ -51,7 +51,7 @@ torchserve --start --model-store models/
  - Generate python gRPC client stub using the proto files
 
 ```bash
-python -m grpc_tools.protoc --proto_path=frontend/server/src/main/resources/proto/ --python_out=ts_scripts --grpc_python_out=ts_scripts frontend/server/src/main/resources/proto/inference.proto frontend/server/src/main/resources/proto/management.proto
+python -m grpc_tools.protoc -I third_party/google/rpc --proto_path=frontend/server/src/main/resources/proto/ --python_out=ts_scripts --grpc_python_out=ts_scripts frontend/server/src/main/resources/proto/inference.proto frontend/server/src/main/resources/proto/management.proto
 ```
 
  - Register densenet161 model
@@ -95,4 +95,4 @@ def handle(data, context):
         for i in range (3):
             send_intermediate_predict_response(["intermediate_response"], context.request_ids, "Intermediate Prediction success", 200, context)
         return ["hello world "]
-```
+```
diff --git a/docs/images/stateful_batch.jpg b/docs/images/stateful_batch.jpg
diff --git a/examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py b/examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py
@@ -0,0 +1,140 @@
+import logging
+from abc import ABC
+
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from ts.context import Context
+from ts.torch_handler.base_handler import BaseHandler
+
+logger = logging.getLogger(__name__)
+logger.info("Transformers version %s", transformers.__version__)
+
+
+class LlamaHandler(BaseHandler, ABC):
+    """
+    Transformers handler class for sequence, token classification and question answering.
+    """
+
+    def __init__(self):
+        super(LlamaHandler, self).__init__()
+        self.max_length = None
+        self.max_new_tokens = None
+        self.tokenizer = None
+        self.initialized = False
+
+    def initialize(self, ctx: Context):
+        """In this initialize function, the HF large model is loaded and
+        partitioned using DeepSpeed.
+        Args:
+            ctx (context): It is a JSON Object containing information
+            pertaining to the model artifacts parameters.
+        """
+        model_dir = ctx.system_properties.get("model_dir")
+        self.max_length = int(ctx.model_yaml_config["handler"]["max_length"])
+        self.max_new_tokens = int(ctx.model_yaml_config["handler"]["max_new_tokens"])
+        model_name = ctx.model_yaml_config["handler"]["model_name"]
+        model_path = f'{model_dir}/{ctx.model_yaml_config["handler"]["model_path"]}'
+        seed = int(ctx.model_yaml_config["handler"]["manual_seed"])
+        torch.manual_seed(seed)
+
+        logger.info("Model %s loading tokenizer", ctx.model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            device_map="balanced",
+            low_cpu_mem_usage=True,
+            torch_dtype=torch.float16,
+            load_in_8bit=True,
+            trust_remote_code=True,
+        )
+        if ctx.model_yaml_config["handler"]["fast_kernels"]:
+            from optimum.bettertransformer import BetterTransformer
+
+            try:
+                self.model = BetterTransformer.transform(self.model)
+            except RuntimeError as error:
+                logger.warning(
+                    "HuggingFace Optimum is not supporting this model,for the list of supported models, please refer to this doc,https://huggingface.co/docs/optimum/bettertransformer/overview"
+                )
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+        logger.info("Model %s loaded successfully", ctx.model_name)
+        self.initialized = True
+
+    def preprocess(self, requests):
+        """
+        Basic text preprocessing, based on the user's choice of application mode.
+        Args:
+            requests (list): A list of dictionaries with a "data" or "body" field, each
+                            containing the input text to be processed.
+        Returns:
+            tuple: A tuple with two tensors: the batch of input ids and the batch of
+                attention masks.
+        """
+        input_texts = [data.get("data") or data.get("body") for data in requests]
+        input_ids_batch, attention_mask_batch = [], []
+        for input_text in input_texts:
+            input_ids, attention_mask = self.encode_input_text(input_text)
+            input_ids_batch.append(input_ids)
+            attention_mask_batch.append(attention_mask)
+        input_ids_batch = torch.cat(input_ids_batch, dim=0).to(self.model.device)
+        attention_mask_batch = torch.cat(attention_mask_batch, dim=0).to(self.device)
+        return input_ids_batch, attention_mask_batch
+
+    def encode_input_text(self, input_text):
+        """
+        Encodes a single input text using the tokenizer.
+        Args:
+            input_text (str): The input text to be encoded.
+        Returns:
+            tuple: A tuple with two tensors: the encoded input ids and the attention mask.
+        """
+        if isinstance(input_text, (bytes, bytearray)):
+            input_text = input_text.decode("utf-8")
+        logger.info("Received text: '%s'", input_text)
+        inputs = self.tokenizer.encode_plus(
+            input_text,
+            max_length=self.max_length,
+            padding=False,
+            add_special_tokens=True,
+            return_tensors="pt",
+            truncation=True,
+        )
+        input_ids = inputs["input_ids"]
+        attention_mask = inputs["attention_mask"]
+        return input_ids, attention_mask
+
+    def inference(self, input_batch):
+        """
+        Predicts the class (or classes) of the received text using the serialized transformers
+        checkpoint.
+        Args:
+            input_batch (tuple): A tuple with two tensors: the batch of input ids and the batch
+                                of attention masks, as returned by the preprocess function.
+        Returns:
+            list: A list of strings with the predicted values for each input text in the batch.
+        """
+        input_ids_batch, attention_mask_batch = input_batch
+        input_ids_batch = input_ids_batch.to(self.device)
+        outputs = self.model.generate(
+            input_ids_batch,
+            attention_mask=attention_mask_batch,
+            max_length=self.max_new_tokens,
+        )
+
+        inferences = self.tokenizer.batch_decode(
+            outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+
+        logger.info("Generated text: %s", inferences)
+        return inferences
+
+    def postprocess(self, inference_output):
+        """Post Process Function converts the predicted response into Torchserve readable format.
+        Args:
+            inference_output (list): It contains the predicted response of the input text.
+        Returns:
+            (list): Returns a list of the Predictions and Explanations.
+        """
+        return inference_output
diff --git a/examples/stateful/Readme.md b/examples/stateful/Readme.md
@@ -0,0 +1,119 @@
+# Stateful Inference
+
+A stateful model possesses the ability to detect interdependencies between successive inference requests. This type of model maintains a persistent state across inference requests, thereby establishing a linkage between the outcomes of prior inquiries and those that follow. Notable illustrations of stateful models encompass online speech recognition systems, such as the Long Short-Term Memory (LSTM) model. Employing stateful inference mandates that the model server adheres to the sequential order of inference requests, ensuring predictions build upon the previous outcomes.
+
+Within this context, TorchServe offers a mechanism known as sequence batching. This approach involves the retrieval of an individual inference request from a particular sequence, followed by the combination of multiple requests originating from different sequences into a unified batch. Each request is associated with a unique sequence ID, which can be extracted using the "get_sequence_id" function of context.py. This `sequence ID` serves as a key employed by custom handlers to store and retrieve values within the backend cache store, fostering efficient management of stateful inference processes. Client can also reuse the `sequence ID` when a connection resumes as long as the sequence is not expired on the TorchServe side.
+
+The following picture show the workflow of stateful inference. A job group has a job queue which stores incoming inference requests from a streaming. The max capacity of a job queue is defined by `maxSequenceJobQueueSize`. A sequence batch aggregator polls an inference request from each job group. A batch of requests is sent to backend.
+
+![sequence batch](../../docs/images/stateful_batch.jpg)
+
+This example serves as a practical showcase of employing stateful inference. Underneath the surface, the backend leverages an [LRU dictionary](https://github.com/amitdev/lru-dict), functioning as a caching layer. Users can choose different caching library in the handler implementation based on their own use cases.
+
+### Step 1: Implement handler
+
+stateful_handler.py is an example of stateful handler. It creates a cache `self.cache` by calling `[LRU](https://github.com/amitdev/lru-dict)`.
+
+```python
+    def initialize(self, ctx: Context):
+        """
+        Loads the model and Initializes the necessary artifacts
+        """
+
+        super().initialize(ctx)
+        if self.context.model_yaml_config["handler"] is not None:
+            try:
+                self.cache = LRU(
+                    int(self.context.model_yaml_config["handler"]["cache"]["capacity"]))
+            except KeyError:
+                logger.warn("No cache capacity was set! Using default value.")
+                self.cache = LRU(StatefulHandler.DEFAULT_CAPACITY)
+
+        self.initialized = True
+```
+
+Handler uses sequenceId (ie., `sequence_id = self.context.get_sequence_id(idx)`) as key to store and fetch values from `self.cache`.
+
+```python
+    def preprocess(self, data):
+        """
+        Preprocess function to convert the request input to a tensor(Torchserve supported format).
+        The user needs to override to customize the pre-processing
+
+        Args :
+            data (list): List of the data from the request input.
+
+        Returns:
+            tensor: Returns the tensor data of the input
+        """
+
+        self.sequence_ids = {}
+        results = []
+        for idx, row in enumerate(data):
+            sequence_id = self.context.get_sequence_id(idx)
+
+            prev = int(0)
+            if self.cache.has_key(sequence_id):
+                prev = int(self.cache[sequence_id])
+
+            request = row.get("data") or row.get("body")
+            if isinstance(request, (bytes, bytearray)):
+                request = request.decode("utf-8")
+
+            val = prev + int(request)
+            self.cache[sequence_id] = val
+            results.append(val)
+
+        return results
+```
+
+### Step 2: Model configuration
+
+Stateful inference has two parameters. TorchServe is able to process (maxWorkers * batchSize) sequences of inference requests of a model in parallel.
+* sequenceMaxIdleMSec: the max idle in milliseconds of a sequence inference request of this stateful model. The default value is 0 (ie. this is not a stateful model.) TorchServe does not process the new inference request if the max idle timeout.
+* maxSequenceJobQueueSize: the job queue size of an inference sequence of this stateful model. The default value is 1.
+
+
+```yaml
+#cat model-config.yaml
+
+minWorkers: 2
+maxWorkers: 2
+batchSize: 4
+sequenceMaxIdleMSec: 60000
+maxSequenceJobQueueSize: 10
+
+handler:
+  cache:
+    capacity: 4
+```
+
+### Step 3: Generate mar or tgz file
+
+```bash
+torch-model-archiver --model-name stateful --version 1.0 --model-file model.py --serialized-file model_cnn.pt --handler stateful_handler.py -r requirements.txt --config-file model-config.yaml
+```
+
+### Step 4: Start torchserve
+
+```bash
+torchserve --start --ncs --model-store model_store --models stateful.mar
+```
+
+### Step 6: Build GRPC Client
+The details can be found at [here](https://github.com/pytorch/serve/blob/master/docs/grpc_api.md).
+* Install gRPC python dependencies
+* Generate python gRPC client stub using the proto files
+
+### Step 7: Run inference
+* Start TorchServe
+
+```bash
+torchserve --ncs --start --model-store models --model stateful.mar --ts-config config.properties
+```
+
+* Run sequence inference
+```bash
+cd ../../
+python ts_scripts/torchserve_grpc_client.py  infer_stream2 stateful seq_0 examples/stateful/sample/sample1.txt,examples/stateful/sample/sample2.txt,examples/stateful/sample/sample3.txt
+```