prokube-ai · hsteude · Oct 4, 2023 · Jan 26, 2024 · Oct 5, 2023 · Jan 26, 2024
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -52,3 +52,17 @@ mnsist-katib-build:
   rules:
     - changes:
         - images/minimal-mnist/**/*
+
+llama2-chat-predictor-build:
+  stage: build-image
+  script:
+    - >
+      /kaniko/executor
+      --context "${CI_PROJECT_DIR}/images/llama2-chat-predictor/"
+      --dockerfile "${CI_PROJECT_DIR}/images/llama2-chat-predictor/Dockerfile"
+      --destination "${CI_REGISTRY}/${CI_PROJECT_PATH}/llama2-chat-predictor:latest"
+      --destination "${CI_REGISTRY}/${CI_PROJECT_PATH}/llama2-chat-predictor:${CI_COMMIT_SHORT_SHA}"
+
+  rules:
+    - changes:
+        - images/llama2-chat-predictor/**/*
diff --git a/images/llama2-chat-predictor/Dockerfile b/images/llama2-chat-predictor/Dockerfile
@@ -0,0 +1,10 @@
+FROM nvcr.io/nvidia/pytorch:23.09-py3
+RUN apt-get update && apt-get install -y libglib2.0-0
+RUN mkdir /app
+WORKDIR /app
+COPY pyproject.toml /app/
+COPY llama2_chat /app/llama2_chat
+RUN python -m pip install .
+ADD main.py ./main.py
+ENV PORT=8080
+EXPOSE 8080
diff --git a/images/llama2-chat-predictor/README.md b/images/llama2-chat-predictor/README.md
@@ -0,0 +1,76 @@
+# Llama2 Deployment
+
+For deploying the Llama2 model, we will create a custom predictor using KServe.
+This directory holds all the code as well as the Dockerfile to create
+the image for the llama2 deployment in kserve.
+The steps for the actual deployment in kserve can also be found in this repository
+(`/serving/llama2-chat`).
+
+If you are new to creating custom predictors with KServe, you may read more about it in the [KServe documentation](https://kserve.github.io/website/0.8/modelserving/v1beta1/custom/custom_model/).
+
+For a gentle introduction and detailed guide on working with Llama2, consider reading this blog post: [How to prompt Llama](https://replicate.com/blog/how-to-prompt-llama).
+
+## Hugging Face Token
+
+You'll need a Hugging Face token to utilize the transformers library for downloading
+the model weights. Additionally, you'll need to ask Meta for permission to use the
+Llama model itself. You can read more about that [here](https://huggingface.co/blog/llama2).
+
+The code expects an environment variable named `HF_ACCESS_TOKEN` that holds the token as a string.
+
+### Setting the Variable Locally
+
+To set this variable for a local deployment, run:
+```sh
+export HF_ACCESS_TOKEN=<example-token>
+```
+
+## Run/develop Locally
+
+To run it locally, you'll need to first install the Python dependencies. To
+do so, run the following from this directory:
+```
+poetry install .
+export HF_ACCESS_TOKEN=<example-token>
+poetry run python main.py --name "llama2-chat" --device "cpu" --hf-model-string "meta-llama/Llama-2-13b-chat-hf"
+```
+
+### Run Query Against Local Deployment
+
+Once the server is up and running, you might try something like:
+```sh
+curl localhost:8080/v1/models/llama2-chat:predict -d '{"top_k": 3, "max_length": 200, "instances": ["Why is MLOps so important?"]}'
+```
+We decided to allow the user to set the inference options for top_k (number of
+samples with the best scores, from which we sample) and max_length in the API query, along with the prompt.
+
+## Build and Push Image
+
+### Using docker
+To build the model using Docker, do:
+```sh
+docker build -t <your-image-registry-name>/llama2-chat:<TAG> .
+```
+
+To use this image in kserve, we'll also need to push it to an image registry.
+First, log in:
+```sh
+docker login <example.registry.com>:4567 -u <token name> -p <token>
+```
+
+Note: This may require setting up a project access token in the GitLab UI first.
+Navigate to: GitLab UI -> this Repository -> Settings -> Access Token -> Add a project access token.
+
+Now, you can push the image using Docker:
+```sh
+docker push <image-repository-name>/llama2-chat:<TAG>
+```
+
+### Using GitLab Runner with Kaniko
+Automated Build: Trigger an automatic build by committing a change to
+any file in or under the directory where this README lives. This activates
+a GitLab CI/CD pipeline using Kaniko. The built image will receive two tags:
+
+latest: Always points to the most recent build.
+commit-<SHORT_COMMIT_HASH>: Associates the image with a specific
+Git commit.
diff --git a/images/llama2-chat-predictor/llama2_chat/__init__.py b/images/llama2-chat-predictor/llama2_chat/__init__.py
diff --git a/images/llama2-chat-predictor/llama2_chat/predictor.py b/images/llama2-chat-predictor/llama2_chat/predictor.py
@@ -0,0 +1,97 @@
+from transformers import AutoTokenizer
+import transformers
+import torch
+import os
+import json
+from loguru import logger
+from huggingface_hub import login
+from typing import Dict, Union
+from kserve import (
+    Model,
+    InferRequest,
+    InferResponse,
+)
+
+
+class Llama2ChatPredictor(Model):
+    def __init__(
+        self,
+        name: str,
+        hf_model_string: str = "meta-llama/Llama-2-7b-chat-hf",
+        loguru_loglevel: str = "INFO",
+        device: Union[int, str] = "auto",
+    ):
+        """
+        Initialize the Llama2ChatPredictor.
+
+        Parameters:
+            name: The name of the model.
+            hf_model_string: The identifier of the Hugging Face model.
+            loguru_loglevel: Logging level for loguru.
+            device: The device to run the model on (0, 1, ..., 'cpu', 'gpu').
+        """
+        super().__init__(name=name)
+        os.environ["LOGURU_LEVEL"] = loguru_loglevel
+        self.device = device
+        self.hf_model_string = hf_model_string
+        self.load()
+        self.ready = True
+
+    def load(self):
+        """
+        Load the model weights and tokenizer from Hugging Face and create the pipeline.
+        """
+        login(token=os.environ["HF_ACCESS_TOKEN"])
+        logger.info(f"Loading weights and tokenizer for model: {self.hf_model_string}")
+        self.tokenizer = AutoTokenizer.from_pretrained(self.hf_model_string)
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.pipeline = transformers.pipeline(
+            "text-generation",
+            model=self.hf_model_string,
+            torch_dtype=torch.float32
+            if not torch.cuda.is_available()
+            else torch.float16,
+            device=self.device,
+        )
+
+    def preprocess(
+        self,
+        payload: Union[bytes, InferRequest],
+        headers: Dict[str, str] = None,
+    ) -> Dict:
+        """
+        Preprocess the payload into the format expected by the predict method.
+
+        Parameters:
+            payload: The input payload, as a bytes string or InferRequest.
+            headers: Additional headers (unused here, but expected in base class).
+
+        Returns:
+            the payload as a dict
+        """
+        return payload if type(payload) == dict else json.loads(payload)
+
+    def predict(
+        self,
+        payload: Dict,
+        headers: Dict[str, str] = None,
+    ) -> Union[str, InferResponse]:
+        """
+        Generate predictions using the input payload.
+
+        Parameters:
+            payload: The input payload, preprocessed to dict format.
+            headers: Additional headers (unused here, but expected in base class).
+
+        Returns:
+            A string containing the generated text.
+        """
+        sequences = self.pipeline(
+            f"{payload['instances']}\n",
+            do_sample=True,
+            top_k=payload["top_k"],
+            num_return_sequences=1,
+            eos_token_id=self.tokenizer.eos_token_id,
+            max_length=payload["max_length"],
+        )
+        return sequences[0]["generated_text"]
diff --git a/images/llama2-chat-predictor/main.py b/images/llama2-chat-predictor/main.py
@@ -0,0 +1,47 @@
+import click
+import torch
+from typing import Union
+from loguru import logger
+from kserve import (
+    ModelServer,
+)
+from llama2_chat.predictor import Llama2ChatPredictor
+
+
+@click.command()
+@click.option("--name", type=str, default="llama2-chat", help="Model name.")
+@click.option("--loglevel", type=str, default="INFO", help="Log level for loguru.")
+@click.option(
+    "--hf-model-string",
+    type=str,
+    default="meta-llama/Llama-2-13b-chat-hf",
+    help="Index of the model in the hugging face model registry.",
+)
+@click.option(
+    "--device",
+    default="cpu",
+    help="Device (0, 1, ..., 'cpu', 'cuda').",
+)
+def main(name: str, loglevel: str, hf_model_string: str, device: Union[str, int]):
+    """
+    Main function to initiate and start the KServe Model Server.
+
+    Parameters:
+        name: The name of the model.
+        loglevel: The logging level for loguru.
+        hf_model_string: Identifier for the Hugging Face model.
+        device: Computational device to utilize (0, 1, 2, ..., 'cpu', 'gpu').
+    """
+    logger.info(f"Cuda is available? -> {torch.cuda.is_available()}")
+    device = int(device) if device not in ('cpu', 'gpu', 'mps') else device
+    kserve_model = Llama2ChatPredictor(
+        name=name,
+        hf_model_string=hf_model_string,
+        loguru_loglevel=loglevel,
+        device=device,
+    )
+    ModelServer().start([kserve_model])
+
+
+if __name__ == "__main__":
+    main()