From 60d48cde96005326ed05a4a00146c70fd6e88b5f Mon Sep 17 00:00:00 2001 From: sd109 Date: Tue, 30 Jan 2024 14:23:01 +0000 Subject: [PATCH 1/6] Handle required Mistral chat format explicitly --- chart/web-app/app.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/chart/web-app/app.py b/chart/web-app/app.py index 6fe59cd..7c625ad 100644 --- a/chart/web-app/app.py +++ b/chart/web-app/app.py @@ -1,5 +1,6 @@ import requests import warnings +import re import rich import gradio as gr from urllib.parse import urljoin @@ -17,6 +18,16 @@ backend_health_endpoint = urljoin(backend_url, "/health") backend_initialised = False +# NOTE(sd109): The Mistral family of models explicitly require a chat +# history of the form user -> ai -> user -> ... and so don't like having +# a SystemPrompt at the beginning. Since these models seem to be the +# best around right now, it makes sense to treat them as special and make +# sure the web app works correctly with them. To do so, we detect when a +# mistral model is specified using this regex and then handle it explicitly +# when contructing the `context` list in the `inference` function below. +MISTRAL_REGEX = re.compile(r"mi(s|x)tral", re.IGNORECASE) +IS_MISTRAL_MODEL = (MISTRAL_REGEX.match(settings.model_name) is not None) + llm = ChatOpenAI( base_url=urljoin(backend_url, "v1"), model = settings.model_name, @@ -57,9 +68,17 @@ def inference(latest_message, history): try: - context = [SystemMessage(content=settings.model_instruction)] - for human, ai in history: - context.append(HumanMessage(content=human)) + # To handle Mistral models we have to add the model instruction to + # the first user message since Mistral requires user -> ai -> user + # chat format and therefore doesn't allow system prompts. + context = [] + if not IS_MISTRAL_MODEL: + context.append(SystemMessage(content=settings.model_instruction)) + for i, (human, ai) in enumerate(history): + if IS_MISTRAL_MODEL and i == 0: + context.append(HumanMessage(content=f"{settings.model_instruction}\n\n{human}")) + else: + context.append(HumanMessage(content=human)) context.append(AIMessage(content=ai)) context.append(HumanMessage(content=latest_message)) From 75ca0cd7adebf62c1e0cccf79686323a2fafb3ef Mon Sep 17 00:00:00 2001 From: sd109 Date: Tue, 30 Jan 2024 14:35:25 +0000 Subject: [PATCH 2/6] Fix typo in Notes --- chart/templates/NOTES.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chart/templates/NOTES.txt b/chart/templates/NOTES.txt index d28f460..475f481 100644 --- a/chart/templates/NOTES.txt +++ b/chart/templates/NOTES.txt @@ -6,6 +6,6 @@ On deployment of a new model, the app must first download the model's weights fr This can take a significant amount of time depending on model choice and network speeds. Download progress can be monitored by inspecting the logs for the LLM API pod(s) via the Kubernetes Dashboard for the target cluster. -The app uses [vLLM](https://docs.vllm.ai/en/latest/) as a model serving backend and [gradio](https://github.com/gradio-app/gradio) + [LangChain](https://python.langchain.com/docs/get_started/introduction) to provide the web interface. +The app uses [vLLM](https://docs.vllm.ai/en/latest/) as a model serving backend and [Gradio](https://github.com/gradio-app/gradio) + [LangChain](https://python.langchain.com/docs/get_started/introduction) to provide the web interface. The official list of HuggingFace models supported by vLLM can be found [here](https://docs.vllm.ai/en/latest/models/supported_models.html), though some of these may not be compatible with the LangChain prompt format. See [this documentation](https://github.com/stackhpc/azimuth-llm/) for a non-exhaustive list of languange models against which the app has been tested. \ No newline at end of file From 0ea1cb79a46cb77c4dc7bdf91fb62eed09961efa Mon Sep 17 00:00:00 2001 From: sd109 Date: Tue, 30 Jan 2024 14:40:31 +0000 Subject: [PATCH 3/6] Add log message for Mistral models --- chart/web-app/app.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/chart/web-app/app.py b/chart/web-app/app.py index 7c625ad..72593d3 100644 --- a/chart/web-app/app.py +++ b/chart/web-app/app.py @@ -27,6 +27,8 @@ # when contructing the `context` list in the `inference` function below. MISTRAL_REGEX = re.compile(r"mi(s|x)tral", re.IGNORECASE) IS_MISTRAL_MODEL = (MISTRAL_REGEX.match(settings.model_name) is not None) +if IS_MISTRAL_MODEL: + print("Detected Mistral model - will alter LangChain conversation format appropriately.") llm = ChatOpenAI( base_url=urljoin(backend_url, "v1"), From a24e9ca518317dd31dd493d95ce5d79de47a78b6 Mon Sep 17 00:00:00 2001 From: sd109 Date: Tue, 30 Jan 2024 14:58:36 +0000 Subject: [PATCH 4/6] Add Mistral 7B to README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 50f7bb0..1d418be 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,8 @@ The following is a non-exhaustive list of models which have been tested with thi - [Llama 2 7B chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) - [AWQ Quantized Llama 2 70B](https://huggingface.co/TheBloke/Llama-2-70B-Chat-AWQ) - [Magicoder 6.7B](https://huggingface.co/ise-uiuc/Magicoder-S-DS-6.7B) +- [Mistral 7B Instruct v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) + Due to the combination of [components](##Components) used in this app, some Huggingface models may not work as expected (usually due to the way in which LangChain formats the prompt messages). Any errors when using new model will appear in the pod logs for either the web-app deployment the backend API deployment. From 69c79173068e75c0e2740757cafb1e7462cc4daf Mon Sep 17 00:00:00 2001 From: sd109 Date: Tue, 30 Jan 2024 14:59:28 +0000 Subject: [PATCH 5/6] Fix Mistral regex pattern --- chart/web-app/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chart/web-app/app.py b/chart/web-app/app.py index 72593d3..0a1ec23 100644 --- a/chart/web-app/app.py +++ b/chart/web-app/app.py @@ -25,7 +25,7 @@ # sure the web app works correctly with them. To do so, we detect when a # mistral model is specified using this regex and then handle it explicitly # when contructing the `context` list in the `inference` function below. -MISTRAL_REGEX = re.compile(r"mi(s|x)tral", re.IGNORECASE) +MISTRAL_REGEX = re.compile(r".*mi(s|x)tral.*", re.IGNORECASE) IS_MISTRAL_MODEL = (MISTRAL_REGEX.match(settings.model_name) is not None) if IS_MISTRAL_MODEL: print("Detected Mistral model - will alter LangChain conversation format appropriately.") From 445ec79020e6911760bc0f79a77487c84b30a5d7 Mon Sep 17 00:00:00 2001 From: sd109 Date: Tue, 30 Jan 2024 15:03:57 +0000 Subject: [PATCH 6/6] Update README --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 1d418be..c58ad59 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ This repository contains a Helm chart for deploying Large Language Models (LLMs) ## Azimuth App -This app ~~is~~ will soon be provided as part of a standard deployment Azimuth so no specific steps are required to use this app other than access to an up to date Azimuth deployment. +This app ~~is~~ will soon be provided as part of a standard deployment Azimuth, so no specific steps are required to use this app other than access to an up-to-date Azimuth deployment. ## Manual Deployment @@ -16,7 +16,7 @@ helm repo update helm install /azimuth-llm --version ``` -where version is the full published version for the specified commit (e.g. `0.1.0-dev.0.main.125`). To see the latest published version, see [this page](https://github.com/stackhpc/azimuth-llm/tree/gh-pages). +where `version` is the full name of the published version for the specified commit (e.g. `0.1.0-dev.0.main.125`). To see the latest published version, see [this page](https://github.com/stackhpc/azimuth-llm/tree/gh-pages). ### Customisation @@ -42,7 +42,7 @@ The following is a non-exhaustive list of models which have been tested with thi - [Mistral 7B Instruct v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) -Due to the combination of [components](##Components) used in this app, some Huggingface models may not work as expected (usually due to the way in which LangChain formats the prompt messages). Any errors when using new model will appear in the pod logs for either the web-app deployment the backend API deployment. +Due to the combination of [components](##Components) used in this app, some HuggingFace models may not work as expected (usually due to the way in which LangChain formats the prompt messages). Any errors when using new model will appear in the pod logs for either the web-app deployment the backend API deployment. ## Components