diff --git a/.github/workflows/test-pr.yml b/.github/workflows/test-pr.yml index 2b16474..dec9f48 100644 --- a/.github/workflows/test-pr.yml +++ b/.github/workflows/test-pr.yml @@ -66,7 +66,7 @@ jobs: with: cluster_name: ${{ env.CLUSTER_NAME }} - # NOTE(scott): Since the local Chart.yaml uses "appVersion: latest" and this + # NOTE(scott): Since the local Chart.yaml uses "appVersion: latest" and this # only gets overwritten to the correct commit SHA during Helm chart build, # we need to pull these published images and load them into the kind cluster # with the tag correct tag. diff --git a/charts/azimuth-chat/ci/ui-only-values.yaml b/charts/azimuth-chat/ci/test-values.yaml similarity index 50% rename from charts/azimuth-chat/ci/ui-only-values.yaml rename to charts/azimuth-chat/ci/test-values.yaml index b66347d..98e3530 100644 --- a/charts/azimuth-chat/ci/ui-only-values.yaml +++ b/charts/azimuth-chat/ci/test-values.yaml @@ -1,16 +1,24 @@ azimuth-llm: + huggingface: + # Use the smallest LLM we can find + model: &model HuggingFaceTB/SmolLM2-135M-Instruct api: - enabled: false + # CI Kind cluster doesn't have kube-prometheus-stack + monitoring: + enabled: false + # No GPUs in CI runners + gpus: 0 ui: service: zenith: enabled: false appSettings: + model_name: *model # Verify that we can set non-standard LLM params llm_params: max_tokens: 101 temperature: 0.1 + top_k: 2 top_p: 0.15 - top_k: 1 presence_penalty: 0.9 frequency_penalty: 1 diff --git a/charts/azimuth-image-analysis/ci/test-values.yaml b/charts/azimuth-image-analysis/ci/test-values.yaml new file mode 100644 index 0000000..a1ca32f --- /dev/null +++ b/charts/azimuth-image-analysis/ci/test-values.yaml @@ -0,0 +1,23 @@ +azimuth-llm: + huggingface: + # Use the smallest vision model we can find + model: &model HuggingFaceTB/SmolVLM-256M-Instruct + api: + # CI Kind cluster doesn't have kube-prometheus-stack + monitoring: + enabled: false + # No GPUs in CI runners + gpus: 0 + ui: + service: + zenith: + enabled: false + appSettings: + model_name: *model + # Verify that we can set non-standard LLM params + llm_params: + max_tokens: 10 # Constrain response tokens to speed up CI test + temperature: 0.1 + top_p: 0.15 + presence_penalty: 0.9 + frequency_penalty: 1 diff --git a/charts/azimuth-image-analysis/ci/ui-only-values.yaml b/charts/azimuth-image-analysis/ci/ui-only-values.yaml deleted file mode 100644 index 96f716d..0000000 --- a/charts/azimuth-image-analysis/ci/ui-only-values.yaml +++ /dev/null @@ -1,15 +0,0 @@ -azimuth-llm: - api: - enabled: false - ui: - service: - zenith: - enabled: false - appSettings: - # Verify that we can set non-standard LLM params - llm_params: - max_tokens: 101 - temperature: 0.1 - top_p: 0.15 - presence_penalty: 0.9 - frequency_penalty: 1 diff --git a/charts/azimuth-llm/ci/default-values.yaml b/charts/azimuth-llm/ci/default-values.yaml new file mode 100644 index 0000000..40dbbde --- /dev/null +++ b/charts/azimuth-llm/ci/default-values.yaml @@ -0,0 +1,18 @@ +# This is intended to test the default chart values +# as close as possible given the constraints of running +# inside a Kind cluster within a CI runner +huggingface: + # Use the smallest LLM we can find + model: &model HuggingFaceTB/SmolLM2-135M-Instruct +api: + # CI Kind cluster doesn't have kube-prometheus-stack + monitoring: + enabled: false + # No GPUs in CI runners + gpus: 0 +ui: + service: + zenith: + enabled: false + appSettings: + model_name: *model diff --git a/charts/azimuth-llm/ci/no-api-values.yaml b/charts/azimuth-llm/ci/no-api-values.yaml deleted file mode 100644 index e455af3..0000000 --- a/charts/azimuth-llm/ci/no-api-values.yaml +++ /dev/null @@ -1,6 +0,0 @@ -api: - enabled: false -ui: - service: - zenith: - enabled: false diff --git a/charts/azimuth-llm/templates/api/deployment.yml b/charts/azimuth-llm/templates/api/deployment.yml index dc41a5f..e3b37fe 100644 --- a/charts/azimuth-llm/templates/api/deployment.yml +++ b/charts/azimuth-llm/templates/api/deployment.yml @@ -19,7 +19,8 @@ spec: spec: containers: - name: {{ .Release.Name }}-api - image: {{ printf "%s:%s" .Values.api.image.repository .Values.api.image.version }} + {{ $imageRepo := .Values.api.image.repository | default (ternary "ghcr.io/stackhpc/vllm-cpu" "vllm-project/vllm" (eq (.Values.api.gpus | int) 0)) -}} + image: {{ printf "%s:%s" $imageRepo .Values.api.image.version }} ports: - name: api containerPort: 8000 @@ -29,7 +30,7 @@ spec: args: - --model - {{ .Values.huggingface.model }} - {{- include "azimuth-llm.chatTemplate" . | nindent 10 }} + {{- include "azimuth-llm.chatTemplate" . | nindent 10 -}} {{- if .Values.api.modelMaxContextLength -}} - --max-model-len - {{ .Values.api.modelMaxContextLength | quote }} @@ -41,7 +42,7 @@ spec: {{- if .Values.api.extraArgs -}} {{- .Values.api.extraArgs | toYaml | nindent 10 }} {{- end -}} - {{- if .Values.huggingface.secretName }} + {{- if .Values.huggingface.secretName -}} envFrom: - secretRef: name: {{ .Values.huggingface.secretName }} diff --git a/charts/azimuth-llm/templates/api/ingress.yml b/charts/azimuth-llm/templates/api/ingress.yml index 14c13b1..d0ba22c 100644 --- a/charts/azimuth-llm/templates/api/ingress.yml +++ b/charts/azimuth-llm/templates/api/ingress.yml @@ -16,7 +16,7 @@ spec: pathType: Prefix backend: service: - name: {{ .Values.api.service.name }} + name: {{ .Release.Name }}-api port: # Must match Service resource number: 80 diff --git a/charts/azimuth-llm/templates/api/service.yml b/charts/azimuth-llm/templates/api/service.yml index e4e31c6..0400545 100644 --- a/charts/azimuth-llm/templates/api/service.yml +++ b/charts/azimuth-llm/templates/api/service.yml @@ -2,7 +2,7 @@ apiVersion: v1 kind: Service metadata: - name: {{ .Values.api.service.name }} + name: {{ .Release.Name }}-api labels: {{- include "azimuth-llm.api-selectorLabels" . | nindent 4 }} spec: diff --git a/charts/azimuth-llm/templates/api/zenith-client.yml b/charts/azimuth-llm/templates/api/zenith-client.yml index ae20b2a..d2832bc 100644 --- a/charts/azimuth-llm/templates/api/zenith-client.yml +++ b/charts/azimuth-llm/templates/api/zenith-client.yml @@ -8,7 +8,7 @@ metadata: spec: reservationName: {{ .Release.Name }}-api upstream: - serviceName: {{ .Values.api.service.name }} + serviceName: {{ .Release.Name }}-api auth: skip: {{ .Values.api.service.zenith.skipAuth }} {{- end -}} diff --git a/charts/azimuth-llm/templates/test/end-to-end.yml b/charts/azimuth-llm/templates/test/end-to-end.yml index e94d4d2..6aecf38 100644 --- a/charts/azimuth-llm/templates/test/end-to-end.yml +++ b/charts/azimuth-llm/templates/test/end-to-end.yml @@ -10,10 +10,9 @@ spec: spec: containers: - name: gradio-client-test - {{- /* - Use the chat image since we know this contains the gradio_client package - */}} - image: {{ printf "ghcr.io/stackhpc/azimuth-llm-chat-ui:%s" (default .Chart.AppVersion .Values.ui.image.tag) }} + # Assumes that one of the in-repo Gradio apps is used and that + # the app includes a `gradio-test-client.py` script. + image: {{ printf "%s:%s" .Values.ui.image.repository (default .Chart.AppVersion .Values.ui.image.tag) }} imagePullPolicy: IfNotPresent command: - python @@ -21,10 +20,13 @@ spec: {{- if .Values.ingress.ui.enabled }} - {{ .Values.ingress.host }}{{ .Values.ingress.ui.path }} {{- else }} - - http://{{ .Values.ui.service.name }}.{{ .Release.Namespace }}.svc + - http://{{ .Release.Name }}-ui.{{ .Release.Namespace }}.svc {{- end }} + env: + - name: PYTHONUNBUFFERED + value: "1" + tty: true # Make stdout from python visible in k8s logs restartPolicy: Never - # Allow plenty of retries since downloading - # model weights can take a long time. - backoffLimit: 10 + # Handle retries within gradio-test-client script + backoffLimit: 1 {{- end -}} diff --git a/charts/azimuth-llm/templates/test/web-app.yml b/charts/azimuth-llm/templates/test/web-app.yml index 65dbb90..3193fb3 100644 --- a/charts/azimuth-llm/templates/test/web-app.yml +++ b/charts/azimuth-llm/templates/test/web-app.yml @@ -18,7 +18,7 @@ spec: {{- if .Values.ingress.ui.enabled }} - {{ .Values.ingress.host | trimPrefix "http://" | trimPrefix "https://" }}{{ .Values.ingress.ui.path }} {{- else }} - - {{ .Values.ui.service.name }}.{{ .Release.Namespace }}.svc + - {{ .Release.Name }}-ui.{{ .Release.Namespace }}.svc {{- end }} - "80" restartPolicy: Never diff --git a/charts/azimuth-llm/templates/ui/app-config-map.yml b/charts/azimuth-llm/templates/ui/app-config-map.yml index e77da18..7a7b9e6 100644 --- a/charts/azimuth-llm/templates/ui/app-config-map.yml +++ b/charts/azimuth-llm/templates/ui/app-config-map.yml @@ -7,5 +7,6 @@ metadata: {{- include "azimuth-llm.labels" . | nindent 4 }} data: overrides.yml: | + {{- $_ := set .Values.ui.appSettings "backend_url" (printf "http://%s-api.%s.svc" .Release.Name .Release.Namespace) }} {{- .Values.ui.appSettings | toYaml | nindent 4 }} {{- end -}} diff --git a/charts/azimuth-llm/templates/ui/ingress.yml b/charts/azimuth-llm/templates/ui/ingress.yml index e9e6e02..c2cc8be 100644 --- a/charts/azimuth-llm/templates/ui/ingress.yml +++ b/charts/azimuth-llm/templates/ui/ingress.yml @@ -16,7 +16,7 @@ spec: pathType: Prefix backend: service: - name: {{ .Values.ui.service.name }} + name: {{ .Release.Name }}-ui port: # Must match Service resource number: 80 diff --git a/charts/azimuth-llm/templates/ui/service.yml b/charts/azimuth-llm/templates/ui/service.yml index 1de5c17..c199a1d 100644 --- a/charts/azimuth-llm/templates/ui/service.yml +++ b/charts/azimuth-llm/templates/ui/service.yml @@ -2,7 +2,7 @@ apiVersion: v1 kind: Service metadata: - name: {{ .Values.ui.service.name }} + name: {{ .Release.Name }}-ui labels: {{- include "azimuth-llm.labels" . | nindent 4 }} spec: diff --git a/charts/azimuth-llm/templates/ui/ui-zenith-client.yml b/charts/azimuth-llm/templates/ui/ui-zenith-client.yml index cbdbe95..ce4b5ad 100644 --- a/charts/azimuth-llm/templates/ui/ui-zenith-client.yml +++ b/charts/azimuth-llm/templates/ui/ui-zenith-client.yml @@ -9,7 +9,7 @@ metadata: spec: reservationName: {{ .Release.Name }}-ui upstream: - serviceName: {{ .Values.ui.service.name }} + serviceName: {{ .Release.Name }}-ui auth: skip: {{ .Values.ui.service.zenith.skipAuth }} {{- end -}} diff --git a/charts/azimuth-llm/values.yaml b/charts/azimuth-llm/values.yaml index b311c9f..e693bbf 100644 --- a/charts/azimuth-llm/values.yaml +++ b/charts/azimuth-llm/values.yaml @@ -33,7 +33,9 @@ api: enabled: true # Container image config image: - repository: vllm/vllm-openai + # Defaults to vllm/vllm-openai when api.gpus > 0 + # or ghrc.io/stackhpc/vllm-cpu when api.gpus == 0 + repository: version: v0.7.2 monitoring: enabled: true @@ -41,7 +43,6 @@ api: replicas: 1 # Service config service: - name: llm-backend type: ClusterIP zenith: enabled: false @@ -56,8 +57,7 @@ api: hostPath: path: /tmp/llm/huggingface-cache # Number of gpus to requests for each api pod instance - # NOTE: This must be in the range 1 <= value <= N, where - # 'N' is the number of GPUs available in a single + # NOTE: This must be less than the number of GPUs available in a single # worker node on the target Kubernetes cluster. # NOTE: According to the vLLM docs found here # https://docs.vllm.ai/en/latest/serving/distributed_serving.html @@ -83,7 +83,7 @@ ui: # Container image config image: repository: ghcr.io/stackhpc/azimuth-llm-chat-ui - tag: # Defaults to chart's appVersion + tag: # Defaults to chart's appVersion imagePullPolicy: # The settings to be passed to the frontend web app. # Format depends on the chosen UI image above. For each of the UIs @@ -105,7 +105,6 @@ ui: - Arial # Service config service: - name: web-app type: ClusterIP zenith: enabled: true diff --git a/ct.yaml b/ct.yaml index f5fada9..3fa795d 100644 --- a/ct.yaml +++ b/ct.yaml @@ -6,3 +6,6 @@ validate-maintainers: false all: true # Split output to make it look nice in GitHub Actions tab github-groups: true +# Allow for long running install and test processes +# (e.g. downloading containers images and model weights) +helm-extra-args: --timeout 1200s diff --git a/web-apps/chat/app.py b/web-apps/chat/app.py index df369ec..cee141f 100644 --- a/web-apps/chat/app.py +++ b/web-apps/chat/app.py @@ -61,7 +61,6 @@ class PossibleSystemPromptException(Exception): streaming=True, ) - def inference(latest_message, history): # Allow mutating global variable global BACKEND_INITIALISED diff --git a/web-apps/chat/defaults.yml b/web-apps/chat/defaults.yml index b0260a9..8a18976 100644 --- a/web-apps/chat/defaults.yml +++ b/web-apps/chat/defaults.yml @@ -30,6 +30,6 @@ theme_params: {} theme_params_extended: {} # Additional CSS and JS overrides -# See https://www.gradio.app/guides/custom-CSS-and-JS +# See https://www.gradio.app/guides/custom-CSS-and-JS css_overrides: custom_javascript: diff --git a/web-apps/chat/gradio-client-test.py b/web-apps/chat/gradio-client-test.py index ddf245c..723852d 100644 --- a/web-apps/chat/gradio-client-test.py +++ b/web-apps/chat/gradio-client-test.py @@ -1,7 +1,21 @@ import sys +import time + from gradio_client import Client gradio_host = sys.argv[1] -client = Client(gradio_host) -result = client.predict("Hi", api_name="/chat") -print(result) + +retries = 60 +for n in range(1, retries+1): + try: + client = Client(gradio_host) + result = client.predict("Hi", api_name="/chat") + print(result) + break + except Exception as err: + msg = f"Attempt {n} / {retries} encounter error: {err}" + if n < retries: + print(msg, "- waiting 10 seconds before retrying") + time.sleep(10) + else: + print(msg, "- no more retries left") diff --git a/web-apps/image-analysis/gradio-client-test.py b/web-apps/image-analysis/gradio-client-test.py new file mode 100644 index 0000000..56e333e --- /dev/null +++ b/web-apps/image-analysis/gradio-client-test.py @@ -0,0 +1,25 @@ +import sys +import time + +from gradio_client import Client + +gradio_host = sys.argv[1] + +retries = 60 +for n in range(1, retries+1): + try: + client = Client(gradio_host) + result = client.predict( + image_url="https://media.licdn.com/dms/image/v2/D4D0BAQHyxNra6_PoUQ/company-logo_200_200/company-logo_200_200/0/1704365018113/stackhpc_ltd_logo?e=1747872000&v=beta&t=Ed3-KZS-sHlg-ne1KC0YjI4Ez7yVvJzWr103nm5eVK0", + prompt="Hi", + api_name="/predict" + ) + print(result) + break + except Exception as err: + msg = f"Attempt {n} / {retries} encounter error: {err}" + if n < retries: + print(msg, "- waiting 10 seconds before retrying") + time.sleep(10) + else: + print(msg, "- no more retries left") diff --git a/web-apps/utils/utils.py b/web-apps/utils/utils.py index 28e7a90..cb99776 100644 --- a/web-apps/utils/utils.py +++ b/web-apps/utils/utils.py @@ -43,28 +43,6 @@ class LLMParams(BaseModel): model_config = ConfigDict(extra="forbid") -NAMESPACE_FILE_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/namespace" - - -def get_k8s_namespace(): - try: - current_k8s_namespace = open(NAMESPACE_FILE_PATH).read() - return current_k8s_namespace - except FileNotFoundError: - return None - - -def api_address_in_cluster(): - k8s_ns = get_k8s_namespace() - if k8s_ns: - return f"http://llm-backend.{k8s_ns}.svc" - else: - log.warning( - "Failed to determine k8s namespace from %s - assuming non-kubernetes environment.", - NAMESPACE_FILE_PATH, - ) - - # Method for loading settings from files def load_yaml(file_path: str) -> dict: with open(file_path, "r") as file: @@ -95,12 +73,9 @@ def load_settings() -> dict: "Please check for typos" ) settings = {**defaults, **overrides} - if "backend_url" not in settings or not settings["backend_url"]: - # Try to detect in-cluster address - in_cluster_backend = api_address_in_cluster() - if not in_cluster_backend: - raise Exception( - "Backend URL must be provided in settings when running outside of Kubernetes." - ) - settings["backend_url"] = in_cluster_backend + + if "backend_url" not in settings: + raise Exception( + "Backend URL must be provided in settings when running outside of Kubernetes." + ) return settings