Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ jobs:
with:
cluster_name: ${{ env.CLUSTER_NAME }}

# NOTE(scott): Since the local Chart.yaml uses "appVersion: latest" and this
# NOTE(scott): Since the local Chart.yaml uses "appVersion: latest" and this
# only gets overwritten to the correct commit SHA during Helm chart build,
# we need to pull these published images and load them into the kind cluster
# with the tag correct tag.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,24 @@
azimuth-llm:
huggingface:
# Use the smallest LLM we can find
model: &model HuggingFaceTB/SmolLM2-135M-Instruct
api:
enabled: false
# CI Kind cluster doesn't have kube-prometheus-stack
monitoring:
enabled: false
# No GPUs in CI runners
gpus: 0
ui:
service:
zenith:
enabled: false
appSettings:
model_name: *model
# Verify that we can set non-standard LLM params
llm_params:
max_tokens: 101
temperature: 0.1
top_k: 2
top_p: 0.15
top_k: 1
presence_penalty: 0.9
frequency_penalty: 1
23 changes: 23 additions & 0 deletions charts/azimuth-image-analysis/ci/test-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
azimuth-llm:
huggingface:
# Use the smallest vision model we can find
model: &model HuggingFaceTB/SmolVLM-256M-Instruct
api:
# CI Kind cluster doesn't have kube-prometheus-stack
monitoring:
enabled: false
# No GPUs in CI runners
gpus: 0
ui:
service:
zenith:
enabled: false
appSettings:
model_name: *model
# Verify that we can set non-standard LLM params
llm_params:
max_tokens: 10 # Constrain response tokens to speed up CI test
temperature: 0.1
top_p: 0.15
presence_penalty: 0.9
frequency_penalty: 1
15 changes: 0 additions & 15 deletions charts/azimuth-image-analysis/ci/ui-only-values.yaml

This file was deleted.

18 changes: 18 additions & 0 deletions charts/azimuth-llm/ci/default-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# This is intended to test the default chart values
# as close as possible given the constraints of running
# inside a Kind cluster within a CI runner
huggingface:
# Use the smallest LLM we can find
model: &model HuggingFaceTB/SmolLM2-135M-Instruct
api:
# CI Kind cluster doesn't have kube-prometheus-stack
monitoring:
enabled: false
# No GPUs in CI runners
gpus: 0
ui:
service:
zenith:
enabled: false
appSettings:
model_name: *model
6 changes: 0 additions & 6 deletions charts/azimuth-llm/ci/no-api-values.yaml

This file was deleted.

7 changes: 4 additions & 3 deletions charts/azimuth-llm/templates/api/deployment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ spec:
spec:
containers:
- name: {{ .Release.Name }}-api
image: {{ printf "%s:%s" .Values.api.image.repository .Values.api.image.version }}
{{ $imageRepo := .Values.api.image.repository | default (ternary "ghcr.io/stackhpc/vllm-cpu" "vllm-project/vllm" (eq (.Values.api.gpus | int) 0)) -}}
image: {{ printf "%s:%s" $imageRepo .Values.api.image.version }}
ports:
- name: api
containerPort: 8000
Expand All @@ -29,7 +30,7 @@ spec:
args:
- --model
- {{ .Values.huggingface.model }}
{{- include "azimuth-llm.chatTemplate" . | nindent 10 }}
{{- include "azimuth-llm.chatTemplate" . | nindent 10 -}}
{{- if .Values.api.modelMaxContextLength -}}
- --max-model-len
- {{ .Values.api.modelMaxContextLength | quote }}
Expand All @@ -41,7 +42,7 @@ spec:
{{- if .Values.api.extraArgs -}}
{{- .Values.api.extraArgs | toYaml | nindent 10 }}
{{- end -}}
{{- if .Values.huggingface.secretName }}
{{- if .Values.huggingface.secretName -}}
envFrom:
- secretRef:
name: {{ .Values.huggingface.secretName }}
Expand Down
2 changes: 1 addition & 1 deletion charts/azimuth-llm/templates/api/ingress.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ spec:
pathType: Prefix
backend:
service:
name: {{ .Values.api.service.name }}
name: {{ .Release.Name }}-api
port:
# Must match Service resource
number: 80
Expand Down
2 changes: 1 addition & 1 deletion charts/azimuth-llm/templates/api/service.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
apiVersion: v1
kind: Service
metadata:
name: {{ .Values.api.service.name }}
name: {{ .Release.Name }}-api
labels:
{{- include "azimuth-llm.api-selectorLabels" . | nindent 4 }}
spec:
Expand Down
2 changes: 1 addition & 1 deletion charts/azimuth-llm/templates/api/zenith-client.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ metadata:
spec:
reservationName: {{ .Release.Name }}-api
upstream:
serviceName: {{ .Values.api.service.name }}
serviceName: {{ .Release.Name }}-api
auth:
skip: {{ .Values.api.service.zenith.skipAuth }}
{{- end -}}
18 changes: 10 additions & 8 deletions charts/azimuth-llm/templates/test/end-to-end.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,23 @@ spec:
spec:
containers:
- name: gradio-client-test
{{- /*
Use the chat image since we know this contains the gradio_client package
*/}}
image: {{ printf "ghcr.io/stackhpc/azimuth-llm-chat-ui:%s" (default .Chart.AppVersion .Values.ui.image.tag) }}
# Assumes that one of the in-repo Gradio apps is used and that
# the app includes a `gradio-test-client.py` script.
image: {{ printf "%s:%s" .Values.ui.image.repository (default .Chart.AppVersion .Values.ui.image.tag) }}
imagePullPolicy: IfNotPresent
command:
- python
- gradio-client-test.py
{{- if .Values.ingress.ui.enabled }}
- {{ .Values.ingress.host }}{{ .Values.ingress.ui.path }}
{{- else }}
- http://{{ .Values.ui.service.name }}.{{ .Release.Namespace }}.svc
- http://{{ .Release.Name }}-ui.{{ .Release.Namespace }}.svc
{{- end }}
env:
- name: PYTHONUNBUFFERED
value: "1"
tty: true # Make stdout from python visible in k8s logs
restartPolicy: Never
# Allow plenty of retries since downloading
# model weights can take a long time.
backoffLimit: 10
# Handle retries within gradio-test-client script
backoffLimit: 1
{{- end -}}
2 changes: 1 addition & 1 deletion charts/azimuth-llm/templates/test/web-app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ spec:
{{- if .Values.ingress.ui.enabled }}
- {{ .Values.ingress.host | trimPrefix "http://" | trimPrefix "https://" }}{{ .Values.ingress.ui.path }}
{{- else }}
- {{ .Values.ui.service.name }}.{{ .Release.Namespace }}.svc
- {{ .Release.Name }}-ui.{{ .Release.Namespace }}.svc
{{- end }}
- "80"
restartPolicy: Never
Expand Down
1 change: 1 addition & 0 deletions charts/azimuth-llm/templates/ui/app-config-map.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,6 @@ metadata:
{{- include "azimuth-llm.labels" . | nindent 4 }}
data:
overrides.yml: |
{{- $_ := set .Values.ui.appSettings "backend_url" (printf "http://%s-api.%s.svc" .Release.Name .Release.Namespace) }}
{{- .Values.ui.appSettings | toYaml | nindent 4 }}
{{- end -}}
2 changes: 1 addition & 1 deletion charts/azimuth-llm/templates/ui/ingress.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ spec:
pathType: Prefix
backend:
service:
name: {{ .Values.ui.service.name }}
name: {{ .Release.Name }}-ui
port:
# Must match Service resource
number: 80
Expand Down
2 changes: 1 addition & 1 deletion charts/azimuth-llm/templates/ui/service.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
apiVersion: v1
kind: Service
metadata:
name: {{ .Values.ui.service.name }}
name: {{ .Release.Name }}-ui
labels:
{{- include "azimuth-llm.labels" . | nindent 4 }}
spec:
Expand Down
2 changes: 1 addition & 1 deletion charts/azimuth-llm/templates/ui/ui-zenith-client.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ metadata:
spec:
reservationName: {{ .Release.Name }}-ui
upstream:
serviceName: {{ .Values.ui.service.name }}
serviceName: {{ .Release.Name }}-ui
auth:
skip: {{ .Values.ui.service.zenith.skipAuth }}
{{- end -}}
Expand Down
11 changes: 5 additions & 6 deletions charts/azimuth-llm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,16 @@ api:
enabled: true
# Container image config
image:
repository: vllm/vllm-openai
# Defaults to vllm/vllm-openai when api.gpus > 0
# or ghrc.io/stackhpc/vllm-cpu when api.gpus == 0
repository:
version: v0.7.2
monitoring:
enabled: true
# The number of replicas for the backend deployment
replicas: 1
# Service config
service:
name: llm-backend
type: ClusterIP
zenith:
enabled: false
Expand All @@ -56,8 +57,7 @@ api:
hostPath:
path: /tmp/llm/huggingface-cache
# Number of gpus to requests for each api pod instance
# NOTE: This must be in the range 1 <= value <= N, where
# 'N' is the number of GPUs available in a single
# NOTE: This must be less than the number of GPUs available in a single
# worker node on the target Kubernetes cluster.
# NOTE: According to the vLLM docs found here
# https://docs.vllm.ai/en/latest/serving/distributed_serving.html
Expand All @@ -83,7 +83,7 @@ ui:
# Container image config
image:
repository: ghcr.io/stackhpc/azimuth-llm-chat-ui
tag: # Defaults to chart's appVersion
tag: # Defaults to chart's appVersion
imagePullPolicy:
# The settings to be passed to the frontend web app.
# Format depends on the chosen UI image above. For each of the UIs
Expand All @@ -105,7 +105,6 @@ ui:
- Arial
# Service config
service:
name: web-app
type: ClusterIP
zenith:
enabled: true
Expand Down
3 changes: 3 additions & 0 deletions ct.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@ validate-maintainers: false
all: true
# Split output to make it look nice in GitHub Actions tab
github-groups: true
# Allow for long running install and test processes
# (e.g. downloading containers images and model weights)
helm-extra-args: --timeout 1200s
1 change: 0 additions & 1 deletion web-apps/chat/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ class PossibleSystemPromptException(Exception):
streaming=True,
)


def inference(latest_message, history):
# Allow mutating global variable
global BACKEND_INITIALISED
Expand Down
2 changes: 1 addition & 1 deletion web-apps/chat/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,6 @@ theme_params: {}
theme_params_extended: {}

# Additional CSS and JS overrides
# See https://www.gradio.app/guides/custom-CSS-and-JS
# See https://www.gradio.app/guides/custom-CSS-and-JS
css_overrides:
custom_javascript:
20 changes: 17 additions & 3 deletions web-apps/chat/gradio-client-test.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,21 @@
import sys
import time

from gradio_client import Client

gradio_host = sys.argv[1]
client = Client(gradio_host)
result = client.predict("Hi", api_name="/chat")
print(result)

retries = 60
for n in range(1, retries+1):
try:
client = Client(gradio_host)
result = client.predict("Hi", api_name="/chat")
print(result)
break
except Exception as err:
msg = f"Attempt {n} / {retries} encounter error: {err}"
if n < retries:
print(msg, "- waiting 10 seconds before retrying")
time.sleep(10)
else:
print(msg, "- no more retries left")
25 changes: 25 additions & 0 deletions web-apps/image-analysis/gradio-client-test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import sys
import time

from gradio_client import Client

gradio_host = sys.argv[1]

retries = 60
for n in range(1, retries+1):
try:
client = Client(gradio_host)
result = client.predict(
image_url="https://media.licdn.com/dms/image/v2/D4D0BAQHyxNra6_PoUQ/company-logo_200_200/company-logo_200_200/0/1704365018113/stackhpc_ltd_logo?e=1747872000&v=beta&t=Ed3-KZS-sHlg-ne1KC0YjI4Ez7yVvJzWr103nm5eVK0",
prompt="Hi",
api_name="/predict"
)
print(result)
break
except Exception as err:
msg = f"Attempt {n} / {retries} encounter error: {err}"
if n < retries:
print(msg, "- waiting 10 seconds before retrying")
time.sleep(10)
else:
print(msg, "- no more retries left")
35 changes: 5 additions & 30 deletions web-apps/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,28 +43,6 @@ class LLMParams(BaseModel):
model_config = ConfigDict(extra="forbid")


NAMESPACE_FILE_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/namespace"


def get_k8s_namespace():
try:
current_k8s_namespace = open(NAMESPACE_FILE_PATH).read()
return current_k8s_namespace
except FileNotFoundError:
return None


def api_address_in_cluster():
k8s_ns = get_k8s_namespace()
if k8s_ns:
return f"http://llm-backend.{k8s_ns}.svc"
else:
log.warning(
"Failed to determine k8s namespace from %s - assuming non-kubernetes environment.",
NAMESPACE_FILE_PATH,
)


# Method for loading settings from files
def load_yaml(file_path: str) -> dict:
with open(file_path, "r") as file:
Expand Down Expand Up @@ -95,12 +73,9 @@ def load_settings() -> dict:
"Please check for typos"
)
settings = {**defaults, **overrides}
if "backend_url" not in settings or not settings["backend_url"]:
# Try to detect in-cluster address
in_cluster_backend = api_address_in_cluster()
if not in_cluster_backend:
raise Exception(
"Backend URL must be provided in settings when running outside of Kubernetes."
)
settings["backend_url"] = in_cluster_backend

if "backend_url" not in settings:
raise Exception(
"Backend URL must be provided in settings when running outside of Kubernetes."
)
return settings
Loading