# Multi-Turn Chat Benchmark

```bash
gcloud container clusters create-auto cluster-1 \
    --location=us-central1

helm repo add kubeai https://www.kubeai.org
helm repo update
curl -L -O https://raw.githubusercontent.com/substratusai/kubeai/refs/heads/main/charts/kubeai/values-gke.yaml
helm upgrade --install kubeai kubeai/kubeai \
    -f values-gke.yaml \
    --set secrets.huggingface.token=$HUGGING_FACE_HUB_TOKEN \
    --set metrics.prometheusOperator.vLLMPodMonitor.enabled=true \
    --set open-webui.enabled=false \
    --wait
```

In [None]:
from subprocess import run, PIPE
import json
from kubernetes import config, dynamic
from kubernetes.client import api_client
from copy import deepcopy

In [None]:
k8s_client = dynamic.DynamicClient(
    api_client.ApiClient(configuration=config.load_kube_config())
)
models_client = k8s_client.resources.get(api_version="kubeai.org/v1", kind="Model")

In [114]:
#base_model = {
#    "apiVersion": "kubeai.org/v1",
#    "kind": "Model",
#    "metadata": {
#        "name": "bench",
#        "namespace": "default",
#    },
#    "spec": {
#        "features": ["TextGeneration"],
#        "url": "ollama://qwen2:0.5b",
#        "engine": "OLlama",
#        "resourceProfile": "cpu:2"
#    },
#}
base_model = {
    "apiVersion": "kubeai.org/v1",
    "kind": "Model",
    "metadata": {
        "name": "bench",
        "namespace": "default",
    },
    "spec": {
        "features": ["TextGeneration"],
        "url": "hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8",
        "engine": "VLLM",
        "args": [
            "--enable-prefix-caching",
            "--max-model-len=16384",
            "--max-num-batched-token=16384",
            "--gpu-memory-utilization=0.8",
            "--disable-log-requests"
        ],
        "resourceProfile": "nvidia-gpu-l4:1"
    },

}

In [None]:
benches = [
    {
        "thread_count": 2000,
        "max_concurrent_threads": 400,
    },
    {
        "thread_count": 2000,
        "max_concurrent_threads": 400,
    },
]
specs = [
    {
        "minReplicas": 2,
        "maxReplicas": 2,
        "loadBalancing": {
            "strategy": "PrefixHash",
            "prefixHash": {
                "meanLoadFactor": 115,
                "prefixCharLength": 100,
                "replication": 1000,
            },
        },
    },
    {
        "minReplicas": 2,
        "maxReplicas": 2,
        "loadBalancing": {
            "strategy": "LeastLoad",
            "prefixHash": {
                "meanLoadFactor": 115,
                "prefixCharLength": 100,
                "replication": 1000,
            },
        },
    },
] 


In [None]:
all_results = []

i = 0
for spec in specs:
    for  bench in benches:
        print(bench)
        try:
            # Start a fresh instance of the benchmark Pod.
            !kubectl apply -f ./bench-pod.yaml
            !kubectl wait --timeout 10m --for=condition=Ready -f ./bench-pod.yaml

            model = deepcopy(base_model)
            model["spec"].update(spec)
            #model["metadata"]["name"] = model["metadata"]["name"] + f'-{i}'
            #models_client.create(body=model)
            models_client.patch(
                body=model,
                content_type="application/apply-patch+yaml",
                field_manager="benchmark",
            )


            model_name = model.get("metadata").get("name")
            model_replicas = model.get("spec").get("minReplicas")

            !kubectl wait --timeout 30m --for=jsonpath='.status.replicas.ready'={model_replicas} model/{model_name}

            thread_count = bench.get("thread_count")
            max_concurrent_threads = bench.get("max_concurrent_threads")
            cmd = f'kubectl exec bench -- bench --threads=./data/large-exact.json --thread-count={thread_count} --max-concurrent-threads={max_concurrent_threads} --request-model={model_name} --max-completion-tokens=40 --request-timeout=2m --seed=2 --format=json'
            print(cmd)

            output = run(cmd, shell=True, stdout=PIPE, encoding='utf8')
            result = json.loads(output.stdout)
            print(result)
            all_results.append({
                "spec": spec,
                "bench": bench,
                "result": result
            }) 
        finally:
            models_client.delete(name=model_name, namespace="default")
            !kubectl delete --now -f ./bench-pod.yaml
            i+=1

all_results

{'thread_count': 1800, 'max_concurrent_threads': 260}
pod/bench created
pod/bench condition met
model.kubeai.org/bench condition met
kubectl exec bench -- bench --threads=./data/large-exact.json --thread-count=1800 --max-concurrent-threads=260 --request-model=bench --max-completion-tokens=40 --request-timeout=2m --seed=2 --format=json


2025/02/24 03:07:18 Shuffling dataset threads
2025/02/24 03:07:18 Trimming dataset threads (9204) to specified thread count (1800)
2025/02/24 03:07:18 Starting run...
2025/02/24 03:12:48 Run completed, starting summarization...


{'input_thread_count': 1800, 'input_messages_per_thread': {'mean': 7.486111111111111}, 'duration': '5m29.873137922s', 'request_count': 13475, 'request_duration': {'mean': '5.516706449s'}, 'chunks_per_request': {'mean': 38.93298701298701}, 'failed_threads': 0, 'run_output_throughput': 1590.3750250923713, 'run_total_throughput': 17611.515859086707, 'ttft': {'mean': '374.11382ms'}, 'itl': {'mean': '131.588506ms'}, 'prompt_tokens': 5284944, 'cached_prompt_tokens': 0, 'completion_tokens': 524622, 'total_tokens': 5809566}
pod "bench" deleted
{'thread_count': 1800, 'max_concurrent_threads': 260}
pod/bench created
pod/bench condition met
model.kubeai.org/bench condition met
kubectl exec bench -- bench --threads=./data/large-exact.json --thread-count=1800 --max-concurrent-threads=260 --request-model=bench --max-completion-tokens=40 --request-timeout=2m --seed=2 --format=json


2025/02/24 03:14:36 Shuffling dataset threads
2025/02/24 03:14:36 Trimming dataset threads (9204) to specified thread count (1800)
2025/02/24 03:14:36 Starting run...


In [None]:
for r in all_results:
    print(f'Strategy {r["spec"]["loadBalancing"]["strategy"]}: TTFT={r["result"]["ttft"]["mean"]} ITL={r["result"]["itl"]["mean"]} TPS(total)={r["result"]["run_total_throughput"]}')

Strategy PrefixHash: TTFT=1.311553724s ITL=147.76313ms TPS(total)=15412.22594145517
