# Multi-Turn Chat Benchmark

Assumes:

* Kubernetes cluster exists and KUBECONFIG is already configured.
* KubeAI is already installed.

In [22]:
!kubectl apply -f ./manifests.yaml
!kubectl wait --for=condition=Ready -f ./manifests.yaml

pod/bench configured
pod/bench condition met


In [163]:
from subprocess import run, PIPE
import json
from kubernetes import config, dynamic
from kubernetes.client import api_client
from copy import deepcopy

In [164]:
k8s_client = dynamic.DynamicClient(
    api_client.ApiClient(configuration=config.load_kube_config())
)
models_client = k8s_client.resources.get(api_version="kubeai.org/v1", kind="Model")

In [174]:
base_model = {
    "apiVersion": "kubeai.org/v1",
    "kind": "Model",
    "metadata": {
        "name": "bench",
        "namespace": "default",
    },
    "spec": {
        "features": ["TextGeneration"],
        "url": "ollama://qwen2:0.5b",
        "engine": "OLlama",
        "resourceProfile": "cpu:2"
    },
}

In [180]:
benches = [
    {
        "thread_count": 4,
        "max_concurrent_threads": 1,
    },
    {
        "thread_count": 3,
        "max_concurrent_threads": 2,
    }
]
specs = [
    {
        "minReplicas": 4,
        "maxReplicas": 4,
    },
    {
        "minReplicas": 5,
        "maxReplicas": 5,
    },
] 


In [181]:
results = []

for spec in specs:
    for  bench in benches:
        print(bench)
        try:
            model = deepcopy(base_model)
            model["spec"].update(spec)
            models_client.create(body=model)

            model_name = model.get("metadata").get("name")
            model_replicas = model.get("spec").get("minReplicas")

            !kubectl wait --timeout 30m --for=jsonpath='.status.replicas.ready'={model_replicas} model/{model_name}

            thread_count = bench.get("thread_count")
            max_concurrent_threads = bench.get("max_concurrent_threads")
            cmd = f'kubectl exec bench -- bench --threads=./data/tiny.json --thread-count={thread_count} --max-concurrent-threads={max_concurrent_threads} --request-model={model_name} --max-completion-tokens=10 --request-timeout=2m --format=json'
            print(cmd)

            output = run(cmd, shell=True, stdout=PIPE, encoding='utf8')
            result = json.loads(output.stdout)
            print(result)
            results.append({
                "spec": spec,
                "bench": bench,
                "results": results
            }) 
        finally:
            models_client.delete(name=model_name, namespace="default")

results

{'thread_count': 4, 'max_concurrent_threads': 1}
model.kubeai.org/bench condition met
kubectl exec bench -- bench --threads=./data/tiny.json --thread-count=4 --max-concurrent-threads=1 --request-model=bench --max-completion-tokens=10 --request-timeout=2m --format=json


2025/02/23 01:22:58 Shuffling dataset threads
2025/02/23 01:22:58 Trimming dataset threads (6) to specified thread count (4)
2025/02/23 01:22:58 Starting run...
2025/02/23 01:23:00 Thread[1/4]: Finished
2025/02/23 01:23:01 Thread[2/4]: Finished
2025/02/23 01:23:04 Thread[3/4]: Finished
2025/02/23 01:23:06 Thread[4/4]: Finished
2025/02/23 01:23:06 Run completed, starting summarization...


{'input_thread_count': 4, 'input_messages_per_thread': {'mean': 4.75}, 'duration': '8.106102814s', 'request_count': 19, 'request_duration': {'mean': '426.569957ms'}, 'chunks_per_request': {'mean': 10}, 'failed_threads': 0, 'run_output_throughput': 23.43913028981722, 'run_total_throughput': 379.9606383823002, 'ttft': {'mean': '327.766701ms'}, 'itl': {'mean': '10.972985ms'}, 'prompt_tokens': 2890, 'cached_prompt_tokens': 0, 'completion_tokens': 190, 'total_tokens': 3080}
{'thread_count': 3, 'max_concurrent_threads': 2}
model.kubeai.org/bench condition met
kubectl exec bench -- bench --threads=./data/tiny.json --thread-count=3 --max-concurrent-threads=2 --request-model=bench --max-completion-tokens=10 --request-timeout=2m --format=json


2025/02/23 01:23:25 Shuffling dataset threads
2025/02/23 01:23:25 Trimming dataset threads (6) to specified thread count (3)
2025/02/23 01:23:25 Starting run...
2025/02/23 01:23:27 Thread[2/3]: Finished
2025/02/23 01:23:27 Thread[1/3]: Finished
2025/02/23 01:23:30 Thread[3/3]: Finished
2025/02/23 01:23:30 Run completed, starting summarization...


{'input_thread_count': 3, 'input_messages_per_thread': {'mean': 4.333333333333333}, 'duration': '4.751332399s', 'request_count': 13, 'request_duration': {'mean': '504.801459ms'}, 'chunks_per_request': {'mean': 10}, 'failed_threads': 0, 'run_output_throughput': 27.360746225071676, 'run_total_throughput': 437.7719396011468, 'ttft': {'mean': '393.493204ms'}, 'itl': {'mean': '12.361448ms'}, 'prompt_tokens': 1950, 'cached_prompt_tokens': 0, 'completion_tokens': 130, 'total_tokens': 2080}
{'thread_count': 4, 'max_concurrent_threads': 1}
model.kubeai.org/bench condition met
kubectl exec bench -- bench --threads=./data/tiny.json --thread-count=4 --max-concurrent-threads=1 --request-model=bench --max-completion-tokens=10 --request-timeout=2m --format=json


2025/02/23 01:23:50 Shuffling dataset threads
2025/02/23 01:23:50 Trimming dataset threads (6) to specified thread count (4)
2025/02/23 01:23:50 Starting run...
2025/02/23 01:23:52 Thread[1/4]: Finished
2025/02/23 01:23:53 Thread[2/4]: Finished
2025/02/23 01:23:55 Thread[3/4]: Finished
2025/02/23 01:23:58 Thread[4/4]: Finished
2025/02/23 01:23:58 Run completed, starting summarization...


{'input_thread_count': 4, 'input_messages_per_thread': {'mean': 4.75}, 'duration': '7.773385523s', 'request_count': 19, 'request_duration': {'mean': '409.066906ms'}, 'chunks_per_request': {'mean': 10}, 'failed_threads': 0, 'run_output_throughput': 24.442374488930902, 'run_total_throughput': 396.2237548731957, 'ttft': {'mean': '308.845968ms'}, 'itl': {'mean': '11.131069ms'}, 'prompt_tokens': 2890, 'cached_prompt_tokens': 0, 'completion_tokens': 190, 'total_tokens': 3080}
{'thread_count': 3, 'max_concurrent_threads': 2}
model.kubeai.org/bench condition met
kubectl exec bench -- bench --threads=./data/tiny.json --thread-count=3 --max-concurrent-threads=2 --request-model=bench --max-completion-tokens=10 --request-timeout=2m --format=json


2025/02/23 01:24:17 Shuffling dataset threads
2025/02/23 01:24:17 Trimming dataset threads (6) to specified thread count (3)
2025/02/23 01:24:17 Starting run...
2025/02/23 01:24:18 Thread[2/3]: Finished
2025/02/23 01:24:19 Thread[1/3]: Finished


{'input_thread_count': 3, 'input_messages_per_thread': {'mean': 4.333333333333333}, 'duration': '4.369495056s', 'request_count': 13, 'request_duration': {'mean': '485.955919ms'}, 'chunks_per_request': {'mean': 10}, 'failed_threads': 0, 'run_output_throughput': 29.751721499602038, 'run_total_throughput': 476.0275439936326, 'ttft': {'mean': '385.445057ms'}, 'itl': {'mean': '11.163695ms'}, 'prompt_tokens': 1950, 'cached_prompt_tokens': 0, 'completion_tokens': 130, 'total_tokens': 2080}


2025/02/23 01:24:21 Thread[3/3]: Finished
2025/02/23 01:24:21 Run completed, starting summarization...


[{'spec': {'minReplicas': 4, 'maxReplicas': 4},
  'bench': {'thread_count': 4, 'max_concurrent_threads': 1},
  'results': [...]},
 {'spec': {'minReplicas': 4, 'maxReplicas': 4},
  'bench': {'thread_count': 3, 'max_concurrent_threads': 2},
  'results': [...]},
 {'spec': {'minReplicas': 5, 'maxReplicas': 5},
  'bench': {'thread_count': 4, 'max_concurrent_threads': 1},
  'results': [...]},
 {'spec': {'minReplicas': 5, 'maxReplicas': 5},
  'bench': {'thread_count': 3, 'max_concurrent_threads': 2},
  'results': [...]}]