In [1]:
import requests
import time
import statistics
import pandas as pd
import random
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
image_urls = [
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png",
    "https://upload.wikimedia.org/wikipedia/commons/4/47/PNG_transparency_demonstration_1.png",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/6e/Golde33443.jpg/640px-Golde33443.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Fronalpstock_big.jpg/640px-Fronalpstock_big.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/a/a9/Example.jpg/640px-Example.jpg"
]

with open("image_urls.txt", "w") as f:
    for url in image_urls:
        f.write(url + "\n")

In [3]:
with open("image_urls.txt") as f:
    IMAGE_URLS = [line.strip() for line in f if line.strip()]

len(IMAGE_URLS)


5

In [4]:
URL = "http://qwen3-vl-4b-instruct-predictor.qwen3.svc.cluster.local:8080/v1/chat/completions"
HEADERS = {"Content-Type": "application/json"}

TIMEOUT = 180
REQUESTS_PER_ROUND = 50
CONCURRENCY_STEPS = [1, 2, 4, 8, 16, 32]

BASE_PROMPT = "Descreva a imagem em portuguÃªs com o mÃ¡ximo de detalhes possÃ­veis."
MAX_TOKENS = 512


In [5]:
def run_inference():
    image_url = random.choice(IMAGE_URLS)

    payload = {
        "model": "qwen3-vl-4b-instruct",
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": BASE_PROMPT},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url
                        }
                    }
                ]
            }
        ],
        "max_tokens": MAX_TOKENS
    }

    start = time.time()
    r = requests.post(URL, json=payload, headers=HEADERS, timeout=TIMEOUT)
    latency = time.time() - start

    if r.status_code != 200:
        return latency, True

    return latency, False


In [6]:
results = []

for concurrency in CONCURRENCY_STEPS:
    print(f"ðŸš€ Testando concorrÃªncia = {concurrency}")

    latencies = []
    errors = 0
    start_round = time.time()

    with ThreadPoolExecutor(max_workers=concurrency) as executor:
        futures = [executor.submit(run_inference) for _ in range(REQUESTS_PER_ROUND)]

        for future in as_completed(futures):
            latency, error = future.result()
            latencies.append(latency)
            if error:
                errors += 1

    total_time = time.time() - start_round

    results.append({
        "concurrency": concurrency,
        "requests": REQUESTS_PER_ROUND,
        "total_time_s": round(total_time, 2),
        "throughput_req_s": round(REQUESTS_PER_ROUND / total_time, 2),
        "latency_avg_s": round(statistics.mean(latencies), 2),
        "latency_p95_s": round(statistics.quantiles(latencies, n=20)[18], 2),
        "latency_max_s": round(max(latencies), 2),
        "errors": errors
    })

    # descanso entre rodadas para estabilizar GPU
    time.sleep(10)


ðŸš€ Testando concorrÃªncia = 1
ðŸš€ Testando concorrÃªncia = 2
ðŸš€ Testando concorrÃªncia = 4
ðŸš€ Testando concorrÃªncia = 8
ðŸš€ Testando concorrÃªncia = 16
ðŸš€ Testando concorrÃªncia = 32


In [7]:
df = pd.DataFrame(results)
df

Unnamed: 0,concurrency,requests,total_time_s,throughput_req_s,latency_avg_s,latency_p95_s,latency_max_s,errors
0,1,50,809.87,0.06,16.2,18.56,21.26,0
1,2,50,433.92,0.12,17.35,18.95,18.98,0
2,4,50,229.97,0.22,17.66,19.46,19.56,0
3,8,50,128.29,0.39,18.37,20.2,20.26,0
4,16,50,71.37,0.7,20.27,22.08,22.13,0
5,32,50,48.67,1.03,23.22,26.93,26.95,0


In [8]:
df.to_csv("benchmark_qwen3_vl_gpu_saturation_multi_images.csv", index=False)