diff --git a/model-engine/model_engine_server/api/worker.py b/model-engine/model_engine_server/api/worker.py index 945614da..d08113b5 100644 --- a/model-engine/model_engine_server/api/worker.py +++ b/model-engine/model_engine_server/api/worker.py @@ -1,5 +1,7 @@ from uvicorn.workers import UvicornWorker +# Gunicorn returns 503 instead of 429 when concurrency exceeds the limit, before adding rate limiting just increase the concurrency +# We'll autoscale at target concurrency of a much lower number (around 50), and this just makes sure we don't 503 with bursty traffic CONCURRENCY_LIMIT = 1000