diff --git a/charts/model-engine/templates/service_template_config_map.yaml b/charts/model-engine/templates/service_template_config_map.yaml index 4780e29d..f1cee8c4 100644 --- a/charts/model-engine/templates/service_template_config_map.yaml +++ b/charts/model-engine/templates/service_template_config_map.yaml @@ -181,10 +181,6 @@ data: - --set - "forwarder.stream.healthcheck_route=${HEALTHCHECK_ROUTE}" - --set - - "forwarder.sync.extra_routes=${FORWARDER_EXTRA_ROUTES}" - - --set - - "forwarder.stream.extra_routes=${FORWARDER_EXTRA_ROUTES}" - - --set - "forwarder.sync.forwarder_type=${FORWARDER_TYPE}" - --set - "forwarder.stream.forwarder_type=${FORWARDER_TYPE}" @@ -370,7 +366,7 @@ data: name: {{ $service_template_aws_config_map_name }} {{- else }} name: {{ $aws_config_map_name }} - {{- end }} + {{- end }} {{- end }} - name: user-config configMap: @@ -487,15 +483,15 @@ data: threshold: "${CONCURRENCY}" metricName: request_concurrency_average query: sum(rate(istio_request_duration_milliseconds_sum{destination_workload="${RESOURCE_NAME}"}[2m])) / 1000 - serverAddress: ${PROMETHEUS_SERVER_ADDRESS} + serverAddress: ${PROMETHEUS_SERVER_ADDRESS} {{- range $device := tuple "gpu" }} {{- range $mode := tuple "streaming"}} leader-worker-set-{{ $mode }}-{{ $device }}.yaml: |- apiVersion: leaderworkerset.x-k8s.io/v1 kind: LeaderWorkerSet metadata: - name: ${RESOURCE_NAME} - namespace: ${NAMESPACE} + name: ${RESOURCE_NAME} + namespace: ${NAMESPACE} labels: {{- $service_template_labels | nindent 8 }} spec: @@ -617,10 +613,6 @@ data: - --set - "forwarder.stream.healthcheck_route=${HEALTHCHECK_ROUTE}" - --set - - "forwarder.sync.extra_routes=${FORWARDER_EXTRA_ROUTES}" - - --set - - "forwarder.stream.extra_routes=${FORWARDER_EXTRA_ROUTES}" - - --set - "forwarder.sync.forwarder_type=${FORWARDER_TYPE}" - --set - "forwarder.stream.forwarder_type=${FORWARDER_TYPE}" @@ -748,7 +740,7 @@ data: name: {{ $service_template_aws_config_map_name }} {{- else }} name: {{ $aws_config_map_name }} - {{- end }} + {{- end }} {{- end }} - name: user-config configMap: @@ -856,7 +848,7 @@ data: name: {{ $service_template_aws_config_map_name }} {{- else }} name: {{ $aws_config_map_name }} - {{- end }} + {{- end }} {{- end }} - name: user-config configMap: diff --git a/model-engine/model_engine_server/db/migrations/README b/model-engine/model_engine_server/db/migrations/README index 34a0d901..7863d56d 100644 --- a/model-engine/model_engine_server/db/migrations/README +++ b/model-engine/model_engine_server/db/migrations/README @@ -4,7 +4,7 @@ We introduce alembic by 1. dumping the current db schemas into 'initial.sql' via pg_dump ``` -pg_dump -h $HOST -U postgres -O -s -d $DB_NAME -n hosted_model_inference -n model -f initial.sql +pg_dump -h $HOST -U postgres -O -s -d $DB_NAME -n hosted_model_inference -n model -f initial.sql ``` 2. writing an initial revision that reads and applies intial.sql script @@ -19,6 +19,9 @@ alembic revision -m “initial” alembic stamp fa3267c80731 ``` +# Steps to make generic database schema changes + +Steps can be found here: https://alembic.sqlalchemy.org/en/latest/tutorial.html#running-our-second-migration # Test db migration from scratch diff --git a/model-engine/model_engine_server/db/migrations/alembic/versions/2025_09_25_1940-221aa19d3f32_add_routes_column.py b/model-engine/model_engine_server/db/migrations/alembic/versions/2025_09_25_1940-221aa19d3f32_add_routes_column.py new file mode 100644 index 00000000..50de5f54 --- /dev/null +++ b/model-engine/model_engine_server/db/migrations/alembic/versions/2025_09_25_1940-221aa19d3f32_add_routes_column.py @@ -0,0 +1,31 @@ +"""add routes column + +Revision ID: 221aa19d3f32 +Revises: e580182d6bfd +Create Date: 2025-09-25 19:40:24.927198 + +""" +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = '221aa19d3f32' +down_revision = 'e580182d6bfd' +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + 'bundles', + sa.Column('runnable_image_routes', sa.ARRAY(sa.Text), nullable=True), + schema='hosted_model_inference', + ) + + +def downgrade() -> None: + op.drop_column( + 'bundles', + 'runnable_image_routes', + schema='hosted_model_inference', + ) diff --git a/model-engine/model_engine_server/db/models/hosted_model_inference.py b/model-engine/model_engine_server/db/models/hosted_model_inference.py index 4076b129..c5b8247f 100644 --- a/model-engine/model_engine_server/db/models/hosted_model_inference.py +++ b/model-engine/model_engine_server/db/models/hosted_model_inference.py @@ -146,6 +146,7 @@ class Bundle(Base): runnable_image_env = Column(JSON, nullable=True) runnable_image_protocol = Column(Text, nullable=True) runnable_image_readiness_initial_delay_seconds = Column(Integer, nullable=True) + runnable_image_routes = Column(ARRAY(Text), nullable=True) runnable_image_extra_routes = Column(ARRAY(Text), nullable=True) runnable_image_forwarder_type = Column(Text, nullable=True) runnable_image_worker_command = Column(ARRAY(Text), nullable=True) @@ -209,6 +210,7 @@ def __init__( runnable_image_env: Optional[Dict[str, Any]] = None, runnable_image_protocol: Optional[str] = None, runnable_image_readiness_initial_delay_seconds: Optional[int] = None, + runnable_image_routes: Optional[List[str]] = None, runnable_image_extra_routes: Optional[List[str]] = None, runnable_image_forwarder_type: Optional[str] = None, runnable_image_worker_command: Optional[List[str]] = None, @@ -268,6 +270,7 @@ def __init__( self.runnable_image_healthcheck_route = runnable_image_healthcheck_route self.runnable_image_env = runnable_image_env self.runnable_image_protocol = runnable_image_protocol + self.runnable_image_routes = runnable_image_routes self.runnable_image_extra_routes = runnable_image_extra_routes self.runnable_image_forwarder_type = runnable_image_forwarder_type self.runnable_image_worker_command = runnable_image_worker_command diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index 4cc69762..352b7a06 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -1019,7 +1019,7 @@ async def create_vllm_bundle( healthcheck_route="/health", predict_route="/predict", streaming_predict_route="/stream", - extra_routes=[ + routes=[ OPENAI_CHAT_COMPLETION_PATH, OPENAI_COMPLETION_PATH, ], @@ -1101,7 +1101,7 @@ async def create_vllm_multinode_bundle( healthcheck_route="/health", predict_route="/predict", streaming_predict_route="/stream", - extra_routes=[OPENAI_CHAT_COMPLETION_PATH, OPENAI_COMPLETION_PATH], + routes=[OPENAI_CHAT_COMPLETION_PATH, OPENAI_COMPLETION_PATH], env=common_vllm_envs, worker_command=worker_command, worker_env=common_vllm_envs, diff --git a/model-engine/model_engine_server/inference/vllm/vllm_server.py b/model-engine/model_engine_server/inference/vllm/vllm_server.py index d7564042..c576a338 100644 --- a/model-engine/model_engine_server/inference/vllm/vllm_server.py +++ b/model-engine/model_engine_server/inference/vllm/vllm_server.py @@ -1,33 +1,14 @@ import asyncio import code -import json import os -import signal import subprocess import traceback from logging import Logger -from typing import AsyncGenerator, Dict, List, Optional -import vllm.envs as envs -from fastapi import APIRouter, BackgroundTasks, Request -from fastapi.responses import Response, StreamingResponse -from vllm.engine.async_llm_engine import AsyncEngineDeadError from vllm.engine.protocol import EngineClient -from vllm.entrypoints.launcher import serve_http -from vllm.entrypoints.openai.api_server import ( - build_app, - build_async_engine_client, - init_app_state, - load_log_config, - maybe_register_tokenizer_info_endpoint, - setup_server, -) +from vllm.entrypoints.openai.api_server import run_server from vllm.entrypoints.openai.cli_args import make_arg_parser -from vllm.entrypoints.openai.tool_parsers import ToolParserManager -from vllm.outputs import CompletionOutput -from vllm.sampling_params import SamplingParams -from vllm.sequence import Logprob -from vllm.utils import FlexibleArgumentParser, random_uuid +from vllm.utils import FlexibleArgumentParser logger = Logger("vllm_server") @@ -36,88 +17,8 @@ TIMEOUT_KEEP_ALIVE = 5 # seconds. TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds -router = APIRouter() - - -@router.post("/predict") -@router.post("/stream") -async def generate(request: Request) -> Response: - """Generate completion for the request. - - The request should be a JSON object with the following fields: - - prompt: the prompt to use for the generation. - - stream: whether to stream the results or not. - - other fields: the sampling parameters (See `SamplingParams` for details). - """ - # check health before accepting request and fail fast if engine isn't healthy - try: - await engine_client.check_health() - - request_dict = await request.json() - prompt = request_dict.pop("prompt") - stream = request_dict.pop("stream", False) - - sampling_params = SamplingParams(**request_dict) - - request_id = random_uuid() - - results_generator = engine_client.generate(prompt, sampling_params, request_id) - - async def abort_request() -> None: - await engine_client.abort(request_id) - - if stream: - # Streaming case - async def stream_results() -> AsyncGenerator[str, None]: - last_output_text = "" - async for request_output in results_generator: - log_probs = format_logprobs(request_output) - ret = { - "text": request_output.outputs[-1].text[len(last_output_text) :], - "count_prompt_tokens": len(request_output.prompt_token_ids), - "count_output_tokens": len(request_output.outputs[0].token_ids), - "log_probs": ( - log_probs[-1] if log_probs and sampling_params.logprobs else None - ), - "finished": request_output.finished, - } - last_output_text = request_output.outputs[-1].text - yield f"data:{json.dumps(ret)}\n\n" - - background_tasks = BackgroundTasks() - # Abort the request if the client disconnects. - background_tasks.add_task(abort_request) - - return StreamingResponse(stream_results(), background=background_tasks) - - # Non-streaming case - final_output = None - tokens = [] - last_output_text = "" - async for request_output in results_generator: - tokens.append(request_output.outputs[-1].text[len(last_output_text) :]) - last_output_text = request_output.outputs[-1].text - if await request.is_disconnected(): - # Abort the request if the client disconnects. - await engine_client.abort(request_id) - return Response(status_code=499) - final_output = request_output - - assert final_output is not None - prompt = final_output.prompt - ret = { - "text": final_output.outputs[0].text, - "count_prompt_tokens": len(final_output.prompt_token_ids), - "count_output_tokens": len(final_output.outputs[0].token_ids), - "log_probs": format_logprobs(final_output), - "tokens": tokens, - } - return Response(content=json.dumps(ret)) - - except AsyncEngineDeadError as e: - logger.error(f"The vllm engine is dead, exiting the pod: {e}") - os.kill(os.getpid(), signal.SIGINT) - raise e +# Legacy endpoints /predit and /stream removed - using vLLM's native OpenAI-compatible endpoints instead +# All requests now go through /v1/completions, /v1/chat/completions, etc. def get_gpu_free_memory(): @@ -171,85 +72,12 @@ def debug(sig, frame): i.interact(message) -def format_logprobs( - request_output: CompletionOutput, -) -> Optional[List[Dict[int, float]]]: - """Given a request output, format the logprobs if they exist.""" - output_logprobs = request_output.outputs[0].logprobs - if output_logprobs is None: - return None - - def extract_logprobs(logprobs: Dict[int, Logprob]) -> Dict[int, float]: - return {k: v.logprob for k, v in logprobs.items()} - - return [extract_logprobs(logprobs) for logprobs in output_logprobs] - - def parse_args(parser: FlexibleArgumentParser): parser = make_arg_parser(parser) parser.add_argument("--attention-backend", type=str, help="The attention backend to use") return parser.parse_args() -async def run_server(args, **uvicorn_kwargs) -> None: - """Run a single-worker API server.""" - listen_address, sock = setup_server(args) - await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) - - -async def run_server_worker( - listen_address, sock, args, client_config=None, **uvicorn_kwargs -) -> None: - """Run a single API server worker.""" - - if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: - ToolParserManager.import_tool_parser(args.tool_parser_plugin) - - server_index = client_config.get("client_index", 0) if client_config else 0 - - # Load logging config for uvicorn if specified - log_config = load_log_config(args.log_config_file) - if log_config is not None: - uvicorn_kwargs["log_config"] = log_config - - global engine_client - - async with build_async_engine_client(args, client_config=client_config) as engine_client: - maybe_register_tokenizer_info_endpoint(args) - app = build_app(args) - - vllm_config = await engine_client.get_vllm_config() - await init_app_state(engine_client, vllm_config, app.state, args) - app.include_router(router) - - logger.info("Starting vLLM API server %d on %s", server_index, listen_address) - shutdown_task = await serve_http( - app, - sock=sock, - enable_ssl_refresh=args.enable_ssl_refresh, - host=args.host, - port=args.port, - log_level=args.uvicorn_log_level, - # NOTE: When the 'disable_uvicorn_access_log' value is True, - # no access log will be output. - access_log=not args.disable_uvicorn_access_log, - timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE, - ssl_keyfile=args.ssl_keyfile, - ssl_certfile=args.ssl_certfile, - ssl_ca_certs=args.ssl_ca_certs, - ssl_cert_reqs=args.ssl_cert_reqs, - h11_max_incomplete_event_size=args.h11_max_incomplete_event_size, - h11_max_header_count=args.h11_max_header_count, - **uvicorn_kwargs, - ) - - # NB: Await server shutdown only after the backend context is exited - try: - await shutdown_task - finally: - sock.close() - - if __name__ == "__main__": check_unknown_startup_memory_usage() @@ -257,4 +85,5 @@ async def run_server_worker( args = parse_args(parser) if args.attention_backend is not None: os.environ["VLLM_ATTENTION_BACKEND"] = args.attention_backend + # Using vllm's run_server asyncio.run(run_server(args)) diff --git a/model-engine/model_engine_server/infra/gateways/live_streaming_model_endpoint_inference_gateway.py b/model-engine/model_engine_server/infra/gateways/live_streaming_model_endpoint_inference_gateway.py index 91fc78ec..86151922 100644 --- a/model-engine/model_engine_server/infra/gateways/live_streaming_model_endpoint_inference_gateway.py +++ b/model-engine/model_engine_server/infra/gateways/live_streaming_model_endpoint_inference_gateway.py @@ -227,6 +227,7 @@ async def streaming_predict( if predict_request.num_retries is None else predict_request.num_retries ) + response = self.make_request_with_retries( request_url=deployment_url, payload_json=predict_request.model_dump(exclude_none=True), diff --git a/model-engine/model_engine_server/infra/gateways/live_sync_model_endpoint_inference_gateway.py b/model-engine/model_engine_server/infra/gateways/live_sync_model_endpoint_inference_gateway.py index 8274d5b8..e65c3044 100644 --- a/model-engine/model_engine_server/infra/gateways/live_sync_model_endpoint_inference_gateway.py +++ b/model-engine/model_engine_server/infra/gateways/live_sync_model_endpoint_inference_gateway.py @@ -253,6 +253,7 @@ async def predict( if predict_request.num_retries is None else predict_request.num_retries ) + response = await self.make_request_with_retries( request_url=deployment_url, payload_json=predict_request.model_dump(exclude_none=True), diff --git a/model-engine/model_engine_server/infra/gateways/resources/k8s_resource_types.py b/model-engine/model_engine_server/infra/gateways/resources/k8s_resource_types.py index 1f107537..6f0d3133 100644 --- a/model-engine/model_engine_server/infra/gateways/resources/k8s_resource_types.py +++ b/model-engine/model_engine_server/infra/gateways/resources/k8s_resource_types.py @@ -140,6 +140,7 @@ class _SyncRunnableImageDeploymentArguments(TypedDict): FORWARDER_PORT: int FORWARDER_WORKER_COUNT: int + FORWARDER_SYNC_ROUTES: List[str] class _StreamingDeploymentArguments(TypedDict): @@ -148,6 +149,8 @@ class _StreamingDeploymentArguments(TypedDict): FORWARDER_PORT: int STREAMING_PREDICT_ROUTE: str FORWARDER_WORKER_COUNT: int + FORWARDER_SYNC_ROUTES: List[str] + FORWARDER_STREAMING_ROUTES: List[str] class _RunnableImageDeploymentArguments(_BaseDeploymentArguments): @@ -163,7 +166,6 @@ class _RunnableImageDeploymentArguments(_BaseDeploymentArguments): FORWARDER_CPUS_LIMIT: float FORWARDER_MEMORY_LIMIT: str FORWARDER_STORAGE_LIMIT: str - FORWARDER_EXTRA_ROUTES: List[str] FORWARDER_TYPE: Optional[str] USER_CONTAINER_PORT: int @@ -664,7 +666,6 @@ def get_endpoint_resource_arguments_from_request( FORWARDER_MEMORY_LIMIT=FORWARDER_MEMORY_USAGE, FORWARDER_STORAGE_LIMIT=FORWARDER_STORAGE_USAGE, USER_CONTAINER_PORT=USER_CONTAINER_PORT, - FORWARDER_EXTRA_ROUTES=flavor.extra_routes, FORWARDER_TYPE=flavor.forwarder_type, # Async Deployment Arguments CELERY_S3_BUCKET=s3_bucket, @@ -715,7 +716,6 @@ def get_endpoint_resource_arguments_from_request( FORWARDER_MEMORY_LIMIT=FORWARDER_MEMORY_USAGE, FORWARDER_STORAGE_LIMIT=FORWARDER_STORAGE_USAGE, USER_CONTAINER_PORT=USER_CONTAINER_PORT, - FORWARDER_EXTRA_ROUTES=flavor.extra_routes, FORWARDER_TYPE=flavor.forwarder_type, # Async Deployment Arguments CELERY_S3_BUCKET=s3_bucket, @@ -760,7 +760,6 @@ def get_endpoint_resource_arguments_from_request( MAIN_ENV=main_env, COMMAND=flavor.streaming_command, PREDICT_ROUTE=flavor.predict_route, - STREAMING_PREDICT_ROUTE=flavor.streaming_predict_route, HEALTHCHECK_ROUTE=flavor.healthcheck_route, READINESS_INITIAL_DELAY=flavor.readiness_initial_delay_seconds, INFRA_SERVICE_CONFIG_VOLUME_MOUNT_PATH=infra_service_config_volume_mount_path, @@ -769,11 +768,15 @@ def get_endpoint_resource_arguments_from_request( FORWARDER_MEMORY_LIMIT=FORWARDER_MEMORY_USAGE, FORWARDER_STORAGE_LIMIT=FORWARDER_STORAGE_USAGE, USER_CONTAINER_PORT=USER_CONTAINER_PORT, - FORWARDER_EXTRA_ROUTES=flavor.extra_routes, FORWARDER_TYPE=flavor.forwarder_type, # Streaming Deployment Arguments FORWARDER_PORT=FORWARDER_PORT, FORWARDER_WORKER_COUNT=FORWARDER_WORKER_COUNT, + STREAMING_PREDICT_ROUTE=flavor.streaming_predict_route, + FORWARDER_SYNC_ROUTES=[flavor.predict_route] + flavor.routes + flavor.extra_routes, + FORWARDER_STREAMING_ROUTES=[flavor.streaming_predict_route] + + flavor.routes + + flavor.extra_routes, ) elif endpoint_resource_name == "deployment-runnable-image-streaming-gpu": assert isinstance(flavor, StreamingEnhancedRunnableImageFlavor) @@ -808,7 +811,6 @@ def get_endpoint_resource_arguments_from_request( MAIN_ENV=main_env, COMMAND=flavor.streaming_command, PREDICT_ROUTE=flavor.predict_route, - STREAMING_PREDICT_ROUTE=flavor.streaming_predict_route, HEALTHCHECK_ROUTE=flavor.healthcheck_route, READINESS_INITIAL_DELAY=flavor.readiness_initial_delay_seconds, INFRA_SERVICE_CONFIG_VOLUME_MOUNT_PATH=infra_service_config_volume_mount_path, @@ -817,11 +819,15 @@ def get_endpoint_resource_arguments_from_request( FORWARDER_MEMORY_LIMIT=FORWARDER_MEMORY_USAGE, FORWARDER_STORAGE_LIMIT=FORWARDER_STORAGE_USAGE, USER_CONTAINER_PORT=USER_CONTAINER_PORT, - FORWARDER_EXTRA_ROUTES=flavor.extra_routes, FORWARDER_TYPE=flavor.forwarder_type, # Streaming Deployment Arguments FORWARDER_PORT=FORWARDER_PORT, FORWARDER_WORKER_COUNT=FORWARDER_WORKER_COUNT, + STREAMING_PREDICT_ROUTE=flavor.streaming_predict_route, + FORWARDER_SYNC_ROUTES=[flavor.predict_route] + flavor.routes + flavor.extra_routes, + FORWARDER_STREAMING_ROUTES=[flavor.streaming_predict_route] + + flavor.routes + + flavor.extra_routes, # GPU Deployment Arguments GPU_TYPE=build_endpoint_request.gpu_type.value, GPUS=build_endpoint_request.gpus, @@ -866,11 +872,11 @@ def get_endpoint_resource_arguments_from_request( FORWARDER_MEMORY_LIMIT=FORWARDER_MEMORY_USAGE, FORWARDER_STORAGE_LIMIT=FORWARDER_STORAGE_USAGE, USER_CONTAINER_PORT=USER_CONTAINER_PORT, - FORWARDER_EXTRA_ROUTES=flavor.extra_routes, FORWARDER_TYPE=flavor.forwarder_type, # Sync Deployment Arguments FORWARDER_PORT=FORWARDER_PORT, FORWARDER_WORKER_COUNT=FORWARDER_WORKER_COUNT, + FORWARDER_SYNC_ROUTES=[flavor.predict_route] + flavor.routes + flavor.extra_routes, ) elif endpoint_resource_name == "deployment-runnable-image-sync-gpu": assert isinstance(flavor, RunnableImageLike) @@ -913,11 +919,11 @@ def get_endpoint_resource_arguments_from_request( FORWARDER_MEMORY_LIMIT=FORWARDER_MEMORY_USAGE, FORWARDER_STORAGE_LIMIT=FORWARDER_STORAGE_USAGE, USER_CONTAINER_PORT=USER_CONTAINER_PORT, - FORWARDER_EXTRA_ROUTES=flavor.extra_routes, FORWARDER_TYPE=flavor.forwarder_type, # Sync Deployment Arguments FORWARDER_PORT=FORWARDER_PORT, FORWARDER_WORKER_COUNT=FORWARDER_WORKER_COUNT, + FORWARDER_SYNC_ROUTES=[flavor.predict_route] + flavor.routes + flavor.extra_routes, # GPU Deployment Arguments GPU_TYPE=build_endpoint_request.gpu_type.value, GPUS=build_endpoint_request.gpus, @@ -962,7 +968,6 @@ def get_endpoint_resource_arguments_from_request( FORWARDER_MEMORY_LIMIT=FORWARDER_MEMORY_USAGE, FORWARDER_STORAGE_LIMIT=FORWARDER_STORAGE_USAGE, USER_CONTAINER_PORT=USER_CONTAINER_PORT, - FORWARDER_EXTRA_ROUTES=flavor.extra_routes, FORWARDER_TYPE=flavor.forwarder_type, # Async Deployment Arguments CELERY_S3_BUCKET=s3_bucket, @@ -1021,7 +1026,6 @@ def get_endpoint_resource_arguments_from_request( FORWARDER_MEMORY_LIMIT=FORWARDER_MEMORY_USAGE, FORWARDER_STORAGE_LIMIT=FORWARDER_STORAGE_USAGE, USER_CONTAINER_PORT=USER_CONTAINER_PORT, - FORWARDER_EXTRA_ROUTES=flavor.extra_routes, FORWARDER_TYPE=flavor.forwarder_type, # Async Deployment Arguments CELERY_S3_BUCKET=s3_bucket, @@ -1082,11 +1086,11 @@ def get_endpoint_resource_arguments_from_request( FORWARDER_MEMORY_LIMIT=FORWARDER_MEMORY_USAGE, FORWARDER_STORAGE_LIMIT=FORWARDER_STORAGE_USAGE, USER_CONTAINER_PORT=USER_CONTAINER_PORT, - FORWARDER_EXTRA_ROUTES=flavor.extra_routes, FORWARDER_TYPE=flavor.forwarder_type, # Sync Deployment Arguments FORWARDER_PORT=FORWARDER_PORT, FORWARDER_WORKER_COUNT=FORWARDER_WORKER_COUNT, + FORWARDER_SYNC_ROUTES=[flavor.predict_route] + flavor.routes + flavor.extra_routes, # Triton Deployment Arguments TRITON_MODEL_REPOSITORY=flavor.triton_model_repository, TRITON_CPUS=str(flavor.triton_num_cpu), @@ -1137,11 +1141,11 @@ def get_endpoint_resource_arguments_from_request( FORWARDER_MEMORY_LIMIT=FORWARDER_MEMORY_USAGE, FORWARDER_STORAGE_LIMIT=FORWARDER_STORAGE_USAGE, USER_CONTAINER_PORT=USER_CONTAINER_PORT, - FORWARDER_EXTRA_ROUTES=flavor.extra_routes, FORWARDER_TYPE=flavor.forwarder_type, # Sync Deployment Arguments FORWARDER_PORT=FORWARDER_PORT, FORWARDER_WORKER_COUNT=FORWARDER_WORKER_COUNT, + FORWARDER_SYNC_ROUTES=[flavor.predict_route] + flavor.routes + flavor.extra_routes, # GPU Deployment Arguments GPU_TYPE=build_endpoint_request.gpu_type.value, GPUS=build_endpoint_request.gpus, @@ -1189,7 +1193,6 @@ def get_endpoint_resource_arguments_from_request( MAIN_ENV=main_env, COMMAND=flavor.streaming_command, PREDICT_ROUTE=flavor.predict_route, - STREAMING_PREDICT_ROUTE=flavor.streaming_predict_route, HEALTHCHECK_ROUTE=flavor.healthcheck_route, READINESS_INITIAL_DELAY=flavor.readiness_initial_delay_seconds, INFRA_SERVICE_CONFIG_VOLUME_MOUNT_PATH=infra_service_config_volume_mount_path, @@ -1198,11 +1201,15 @@ def get_endpoint_resource_arguments_from_request( FORWARDER_MEMORY_LIMIT=FORWARDER_MEMORY_USAGE, FORWARDER_STORAGE_LIMIT=FORWARDER_STORAGE_USAGE, USER_CONTAINER_PORT=USER_CONTAINER_PORT, - FORWARDER_EXTRA_ROUTES=flavor.extra_routes, FORWARDER_TYPE=flavor.forwarder_type, # Streaming Arguments FORWARDER_PORT=FORWARDER_PORT, FORWARDER_WORKER_COUNT=FORWARDER_WORKER_COUNT, + STREAMING_PREDICT_ROUTE=flavor.streaming_predict_route, + FORWARDER_SYNC_ROUTES=[flavor.predict_route] + flavor.routes + flavor.extra_routes, + FORWARDER_STREAMING_ROUTES=[flavor.streaming_predict_route] + + flavor.routes + + flavor.extra_routes, # GPU Arguments GPU_TYPE=build_endpoint_request.gpu_type.value, GPUS=build_endpoint_request.gpus, diff --git a/model-engine/model_engine_server/infra/gateways/resources/templates/service_template_config_map_circleci.yaml b/model-engine/model_engine_server/infra/gateways/resources/templates/service_template_config_map_circleci.yaml index 6b561044..59aa6df0 100644 --- a/model-engine/model_engine_server/infra/gateways/resources/templates/service_template_config_map_circleci.yaml +++ b/model-engine/model_engine_server/infra/gateways/resources/templates/service_template_config_map_circleci.yaml @@ -167,8 +167,8 @@ data: cpu: ${FORWARDER_CPUS_LIMIT} memory: ${FORWARDER_MEMORY_LIMIT} ephemeral-storage: ${FORWARDER_STORAGE_LIMIT} - - + + volumeMounts: - name: config-volume mountPath: /opt/.aws/config @@ -440,8 +440,8 @@ data: cpu: ${FORWARDER_CPUS_LIMIT} memory: ${FORWARDER_MEMORY_LIMIT} ephemeral-storage: ${FORWARDER_STORAGE_LIMIT} - - + + volumeMounts: - name: config-volume mountPath: /opt/.aws/config @@ -660,8 +660,8 @@ data: cpu: ${FORWARDER_CPUS_LIMIT} memory: ${FORWARDER_MEMORY_LIMIT} ephemeral-storage: ${FORWARDER_STORAGE_LIMIT} - - + + volumeMounts: - name: config-volume mountPath: /opt/.aws/config @@ -929,8 +929,8 @@ data: cpu: ${FORWARDER_CPUS_LIMIT} memory: ${FORWARDER_MEMORY_LIMIT} ephemeral-storage: ${FORWARDER_STORAGE_LIMIT} - - + + volumeMounts: - name: config-volume mountPath: /opt/.aws/config @@ -1104,17 +1104,13 @@ data: - --num-workers - "${FORWARDER_WORKER_COUNT}" - --set - - "forwarder.sync.predict_route=${PREDICT_ROUTE}" - - --set - - "forwarder.stream.predict_route=${STREAMING_PREDICT_ROUTE}" - - --set - "forwarder.sync.healthcheck_route=${HEALTHCHECK_ROUTE}" - --set - "forwarder.stream.healthcheck_route=${HEALTHCHECK_ROUTE}" - --set - - "forwarder.sync.extra_routes=${FORWARDER_EXTRA_ROUTES}" + - "forwarder.sync.routes=${FORWARDER_SYNC_ROUTES}" - --set - - "forwarder.stream.extra_routes=${FORWARDER_EXTRA_ROUTES}" + - "forwarder.stream.routes=${FORWARDER_STREAMING_ROUTES}" - --set - "forwarder.sync.forwarder_type=${FORWARDER_TYPE}" - --set @@ -1162,8 +1158,8 @@ data: cpu: ${FORWARDER_CPUS_LIMIT} memory: ${FORWARDER_MEMORY_LIMIT} ephemeral-storage: ${FORWARDER_STORAGE_LIMIT} - - + + volumeMounts: - name: config-volume mountPath: /opt/.aws/config @@ -1398,8 +1394,8 @@ data: cpu: ${FORWARDER_CPUS_LIMIT} memory: ${FORWARDER_MEMORY_LIMIT} ephemeral-storage: ${FORWARDER_STORAGE_LIMIT} - - + + volumeMounts: - name: config-volume mountPath: /opt/.aws/config @@ -1679,8 +1675,8 @@ data: cpu: ${FORWARDER_CPUS_LIMIT} memory: ${FORWARDER_MEMORY_LIMIT} ephemeral-storage: ${FORWARDER_STORAGE_LIMIT} - - + + volumeMounts: - name: config-volume mountPath: /opt/.aws/config @@ -1907,8 +1903,8 @@ data: cpu: ${FORWARDER_CPUS_LIMIT} memory: ${FORWARDER_MEMORY_LIMIT} ephemeral-storage: ${FORWARDER_STORAGE_LIMIT} - - + + volumeMounts: - name: config-volume mountPath: /opt/.aws/config @@ -2184,8 +2180,8 @@ data: cpu: ${FORWARDER_CPUS_LIMIT} memory: ${FORWARDER_MEMORY_LIMIT} ephemeral-storage: ${FORWARDER_STORAGE_LIMIT} - - + + volumeMounts: - name: config-volume mountPath: /opt/.aws/config @@ -2367,17 +2363,13 @@ data: - --num-workers - "${FORWARDER_WORKER_COUNT}" - --set - - "forwarder.sync.predict_route=${PREDICT_ROUTE}" - - --set - - "forwarder.stream.predict_route=${STREAMING_PREDICT_ROUTE}" - - --set - "forwarder.sync.healthcheck_route=${HEALTHCHECK_ROUTE}" - --set - "forwarder.stream.healthcheck_route=${HEALTHCHECK_ROUTE}" - --set - - "forwarder.sync.extra_routes=${FORWARDER_EXTRA_ROUTES}" + - "forwarder.sync.routes=${FORWARDER_SYNC_ROUTES}" - --set - - "forwarder.stream.extra_routes=${FORWARDER_EXTRA_ROUTES}" + - "forwarder.stream.routes=${FORWARDER_STREAMING_ROUTES}" - --set - "forwarder.sync.forwarder_type=${FORWARDER_TYPE}" - --set @@ -2425,8 +2417,8 @@ data: cpu: ${FORWARDER_CPUS_LIMIT} memory: ${FORWARDER_MEMORY_LIMIT} ephemeral-storage: ${FORWARDER_STORAGE_LIMIT} - - + + volumeMounts: - name: config-volume mountPath: /opt/.aws/config @@ -2638,8 +2630,8 @@ data: apiVersion: leaderworkerset.x-k8s.io/v1 kind: LeaderWorkerSet metadata: - name: ${RESOURCE_NAME} - namespace: ${NAMESPACE} + name: ${RESOURCE_NAME} + namespace: ${NAMESPACE} labels: user_id: ${OWNER} team: ${TEAM} @@ -2780,8 +2772,8 @@ data: cpu: ${FORWARDER_CPUS_LIMIT} memory: ${FORWARDER_MEMORY_LIMIT} ephemeral-storage: ${FORWARDER_STORAGE_LIMIT} - - + + volumeMounts: - name: config-volume mountPath: /opt/.aws/config diff --git a/model-engine/model_engine_server/infra/repositories/db_model_bundle_repository.py b/model-engine/model_engine_server/infra/repositories/db_model_bundle_repository.py index 033b1971..7072d2ca 100644 --- a/model-engine/model_engine_server/infra/repositories/db_model_bundle_repository.py +++ b/model-engine/model_engine_server/infra/repositories/db_model_bundle_repository.py @@ -146,6 +146,7 @@ def translate_model_bundle_orm_to_model_bundle( env=model_bundle_orm.runnable_image_env, protocol=model_bundle_orm.runnable_image_protocol, readiness_initial_delay_seconds=model_bundle_orm.runnable_image_readiness_initial_delay_seconds, + routes=model_bundle_orm.runnable_image_routes, extra_routes=model_bundle_orm.runnable_image_extra_routes, forwarder_type=model_bundle_orm.runnable_image_forwarder_type, worker_command=model_bundle_orm.runnable_image_worker_command, @@ -218,6 +219,7 @@ def translate_kwargs_to_model_bundle_orm( runnable_image_readiness_initial_delay_seconds=flavor_dict.get( "readiness_initial_delay_seconds" ), + runnable_image_routes=flavor_dict.get("routes"), runnable_image_extra_routes=flavor_dict.get("extra_routes"), runnable_image_forwarder_type=flavor_dict.get("forwarder_type"), runnable_image_worker_command=flavor_dict.get("worker_command"),