Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions charts/model-engine/templates/_helpers.tpl
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

adding this configuration because we are currently seeing error logs related to this not being properly set up, the error logs go away when this is off. ref: DataDog/dd-trace-py#5212

Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ podAffinity:
env:
- name: DATADOG_TRACE_ENABLED
value: "${DATADOG_TRACE_ENABLED}"
- name: DD_REMOTE_CONFIGURATION_ENABLED
value: "false"
- name: DD_SERVICE
value: "${ENDPOINT_NAME}"
- name: DD_ENV
Expand Down Expand Up @@ -184,6 +186,8 @@ env:
env:
- name: DATADOG_TRACE_ENABLED
value: "${DATADOG_TRACE_ENABLED}"
- name: DD_REMOTE_CONFIGURATION_ENABLED
value: "false"
- name: DD_SERVICE
value: "${ENDPOINT_NAME}"
- name: DD_ENV
Expand Down Expand Up @@ -230,6 +234,8 @@ env:
env:
- name: DATADOG_TRACE_ENABLED
value: "{{ .Values.datadog_trace_enabled }}"
- name: DD_REMOTE_CONFIGURATION_ENABLED
value: "false"
- name: DD_ENV
value: {{ .Values.context }}
- name: DD_AGENT_HOST
Expand Down
31 changes: 30 additions & 1 deletion model-engine/model_engine_server/api/app.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import os
import traceback
from pathlib import Path

from fastapi import FastAPI, Response
from fastapi import FastAPI, Request, Response
from fastapi.responses import JSONResponse
from fastapi.staticfiles import StaticFiles
from model_engine_server.api.batch_jobs_v1 import batch_job_router_v1
from model_engine_server.api.dependencies import get_or_create_aioredis_pool
Expand All @@ -16,6 +18,9 @@
from model_engine_server.api.model_endpoints_v1 import model_endpoint_router_v1
from model_engine_server.api.tasks_v1 import inference_task_router_v1
from model_engine_server.api.triggers_v1 import trigger_router_v1
from model_engine_server.common.datadog_utils import get_request_id
from model_engine_server.core.loggers import filename_wo_ext, make_logger
from starlette.middleware.base import BaseHTTPMiddleware

app = FastAPI(title="launch", version="1.0.0", redoc_url="/api")

Expand All @@ -30,6 +35,30 @@
app.include_router(file_router_v1)
app.include_router(trigger_router_v1)

logger = make_logger(filename_wo_ext(__name__))


class ExceptionLoggingMiddleware(BaseHTTPMiddleware):
async def dispatch(self, request: Request, call_next):
try:
return await call_next(request)
except Exception as e:
tb_str = traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)
structured_log = {"error": str(e), "traceback": "".join(tb_str)}
logger.error("Unhandled exception: %s", structured_log)
request_id = get_request_id()
return JSONResponse(
{
"status_code": 500,
"content": {
"error": f"Internal error for request_id {request_id}. Our team has been notified."
},
}
)


app.add_middleware(ExceptionLoggingMiddleware)

# TODO: Remove this once we have a better way to serve internal docs
INTERNAL_DOCS_PATH = str(Path(__file__).parents[3] / "launch_internal/site")
if os.path.exists(INTERNAL_DOCS_PATH):
Expand Down
10 changes: 4 additions & 6 deletions model-engine/model_engine_server/api/batch_jobs_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,14 @@
UpdateDockerImageBatchJobV1Response,
)
from model_engine_server.core.auth.authentication_repository import User
from model_engine_server.core.domain_exceptions import (
DockerImageNotFoundException,
ObjectHasInvalidValueException,
ObjectNotAuthorizedException,
ObjectNotFoundException,
)
from model_engine_server.core.loggers import filename_wo_ext, make_logger
from model_engine_server.domain.exceptions import (
DockerImageNotFoundException,
EndpointLabelsException,
EndpointResourceInvalidRequestException,
ObjectHasInvalidValueException,
ObjectNotAuthorizedException,
ObjectNotFoundException,
)
from model_engine_server.domain.use_cases.batch_job_use_cases import (
CreateBatchJobV1UseCase,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
)
from model_engine_server.common.dtos.model_bundles import ModelBundleOrderBy
from model_engine_server.core.auth.authentication_repository import User
from model_engine_server.core.domain_exceptions import (
from model_engine_server.core.loggers import filename_wo_ext, make_logger
from model_engine_server.domain.exceptions import (
EndpointResourceInvalidRequestException,
ObjectNotAuthorizedException,
ObjectNotFoundException,
)
from model_engine_server.core.loggers import filename_wo_ext, make_logger
from model_engine_server.domain.exceptions import EndpointResourceInvalidRequestException
from model_engine_server.domain.use_cases.docker_image_batch_job_bundle_use_cases import (
CreateDockerImageBatchJobBundleV1UseCase,
GetDockerImageBatchJobBundleByIdV1UseCase,
Expand Down
4 changes: 2 additions & 2 deletions model-engine/model_engine_server/api/files_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
UploadFileResponse,
)
from model_engine_server.core.auth.authentication_repository import User
from model_engine_server.core.domain_exceptions import (
from model_engine_server.core.loggers import filename_wo_ext, make_logger
from model_engine_server.domain.exceptions import (
ObjectNotAuthorizedException,
ObjectNotFoundException,
)
from model_engine_server.core.loggers import filename_wo_ext, make_logger
from model_engine_server.domain.use_cases.file_use_cases import (
DeleteFileUseCase,
GetFileContentUseCase,
Expand Down
26 changes: 12 additions & 14 deletions model-engine/model_engine_server/api/llms_v1.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""LLM Model Endpoint routes for the hosted model inference service.
"""
from typing import Optional
from uuid import uuid4

from fastapi import APIRouter, Depends, HTTPException, Query
from model_engine_server.api.dependencies import (
Expand All @@ -10,7 +9,7 @@
get_external_interfaces_read_only,
verify_authentication,
)
from model_engine_server.common.datadog_utils import add_trace_request_id, add_trace_resource_name
from model_engine_server.common.datadog_utils import add_trace_resource_name, get_request_id
from model_engine_server.common.dtos.llms import (
CancelFineTuneResponse,
CompletionStreamV1Request,
Expand All @@ -32,13 +31,6 @@
)
from model_engine_server.common.dtos.model_endpoints import ModelEndpointOrderBy
from model_engine_server.core.auth.authentication_repository import User
from model_engine_server.core.domain_exceptions import (
ObjectAlreadyExistsException,
ObjectHasInvalidValueException,
ObjectNotApprovedException,
ObjectNotAuthorizedException,
ObjectNotFoundException,
)
from model_engine_server.core.loggers import filename_wo_ext, make_logger
from model_engine_server.domain.exceptions import (
EndpointDeleteFailedException,
Expand All @@ -49,6 +41,11 @@
InvalidRequestException,
LLMFineTuningMethodNotImplementedException,
LLMFineTuningQuotaReached,
ObjectAlreadyExistsException,
ObjectHasInvalidValueException,
ObjectNotApprovedException,
ObjectNotAuthorizedException,
ObjectNotFoundException,
UpstreamServiceError,
)
from model_engine_server.domain.use_cases.llm_fine_tuning_use_cases import (
Expand Down Expand Up @@ -192,10 +189,12 @@ async def create_completion_sync_task(
user=auth, model_endpoint_name=model_endpoint_name, request=request
)
except UpstreamServiceError:
request_id = str(uuid4())
add_trace_request_id(request_id)
request_id = get_request_id()
logger.exception(f"Upstream service error for request {request_id}")
return CompletionSyncV1Response(request_id=request_id, output=None)
raise HTTPException(
status_code=500,
detail=f"Upstream service error for request_id {request_id}.",
)
except (ObjectNotFoundException, ObjectNotAuthorizedException) as exc:
raise HTTPException(
status_code=404,
Expand Down Expand Up @@ -245,8 +244,7 @@ async def event_generator():

return EventSourceResponse(event_generator())
except UpstreamServiceError:
request_id = str(uuid4())
add_trace_request_id(request_id)
request_id = get_request_id()
logger.exception(f"Upstream service error for request {request_id}")
return EventSourceResponse(
iter((CompletionStreamV1Response(request_id=request_id).json(),))
Expand Down
4 changes: 2 additions & 2 deletions model-engine/model_engine_server/api/model_bundles_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@
ModelBundleV1Response,
)
from model_engine_server.core.auth.authentication_repository import User
from model_engine_server.core.domain_exceptions import (
from model_engine_server.core.loggers import filename_wo_ext, make_logger
from model_engine_server.domain.exceptions import (
DockerImageNotFoundException,
ObjectHasInvalidValueException,
ObjectNotAuthorizedException,
ObjectNotFoundException,
)
from model_engine_server.core.loggers import filename_wo_ext, make_logger
from model_engine_server.domain.use_cases.model_bundle_use_cases import (
CloneModelBundleV1UseCase,
CreateModelBundleV1UseCase,
Expand Down
4 changes: 2 additions & 2 deletions model-engine/model_engine_server/api/model_bundles_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@
ModelBundleV2Response,
)
from model_engine_server.core.auth.authentication_repository import User
from model_engine_server.core.domain_exceptions import (
from model_engine_server.core.loggers import filename_wo_ext, make_logger
from model_engine_server.domain.exceptions import (
DockerImageNotFoundException,
ObjectHasInvalidValueException,
ObjectNotAuthorizedException,
ObjectNotFoundException,
)
from model_engine_server.core.loggers import filename_wo_ext, make_logger
from model_engine_server.domain.use_cases.model_bundle_use_cases import (
CloneModelBundleV2UseCase,
CreateModelBundleV2UseCase,
Expand Down
58 changes: 46 additions & 12 deletions model-engine/model_engine_server/api/model_endpoints_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
get_external_interfaces_read_only,
verify_authentication,
)
from model_engine_server.common.datadog_utils import add_trace_resource_name
from model_engine_server.common.datadog_utils import add_trace_resource_name, get_request_id
from model_engine_server.common.dtos.model_endpoints import (
CreateModelEndpointV1Request,
CreateModelEndpointV1Response,
Expand All @@ -24,19 +24,17 @@
UpdateModelEndpointV1Response,
)
from model_engine_server.core.auth.authentication_repository import User
from model_engine_server.core.domain_exceptions import (
ObjectAlreadyExistsException,
ObjectHasInvalidValueException,
ObjectNotApprovedException,
ObjectNotAuthorizedException,
ObjectNotFoundException,
)
from model_engine_server.core.loggers import filename_wo_ext, make_logger
from model_engine_server.domain.exceptions import (
EndpointDeleteFailedException,
EndpointLabelsException,
EndpointResourceInvalidRequestException,
ExistingEndpointOperationInProgressException,
ObjectAlreadyExistsException,
ObjectHasInvalidValueException,
ObjectNotApprovedException,
ObjectNotAuthorizedException,
ObjectNotFoundException,
)
from model_engine_server.domain.use_cases.model_endpoint_use_cases import (
CreateModelEndpointV1UseCase,
Expand Down Expand Up @@ -94,6 +92,13 @@ async def create_model_endpoint(
status_code=404,
detail="The specified model bundle could not be found.",
) from exc
except Exception as exc:
request_id = get_request_id()
logger.exception(f"Internal service error for request {request_id}: {exc}")
raise HTTPException(
status_code=500,
detail=f"Internal error for request_id {request_id}.",
)


@model_endpoint_router_v1.get("/model-endpoints", response_model=ListModelEndpointsV1Response)
Expand All @@ -108,10 +113,18 @@ async def list_model_endpoints(
"""
add_trace_resource_name("model_endpoints_get")
logger.info(f"GET /model-endpoints?name={name}&order_by={order_by} for {auth}")
use_case = ListModelEndpointsV1UseCase(
model_endpoint_service=external_interfaces.model_endpoint_service,
)
return await use_case.execute(user=auth, name=name, order_by=order_by)
try:
use_case = ListModelEndpointsV1UseCase(
model_endpoint_service=external_interfaces.model_endpoint_service,
)
return await use_case.execute(user=auth, name=name, order_by=order_by)
except Exception as exc:
request_id = get_request_id()
logger.exception(f"Internal service error for request {request_id}: {exc}")
raise HTTPException(
status_code=500,
detail=f"Internal error for request_id {request_id}.",
)


@model_endpoint_router_v1.get(
Expand All @@ -137,6 +150,13 @@ async def get_model_endpoint(
status_code=404,
detail=f"Model Endpoint {model_endpoint_id} was not found.",
) from exc
except Exception as exc:
request_id = get_request_id()
logger.exception(f"Internal service error for request {request_id}: {exc}")
raise HTTPException(
status_code=500,
detail=f"Internal error for request_id {request_id}.",
)


@model_endpoint_router_v1.put(
Expand Down Expand Up @@ -181,6 +201,13 @@ async def update_model_endpoint(
status_code=409,
detail="Existing operation on endpoint in progress, try again later.",
) from exc
except Exception as exc:
request_id = get_request_id()
logger.exception(f"Internal service error for request {request_id}: {exc}")
raise HTTPException(
status_code=500,
detail=f"Internal error for request_id {request_id}.",
)


@model_endpoint_router_v1.delete(
Expand Down Expand Up @@ -216,3 +243,10 @@ async def delete_model_endpoint(
status_code=500,
detail="deletion of endpoint failed, compute resources still exist.",
) from exc
except Exception as exc:
request_id = get_request_id()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Think this is fine for now, but as we revisit self-hosting in VPCs, we'll probably want some sort of Gateway abstraction for getting the request/trace ID.

logger.exception(f"Internal service error for request {request_id}: {exc}")
raise HTTPException(
status_code=500,
detail=f"Internal error for request_id {request_id}.",
)
6 changes: 2 additions & 4 deletions model-engine/model_engine_server/api/tasks_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,11 @@
TaskStatus,
)
from model_engine_server.core.auth.authentication_repository import User
from model_engine_server.core.domain_exceptions import (
ObjectNotAuthorizedException,
ObjectNotFoundException,
)
from model_engine_server.core.loggers import filename_wo_ext, make_logger
from model_engine_server.domain.exceptions import (
EndpointUnsupportedInferenceTypeException,
ObjectNotAuthorizedException,
ObjectNotFoundException,
UpstreamServiceError,
)
from model_engine_server.domain.use_cases.async_inference_use_cases import (
Expand Down
10 changes: 4 additions & 6 deletions model-engine/model_engine_server/api/triggers_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,15 @@
UpdateTriggerV1Response,
)
from model_engine_server.core.auth.authentication_repository import User
from model_engine_server.core.domain_exceptions import (
DockerImageNotFoundException,
ObjectHasInvalidValueException,
ObjectNotAuthorizedException,
ObjectNotFoundException,
)
from model_engine_server.core.loggers import filename_wo_ext, make_logger
from model_engine_server.domain.exceptions import (
CronSyntaxException,
DockerImageNotFoundException,
EndpointLabelsException,
EndpointResourceInvalidRequestException,
ObjectHasInvalidValueException,
ObjectNotAuthorizedException,
ObjectNotFoundException,
TriggerNameAlreadyExistsException,
)
from model_engine_server.domain.use_cases.trigger_use_cases import (
Expand Down
6 changes: 6 additions & 0 deletions model-engine/model_engine_server/common/datadog_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,9 @@ def add_trace_request_id(request_id: str):
current_span = tracer.current_span()
if current_span:
current_span.set_tag("launch.request_id", request_id)


def get_request_id():
"""Gets the request id for an api call (in our case, dd trace id) so that we can filter in Datadog easier"""
current_span = tracer.current_span()
return current_span.trace_id if current_span else None
Loading