From d27bedcd416ce5ac2eb2b6ab362ce9a8f4f4fde3 Mon Sep 17 00:00:00 2001 From: Ian Macleod Date: Fri, 15 Sep 2023 17:01:07 +0000 Subject: [PATCH 01/12] changing exception location --- .../model_engine_server/api/batch_jobs_v1.py | 10 ++- .../api/docker_image_batch_job_bundles_v1.py | 6 +- .../model_engine_server/api/files_v1.py | 4 +- .../model_engine_server/api/llms_v1.py | 17 ++--- .../api/model_bundles_v1.py | 4 +- .../api/model_bundles_v2.py | 4 +- .../api/model_endpoints_v1.py | 12 ++-- .../model_engine_server/api/triggers_v1.py | 10 ++- .../common/datadog_utils.py | 6 ++ .../core/domain_exceptions.py | 59 ---------------- .../model_engine_server/domain/exceptions.py | 70 ++++++++++++++++++- .../use_cases/async_inference_use_cases.py | 10 +-- .../domain/use_cases/batch_job_use_cases.py | 12 ++-- ...docker_image_batch_job_bundle_use_cases.py | 8 +-- .../use_cases/llm_model_endpoint_use_cases.py | 48 +++++++------ .../use_cases/model_bundle_use_cases.py | 10 +-- .../use_cases/model_endpoint_use_cases.py | 8 +-- .../streaming_inference_use_cases.py | 10 +-- .../use_cases/sync_inference_use_cases.py | 10 +-- .../domain/use_cases/trigger_use_cases.py | 11 +-- .../services/live_model_endpoint_service.py | 10 +-- model-engine/tests/unit/api/test_tasks.py | 6 +- .../domain/test_async_inference_use_cases.py | 4 +- ...docker_image_batch_job_bundle_use_cases.py | 2 +- .../tests/unit/domain/test_llm_use_cases.py | 8 +-- .../domain/test_model_bundle_use_cases.py | 2 +- .../domain/test_model_endpoint_use_cases.py | 8 +-- .../test_streaming_inference_use_cases.py | 6 +- .../domain/test_sync_inference_use_cases.py | 4 +- .../test_live_model_endpoint_service.py | 6 +- 30 files changed, 195 insertions(+), 190 deletions(-) delete mode 100644 model-engine/model_engine_server/core/domain_exceptions.py diff --git a/model-engine/model_engine_server/api/batch_jobs_v1.py b/model-engine/model_engine_server/api/batch_jobs_v1.py index 7e939d9c..022b9dc8 100644 --- a/model-engine/model_engine_server/api/batch_jobs_v1.py +++ b/model-engine/model_engine_server/api/batch_jobs_v1.py @@ -22,16 +22,14 @@ UpdateDockerImageBatchJobV1Response, ) from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( - DockerImageNotFoundException, - ObjectHasInvalidValueException, - ObjectNotAuthorizedException, - ObjectNotFoundException, -) from model_engine_server.core.loggers import filename_wo_ext, make_logger from model_engine_server.domain.exceptions import ( + DockerImageNotFoundException, EndpointLabelsException, EndpointResourceInvalidRequestException, + ObjectHasInvalidValueException, + ObjectNotAuthorizedException, + ObjectNotFoundException, ) from model_engine_server.domain.use_cases.batch_job_use_cases import ( CreateBatchJobV1UseCase, diff --git a/model-engine/model_engine_server/api/docker_image_batch_job_bundles_v1.py b/model-engine/model_engine_server/api/docker_image_batch_job_bundles_v1.py index 96cc3d49..1444a39b 100644 --- a/model-engine/model_engine_server/api/docker_image_batch_job_bundles_v1.py +++ b/model-engine/model_engine_server/api/docker_image_batch_job_bundles_v1.py @@ -15,12 +15,12 @@ ) from model_engine_server.common.dtos.model_bundles import ModelBundleOrderBy from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( +from model_engine_server.core.loggers import filename_wo_ext, make_logger +from model_engine_server.domain.exceptions import ( + EndpointResourceInvalidRequestException, ObjectNotAuthorizedException, ObjectNotFoundException, ) -from model_engine_server.core.loggers import filename_wo_ext, make_logger -from model_engine_server.domain.exceptions import EndpointResourceInvalidRequestException from model_engine_server.domain.use_cases.docker_image_batch_job_bundle_use_cases import ( CreateDockerImageBatchJobBundleV1UseCase, GetDockerImageBatchJobBundleByIdV1UseCase, diff --git a/model-engine/model_engine_server/api/files_v1.py b/model-engine/model_engine_server/api/files_v1.py index a2d23ba3..8c50cc53 100644 --- a/model-engine/model_engine_server/api/files_v1.py +++ b/model-engine/model_engine_server/api/files_v1.py @@ -16,11 +16,11 @@ UploadFileResponse, ) from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( +from model_engine_server.core.loggers import filename_wo_ext, make_logger +from model_engine_server.domain.exceptions import ( ObjectNotAuthorizedException, ObjectNotFoundException, ) -from model_engine_server.core.loggers import filename_wo_ext, make_logger from model_engine_server.domain.use_cases.file_use_cases import ( DeleteFileUseCase, GetFileContentUseCase, diff --git a/model-engine/model_engine_server/api/llms_v1.py b/model-engine/model_engine_server/api/llms_v1.py index 3e7533da..41c2ff6b 100644 --- a/model-engine/model_engine_server/api/llms_v1.py +++ b/model-engine/model_engine_server/api/llms_v1.py @@ -31,13 +31,6 @@ ) from model_engine_server.common.dtos.model_endpoints import ModelEndpointOrderBy from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( - ObjectAlreadyExistsException, - ObjectHasInvalidValueException, - ObjectNotApprovedException, - ObjectNotAuthorizedException, - ObjectNotFoundException, -) from model_engine_server.core.loggers import filename_wo_ext, make_logger from model_engine_server.domain.exceptions import ( EndpointLabelsException, @@ -46,6 +39,11 @@ InvalidRequestException, LLMFineTuningMethodNotImplementedException, LLMFineTuningQuotaReached, + ObjectAlreadyExistsException, + ObjectHasInvalidValueException, + ObjectNotApprovedException, + ObjectNotAuthorizedException, + ObjectNotFoundException, UpstreamServiceError, ) from model_engine_server.domain.use_cases.llm_fine_tuning_use_cases import ( @@ -191,7 +189,10 @@ async def create_completion_sync_task( request_id = str(uuid4()) add_trace_request_id(request_id) logger.exception(f"Upstream service error for request {request_id}") - return CompletionSyncV1Response(request_id=request_id, output=None) + raise HTTPException( + status_code=500, + detail=f"Upstream service error for request_id {request_id}.", + ) except (ObjectNotFoundException, ObjectNotAuthorizedException) as exc: raise HTTPException( status_code=404, diff --git a/model-engine/model_engine_server/api/model_bundles_v1.py b/model-engine/model_engine_server/api/model_bundles_v1.py index de24f860..e192af13 100644 --- a/model-engine/model_engine_server/api/model_bundles_v1.py +++ b/model-engine/model_engine_server/api/model_bundles_v1.py @@ -19,13 +19,13 @@ ModelBundleV1Response, ) from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( +from model_engine_server.core.loggers import filename_wo_ext, make_logger +from model_engine_server.domain.exceptions import ( DockerImageNotFoundException, ObjectHasInvalidValueException, ObjectNotAuthorizedException, ObjectNotFoundException, ) -from model_engine_server.core.loggers import filename_wo_ext, make_logger from model_engine_server.domain.use_cases.model_bundle_use_cases import ( CloneModelBundleV1UseCase, CreateModelBundleV1UseCase, diff --git a/model-engine/model_engine_server/api/model_bundles_v2.py b/model-engine/model_engine_server/api/model_bundles_v2.py index 94801916..d35de5cf 100644 --- a/model-engine/model_engine_server/api/model_bundles_v2.py +++ b/model-engine/model_engine_server/api/model_bundles_v2.py @@ -19,13 +19,13 @@ ModelBundleV2Response, ) from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( +from model_engine_server.core.loggers import filename_wo_ext, make_logger +from model_engine_server.domain.exceptions import ( DockerImageNotFoundException, ObjectHasInvalidValueException, ObjectNotAuthorizedException, ObjectNotFoundException, ) -from model_engine_server.core.loggers import filename_wo_ext, make_logger from model_engine_server.domain.use_cases.model_bundle_use_cases import ( CloneModelBundleV2UseCase, CreateModelBundleV2UseCase, diff --git a/model-engine/model_engine_server/api/model_endpoints_v1.py b/model-engine/model_engine_server/api/model_endpoints_v1.py index d37f8bf6..4bf3cf32 100644 --- a/model-engine/model_engine_server/api/model_endpoints_v1.py +++ b/model-engine/model_engine_server/api/model_endpoints_v1.py @@ -24,19 +24,17 @@ UpdateModelEndpointV1Response, ) from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( - ObjectAlreadyExistsException, - ObjectHasInvalidValueException, - ObjectNotApprovedException, - ObjectNotAuthorizedException, - ObjectNotFoundException, -) from model_engine_server.core.loggers import filename_wo_ext, make_logger from model_engine_server.domain.exceptions import ( EndpointDeleteFailedException, EndpointLabelsException, EndpointResourceInvalidRequestException, ExistingEndpointOperationInProgressException, + ObjectAlreadyExistsException, + ObjectHasInvalidValueException, + ObjectNotApprovedException, + ObjectNotAuthorizedException, + ObjectNotFoundException, ) from model_engine_server.domain.use_cases.model_endpoint_use_cases import ( CreateModelEndpointV1UseCase, diff --git a/model-engine/model_engine_server/api/triggers_v1.py b/model-engine/model_engine_server/api/triggers_v1.py index cc32180e..30f3310b 100644 --- a/model-engine/model_engine_server/api/triggers_v1.py +++ b/model-engine/model_engine_server/api/triggers_v1.py @@ -15,17 +15,15 @@ UpdateTriggerV1Response, ) from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( - DockerImageNotFoundException, - ObjectHasInvalidValueException, - ObjectNotAuthorizedException, - ObjectNotFoundException, -) from model_engine_server.core.loggers import filename_wo_ext, make_logger from model_engine_server.domain.exceptions import ( CronSyntaxException, + DockerImageNotFoundException, EndpointLabelsException, EndpointResourceInvalidRequestException, + ObjectHasInvalidValueException, + ObjectNotAuthorizedException, + ObjectNotFoundException, TriggerNameAlreadyExistsException, ) from model_engine_server.domain.use_cases.trigger_use_cases import ( diff --git a/model-engine/model_engine_server/common/datadog_utils.py b/model-engine/model_engine_server/common/datadog_utils.py index c73fa2f9..26152f03 100644 --- a/model-engine/model_engine_server/common/datadog_utils.py +++ b/model-engine/model_engine_server/common/datadog_utils.py @@ -17,3 +17,9 @@ def add_trace_request_id(request_id: str): current_span = tracer.current_span() if current_span: current_span.set_tag("launch.request_id", request_id) + + +def get_request_id(): + """Gets the request id for an api call (in our case, dd trace id) so that we can filter in Datadog easier""" + current_span = tracer.current_span() + return current_span.trace_id if current_span else None diff --git a/model-engine/model_engine_server/core/domain_exceptions.py b/model-engine/model_engine_server/core/domain_exceptions.py deleted file mode 100644 index 62068614..00000000 --- a/model-engine/model_engine_server/core/domain_exceptions.py +++ /dev/null @@ -1,59 +0,0 @@ -from dataclasses import dataclass - - -class DomainException(Exception): - """ - Base class for exceptions thrown for domain (business logic) errors. - """ - - -class ObjectAlreadyExistsException(DomainException): - """ - Thrown when the user tries to create a model with a name that already exists. - """ - - -class ObjectNotFoundException(DomainException): - """ - Thrown when a required object is not found, e.g. when creating a version for a nonexistent model - """ - - -class ObjectNotAuthorizedException(DomainException): - """ - Thrown when a user tries to access an object they don't own. - """ - - -class ObjectHasInvalidValueException(DomainException, ValueError): - """ - Thrown when a user tries to create an object with an invalid value. - """ - - -class ObjectNotApprovedException(DomainException): - """ - Thrown when a required object is not approved, e.g. for a Bundle in review. - """ - - -@dataclass -class DockerImageNotFoundException(DomainException): - """ - Thrown when a user tries to specify a custom Docker image that cannot be found. - """ - - repository: str - tag: str - - -class DockerBuildFailedException(DomainException): - """ - Thrown if the server failed to build a docker image. - """ - - -class ReadOnlyDatabaseException(DomainException): - """ - Thrown if the server attempted to write to a read-only database. - """ diff --git a/model-engine/model_engine_server/domain/exceptions.py b/model-engine/model_engine_server/domain/exceptions.py index c31eb0ad..a52f0be9 100644 --- a/model-engine/model_engine_server/domain/exceptions.py +++ b/model-engine/model_engine_server/domain/exceptions.py @@ -1,4 +1,62 @@ -from model_engine_server.core.domain_exceptions import DomainException +from dataclasses import dataclass + + +class DomainException(Exception): + """ + Base class for exceptions thrown for domain (business logic) errors. + """ + + +class ObjectAlreadyExistsException(DomainException): + """ + Thrown when the user tries to create a model with a name that already exists. + """ + + +class ObjectNotFoundException(DomainException): + """ + Thrown when a required object is not found, e.g. when creating a version for a nonexistent model + """ + + +class ObjectNotAuthorizedException(DomainException): + """ + Thrown when a user tries to access an object they don't own. + """ + + +class ObjectHasInvalidValueException(DomainException, ValueError): + """ + Thrown when a user tries to create an object with an invalid value. + """ + + +class ObjectNotApprovedException(DomainException): + """ + Thrown when a required object is not approved, e.g. for a Bundle in review. + """ + + +@dataclass +class DockerImageNotFoundException(DomainException): + """ + Thrown when a user tries to specify a custom Docker image that cannot be found. + """ + + repository: str + tag: str + + +class DockerBuildFailedException(DomainException): + """ + Thrown if the server failed to build a docker image. + """ + + +class ReadOnlyDatabaseException(DomainException): + """ + Thrown if the server attempted to write to a read-only database. + """ class ExistingEndpointOperationInProgressException(DomainException): @@ -112,3 +170,13 @@ class TriggerNameAlreadyExistsException(DomainException): """ Thrown if the requested name already exists in the trigger repository """ + + +class InternalError(DomainException): + """ + Thrown as a catch-all for unhandled errors. + """ + + def __init__(self, request_id, error): + self.request_id = request_id + self.error = error diff --git a/model-engine/model_engine_server/domain/use_cases/async_inference_use_cases.py b/model-engine/model_engine_server/domain/use_cases/async_inference_use_cases.py index 3b8a5ddf..647905f2 100644 --- a/model-engine/model_engine_server/domain/use_cases/async_inference_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/async_inference_use_cases.py @@ -4,15 +4,15 @@ GetAsyncTaskV1Response, ) from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( - ObjectNotAuthorizedException, - ObjectNotFoundException, -) from model_engine_server.domain.authorization.live_authorization_module import ( LiveAuthorizationModule, ) from model_engine_server.domain.entities import ModelEndpointType -from model_engine_server.domain.exceptions import EndpointUnsupportedInferenceTypeException +from model_engine_server.domain.exceptions import ( + EndpointUnsupportedInferenceTypeException, + ObjectNotAuthorizedException, + ObjectNotFoundException, +) from model_engine_server.domain.services.model_endpoint_service import ModelEndpointService DEFAULT_TASK_TIMEOUT_SECONDS = 86400 diff --git a/model-engine/model_engine_server/domain/use_cases/batch_job_use_cases.py b/model-engine/model_engine_server/domain/use_cases/batch_job_use_cases.py index 0a1bb1f5..7ea13e11 100644 --- a/model-engine/model_engine_server/domain/use_cases/batch_job_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/batch_job_use_cases.py @@ -17,17 +17,17 @@ ) from model_engine_server.common.resource_limits import validate_resource_requests from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( - DockerImageNotFoundException, - ObjectHasInvalidValueException, - ObjectNotAuthorizedException, - ObjectNotFoundException, -) from model_engine_server.core.loggers import filename_wo_ext, make_logger from model_engine_server.domain.authorization.live_authorization_module import ( LiveAuthorizationModule, ) from model_engine_server.domain.entities import ModelEndpointType +from model_engine_server.domain.exceptions import ( + DockerImageNotFoundException, + ObjectHasInvalidValueException, + ObjectNotAuthorizedException, + ObjectNotFoundException, +) from model_engine_server.domain.gateways import CronJobGateway, DockerImageBatchJobGateway from model_engine_server.domain.repositories import ( DockerImageBatchJobBundleRepository, diff --git a/model-engine/model_engine_server/domain/use_cases/docker_image_batch_job_bundle_use_cases.py b/model-engine/model_engine_server/domain/use_cases/docker_image_batch_job_bundle_use_cases.py index 29d40fe8..3767ffe5 100644 --- a/model-engine/model_engine_server/domain/use_cases/docker_image_batch_job_bundle_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/docker_image_batch_job_bundle_use_cases.py @@ -8,13 +8,13 @@ ) from model_engine_server.common.dtos.model_bundles import ModelBundleOrderBy from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( - ObjectNotAuthorizedException, - ObjectNotFoundException, -) from model_engine_server.domain.authorization.live_authorization_module import ( LiveAuthorizationModule, ) +from model_engine_server.domain.exceptions import ( + ObjectNotAuthorizedException, + ObjectNotFoundException, +) from model_engine_server.domain.repositories import DockerImageBatchJobBundleRepository diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index 92f9588a..89029a1c 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -32,11 +32,6 @@ from model_engine_server.common.dtos.tasks import SyncEndpointPredictV1Request, TaskStatus from model_engine_server.common.resource_limits import validate_resource_requests from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( - ObjectHasInvalidValueException, - ObjectNotAuthorizedException, - ObjectNotFoundException, -) from model_engine_server.core.loggers import filename_wo_ext, make_logger from model_engine_server.domain.entities import ( LLMInferenceFramework, @@ -53,7 +48,11 @@ from model_engine_server.domain.exceptions import ( EndpointLabelsException, EndpointUnsupportedInferenceTypeException, + InternalError, InvalidRequestException, + ObjectHasInvalidValueException, + ObjectNotAuthorizedException, + ObjectNotFoundException, UpstreamServiceError, ) from model_engine_server.domain.gateways.llm_artifact_gateway import LLMArtifactGateway @@ -61,7 +60,7 @@ from model_engine_server.domain.services import LLMModelEndpointService, ModelEndpointService from model_engine_server.infra.gateways.filesystem_gateway import FilesystemGateway -from ...common.datadog_utils import add_trace_request_id +from ...common.datadog_utils import add_trace_request_id, get_request_id from ..authorization.live_authorization_module import LiveAuthorizationModule from .model_bundle_use_cases import CreateModelBundleV2UseCase from .model_endpoint_use_cases import ( @@ -1373,22 +1372,25 @@ def __init__( self.llm_artifact_gateway = llm_artifact_gateway async def execute(self, user: User, request: ModelDownloadRequest) -> ModelDownloadResponse: - model_endpoints = await self.model_endpoint_service.list_model_endpoints( - owner=user.team_id, name=request.model_name, order_by=None - ) - if len(model_endpoints) == 0: - raise ObjectNotFoundException + try: + model_endpoints = await self.model_endpoint_service.list_model_endpoints( + owner=user.team_id, name=request.model_name, order_by=None + ) + if len(model_endpoints) == 0: + raise ObjectNotFoundException - if len(model_endpoints) > 1: - raise ObjectHasInvalidValueException( - f"Expected 1 LLM model endpoint for model name {request.model_name}, got {len(model_endpoints)}" + if len(model_endpoints) > 1: + raise ObjectHasInvalidValueException( + f"Expected 1 LLM model endpoint for model name {request.model_name}, got {len(model_endpoints)}" + ) + model_files = self.llm_artifact_gateway.get_model_weights_urls( + user.team_id, request.model_name ) - model_files = self.llm_artifact_gateway.get_model_weights_urls( - user.team_id, request.model_name - ) - urls = {} - for model_file in model_files: - # don't want to make s3 bucket full keys public, so trim to just keep file name - public_file_name = model_file.rsplit("/", 1)[-1] - urls[public_file_name] = self.filesystem_gateway.generate_signed_url(model_file) - return ModelDownloadResponse(urls=urls) + urls = {} + for model_file in model_files: + # don't want to make s3 bucket full keys public, so trim to just keep file name + public_file_name = model_file.rsplit("/", 1)[-1] + urls[public_file_name] = self.filesystem_gateway.generate_signed_url(model_file) + return ModelDownloadResponse(urls=urls) + except Exception as exc: + raise InternalError(request_id=get_request_id(), error=exc) diff --git a/model-engine/model_engine_server/domain/use_cases/model_bundle_use_cases.py b/model-engine/model_engine_server/domain/use_cases/model_bundle_use_cases.py index be75e695..d79b8793 100644 --- a/model-engine/model_engine_server/domain/use_cases/model_bundle_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/model_bundle_use_cases.py @@ -16,11 +16,6 @@ ModelBundleV2Response, ) from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( - DockerImageNotFoundException, - ObjectNotAuthorizedException, - ObjectNotFoundException, -) from model_engine_server.domain.authorization.live_authorization_module import ( LiveAuthorizationModule, ) @@ -37,6 +32,11 @@ TensorflowFramework, ZipArtifactFlavor, ) +from model_engine_server.domain.exceptions import ( + DockerImageNotFoundException, + ObjectNotAuthorizedException, + ObjectNotFoundException, +) from model_engine_server.domain.gateways import ModelPrimitiveGateway from model_engine_server.domain.repositories import DockerRepository, ModelBundleRepository diff --git a/model-engine/model_engine_server/domain/use_cases/model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/model_endpoint_use_cases.py index 1633a72b..bab01204 100644 --- a/model-engine/model_engine_server/domain/use_cases/model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/model_endpoint_use_cases.py @@ -24,11 +24,6 @@ from model_engine_server.common.resource_limits import MAX_ENDPOINT_SIZE, validate_resource_requests from model_engine_server.common.settings import REQUIRED_ENDPOINT_LABELS, RESTRICTED_ENDPOINT_LABELS from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( - ObjectHasInvalidValueException, - ObjectNotAuthorizedException, - ObjectNotFoundException, -) from model_engine_server.core.loggers import filename_wo_ext, make_logger from model_engine_server.domain.authorization.live_authorization_module import ( LiveAuthorizationModule, @@ -43,6 +38,9 @@ EndpointInfraStateNotFound, EndpointLabelsException, EndpointResourceInvalidRequestException, + ObjectHasInvalidValueException, + ObjectNotAuthorizedException, + ObjectNotFoundException, ) from model_engine_server.domain.repositories import ModelBundleRepository from model_engine_server.domain.services import ModelEndpointService diff --git a/model-engine/model_engine_server/domain/use_cases/streaming_inference_use_cases.py b/model-engine/model_engine_server/domain/use_cases/streaming_inference_use_cases.py index 1fb70023..e17b512e 100644 --- a/model-engine/model_engine_server/domain/use_cases/streaming_inference_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/streaming_inference_use_cases.py @@ -5,15 +5,15 @@ SyncEndpointPredictV1Response, ) from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( - ObjectNotAuthorizedException, - ObjectNotFoundException, -) from model_engine_server.domain.authorization.live_authorization_module import ( LiveAuthorizationModule, ) from model_engine_server.domain.entities import ModelEndpointType -from model_engine_server.domain.exceptions import EndpointUnsupportedInferenceTypeException +from model_engine_server.domain.exceptions import ( + EndpointUnsupportedInferenceTypeException, + ObjectNotAuthorizedException, + ObjectNotFoundException, +) from model_engine_server.domain.services.model_endpoint_service import ModelEndpointService diff --git a/model-engine/model_engine_server/domain/use_cases/sync_inference_use_cases.py b/model-engine/model_engine_server/domain/use_cases/sync_inference_use_cases.py index d785beed..16196ab6 100644 --- a/model-engine/model_engine_server/domain/use_cases/sync_inference_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/sync_inference_use_cases.py @@ -3,15 +3,15 @@ SyncEndpointPredictV1Response, ) from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( - ObjectNotAuthorizedException, - ObjectNotFoundException, -) from model_engine_server.domain.authorization.live_authorization_module import ( LiveAuthorizationModule, ) from model_engine_server.domain.entities import ModelEndpointType -from model_engine_server.domain.exceptions import EndpointUnsupportedInferenceTypeException +from model_engine_server.domain.exceptions import ( + EndpointUnsupportedInferenceTypeException, + ObjectNotAuthorizedException, + ObjectNotFoundException, +) from model_engine_server.domain.services.model_endpoint_service import ModelEndpointService diff --git a/model-engine/model_engine_server/domain/use_cases/trigger_use_cases.py b/model-engine/model_engine_server/domain/use_cases/trigger_use_cases.py index b616c299..a0bd1769 100644 --- a/model-engine/model_engine_server/domain/use_cases/trigger_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/trigger_use_cases.py @@ -14,16 +14,17 @@ from model_engine_server.common.settings import REQUIRED_ENDPOINT_LABELS from model_engine_server.core.auth.authentication_repository import User from model_engine_server.core.config import infra_config -from model_engine_server.core.domain_exceptions import ( +from model_engine_server.domain.authorization.live_authorization_module import ( + LiveAuthorizationModule, +) +from model_engine_server.domain.exceptions import ( + CronSyntaxException, DockerImageNotFoundException, + EndpointLabelsException, ObjectHasInvalidValueException, ObjectNotAuthorizedException, ObjectNotFoundException, ) -from model_engine_server.domain.authorization.live_authorization_module import ( - LiveAuthorizationModule, -) -from model_engine_server.domain.exceptions import CronSyntaxException, EndpointLabelsException from model_engine_server.domain.gateways.cron_job_gateway import CronJobGateway from model_engine_server.domain.repositories import ( DockerImageBatchJobBundleRepository, diff --git a/model-engine/model_engine_server/infra/services/live_model_endpoint_service.py b/model-engine/model_engine_server/infra/services/live_model_endpoint_service.py index ab88886c..dba1a055 100644 --- a/model-engine/model_engine_server/infra/services/live_model_endpoint_service.py +++ b/model-engine/model_engine_server/infra/services/live_model_endpoint_service.py @@ -3,10 +3,6 @@ from datadog import statsd from model_engine_server.common.dtos.model_endpoints import ModelEndpointOrderBy from model_engine_server.common.settings import generate_deployment_name -from model_engine_server.core.domain_exceptions import ( - ObjectAlreadyExistsException, - ObjectNotFoundException, -) from model_engine_server.core.loggers import filename_wo_ext, make_logger from model_engine_server.domain.entities import ( CallbackAuth, @@ -20,7 +16,11 @@ ModelEndpointType, StorageSpecificationType, ) -from model_engine_server.domain.exceptions import EndpointDeleteFailedException +from model_engine_server.domain.exceptions import ( + EndpointDeleteFailedException, + ObjectAlreadyExistsException, + ObjectNotFoundException, +) from model_engine_server.domain.gateways import ( AsyncModelEndpointInferenceGateway, ModelEndpointsSchemaGateway, diff --git a/model-engine/tests/unit/api/test_tasks.py b/model-engine/tests/unit/api/test_tasks.py index db65a80f..5192f025 100644 --- a/model-engine/tests/unit/api/test_tasks.py +++ b/model-engine/tests/unit/api/test_tasks.py @@ -2,12 +2,12 @@ from unittest.mock import AsyncMock, MagicMock, patch from model_engine_server.common.dtos.tasks import EndpointPredictV1Request -from model_engine_server.core.domain_exceptions import ( +from model_engine_server.domain.entities import ModelBundle, ModelEndpoint +from model_engine_server.domain.exceptions import ( ObjectNotAuthorizedException, ObjectNotFoundException, + UpstreamServiceError, ) -from model_engine_server.domain.entities import ModelBundle, ModelEndpoint -from model_engine_server.domain.exceptions import UpstreamServiceError def test_create_async_task_success( diff --git a/model-engine/tests/unit/domain/test_async_inference_use_cases.py b/model-engine/tests/unit/domain/test_async_inference_use_cases.py index 7a122b3b..4334480d 100644 --- a/model-engine/tests/unit/domain/test_async_inference_use_cases.py +++ b/model-engine/tests/unit/domain/test_async_inference_use_cases.py @@ -3,11 +3,11 @@ import pytest from model_engine_server.common.dtos.tasks import EndpointPredictV1Request, TaskStatus from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( +from model_engine_server.domain.entities import ModelEndpoint +from model_engine_server.domain.exceptions import ( ObjectNotAuthorizedException, ObjectNotFoundException, ) -from model_engine_server.domain.entities import ModelEndpoint from model_engine_server.domain.use_cases.async_inference_use_cases import ( CreateAsyncInferenceTaskV1UseCase, GetAsyncInferenceTaskV1UseCase, diff --git a/model-engine/tests/unit/domain/test_docker_image_batch_job_bundle_use_cases.py b/model-engine/tests/unit/domain/test_docker_image_batch_job_bundle_use_cases.py index 9522c9d5..4f62b79d 100644 --- a/model-engine/tests/unit/domain/test_docker_image_batch_job_bundle_use_cases.py +++ b/model-engine/tests/unit/domain/test_docker_image_batch_job_bundle_use_cases.py @@ -5,7 +5,7 @@ ) from model_engine_server.common.dtos.model_bundles import ModelBundleOrderBy from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( +from model_engine_server.domain.exceptions import ( ObjectNotAuthorizedException, ObjectNotFoundException, ) diff --git a/model-engine/tests/unit/domain/test_llm_use_cases.py b/model-engine/tests/unit/domain/test_llm_use_cases.py index b841ba1b..58ecce52 100644 --- a/model-engine/tests/unit/domain/test_llm_use_cases.py +++ b/model-engine/tests/unit/domain/test_llm_use_cases.py @@ -14,16 +14,14 @@ ) from model_engine_server.common.dtos.tasks import SyncEndpointPredictV1Response, TaskStatus from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( - ObjectHasInvalidValueException, - ObjectNotAuthorizedException, - ObjectNotFoundException, -) from model_engine_server.domain.entities import ModelEndpoint, ModelEndpointType from model_engine_server.domain.exceptions import ( EndpointUnsupportedInferenceTypeException, InvalidRequestException, LLMFineTuningQuotaReached, + ObjectHasInvalidValueException, + ObjectNotAuthorizedException, + ObjectNotFoundException, ) from model_engine_server.domain.use_cases.llm_fine_tuning_use_cases import ( MAX_LLM_ENDPOINTS_PER_INTERNAL_USER, diff --git a/model-engine/tests/unit/domain/test_model_bundle_use_cases.py b/model-engine/tests/unit/domain/test_model_bundle_use_cases.py index d9b4bc25..ae2bb7e2 100644 --- a/model-engine/tests/unit/domain/test_model_bundle_use_cases.py +++ b/model-engine/tests/unit/domain/test_model_bundle_use_cases.py @@ -10,7 +10,7 @@ ModelBundleV1Response, ) from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( +from model_engine_server.domain.exceptions import ( DockerImageNotFoundException, ObjectNotAuthorizedException, ObjectNotFoundException, diff --git a/model-engine/tests/unit/domain/test_model_endpoint_use_cases.py b/model-engine/tests/unit/domain/test_model_endpoint_use_cases.py index 95901f8a..49e017fa 100644 --- a/model-engine/tests/unit/domain/test_model_endpoint_use_cases.py +++ b/model-engine/tests/unit/domain/test_model_endpoint_use_cases.py @@ -17,16 +17,14 @@ STORAGE_LIMIT, ) from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( - ObjectHasInvalidValueException, - ObjectNotAuthorizedException, - ObjectNotFoundException, -) from model_engine_server.domain.entities import ModelBundle, ModelEndpoint from model_engine_server.domain.exceptions import ( EndpointBillingTagsMalformedException, EndpointLabelsException, EndpointResourceInvalidRequestException, + ObjectHasInvalidValueException, + ObjectNotAuthorizedException, + ObjectNotFoundException, ) from model_engine_server.domain.use_cases.model_endpoint_use_cases import ( CreateModelEndpointV1UseCase, diff --git a/model-engine/tests/unit/domain/test_streaming_inference_use_cases.py b/model-engine/tests/unit/domain/test_streaming_inference_use_cases.py index 191fa0f4..9da48267 100644 --- a/model-engine/tests/unit/domain/test_streaming_inference_use_cases.py +++ b/model-engine/tests/unit/domain/test_streaming_inference_use_cases.py @@ -3,12 +3,12 @@ import pytest from model_engine_server.common.dtos.tasks import EndpointPredictV1Request from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( +from model_engine_server.domain.entities import ModelEndpoint +from model_engine_server.domain.exceptions import ( + EndpointUnsupportedInferenceTypeException, ObjectNotAuthorizedException, ObjectNotFoundException, ) -from model_engine_server.domain.entities import ModelEndpoint -from model_engine_server.domain.exceptions import EndpointUnsupportedInferenceTypeException from model_engine_server.domain.use_cases.streaming_inference_use_cases import ( CreateStreamingInferenceTaskV1UseCase, ) diff --git a/model-engine/tests/unit/domain/test_sync_inference_use_cases.py b/model-engine/tests/unit/domain/test_sync_inference_use_cases.py index 879d5345..673cafa1 100644 --- a/model-engine/tests/unit/domain/test_sync_inference_use_cases.py +++ b/model-engine/tests/unit/domain/test_sync_inference_use_cases.py @@ -3,11 +3,11 @@ import pytest from model_engine_server.common.dtos.tasks import EndpointPredictV1Request from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( +from model_engine_server.domain.entities import ModelEndpoint +from model_engine_server.domain.exceptions import ( ObjectNotAuthorizedException, ObjectNotFoundException, ) -from model_engine_server.domain.entities import ModelEndpoint from model_engine_server.domain.use_cases.sync_inference_use_cases import ( CreateSyncInferenceTaskV1UseCase, ) diff --git a/model-engine/tests/unit/infra/services/test_live_model_endpoint_service.py b/model-engine/tests/unit/infra/services/test_live_model_endpoint_service.py index 87cbab0f..66969005 100644 --- a/model-engine/tests/unit/infra/services/test_live_model_endpoint_service.py +++ b/model-engine/tests/unit/infra/services/test_live_model_endpoint_service.py @@ -2,10 +2,6 @@ from unittest.mock import AsyncMock import pytest -from model_engine_server.core.domain_exceptions import ( - ObjectAlreadyExistsException, - ObjectNotFoundException, -) from model_engine_server.domain.entities import ( ModelBundle, ModelEndpoint, @@ -15,6 +11,8 @@ from model_engine_server.domain.exceptions import ( EndpointDeleteFailedException, ExistingEndpointOperationInProgressException, + ObjectAlreadyExistsException, + ObjectNotFoundException, ) from model_engine_server.infra.services import LiveModelEndpointService From e5f448c8fbfaa85195effe7e6463778ada8c0155 Mon Sep 17 00:00:00 2001 From: Ian Macleod Date: Mon, 18 Sep 2023 21:37:34 +0000 Subject: [PATCH 02/12] adding exception handling for generic exceptions --- .../model_engine_server/api/llms_v1.py | 111 +++++++++++++++--- .../api/model_endpoints_v1.py | 46 +++++++- .../common/datadog_utils.py | 9 -- .../model_engine_server/domain/exceptions.py | 10 -- 4 files changed, 138 insertions(+), 38 deletions(-) diff --git a/model-engine/model_engine_server/api/llms_v1.py b/model-engine/model_engine_server/api/llms_v1.py index 6b738e0f..83ef3277 100644 --- a/model-engine/model_engine_server/api/llms_v1.py +++ b/model-engine/model_engine_server/api/llms_v1.py @@ -1,7 +1,6 @@ """LLM Model Endpoint routes for the hosted model inference service. """ from typing import Optional -from uuid import uuid4 from fastapi import APIRouter, Depends, HTTPException, Query from model_engine_server.api.dependencies import ( @@ -10,7 +9,7 @@ get_external_interfaces_read_only, verify_authentication, ) -from model_engine_server.common.datadog_utils import add_trace_request_id, add_trace_resource_name +from model_engine_server.common.datadog_utils import add_trace_resource_name, get_request_id from model_engine_server.common.dtos.llms import ( CancelFineTuneResponse, CompletionStreamV1Request, @@ -122,6 +121,13 @@ async def create_model_endpoint( status_code=404, detail="The specified model bundle could not be found.", ) from exc + except Exception as exc: + request_id = get_request_id() + logger.exception(f"Internal service error for request {request_id}: {exc}") + raise HTTPException( + status_code=500, + detail=f"Internal error for request_id {request_id}.", + ) @llm_router_v1.get("/model-endpoints", response_model=ListLLMModelEndpointsV1Response) @@ -136,10 +142,18 @@ async def list_model_endpoints( """ add_trace_resource_name("llm_model_endpoints_get") logger.info(f"GET /llm/model-endpoints?name={name}&order_by={order_by} for {auth}") - use_case = ListLLMModelEndpointsV1UseCase( - llm_model_endpoint_service=external_interfaces.llm_model_endpoint_service, - ) - return await use_case.execute(user=auth, name=name, order_by=order_by) + try: + use_case = ListLLMModelEndpointsV1UseCase( + llm_model_endpoint_service=external_interfaces.llm_model_endpoint_service, + ) + return await use_case.execute(user=auth, name=name, order_by=order_by) + except Exception as exc: + request_id = get_request_id() + logger.exception(f"Internal service error for request {request_id}: {exc}") + raise HTTPException( + status_code=500, + detail=f"Internal error for request_id {request_id}.", + ) @llm_router_v1.get( @@ -165,6 +179,13 @@ async def get_model_endpoint( status_code=404, detail=f"Model Endpoint {model_endpoint_name} was not found.", ) from exc + except Exception as exc: + request_id = get_request_id() + logger.exception(f"Internal service error for request {request_id}: {exc}") + raise HTTPException( + status_code=500, + detail=f"Internal error for request_id {request_id}.", + ) @llm_router_v1.post("/completions-sync", response_model=CompletionSyncV1Response) @@ -190,8 +211,7 @@ async def create_completion_sync_task( user=auth, model_endpoint_name=model_endpoint_name, request=request ) except UpstreamServiceError: - request_id = str(uuid4()) - add_trace_request_id(request_id) + request_id = get_request_id() logger.exception(f"Upstream service error for request {request_id}") raise HTTPException( status_code=500, @@ -211,6 +231,13 @@ async def create_completion_sync_task( status_code=400, detail=f"Unsupported inference type: {str(exc)}", ) from exc + except Exception as exc: + request_id = get_request_id() + logger.exception(f"Internal service error for request {request_id}: {exc}") + raise HTTPException( + status_code=500, + detail=f"Internal error for request_id {request_id}.", + ) @llm_router_v1.post("/completions-stream", response_model=CompletionStreamV1Response) @@ -246,8 +273,7 @@ async def event_generator(): return EventSourceResponse(event_generator()) except UpstreamServiceError: - request_id = str(uuid4()) - add_trace_request_id(request_id) + request_id = get_request_id() logger.exception(f"Upstream service error for request {request_id}") return EventSourceResponse( iter((CompletionStreamV1Response(request_id=request_id).json(),)) @@ -264,6 +290,13 @@ async def event_generator(): status_code=400, detail=f"Unsupported inference type: {str(exc)}", ) from exc + except Exception as exc: + request_id = get_request_id() + logger.exception(f"Internal service error for request {request_id}: {exc}") + raise HTTPException( + status_code=500, + detail=f"Internal error for request_id {request_id}.", + ) @llm_router_v1.post("/fine-tunes", response_model=CreateFineTuneResponse) @@ -293,6 +326,13 @@ async def create_fine_tune( status_code=400, detail=str(exc), ) from exc + except Exception as exc: + request_id = get_request_id() + logger.exception(f"Internal service error for request {request_id}: {exc}") + raise HTTPException( + status_code=500, + detail=f"Internal error for request_id {request_id}.", + ) @llm_router_v1.get("/fine-tunes/{fine_tune_id}", response_model=GetFineTuneResponse) @@ -313,6 +353,13 @@ async def get_fine_tune( status_code=404, detail="The specified fine-tune job could not be found.", ) from exc + except Exception as exc: + request_id = get_request_id() + logger.exception(f"Internal service error for request {request_id}: {exc}") + raise HTTPException( + status_code=500, + detail=f"Internal error for request_id {request_id}.", + ) @llm_router_v1.get("/fine-tunes", response_model=ListFineTunesResponse) @@ -322,10 +369,18 @@ async def list_fine_tunes( ) -> ListFineTunesResponse: add_trace_resource_name("fine_tunes_list") logger.info(f"GET /fine-tunes for {auth}") - use_case = ListFineTunesV1UseCase( - llm_fine_tuning_service=external_interfaces.llm_fine_tuning_service, - ) - return await use_case.execute(user=auth) + try: + use_case = ListFineTunesV1UseCase( + llm_fine_tuning_service=external_interfaces.llm_fine_tuning_service, + ) + return await use_case.execute(user=auth) + except Exception as exc: + request_id = get_request_id() + logger.exception(f"Internal service error for request {request_id}: {exc}") + raise HTTPException( + status_code=500, + detail=f"Internal error for request_id {request_id}.", + ) @llm_router_v1.put("/fine-tunes/{fine_tune_id}/cancel", response_model=CancelFineTuneResponse) @@ -346,6 +401,13 @@ async def cancel_fine_tune( status_code=404, detail="The specified fine-tune job could not be found.", ) from exc + except Exception as exc: + request_id = get_request_id() + logger.exception(f"Internal service error for request {request_id}: {exc}") + raise HTTPException( + status_code=500, + detail=f"Internal error for request_id {request_id}.", + ) @llm_router_v1.get("/fine-tunes/{fine_tune_id}/events", response_model=GetFineTuneEventsResponse) @@ -367,6 +429,13 @@ async def get_fine_tune_events( status_code=404, detail="The specified fine-tune job's events could not be found.", ) from exc + except Exception as exc: + request_id = get_request_id() + logger.exception(f"Internal service error for request {request_id}: {exc}") + raise HTTPException( + status_code=500, + detail=f"Internal error for request_id {request_id}.", + ) @llm_router_v1.post("/model-endpoints/download", response_model=ModelDownloadResponse) @@ -389,6 +458,13 @@ async def download_model_endpoint( status_code=404, detail="The requested fine-tuned model could not be found.", ) from exc + except Exception as exc: + request_id = get_request_id() + logger.exception(f"Internal service error for request {request_id}: {exc}") + raise HTTPException( + status_code=500, + detail=f"Internal error for request_id {request_id}.", + ) @llm_router_v1.delete( @@ -427,3 +503,10 @@ async def delete_llm_model_endpoint( status_code=500, detail="deletion of endpoint failed.", ) from exc + except Exception as exc: + request_id = get_request_id() + logger.exception(f"Internal service error for request {request_id}: {exc}") + raise HTTPException( + status_code=500, + detail=f"Internal error for request_id {request_id}.", + ) diff --git a/model-engine/model_engine_server/api/model_endpoints_v1.py b/model-engine/model_engine_server/api/model_endpoints_v1.py index 4bf3cf32..9a6c9da4 100644 --- a/model-engine/model_engine_server/api/model_endpoints_v1.py +++ b/model-engine/model_engine_server/api/model_endpoints_v1.py @@ -12,7 +12,7 @@ get_external_interfaces_read_only, verify_authentication, ) -from model_engine_server.common.datadog_utils import add_trace_resource_name +from model_engine_server.common.datadog_utils import add_trace_resource_name, get_request_id from model_engine_server.common.dtos.model_endpoints import ( CreateModelEndpointV1Request, CreateModelEndpointV1Response, @@ -92,6 +92,13 @@ async def create_model_endpoint( status_code=404, detail="The specified model bundle could not be found.", ) from exc + except Exception as exc: + request_id = get_request_id() + logger.exception(f"Internal service error for request {request_id}: {exc}") + raise HTTPException( + status_code=500, + detail=f"Internal error for request_id {request_id}.", + ) @model_endpoint_router_v1.get("/model-endpoints", response_model=ListModelEndpointsV1Response) @@ -106,10 +113,18 @@ async def list_model_endpoints( """ add_trace_resource_name("model_endpoints_get") logger.info(f"GET /model-endpoints?name={name}&order_by={order_by} for {auth}") - use_case = ListModelEndpointsV1UseCase( - model_endpoint_service=external_interfaces.model_endpoint_service, - ) - return await use_case.execute(user=auth, name=name, order_by=order_by) + try: + use_case = ListModelEndpointsV1UseCase( + model_endpoint_service=external_interfaces.model_endpoint_service, + ) + return await use_case.execute(user=auth, name=name, order_by=order_by) + except Exception as exc: + request_id = get_request_id() + logger.exception(f"Internal service error for request {request_id}: {exc}") + raise HTTPException( + status_code=500, + detail=f"Internal error for request_id {request_id}.", + ) @model_endpoint_router_v1.get( @@ -135,6 +150,13 @@ async def get_model_endpoint( status_code=404, detail=f"Model Endpoint {model_endpoint_id} was not found.", ) from exc + except Exception as exc: + request_id = get_request_id() + logger.exception(f"Internal service error for request {request_id}: {exc}") + raise HTTPException( + status_code=500, + detail=f"Internal error for request_id {request_id}.", + ) @model_endpoint_router_v1.put( @@ -179,6 +201,13 @@ async def update_model_endpoint( status_code=409, detail="Existing operation on endpoint in progress, try again later.", ) from exc + except Exception as exc: + request_id = get_request_id() + logger.exception(f"Internal service error for request {request_id}: {exc}") + raise HTTPException( + status_code=500, + detail=f"Internal error for request_id {request_id}.", + ) @model_endpoint_router_v1.delete( @@ -214,3 +243,10 @@ async def delete_model_endpoint( status_code=500, detail="deletion of endpoint failed, compute resources still exist.", ) from exc + except Exception as exc: + request_id = get_request_id() + logger.exception(f"Internal service error for request {request_id}: {exc}") + raise HTTPException( + status_code=500, + detail=f"Internal error for request_id {request_id}.", + ) diff --git a/model-engine/model_engine_server/common/datadog_utils.py b/model-engine/model_engine_server/common/datadog_utils.py index 26152f03..99d74d63 100644 --- a/model-engine/model_engine_server/common/datadog_utils.py +++ b/model-engine/model_engine_server/common/datadog_utils.py @@ -10,15 +10,6 @@ def add_trace_resource_name(tag: str): current_span.set_tag("launch.resource_name", tag) -def add_trace_request_id(request_id: str): - """Adds a custom tag to a given dd trace corresponding to the request id - so that we can filter in Datadog easier - """ - current_span = tracer.current_span() - if current_span: - current_span.set_tag("launch.request_id", request_id) - - def get_request_id(): """Gets the request id for an api call (in our case, dd trace id) so that we can filter in Datadog easier""" current_span = tracer.current_span() diff --git a/model-engine/model_engine_server/domain/exceptions.py b/model-engine/model_engine_server/domain/exceptions.py index a52f0be9..934a5e21 100644 --- a/model-engine/model_engine_server/domain/exceptions.py +++ b/model-engine/model_engine_server/domain/exceptions.py @@ -170,13 +170,3 @@ class TriggerNameAlreadyExistsException(DomainException): """ Thrown if the requested name already exists in the trigger repository """ - - -class InternalError(DomainException): - """ - Thrown as a catch-all for unhandled errors. - """ - - def __init__(self, request_id, error): - self.request_id = request_id - self.error = error From ab3171fae45abf8a2b000912c09112c1d185d79f Mon Sep 17 00:00:00 2001 From: Ian Macleod Date: Mon, 18 Sep 2023 22:54:23 +0000 Subject: [PATCH 03/12] moving more error handling, still haven't gotten middleware working --- model-engine/model_engine_server/api/app.py | 23 ++++++++++- .../model_engine_server/api/llms_v1.py | 3 +- .../model_engine_server/api/tasks_v1.py | 2 +- .../common/datadog_utils.py | 9 +++++ .../domain/use_cases/file_use_cases.py | 2 +- .../use_cases/llm_fine_tuning_use_cases.py | 2 +- .../use_cases/llm_model_endpoint_use_cases.py | 38 +++++++++---------- .../infra/repositories/db_repository_mixin.py | 2 +- ...s3_file_llm_fine_tune_events_repository.py | 2 +- .../live_batch_job_orchestration_service.py | 2 +- .../services/live_endpoint_builder_service.py | 2 +- model-engine/tests/unit/conftest.py | 2 +- .../test_db_batch_job_record_repository.py | 2 +- ...ocker_image_batch_job_bundle_repository.py | 2 +- .../test_db_model_bundle_repository.py | 2 +- ...est_db_model_endpoint_record_repository.py | 2 +- ...st_live_batch_job_orchestration_service.py | 2 +- .../test_live_endpoint_builder_service.py | 2 +- 18 files changed, 64 insertions(+), 37 deletions(-) diff --git a/model-engine/model_engine_server/api/app.py b/model-engine/model_engine_server/api/app.py index 786f097a..ba698e0f 100644 --- a/model-engine/model_engine_server/api/app.py +++ b/model-engine/model_engine_server/api/app.py @@ -1,8 +1,10 @@ import os from pathlib import Path +import traceback -from fastapi import FastAPI, Response +from fastapi import FastAPI, Response, Request, HTTPException from fastapi.staticfiles import StaticFiles +from model_engine_server.common.datadog_utils import get_request_id from model_engine_server.api.batch_jobs_v1 import batch_job_router_v1 from model_engine_server.api.dependencies import get_or_create_aioredis_pool from model_engine_server.api.docker_image_batch_job_bundles_v1 import ( @@ -16,6 +18,7 @@ from model_engine_server.api.model_endpoints_v1 import model_endpoint_router_v1 from model_engine_server.api.tasks_v1 import inference_task_router_v1 from model_engine_server.api.triggers_v1 import trigger_router_v1 +from model_engine_server.core.loggers import filename_wo_ext, make_logger app = FastAPI(title="launch", version="1.0.0", redoc_url="/api") @@ -30,6 +33,24 @@ app.include_router(file_router_v1) app.include_router(trigger_router_v1) +logger = make_logger(filename_wo_ext(__name__)) + +@app.middleware("http") +async def exception_handler(request: Request, call_next): + try: + return await call_next(request) + except Exception as e: + logger.error("An unexpected error occured during request: {0}".format(str(e))) + logger.exception( + "Traceback is: {0}".format(str(traceback.format_tb(e.__traceback__, 10))) + ) + request_id = get_request_id() + raise HTTPException( + status_code=500, + detail=f"Internal error for request_id {request_id}.", + ) + + # TODO: Remove this once we have a better way to serve internal docs INTERNAL_DOCS_PATH = str(Path(__file__).parents[3] / "launch_internal/site") if os.path.exists(INTERNAL_DOCS_PATH): diff --git a/model-engine/model_engine_server/api/llms_v1.py b/model-engine/model_engine_server/api/llms_v1.py index 83ef3277..b14507c3 100644 --- a/model-engine/model_engine_server/api/llms_v1.py +++ b/model-engine/model_engine_server/api/llms_v1.py @@ -2,7 +2,8 @@ """ from typing import Optional -from fastapi import APIRouter, Depends, HTTPException, Query +from fastapi import APIRouter, Depends, HTTPException, Query, Request +import traceback from model_engine_server.api.dependencies import ( ExternalInterfaces, get_external_interfaces, diff --git a/model-engine/model_engine_server/api/tasks_v1.py b/model-engine/model_engine_server/api/tasks_v1.py index 443b7fb7..e9d50d09 100644 --- a/model-engine/model_engine_server/api/tasks_v1.py +++ b/model-engine/model_engine_server/api/tasks_v1.py @@ -16,7 +16,7 @@ TaskStatus, ) from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ( +from model_engine_server.domain.exceptions import ( ObjectNotAuthorizedException, ObjectNotFoundException, ) diff --git a/model-engine/model_engine_server/common/datadog_utils.py b/model-engine/model_engine_server/common/datadog_utils.py index 99d74d63..26152f03 100644 --- a/model-engine/model_engine_server/common/datadog_utils.py +++ b/model-engine/model_engine_server/common/datadog_utils.py @@ -10,6 +10,15 @@ def add_trace_resource_name(tag: str): current_span.set_tag("launch.resource_name", tag) +def add_trace_request_id(request_id: str): + """Adds a custom tag to a given dd trace corresponding to the request id + so that we can filter in Datadog easier + """ + current_span = tracer.current_span() + if current_span: + current_span.set_tag("launch.request_id", request_id) + + def get_request_id(): """Gets the request id for an api call (in our case, dd trace id) so that we can filter in Datadog easier""" current_span = tracer.current_span() diff --git a/model-engine/model_engine_server/domain/use_cases/file_use_cases.py b/model-engine/model_engine_server/domain/use_cases/file_use_cases.py index e646e8a0..325a7650 100644 --- a/model-engine/model_engine_server/domain/use_cases/file_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/file_use_cases.py @@ -6,7 +6,7 @@ UploadFileResponse, ) from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ObjectNotFoundException +from model_engine_server.domain.exceptions import ObjectNotFoundException from model_engine_server.core.loggers import filename_wo_ext, make_logger from model_engine_server.domain.gateways import FileStorageGateway diff --git a/model-engine/model_engine_server/domain/use_cases/llm_fine_tuning_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_fine_tuning_use_cases.py index a66fc3ff..a744802a 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_fine_tuning_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_fine_tuning_use_cases.py @@ -13,7 +13,7 @@ ListFineTunesResponse, ) from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.core.domain_exceptions import ObjectNotFoundException +from model_engine_server.domain.exceptions import ObjectNotFoundException from model_engine_server.core.loggers import filename_wo_ext, make_logger from model_engine_server.domain.entities import BatchJobStatus from model_engine_server.domain.exceptions import InvalidRequestException, LLMFineTuningQuotaReached diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index 641def1c..bb803b3c 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -49,7 +49,6 @@ from model_engine_server.domain.exceptions import ( EndpointLabelsException, EndpointUnsupportedInferenceTypeException, - InternalError, InvalidRequestException, ObjectHasInvalidValueException, ObjectNotAuthorizedException, @@ -1410,25 +1409,22 @@ def __init__( self.llm_artifact_gateway = llm_artifact_gateway async def execute(self, user: User, request: ModelDownloadRequest) -> ModelDownloadResponse: - try: - model_endpoints = await self.model_endpoint_service.list_model_endpoints( - owner=user.team_id, name=request.model_name, order_by=None - ) - if len(model_endpoints) == 0: - raise ObjectNotFoundException + model_endpoints = await self.model_endpoint_service.list_model_endpoints( + owner=user.team_id, name=request.model_name, order_by=None + ) + if len(model_endpoints) == 0: + raise ObjectNotFoundException - if len(model_endpoints) > 1: - raise ObjectHasInvalidValueException( - f"Expected 1 LLM model endpoint for model name {request.model_name}, got {len(model_endpoints)}" - ) - model_files = self.llm_artifact_gateway.get_model_weights_urls( - user.team_id, request.model_name + if len(model_endpoints) > 1: + raise ObjectHasInvalidValueException( + f"Expected 1 LLM model endpoint for model name {request.model_name}, got {len(model_endpoints)}" ) - urls = {} - for model_file in model_files: - # don't want to make s3 bucket full keys public, so trim to just keep file name - public_file_name = model_file.rsplit("/", 1)[-1] - urls[public_file_name] = self.filesystem_gateway.generate_signed_url(model_file) - return ModelDownloadResponse(urls=urls) - except Exception as exc: - raise InternalError(request_id=get_request_id(), error=exc) + model_files = self.llm_artifact_gateway.get_model_weights_urls( + user.team_id, request.model_name + ) + urls = {} + for model_file in model_files: + # don't want to make s3 bucket full keys public, so trim to just keep file name + public_file_name = model_file.rsplit("/", 1)[-1] + urls[public_file_name] = self.filesystem_gateway.generate_signed_url(model_file) + return ModelDownloadResponse(urls=urls) diff --git a/model-engine/model_engine_server/infra/repositories/db_repository_mixin.py b/model-engine/model_engine_server/infra/repositories/db_repository_mixin.py index cd8bc402..e0e9f242 100644 --- a/model-engine/model_engine_server/infra/repositories/db_repository_mixin.py +++ b/model-engine/model_engine_server/infra/repositories/db_repository_mixin.py @@ -2,7 +2,7 @@ from functools import wraps from typing import Callable -from model_engine_server.core.domain_exceptions import ReadOnlyDatabaseException +from model_engine_server.domain.exceptions import ReadOnlyDatabaseException from sqlalchemy.ext.asyncio import AsyncSession diff --git a/model-engine/model_engine_server/infra/repositories/s3_file_llm_fine_tune_events_repository.py b/model-engine/model_engine_server/infra/repositories/s3_file_llm_fine_tune_events_repository.py index 90f179c9..d05fd0ed 100644 --- a/model-engine/model_engine_server/infra/repositories/s3_file_llm_fine_tune_events_repository.py +++ b/model-engine/model_engine_server/infra/repositories/s3_file_llm_fine_tune_events_repository.py @@ -6,7 +6,7 @@ import boto3 import smart_open from model_engine_server.core.config import infra_config -from model_engine_server.core.domain_exceptions import ObjectNotFoundException +from model_engine_server.domain.exceptions import ObjectNotFoundException from model_engine_server.domain.entities.llm_fine_tune_entity import LLMFineTuneEvent from model_engine_server.domain.repositories.llm_fine_tune_events_repository import ( LLMFineTuneEventsRepository, diff --git a/model-engine/model_engine_server/infra/services/live_batch_job_orchestration_service.py b/model-engine/model_engine_server/infra/services/live_batch_job_orchestration_service.py index 258a4429..fadf62c0 100644 --- a/model-engine/model_engine_server/infra/services/live_batch_job_orchestration_service.py +++ b/model-engine/model_engine_server/infra/services/live_batch_job_orchestration_service.py @@ -16,7 +16,7 @@ TaskStatus, ) from model_engine_server.core.config import infra_config -from model_engine_server.core.domain_exceptions import ObjectNotFoundException +from model_engine_server.domain.exceptions import ObjectNotFoundException from model_engine_server.core.loggers import filename_wo_ext, make_logger from model_engine_server.domain.entities import ( BatchJobProgress, diff --git a/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py b/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py index eabbf034..29cec20f 100644 --- a/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py +++ b/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py @@ -19,7 +19,7 @@ from model_engine_server.common.io import open_wrapper from model_engine_server.common.serialization_utils import bool_to_str from model_engine_server.core.config import infra_config -from model_engine_server.core.domain_exceptions import DockerBuildFailedException +from model_engine_server.domain.exceptions import DockerBuildFailedException from model_engine_server.core.loggers import make_logger from model_engine_server.core.notification_gateway import NotificationApp, NotificationGateway from model_engine_server.core.utils.env import environment diff --git a/model-engine/tests/unit/conftest.py b/model-engine/tests/unit/conftest.py index 9714fd9d..03856f5b 100644 --- a/model-engine/tests/unit/conftest.py +++ b/model-engine/tests/unit/conftest.py @@ -41,7 +41,7 @@ TaskStatus, ) from model_engine_server.common.settings import generate_destination -from model_engine_server.core.domain_exceptions import ObjectNotFoundException +from model_engine_server.domain.exceptions import ObjectNotFoundException from model_engine_server.core.fake_notification_gateway import FakeNotificationGateway from model_engine_server.db.endpoint_row_lock import get_lock_key from model_engine_server.db.models import BatchJob as OrmBatchJob diff --git a/model-engine/tests/unit/infra/repositories/test_db_batch_job_record_repository.py b/model-engine/tests/unit/infra/repositories/test_db_batch_job_record_repository.py index d52d327b..6cdb435c 100644 --- a/model-engine/tests/unit/infra/repositories/test_db_batch_job_record_repository.py +++ b/model-engine/tests/unit/infra/repositories/test_db_batch_job_record_repository.py @@ -3,7 +3,7 @@ from unittest.mock import AsyncMock import pytest -from model_engine_server.core.domain_exceptions import ReadOnlyDatabaseException +from model_engine_server.domain.exceptions import ReadOnlyDatabaseException from model_engine_server.db.models import BatchJob, Bundle from model_engine_server.domain.entities import BatchJobRecord from model_engine_server.infra.repositories.db_batch_job_record_repository import ( diff --git a/model-engine/tests/unit/infra/repositories/test_db_docker_image_batch_job_bundle_repository.py b/model-engine/tests/unit/infra/repositories/test_db_docker_image_batch_job_bundle_repository.py index b28bf81f..fb05cc30 100644 --- a/model-engine/tests/unit/infra/repositories/test_db_docker_image_batch_job_bundle_repository.py +++ b/model-engine/tests/unit/infra/repositories/test_db_docker_image_batch_job_bundle_repository.py @@ -4,7 +4,7 @@ import pytest from model_engine_server.common.dtos.model_bundles import ModelBundleOrderBy -from model_engine_server.core.domain_exceptions import ReadOnlyDatabaseException +from model_engine_server.domain.exceptions import ReadOnlyDatabaseException from model_engine_server.db.models import DockerImageBatchJobBundle as OrmDockerImageBatchJobBundle from model_engine_server.domain.entities import GpuType from model_engine_server.domain.entities.docker_image_batch_job_bundle_entity import ( diff --git a/model-engine/tests/unit/infra/repositories/test_db_model_bundle_repository.py b/model-engine/tests/unit/infra/repositories/test_db_model_bundle_repository.py index dd73b221..90b3b1de 100644 --- a/model-engine/tests/unit/infra/repositories/test_db_model_bundle_repository.py +++ b/model-engine/tests/unit/infra/repositories/test_db_model_bundle_repository.py @@ -4,7 +4,7 @@ import pytest from model_engine_server.common.dtos.model_bundles import ModelBundleOrderBy -from model_engine_server.core.domain_exceptions import ReadOnlyDatabaseException +from model_engine_server.domain.exceptions import ReadOnlyDatabaseException from model_engine_server.db.models import Bundle from model_engine_server.domain.entities import ( CloudpickleArtifactFlavor, diff --git a/model-engine/tests/unit/infra/repositories/test_db_model_endpoint_record_repository.py b/model-engine/tests/unit/infra/repositories/test_db_model_endpoint_record_repository.py index 8d751272..1a14129b 100644 --- a/model-engine/tests/unit/infra/repositories/test_db_model_endpoint_record_repository.py +++ b/model-engine/tests/unit/infra/repositories/test_db_model_endpoint_record_repository.py @@ -4,7 +4,7 @@ import pytest from model_engine_server.common.dtos.model_endpoints import ModelEndpointOrderBy -from model_engine_server.core.domain_exceptions import ReadOnlyDatabaseException +from model_engine_server.domain.exceptions import ReadOnlyDatabaseException from model_engine_server.db.models import Bundle, Endpoint from model_engine_server.domain.entities import ModelEndpointRecord from model_engine_server.infra.gateways import FakeMonitoringMetricsGateway diff --git a/model-engine/tests/unit/infra/services/test_live_batch_job_orchestration_service.py b/model-engine/tests/unit/infra/services/test_live_batch_job_orchestration_service.py index 8d894622..1587d125 100644 --- a/model-engine/tests/unit/infra/services/test_live_batch_job_orchestration_service.py +++ b/model-engine/tests/unit/infra/services/test_live_batch_job_orchestration_service.py @@ -6,7 +6,7 @@ import pytest from model_engine_server.common.constants import DEFAULT_CELERY_TASK_NAME, LIRA_CELERY_TASK_NAME from model_engine_server.common.dtos.tasks import GetAsyncTaskV1Response, ResponseSchema, TaskStatus -from model_engine_server.core.domain_exceptions import ObjectNotFoundException +from model_engine_server.domain.exceptions import ObjectNotFoundException from model_engine_server.domain.entities import ( BatchJob, BatchJobSerializationFormat, diff --git a/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py b/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py index 6d5724fb..d3333e3c 100644 --- a/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py +++ b/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py @@ -8,7 +8,7 @@ BuildEndpointResponse, BuildEndpointStatus, ) -from model_engine_server.core.domain_exceptions import DockerBuildFailedException +from model_engine_server.domain.exceptions import DockerBuildFailedException from model_engine_server.core.fake_notification_gateway import FakeNotificationGateway from model_engine_server.core.notification_gateway import NotificationApp from model_engine_server.domain.entities.model_bundle_entity import ( From cd0f7334fd1b875d6c93d0fd60a0df220c1b4f58 Mon Sep 17 00:00:00 2001 From: Ian Macleod Date: Mon, 18 Sep 2023 22:54:34 +0000 Subject: [PATCH 04/12] precommit --- model-engine/model_engine_server/api/app.py | 11 +++++------ model-engine/model_engine_server/api/llms_v1.py | 2 +- model-engine/model_engine_server/api/tasks_v1.py | 6 ++---- .../domain/use_cases/file_use_cases.py | 2 +- .../domain/use_cases/llm_fine_tuning_use_cases.py | 7 +++++-- .../s3_file_llm_fine_tune_events_repository.py | 2 +- .../services/live_batch_job_orchestration_service.py | 2 +- .../infra/services/live_endpoint_builder_service.py | 6 ++++-- model-engine/tests/unit/conftest.py | 6 ++++-- .../test_db_batch_job_record_repository.py | 2 +- ...est_db_docker_image_batch_job_bundle_repository.py | 6 ++++-- .../repositories/test_db_model_bundle_repository.py | 2 +- .../test_db_model_endpoint_record_repository.py | 2 +- .../test_live_batch_job_orchestration_service.py | 2 +- .../services/test_live_endpoint_builder_service.py | 6 ++++-- 15 files changed, 36 insertions(+), 28 deletions(-) diff --git a/model-engine/model_engine_server/api/app.py b/model-engine/model_engine_server/api/app.py index ba698e0f..aebea251 100644 --- a/model-engine/model_engine_server/api/app.py +++ b/model-engine/model_engine_server/api/app.py @@ -1,10 +1,9 @@ import os -from pathlib import Path import traceback +from pathlib import Path -from fastapi import FastAPI, Response, Request, HTTPException +from fastapi import FastAPI, HTTPException, Request, Response from fastapi.staticfiles import StaticFiles -from model_engine_server.common.datadog_utils import get_request_id from model_engine_server.api.batch_jobs_v1 import batch_job_router_v1 from model_engine_server.api.dependencies import get_or_create_aioredis_pool from model_engine_server.api.docker_image_batch_job_bundles_v1 import ( @@ -18,6 +17,7 @@ from model_engine_server.api.model_endpoints_v1 import model_endpoint_router_v1 from model_engine_server.api.tasks_v1 import inference_task_router_v1 from model_engine_server.api.triggers_v1 import trigger_router_v1 +from model_engine_server.common.datadog_utils import get_request_id from model_engine_server.core.loggers import filename_wo_ext, make_logger app = FastAPI(title="launch", version="1.0.0", redoc_url="/api") @@ -35,15 +35,14 @@ logger = make_logger(filename_wo_ext(__name__)) + @app.middleware("http") async def exception_handler(request: Request, call_next): try: return await call_next(request) except Exception as e: logger.error("An unexpected error occured during request: {0}".format(str(e))) - logger.exception( - "Traceback is: {0}".format(str(traceback.format_tb(e.__traceback__, 10))) - ) + logger.exception("Traceback is: {0}".format(str(traceback.format_tb(e.__traceback__, 10)))) request_id = get_request_id() raise HTTPException( status_code=500, diff --git a/model-engine/model_engine_server/api/llms_v1.py b/model-engine/model_engine_server/api/llms_v1.py index b14507c3..909802a6 100644 --- a/model-engine/model_engine_server/api/llms_v1.py +++ b/model-engine/model_engine_server/api/llms_v1.py @@ -1,9 +1,9 @@ """LLM Model Endpoint routes for the hosted model inference service. """ +import traceback from typing import Optional from fastapi import APIRouter, Depends, HTTPException, Query, Request -import traceback from model_engine_server.api.dependencies import ( ExternalInterfaces, get_external_interfaces, diff --git a/model-engine/model_engine_server/api/tasks_v1.py b/model-engine/model_engine_server/api/tasks_v1.py index e9d50d09..05fdb270 100644 --- a/model-engine/model_engine_server/api/tasks_v1.py +++ b/model-engine/model_engine_server/api/tasks_v1.py @@ -16,13 +16,11 @@ TaskStatus, ) from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.domain.exceptions import ( - ObjectNotAuthorizedException, - ObjectNotFoundException, -) from model_engine_server.core.loggers import filename_wo_ext, make_logger from model_engine_server.domain.exceptions import ( EndpointUnsupportedInferenceTypeException, + ObjectNotAuthorizedException, + ObjectNotFoundException, UpstreamServiceError, ) from model_engine_server.domain.use_cases.async_inference_use_cases import ( diff --git a/model-engine/model_engine_server/domain/use_cases/file_use_cases.py b/model-engine/model_engine_server/domain/use_cases/file_use_cases.py index 325a7650..a3ede743 100644 --- a/model-engine/model_engine_server/domain/use_cases/file_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/file_use_cases.py @@ -6,8 +6,8 @@ UploadFileResponse, ) from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.domain.exceptions import ObjectNotFoundException from model_engine_server.core.loggers import filename_wo_ext, make_logger +from model_engine_server.domain.exceptions import ObjectNotFoundException from model_engine_server.domain.gateways import FileStorageGateway logger = make_logger(filename_wo_ext(__file__)) diff --git a/model-engine/model_engine_server/domain/use_cases/llm_fine_tuning_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_fine_tuning_use_cases.py index a744802a..039b15ad 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_fine_tuning_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_fine_tuning_use_cases.py @@ -13,10 +13,13 @@ ListFineTunesResponse, ) from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.domain.exceptions import ObjectNotFoundException from model_engine_server.core.loggers import filename_wo_ext, make_logger from model_engine_server.domain.entities import BatchJobStatus -from model_engine_server.domain.exceptions import InvalidRequestException, LLMFineTuningQuotaReached +from model_engine_server.domain.exceptions import ( + InvalidRequestException, + LLMFineTuningQuotaReached, + ObjectNotFoundException, +) from model_engine_server.domain.gateways import FileStorageGateway from model_engine_server.domain.repositories import LLMFineTuneEventsRepository from model_engine_server.domain.services import LLMFineTuningService, ModelEndpointService diff --git a/model-engine/model_engine_server/infra/repositories/s3_file_llm_fine_tune_events_repository.py b/model-engine/model_engine_server/infra/repositories/s3_file_llm_fine_tune_events_repository.py index d05fd0ed..6993d1d0 100644 --- a/model-engine/model_engine_server/infra/repositories/s3_file_llm_fine_tune_events_repository.py +++ b/model-engine/model_engine_server/infra/repositories/s3_file_llm_fine_tune_events_repository.py @@ -6,8 +6,8 @@ import boto3 import smart_open from model_engine_server.core.config import infra_config -from model_engine_server.domain.exceptions import ObjectNotFoundException from model_engine_server.domain.entities.llm_fine_tune_entity import LLMFineTuneEvent +from model_engine_server.domain.exceptions import ObjectNotFoundException from model_engine_server.domain.repositories.llm_fine_tune_events_repository import ( LLMFineTuneEventsRepository, ) diff --git a/model-engine/model_engine_server/infra/services/live_batch_job_orchestration_service.py b/model-engine/model_engine_server/infra/services/live_batch_job_orchestration_service.py index fadf62c0..7a096c2a 100644 --- a/model-engine/model_engine_server/infra/services/live_batch_job_orchestration_service.py +++ b/model-engine/model_engine_server/infra/services/live_batch_job_orchestration_service.py @@ -16,7 +16,6 @@ TaskStatus, ) from model_engine_server.core.config import infra_config -from model_engine_server.domain.exceptions import ObjectNotFoundException from model_engine_server.core.loggers import filename_wo_ext, make_logger from model_engine_server.domain.entities import ( BatchJobProgress, @@ -25,6 +24,7 @@ BatchJobStatus, ModelEndpointStatus, ) +from model_engine_server.domain.exceptions import ObjectNotFoundException from model_engine_server.domain.gateways import AsyncModelEndpointInferenceGateway from model_engine_server.domain.services import ModelEndpointService from model_engine_server.domain.use_cases.async_inference_use_cases import ( diff --git a/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py b/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py index 29cec20f..1a8c0c7d 100644 --- a/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py +++ b/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py @@ -19,7 +19,6 @@ from model_engine_server.common.io import open_wrapper from model_engine_server.common.serialization_utils import bool_to_str from model_engine_server.core.config import infra_config -from model_engine_server.domain.exceptions import DockerBuildFailedException from model_engine_server.core.loggers import make_logger from model_engine_server.core.notification_gateway import NotificationApp, NotificationGateway from model_engine_server.core.utils.env import environment @@ -40,7 +39,10 @@ TensorflowFramework, ZipArtifactFlavor, ) -from model_engine_server.domain.exceptions import EndpointResourceInfraException +from model_engine_server.domain.exceptions import ( + DockerBuildFailedException, + EndpointResourceInfraException, +) from model_engine_server.domain.gateways import MonitoringMetricsGateway from model_engine_server.domain.repositories import DockerRepository from model_engine_server.domain.services import EndpointBuilderService diff --git a/model-engine/tests/unit/conftest.py b/model-engine/tests/unit/conftest.py index 03856f5b..8ea517fd 100644 --- a/model-engine/tests/unit/conftest.py +++ b/model-engine/tests/unit/conftest.py @@ -41,7 +41,6 @@ TaskStatus, ) from model_engine_server.common.settings import generate_destination -from model_engine_server.domain.exceptions import ObjectNotFoundException from model_engine_server.core.fake_notification_gateway import FakeNotificationGateway from model_engine_server.db.endpoint_row_lock import get_lock_key from model_engine_server.db.models import BatchJob as OrmBatchJob @@ -87,7 +86,10 @@ DockerImageBatchJobBundle, ) from model_engine_server.domain.entities.llm_fine_tune_entity import LLMFineTuneTemplate -from model_engine_server.domain.exceptions import EndpointResourceInfraException +from model_engine_server.domain.exceptions import ( + EndpointResourceInfraException, + ObjectNotFoundException, +) from model_engine_server.domain.gateways import ( AsyncModelEndpointInferenceGateway, CronJobGateway, diff --git a/model-engine/tests/unit/infra/repositories/test_db_batch_job_record_repository.py b/model-engine/tests/unit/infra/repositories/test_db_batch_job_record_repository.py index 6cdb435c..214a1ebc 100644 --- a/model-engine/tests/unit/infra/repositories/test_db_batch_job_record_repository.py +++ b/model-engine/tests/unit/infra/repositories/test_db_batch_job_record_repository.py @@ -3,9 +3,9 @@ from unittest.mock import AsyncMock import pytest -from model_engine_server.domain.exceptions import ReadOnlyDatabaseException from model_engine_server.db.models import BatchJob, Bundle from model_engine_server.domain.entities import BatchJobRecord +from model_engine_server.domain.exceptions import ReadOnlyDatabaseException from model_engine_server.infra.repositories.db_batch_job_record_repository import ( DbBatchJobRecordRepository, OrmBatchJob, diff --git a/model-engine/tests/unit/infra/repositories/test_db_docker_image_batch_job_bundle_repository.py b/model-engine/tests/unit/infra/repositories/test_db_docker_image_batch_job_bundle_repository.py index fb05cc30..2bfaab3b 100644 --- a/model-engine/tests/unit/infra/repositories/test_db_docker_image_batch_job_bundle_repository.py +++ b/model-engine/tests/unit/infra/repositories/test_db_docker_image_batch_job_bundle_repository.py @@ -4,13 +4,15 @@ import pytest from model_engine_server.common.dtos.model_bundles import ModelBundleOrderBy -from model_engine_server.domain.exceptions import ReadOnlyDatabaseException from model_engine_server.db.models import DockerImageBatchJobBundle as OrmDockerImageBatchJobBundle from model_engine_server.domain.entities import GpuType from model_engine_server.domain.entities.docker_image_batch_job_bundle_entity import ( DockerImageBatchJobBundle, ) -from model_engine_server.domain.exceptions import CorruptRecordInfraStateException +from model_engine_server.domain.exceptions import ( + CorruptRecordInfraStateException, + ReadOnlyDatabaseException, +) from model_engine_server.infra.repositories import DbDockerImageBatchJobBundleRepository from model_engine_server.infra.repositories.db_docker_image_batch_job_bundle_repository import ( translate_docker_image_batch_job_bundle_orm_to_entity, diff --git a/model-engine/tests/unit/infra/repositories/test_db_model_bundle_repository.py b/model-engine/tests/unit/infra/repositories/test_db_model_bundle_repository.py index 90b3b1de..4eb94d20 100644 --- a/model-engine/tests/unit/infra/repositories/test_db_model_bundle_repository.py +++ b/model-engine/tests/unit/infra/repositories/test_db_model_bundle_repository.py @@ -4,7 +4,6 @@ import pytest from model_engine_server.common.dtos.model_bundles import ModelBundleOrderBy -from model_engine_server.domain.exceptions import ReadOnlyDatabaseException from model_engine_server.db.models import Bundle from model_engine_server.domain.entities import ( CloudpickleArtifactFlavor, @@ -12,6 +11,7 @@ ModelBundlePackagingType, PytorchFramework, ) +from model_engine_server.domain.exceptions import ReadOnlyDatabaseException from model_engine_server.infra.repositories.db_model_bundle_repository import ( DbModelBundleRepository, OrmModelBundle, diff --git a/model-engine/tests/unit/infra/repositories/test_db_model_endpoint_record_repository.py b/model-engine/tests/unit/infra/repositories/test_db_model_endpoint_record_repository.py index 1a14129b..3ad72127 100644 --- a/model-engine/tests/unit/infra/repositories/test_db_model_endpoint_record_repository.py +++ b/model-engine/tests/unit/infra/repositories/test_db_model_endpoint_record_repository.py @@ -4,9 +4,9 @@ import pytest from model_engine_server.common.dtos.model_endpoints import ModelEndpointOrderBy -from model_engine_server.domain.exceptions import ReadOnlyDatabaseException from model_engine_server.db.models import Bundle, Endpoint from model_engine_server.domain.entities import ModelEndpointRecord +from model_engine_server.domain.exceptions import ReadOnlyDatabaseException from model_engine_server.infra.gateways import FakeMonitoringMetricsGateway from model_engine_server.infra.repositories import db_model_endpoint_record_repository from model_engine_server.infra.repositories.db_model_endpoint_record_repository import ( diff --git a/model-engine/tests/unit/infra/services/test_live_batch_job_orchestration_service.py b/model-engine/tests/unit/infra/services/test_live_batch_job_orchestration_service.py index 1587d125..11b2abe5 100644 --- a/model-engine/tests/unit/infra/services/test_live_batch_job_orchestration_service.py +++ b/model-engine/tests/unit/infra/services/test_live_batch_job_orchestration_service.py @@ -6,7 +6,6 @@ import pytest from model_engine_server.common.constants import DEFAULT_CELERY_TASK_NAME, LIRA_CELERY_TASK_NAME from model_engine_server.common.dtos.tasks import GetAsyncTaskV1Response, ResponseSchema, TaskStatus -from model_engine_server.domain.exceptions import ObjectNotFoundException from model_engine_server.domain.entities import ( BatchJob, BatchJobSerializationFormat, @@ -15,6 +14,7 @@ ModelEndpoint, ModelEndpointStatus, ) +from model_engine_server.domain.exceptions import ObjectNotFoundException from model_engine_server.infra.gateways import LiveBatchJobProgressGateway from model_engine_server.infra.services import ( LiveBatchJobOrchestrationService, diff --git a/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py b/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py index d3333e3c..31d44b2d 100644 --- a/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py +++ b/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py @@ -8,14 +8,16 @@ BuildEndpointResponse, BuildEndpointStatus, ) -from model_engine_server.domain.exceptions import DockerBuildFailedException from model_engine_server.core.fake_notification_gateway import FakeNotificationGateway from model_engine_server.core.notification_gateway import NotificationApp from model_engine_server.domain.entities.model_bundle_entity import ( ArtifactLike, RunnableImageFlavor, ) -from model_engine_server.domain.exceptions import EndpointResourceInfraException +from model_engine_server.domain.exceptions import ( + DockerBuildFailedException, + EndpointResourceInfraException, +) from model_engine_server.infra.gateways.fake_monitoring_metrics_gateway import ( FakeMonitoringMetricsGateway, ) From 51c63c1054949a9a4b7d4fee0796620a8fc34af2 Mon Sep 17 00:00:00 2001 From: Ian Macleod Date: Tue, 19 Sep 2023 20:41:10 +0000 Subject: [PATCH 05/12] add new error handler middleware --- model-engine/model_engine_server/api/app.py | 35 ++++++++++++------- .../model_engine_server/api/llms_v1.py | 8 +++-- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/model-engine/model_engine_server/api/app.py b/model-engine/model_engine_server/api/app.py index aebea251..c9770313 100644 --- a/model-engine/model_engine_server/api/app.py +++ b/model-engine/model_engine_server/api/app.py @@ -2,7 +2,8 @@ import traceback from pathlib import Path -from fastapi import FastAPI, HTTPException, Request, Response +from fastapi import FastAPI, Request, Response +from fastapi.responses import JSONResponse from fastapi.staticfiles import StaticFiles from model_engine_server.api.batch_jobs_v1 import batch_job_router_v1 from model_engine_server.api.dependencies import get_or_create_aioredis_pool @@ -19,6 +20,7 @@ from model_engine_server.api.triggers_v1 import trigger_router_v1 from model_engine_server.common.datadog_utils import get_request_id from model_engine_server.core.loggers import filename_wo_ext, make_logger +from starlette.middleware.base import BaseHTTPMiddleware app = FastAPI(title="launch", version="1.0.0", redoc_url="/api") @@ -36,20 +38,27 @@ logger = make_logger(filename_wo_ext(__name__)) -@app.middleware("http") -async def exception_handler(request: Request, call_next): - try: - return await call_next(request) - except Exception as e: - logger.error("An unexpected error occured during request: {0}".format(str(e))) - logger.exception("Traceback is: {0}".format(str(traceback.format_tb(e.__traceback__, 10)))) - request_id = get_request_id() - raise HTTPException( - status_code=500, - detail=f"Internal error for request_id {request_id}.", - ) +class ExceptionLoggingMiddleware(BaseHTTPMiddleware): + async def dispatch(self, request: Request, call_next): + try: + return await call_next(request) + except Exception as e: + # logger.error("An unexpected error occured during request: {0}".format(str(e))) + # logger.exception("Traceback is: {0}".format(str(traceback.format_tb(e.__traceback__)))) + tb_str = traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__) + structured_log = {"error": str(e), "traceback": "".join(tb_str)} + logger.error("Unhandled exception: %s", structured_log) + request_id = get_request_id() + return JSONResponse( + { + "status_code": 500, + "content": {"error": f"Internal error for request_id {request_id}."}, + } + ) +app.add_middleware(ExceptionLoggingMiddleware) + # TODO: Remove this once we have a better way to serve internal docs INTERNAL_DOCS_PATH = str(Path(__file__).parents[3] / "launch_internal/site") if os.path.exists(INTERNAL_DOCS_PATH): diff --git a/model-engine/model_engine_server/api/llms_v1.py b/model-engine/model_engine_server/api/llms_v1.py index 909802a6..7d89537e 100644 --- a/model-engine/model_engine_server/api/llms_v1.py +++ b/model-engine/model_engine_server/api/llms_v1.py @@ -1,9 +1,8 @@ """LLM Model Endpoint routes for the hosted model inference service. """ -import traceback from typing import Optional -from fastapi import APIRouter, Depends, HTTPException, Query, Request +from fastapi import APIRouter, Depends, HTTPException, Query from model_engine_server.api.dependencies import ( ExternalInterfaces, get_external_interfaces, @@ -511,3 +510,8 @@ async def delete_llm_model_endpoint( status_code=500, detail=f"Internal error for request_id {request_id}.", ) + + +@llm_router_v1.get("/test_error") +def test_error(): + raise Exception From 9d2fb6bf9307b9213d17ba20f4727c1018dbc2de Mon Sep 17 00:00:00 2001 From: Ian Macleod Date: Tue, 19 Sep 2023 21:42:39 +0000 Subject: [PATCH 06/12] iterating --- model-engine/model_engine_server/api/app.py | 2 -- .../model_engine_server/api/llms_v1.py | 21 +++++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/model-engine/model_engine_server/api/app.py b/model-engine/model_engine_server/api/app.py index c9770313..cb9357db 100644 --- a/model-engine/model_engine_server/api/app.py +++ b/model-engine/model_engine_server/api/app.py @@ -43,8 +43,6 @@ async def dispatch(self, request: Request, call_next): try: return await call_next(request) except Exception as e: - # logger.error("An unexpected error occured during request: {0}".format(str(e))) - # logger.exception("Traceback is: {0}".format(str(traceback.format_tb(e.__traceback__)))) tb_str = traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__) structured_log = {"error": str(e), "traceback": "".join(tb_str)} logger.error("Unhandled exception: %s", structured_log) diff --git a/model-engine/model_engine_server/api/llms_v1.py b/model-engine/model_engine_server/api/llms_v1.py index 7d89537e..0bbc8693 100644 --- a/model-engine/model_engine_server/api/llms_v1.py +++ b/model-engine/model_engine_server/api/llms_v1.py @@ -2,6 +2,7 @@ """ from typing import Optional +from ddtrace import tracer from fastapi import APIRouter, Depends, HTTPException, Query from model_engine_server.api.dependencies import ( ExternalInterfaces, @@ -515,3 +516,23 @@ async def delete_llm_model_endpoint( @llm_router_v1.get("/test_error") def test_error(): raise Exception + + +@llm_router_v1.get("/test_dd_trace") +def test_dd_trace(): + if get_request_id() is None: + print("no trace id") + else: + print("trace id found!") + + +@llm_router_v1.get("/test_create_dd_trace") +def test_create_dd_trace(): + if get_request_id() is None: + print("no trace id") + with tracer.trace("web.request", service="my-fastapi-service") as span: + span.set_tag("http.method", "GET") + print("how about now?") + print(get_request_id()) + else: + print("trace id found!") From 2675ebf8e4feea5c44d283be1b14d3752ebf193d Mon Sep 17 00:00:00 2001 From: Ian Macleod Date: Tue, 19 Sep 2023 21:51:35 +0000 Subject: [PATCH 07/12] iterating --- model-engine/model_engine_server/api/llms_v1.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/model-engine/model_engine_server/api/llms_v1.py b/model-engine/model_engine_server/api/llms_v1.py index 0bbc8693..c00517da 100644 --- a/model-engine/model_engine_server/api/llms_v1.py +++ b/model-engine/model_engine_server/api/llms_v1.py @@ -521,9 +521,9 @@ def test_error(): @llm_router_v1.get("/test_dd_trace") def test_dd_trace(): if get_request_id() is None: - print("no trace id") + logger.info("no trace id") else: - print("trace id found!") + logger.info("trace id found!") @llm_router_v1.get("/test_create_dd_trace") @@ -532,7 +532,8 @@ def test_create_dd_trace(): print("no trace id") with tracer.trace("web.request", service="my-fastapi-service") as span: span.set_tag("http.method", "GET") - print("how about now?") - print(get_request_id()) + logger.info("how about now?") + logger.info(get_request_id()) else: - print("trace id found!") + logger.info("trace id found!") + logger.info(get_request_id()) From 18a3f141ee84997b3fc06c4d173495a0eed0708d Mon Sep 17 00:00:00 2001 From: Ian Macleod Date: Tue, 19 Sep 2023 22:18:22 +0000 Subject: [PATCH 08/12] iterating --- model-engine/model_engine_server/api/llms_v1.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/model-engine/model_engine_server/api/llms_v1.py b/model-engine/model_engine_server/api/llms_v1.py index c00517da..aa988878 100644 --- a/model-engine/model_engine_server/api/llms_v1.py +++ b/model-engine/model_engine_server/api/llms_v1.py @@ -515,7 +515,9 @@ async def delete_llm_model_endpoint( @llm_router_v1.get("/test_error") def test_error(): - raise Exception + with tracer.trace("web.request", service="my-fastapi-service") as span: + span.set_tag("http.method", "GET") + raise Exception @llm_router_v1.get("/test_dd_trace") From db470d6ecd5f254e0b95e2078cc257ed9f2b54e0 Mon Sep 17 00:00:00 2001 From: Ian Macleod Date: Tue, 19 Sep 2023 23:47:43 +0000 Subject: [PATCH 09/12] setting DD_REMOTE_CONFIGURATION_ENABLED as false so that we don't get unneccessary logging errors --- charts/model-engine/templates/_helpers.tpl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/charts/model-engine/templates/_helpers.tpl b/charts/model-engine/templates/_helpers.tpl index 8bcebe2c..c9d84d3c 100644 --- a/charts/model-engine/templates/_helpers.tpl +++ b/charts/model-engine/templates/_helpers.tpl @@ -124,6 +124,8 @@ podAffinity: env: - name: DATADOG_TRACE_ENABLED value: "${DATADOG_TRACE_ENABLED}" + - name: DD_REMOTE_CONFIGURATION_ENABLED + value: "false" - name: DD_SERVICE value: "${ENDPOINT_NAME}" - name: DD_ENV @@ -184,6 +186,8 @@ env: env: - name: DATADOG_TRACE_ENABLED value: "${DATADOG_TRACE_ENABLED}" + - name: DD_REMOTE_CONFIGURATION_ENABLED + value: "false" - name: DD_SERVICE value: "${ENDPOINT_NAME}" - name: DD_ENV @@ -230,6 +234,8 @@ env: env: - name: DATADOG_TRACE_ENABLED value: "{{ .Values.datadog_trace_enabled }}" + - name: DD_REMOTE_CONFIGURATION_ENABLED + value: "false" - name: DD_ENV value: {{ .Values.context }} - name: DD_AGENT_HOST From ffafc2ada08a62664f396d4006416ef9f433aae9 Mon Sep 17 00:00:00 2001 From: Ian Macleod Date: Wed, 20 Sep 2023 00:07:45 +0000 Subject: [PATCH 10/12] updated tracer working? --- model-engine/model_engine_server/core/loggers.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/model-engine/model_engine_server/core/loggers.py b/model-engine/model_engine_server/core/loggers.py index e8245199..593302c0 100644 --- a/model-engine/model_engine_server/core/loggers.py +++ b/model-engine/model_engine_server/core/loggers.py @@ -10,7 +10,7 @@ import ddtrace import json_log_formatter import tqdm -from ddtrace import tracer +from ddtrace.tracer import Tracer # DO NOT CHANGE LOGGING FORMAT LOG_FORMAT: str = "%(asctime)s %(levelname)s [%(name)s] [%(filename)s:%(lineno)d] - %(message)s" @@ -82,12 +82,13 @@ def json_record(self, message: str, extra: dict, record: logging.LogRecord) -> d if request_id: extra["request_id"] = request_id - context = tracer.current_trace_context() - trace_id, span_id = (context.trace_id, context.span_id) if context else (0, 0) + context = Tracer().get_log_correlation_context() + trace_id = context.get("trace_id") + span_id = context.get("span_id") # add ids to event dictionary - extra["dd.trace_id"] = trace_id - extra["dd.span_id"] = span_id + extra["dd.trace_id"] = trace_id or 0 + extra["dd.span_id"] = span_id or 0 # add the env, service, and version configured for the tracer. # If tracing is not set up, then this should pull values from DD_ENV, DD_SERVICE, and DD_VERSION. From b82acb32b3ad802fea969b8c27c1d884225eddd8 Mon Sep 17 00:00:00 2001 From: Ian Macleod Date: Wed, 20 Sep 2023 18:27:47 +0000 Subject: [PATCH 11/12] cleanup --- model-engine/model_engine_server/api/app.py | 4 +- .../model_engine_server/api/llms_v1.py | 131 ++---------------- 2 files changed, 11 insertions(+), 124 deletions(-) diff --git a/model-engine/model_engine_server/api/app.py b/model-engine/model_engine_server/api/app.py index cb9357db..a13d62e6 100644 --- a/model-engine/model_engine_server/api/app.py +++ b/model-engine/model_engine_server/api/app.py @@ -50,7 +50,9 @@ async def dispatch(self, request: Request, call_next): return JSONResponse( { "status_code": 500, - "content": {"error": f"Internal error for request_id {request_id}."}, + "content": { + "error": f"Internal error for request_id {request_id}. Our team has been notified." + }, } ) diff --git a/model-engine/model_engine_server/api/llms_v1.py b/model-engine/model_engine_server/api/llms_v1.py index aa988878..78117376 100644 --- a/model-engine/model_engine_server/api/llms_v1.py +++ b/model-engine/model_engine_server/api/llms_v1.py @@ -2,7 +2,6 @@ """ from typing import Optional -from ddtrace import tracer from fastapi import APIRouter, Depends, HTTPException, Query from model_engine_server.api.dependencies import ( ExternalInterfaces, @@ -122,13 +121,6 @@ async def create_model_endpoint( status_code=404, detail="The specified model bundle could not be found.", ) from exc - except Exception as exc: - request_id = get_request_id() - logger.exception(f"Internal service error for request {request_id}: {exc}") - raise HTTPException( - status_code=500, - detail=f"Internal error for request_id {request_id}.", - ) @llm_router_v1.get("/model-endpoints", response_model=ListLLMModelEndpointsV1Response) @@ -143,18 +135,10 @@ async def list_model_endpoints( """ add_trace_resource_name("llm_model_endpoints_get") logger.info(f"GET /llm/model-endpoints?name={name}&order_by={order_by} for {auth}") - try: - use_case = ListLLMModelEndpointsV1UseCase( - llm_model_endpoint_service=external_interfaces.llm_model_endpoint_service, - ) - return await use_case.execute(user=auth, name=name, order_by=order_by) - except Exception as exc: - request_id = get_request_id() - logger.exception(f"Internal service error for request {request_id}: {exc}") - raise HTTPException( - status_code=500, - detail=f"Internal error for request_id {request_id}.", - ) + use_case = ListLLMModelEndpointsV1UseCase( + llm_model_endpoint_service=external_interfaces.llm_model_endpoint_service, + ) + return await use_case.execute(user=auth, name=name, order_by=order_by) @llm_router_v1.get( @@ -180,13 +164,6 @@ async def get_model_endpoint( status_code=404, detail=f"Model Endpoint {model_endpoint_name} was not found.", ) from exc - except Exception as exc: - request_id = get_request_id() - logger.exception(f"Internal service error for request {request_id}: {exc}") - raise HTTPException( - status_code=500, - detail=f"Internal error for request_id {request_id}.", - ) @llm_router_v1.post("/completions-sync", response_model=CompletionSyncV1Response) @@ -232,13 +209,6 @@ async def create_completion_sync_task( status_code=400, detail=f"Unsupported inference type: {str(exc)}", ) from exc - except Exception as exc: - request_id = get_request_id() - logger.exception(f"Internal service error for request {request_id}: {exc}") - raise HTTPException( - status_code=500, - detail=f"Internal error for request_id {request_id}.", - ) @llm_router_v1.post("/completions-stream", response_model=CompletionStreamV1Response) @@ -291,13 +261,6 @@ async def event_generator(): status_code=400, detail=f"Unsupported inference type: {str(exc)}", ) from exc - except Exception as exc: - request_id = get_request_id() - logger.exception(f"Internal service error for request {request_id}: {exc}") - raise HTTPException( - status_code=500, - detail=f"Internal error for request_id {request_id}.", - ) @llm_router_v1.post("/fine-tunes", response_model=CreateFineTuneResponse) @@ -327,13 +290,6 @@ async def create_fine_tune( status_code=400, detail=str(exc), ) from exc - except Exception as exc: - request_id = get_request_id() - logger.exception(f"Internal service error for request {request_id}: {exc}") - raise HTTPException( - status_code=500, - detail=f"Internal error for request_id {request_id}.", - ) @llm_router_v1.get("/fine-tunes/{fine_tune_id}", response_model=GetFineTuneResponse) @@ -354,13 +310,6 @@ async def get_fine_tune( status_code=404, detail="The specified fine-tune job could not be found.", ) from exc - except Exception as exc: - request_id = get_request_id() - logger.exception(f"Internal service error for request {request_id}: {exc}") - raise HTTPException( - status_code=500, - detail=f"Internal error for request_id {request_id}.", - ) @llm_router_v1.get("/fine-tunes", response_model=ListFineTunesResponse) @@ -370,18 +319,10 @@ async def list_fine_tunes( ) -> ListFineTunesResponse: add_trace_resource_name("fine_tunes_list") logger.info(f"GET /fine-tunes for {auth}") - try: - use_case = ListFineTunesV1UseCase( - llm_fine_tuning_service=external_interfaces.llm_fine_tuning_service, - ) - return await use_case.execute(user=auth) - except Exception as exc: - request_id = get_request_id() - logger.exception(f"Internal service error for request {request_id}: {exc}") - raise HTTPException( - status_code=500, - detail=f"Internal error for request_id {request_id}.", - ) + use_case = ListFineTunesV1UseCase( + llm_fine_tuning_service=external_interfaces.llm_fine_tuning_service, + ) + return await use_case.execute(user=auth) @llm_router_v1.put("/fine-tunes/{fine_tune_id}/cancel", response_model=CancelFineTuneResponse) @@ -402,13 +343,6 @@ async def cancel_fine_tune( status_code=404, detail="The specified fine-tune job could not be found.", ) from exc - except Exception as exc: - request_id = get_request_id() - logger.exception(f"Internal service error for request {request_id}: {exc}") - raise HTTPException( - status_code=500, - detail=f"Internal error for request_id {request_id}.", - ) @llm_router_v1.get("/fine-tunes/{fine_tune_id}/events", response_model=GetFineTuneEventsResponse) @@ -430,13 +364,6 @@ async def get_fine_tune_events( status_code=404, detail="The specified fine-tune job's events could not be found.", ) from exc - except Exception as exc: - request_id = get_request_id() - logger.exception(f"Internal service error for request {request_id}: {exc}") - raise HTTPException( - status_code=500, - detail=f"Internal error for request_id {request_id}.", - ) @llm_router_v1.post("/model-endpoints/download", response_model=ModelDownloadResponse) @@ -459,13 +386,6 @@ async def download_model_endpoint( status_code=404, detail="The requested fine-tuned model could not be found.", ) from exc - except Exception as exc: - request_id = get_request_id() - logger.exception(f"Internal service error for request {request_id}: {exc}") - raise HTTPException( - status_code=500, - detail=f"Internal error for request_id {request_id}.", - ) @llm_router_v1.delete( @@ -504,38 +424,3 @@ async def delete_llm_model_endpoint( status_code=500, detail="deletion of endpoint failed.", ) from exc - except Exception as exc: - request_id = get_request_id() - logger.exception(f"Internal service error for request {request_id}: {exc}") - raise HTTPException( - status_code=500, - detail=f"Internal error for request_id {request_id}.", - ) - - -@llm_router_v1.get("/test_error") -def test_error(): - with tracer.trace("web.request", service="my-fastapi-service") as span: - span.set_tag("http.method", "GET") - raise Exception - - -@llm_router_v1.get("/test_dd_trace") -def test_dd_trace(): - if get_request_id() is None: - logger.info("no trace id") - else: - logger.info("trace id found!") - - -@llm_router_v1.get("/test_create_dd_trace") -def test_create_dd_trace(): - if get_request_id() is None: - print("no trace id") - with tracer.trace("web.request", service="my-fastapi-service") as span: - span.set_tag("http.method", "GET") - logger.info("how about now?") - logger.info(get_request_id()) - else: - logger.info("trace id found!") - logger.info(get_request_id()) From 1c061a60a1ee5a2b80a1554f558285af2bf37dd7 Mon Sep 17 00:00:00 2001 From: Ian Macleod Date: Wed, 20 Sep 2023 18:31:58 +0000 Subject: [PATCH 12/12] precommit hooks --- .../domain/use_cases/llm_model_endpoint_use_cases.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index bb803b3c..9e4b6ba6 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -60,7 +60,7 @@ from model_engine_server.domain.services import LLMModelEndpointService, ModelEndpointService from model_engine_server.infra.gateways.filesystem_gateway import FilesystemGateway -from ...common.datadog_utils import add_trace_request_id, get_request_id +from ...common.datadog_utils import add_trace_request_id from ..authorization.live_authorization_module import LiveAuthorizationModule from .model_bundle_use_cases import CreateModelBundleV2UseCase from .model_endpoint_use_cases import (