diff --git a/charts/llm-engine/templates/_helpers.tpl b/charts/llm-engine/templates/_helpers.tpl index 04c8168f..08af45f4 100644 --- a/charts/llm-engine/templates/_helpers.tpl +++ b/charts/llm-engine/templates/_helpers.tpl @@ -344,7 +344,7 @@ volumeMounts: {{- define "llmEngine.forwarderVolumeMounts" }} volumeMounts: - name: config-volume - mountPath: /root/.aws/config + mountPath: /home/user/.aws/config subPath: config - name: user-config mountPath: /workspace/user_config diff --git a/charts/llm-engine/templates/service_template_config_map.yaml b/charts/llm-engine/templates/service_template_config_map.yaml index 87b992cf..08ce1424 100644 --- a/charts/llm-engine/templates/service_template_config_map.yaml +++ b/charts/llm-engine/templates/service_template_config_map.yaml @@ -180,7 +180,7 @@ data: - ddtrace-run - run-service - --config - - /workspace/llm_engine/llm_engine/inference/configs/${FORWARDER_CONFIG_FILE_NAME} + - /workspace/server/llm_engine_server/inference/configs/${FORWARDER_CONFIG_FILE_NAME} - --http - production_threads - --port @@ -221,9 +221,9 @@ data: - ddtrace-run - python - -m - - llm_engine.inference.forwarding.http_forwarder + - server.llm_engine_server.inference.forwarding.http_forwarder - --config - - /workspace/llm_engine/llm_engine/inference/configs/service--http_forwarder.yaml + - /workspace/server/llm_engine_server/inference/configs/service--http_forwarder.yaml - --port - "${FORWARDER_PORT}" - --num-workers @@ -266,7 +266,7 @@ data: - ddtrace-run - run-service - --config - - /workspace/llm_engine/llm_engine/inference/configs/${FORWARDER_CONFIG_FILE_NAME} + - /workspace/server/llm_engine_server/inference/configs/${FORWARDER_CONFIG_FILE_NAME} - --queue - "${QUEUE}" - --task-visibility diff --git a/charts/llm-engine/values_sample.yaml b/charts/llm-engine/values_sample.yaml index 7b2cbbf0..06d70362 100644 --- a/charts/llm-engine/values_sample.yaml +++ b/charts/llm-engine/values_sample.yaml @@ -1,7 +1,7 @@ # This is a YAML-formatted file. # tag [required] is the LLM Engine docker image tag -tag: 1defd4f9c5376149e27673e154731a0c7820fe5d +tag: 41ecada1b51ce3a46bbc3190a36ed7890db370d3 # context is a user-specified deployment tag. Can be used to context: production image: @@ -171,6 +171,20 @@ imageCache: - key: "nvidia.com/gpu" operator: "Exists" effect: "NoSchedule" + - name: a100 + nodeSelector: + k8s.amazonaws.com/accelerator: nvidia-ampere-a100 + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - name: t4 + nodeSelector: + k8s.amazonaws.com/accelerator: nvidia-tesla-t4 + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" # celeryBrokerType specifies the celery broker type for async endpoints (coming soon) celeryBrokerType: sqs diff --git a/server/llm_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/server/llm_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index 482a4519..8de7ef72 100644 --- a/server/llm_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/server/llm_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -227,7 +227,7 @@ async def create_text_generation_inference_bundle( schema_location="TBA", flavor=StreamingEnhancedRunnableImageFlavor( flavor=ModelBundleFlavorType.STREAMING_ENHANCED_RUNNABLE_IMAGE, - repository="text-generation-inference", # TODO: let user choose repo + repository="ghcr.io/huggingface/text-generation-inference", # TODO: let user choose repo tag=framework_image_tag, command=command, streaming_command=command, diff --git a/server/llm_engine_server/infra/gateways/resources/templates/service_template_config_map_circleci.yaml b/server/llm_engine_server/infra/gateways/resources/templates/service_template_config_map_circleci.yaml index 41ffe75b..3f2e519f 100644 --- a/server/llm_engine_server/infra/gateways/resources/templates/service_template_config_map_circleci.yaml +++ b/server/llm_engine_server/infra/gateways/resources/templates/service_template_config_map_circleci.yaml @@ -114,7 +114,7 @@ data: - ddtrace-run - run-service - --config - - /workspace/llm_engine/llm_engine/inference/configs/${FORWARDER_CONFIG_FILE_NAME} + - /workspace/server/llm_engine_server/inference/configs/${FORWARDER_CONFIG_FILE_NAME} - --queue - "${QUEUE}" - --task-visibility @@ -383,7 +383,7 @@ data: - ddtrace-run - run-service - --config - - /workspace/llm_engine/llm_engine/inference/configs/${FORWARDER_CONFIG_FILE_NAME} + - /workspace/server/llm_engine_server/inference/configs/${FORWARDER_CONFIG_FILE_NAME} - --queue - "${QUEUE}" - --task-visibility @@ -805,7 +805,7 @@ data: - ddtrace-run - run-service - --config - - /workspace/llm_engine/llm_engine/inference/configs/${FORWARDER_CONFIG_FILE_NAME} + - /workspace/server/llm_engine_server/inference/configs/${FORWARDER_CONFIG_FILE_NAME} - --http - production_threads - --port @@ -1071,7 +1071,7 @@ data: - ddtrace-run - run-service - --config - - /workspace/llm_engine/llm_engine/inference/configs/${FORWARDER_CONFIG_FILE_NAME} + - /workspace/server/llm_engine_server/inference/configs/${FORWARDER_CONFIG_FILE_NAME} - --http - production_threads - --port @@ -1473,9 +1473,9 @@ data: - ddtrace-run - python - -m - - llm_engine.inference.forwarding.http_forwarder + - server.llm_engine_server.inference.forwarding.http_forwarder - --config - - /workspace/llm_engine/llm_engine/inference/configs/service--http_forwarder.yaml + - /workspace/server/llm_engine_server/inference/configs/service--http_forwarder.yaml - --port - "${FORWARDER_PORT}" - --num-workers @@ -1712,7 +1712,7 @@ data: - ddtrace-run - run-service - --config - - /workspace/llm_engine/llm_engine/inference/configs/${FORWARDER_CONFIG_FILE_NAME} + - /workspace/server/llm_engine_server/inference/configs/${FORWARDER_CONFIG_FILE_NAME} - --queue - "${QUEUE}" - --task-visibility @@ -1987,7 +1987,7 @@ data: - ddtrace-run - run-service - --config - - /workspace/llm_engine/llm_engine/inference/configs/${FORWARDER_CONFIG_FILE_NAME} + - /workspace/server/llm_engine_server/inference/configs/${FORWARDER_CONFIG_FILE_NAME} - --queue - "${QUEUE}" - --task-visibility @@ -2421,7 +2421,7 @@ data: - ddtrace-run - run-service - --config - - /workspace/llm_engine/llm_engine/inference/configs/${FORWARDER_CONFIG_FILE_NAME} + - /workspace/server/llm_engine_server/inference/configs/${FORWARDER_CONFIG_FILE_NAME} - --http - production_threads - --port @@ -2693,7 +2693,7 @@ data: - ddtrace-run - run-service - --config - - /workspace/llm_engine/llm_engine/inference/configs/${FORWARDER_CONFIG_FILE_NAME} + - /workspace/server/llm_engine_server/inference/configs/${FORWARDER_CONFIG_FILE_NAME} - --http - production_threads - --port @@ -3107,9 +3107,9 @@ data: - ddtrace-run - python - -m - - llm_engine.inference.forwarding.http_forwarder + - server.llm_engine_server.inference.forwarding.http_forwarder - --config - - /workspace/llm_engine/llm_engine/inference/configs/service--http_forwarder.yaml + - /workspace/server/llm_engine_server/inference/configs/service--http_forwarder.yaml - --port - "${FORWARDER_PORT}" - --num-workers