From 1c42de32ad8c6a832c6e7117ac0e99fb725a5e69 Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Thu, 16 Apr 2026 13:22:55 -0400 Subject: [PATCH 01/22] feat: harden model-engine runtime on chainguard --- model-engine/Dockerfile | 91 ++++++++++++----------------------- model-engine/requirements.in | 11 +++-- model-engine/requirements.txt | 27 ++++++----- 3 files changed, 52 insertions(+), 77 deletions(-) diff --git a/model-engine/Dockerfile b/model-engine/Dockerfile index a6a3bb49..8a42935c 100644 --- a/model-engine/Dockerfile +++ b/model-engine/Dockerfile @@ -1,79 +1,52 @@ -# syntax = docker/dockerfile:experimental +# syntax = docker/dockerfile:1 -# --- Builder: compile C extensions (pycurl, etc.) and install Python packages --- -FROM python:3.13-slim AS builder +FROM cgr.dev/chainguard/python:latest-dev AS builder +USER root WORKDIR /workspace -RUN apt-get update && apt-get install -y --no-install-recommends \ +RUN apk add --no-cache \ + bash \ + build-base \ + cmake \ + curl \ + curl-dev \ + dumb-init \ git \ - gcc \ - build-essential \ - libssl-dev \ - libcurl4-openssl-dev \ - && rm -rf /var/lib/apt/lists/* + openssl-dev -RUN pip install pip==24.2 setuptools -RUN pip install awscli==1.34.28 --no-cache-dir +RUN python -m venv /workspace/venv +ENV PATH="/workspace/venv/bin:/usr/sbin:/usr/bin:/sbin:/bin" -WORKDIR /workspace/model-engine/ -COPY model-engine/requirements-test.txt requirements-test.txt +WORKDIR /workspace/model-engine COPY model-engine/requirements.txt requirements.txt COPY model-engine/requirements_override.txt requirements_override.txt -RUN pip install -r requirements-test.txt --no-cache-dir -RUN pip install -r requirements.txt --no-cache-dir -# NOTE: aioboto3==10.4.0 -> aiobotocore==2.4.2 -> urllib3<1.27, which downgrades urllib3 -# from 2.x back to 1.26.x. CVE-2023-43804, CVE-2023-45803, CVE-2024-37891 remain. -# Fix: upgrade aioboto3 to >=15.x (separate PR — breaking API changes). +RUN pip install --upgrade pip==24.2 setuptools cmake setuptools-rust +RUN pip install -r requirements.txt --no-cache-dir --no-build-isolation RUN pip install -r requirements_override.txt --no-cache-dir COPY model-engine/setup.py setup.py COPY model-engine/model_engine_server model_engine_server +COPY model-engine/service_configs service_configs RUN pip install -e . -# --- Runtime: no build tools (eliminates linux-libc-dev and python3.13 CVEs) --- -FROM python:3.13-slim AS model-engine +FROM cgr.dev/chainguard/python:latest AS model-engine +USER root WORKDIR /workspace -# Runtime-only system deps (vim omitted: multiple unpatched HIGH CVEs in Debian 13.4) -RUN apt-get update && apt-get install -y --no-install-recommends \ - dumb-init \ - git \ - openssh-client \ - curl \ - procps \ - htop \ - libcurl4 \ - && rm -rf /var/lib/apt/lists/* - -# Install aws-iam-authenticator (architecture-aware) -RUN ARCH=$(uname -m) && \ - if [ "$ARCH" = "aarch64" ]; then \ - curl -fLo /bin/aws-iam-authenticator https://github.com/kubernetes-sigs/aws-iam-authenticator/releases/download/v0.7.11/aws-iam-authenticator_0.7.11_linux_arm64; \ - else \ - curl -fLo /bin/aws-iam-authenticator https://github.com/kubernetes-sigs/aws-iam-authenticator/releases/download/v0.7.11/aws-iam-authenticator_0.7.11_linux_amd64; \ - fi && \ - chmod +x /bin/aws-iam-authenticator - -# Install kubectl (architecture-aware) -RUN ARCH=$(uname -m | sed 's/x86_64/amd64/' | sed 's/aarch64/arm64/') && \ - curl -fLO "https://dl.k8s.io/release/v1.35.3/bin/linux/${ARCH}/kubectl" && \ - chmod +x kubectl && \ - mv kubectl /usr/local/bin/kubectl - -# Copy Python packages, entry-point scripts, and source tree from builder -COPY --from=builder /usr/local/lib/python3.13/site-packages /usr/local/lib/python3.13/site-packages -COPY --from=builder /usr/local/bin /usr/local/bin -COPY --from=builder /workspace/model-engine /workspace/model-engine - -RUN useradd --create-home --shell /bin/bash nonroot && \ - chown -R nonroot:nonroot /workspace - -COPY integration_tests /workspace/integration_tests - -WORKDIR /workspace -ENV PYTHONPATH /workspace -ENV WORKSPACE /workspace +COPY --from=builder --chown=nonroot:nonroot /workspace/venv /workspace/venv +COPY --from=builder --chown=nonroot:nonroot /workspace/model-engine /workspace/model-engine +COPY --from=builder /usr/bin/dumb-init /usr/bin/dumb-init +COPY --from=builder /usr/bin/git /usr/bin/git +COPY --from=builder /usr/libexec/git-core /usr/libexec/git-core +COPY --from=builder /usr/lib/libpcre2-8.so.0 /usr/lib/libpcre2-8.so.0 +COPY --from=builder /usr/lib/libpcre2-8.so.0.15.0 /usr/lib/libpcre2-8.so.0.15.0 +COPY --from=builder /usr/lib/libz.so.1 /usr/lib/libz.so.1 +COPY --from=builder /usr/lib/libz.so.1.3.2 /usr/lib/libz.so.1.3.2 + +ENV PATH="/workspace/venv/bin:/usr/local/bin:/usr/libexec/git-core:/usr/bin" +ENV PYTHONPATH=/workspace +ENV WORKSPACE=/workspace USER nonroot EXPOSE 5000 diff --git a/model-engine/requirements.in b/model-engine/requirements.in index 3dba1c96..5ae32024 100644 --- a/model-engine/requirements.in +++ b/model-engine/requirements.in @@ -12,7 +12,7 @@ azure-storage-blob~=12.19.0 # GCP dependencies gcloud-aio-storage~=9.6 google-auth~=2.25.0 -google-cloud-artifact-registry~=1.13.0 +google-cloud-artifact-registry~=1.21.0 google-cloud-secret-manager>=2.20 google-cloud-storage~=2.14.0 aioboto3==15.5.0 # 10.4.0 forced urllib3<1.27 (CVE-2023-43804/45803/2024-37891); 15.x uses aiobotocore 2.25.x (urllib3<3, 2.x compatible) @@ -28,8 +28,8 @@ cryptography>=44.0.0 # not used directly, but needs to be pinned for Microsoft s dataclasses-json>=0.5.7 datadog-api-client==2.11.0 datadog~=0.47.0 -ddtrace>=2.0,<3.0 # 1.8.3 is incompatible with Python 3.12; 2.x has full Py3.12 support -numpy>=1.26.4,<2.3 # Python 3.12/3.13 wheels start at 1.26.0; 2.3+ requires Python 3.11+ +ddtrace>=4.7.1,<5.0 # 4.7.1 publishes CPython 3.14 wheels; needed for public Chainguard latest +numpy>=2.4.4,<2.5 # 2.4.4 publishes CPython 3.14 wheels deprecation~=2.1 docker~=5.0 fastapi>=0.115.8 # bumped to allow starlette>=0.49.1 (CVE-2025-62727 fix) @@ -42,10 +42,11 @@ kubernetes-asyncio==25.11.0 kubernetes~=25.3.0 orjson>=3.10.15 protobuf>=4.25.0 -psycopg2-binary==2.9.10 +psycopg2-binary==2.9.11 py-xid==0.3.0 pycurl~=7.44 # For celery[sqs] -pydantic==2.8.2 +pytz>=2024.1 +pydantic==2.12.5 python-multipart>=0.0.18 quart~=0.19.9 werkzeug>=3.0.6 # CVE-2024-34069, CVE-2024-49766, CVE-2024-49767 diff --git a/model-engine/requirements.txt b/model-engine/requirements.txt index b3f8fb43..5f4d93ee 100644 --- a/model-engine/requirements.txt +++ b/model-engine/requirements.txt @@ -146,7 +146,7 @@ datadog==0.47.0 # via -r requirements.in datadog-api-client==2.11.0 # via -r requirements.in -ddtrace==2.21.12 +ddtrace==4.7.1 # via -r requirements.in deprecation==2.1.0 # via -r requirements.in @@ -154,7 +154,7 @@ docker==5.0.3 # via -r requirements.in docutils==0.20.1 # via readme-renderer -envier==0.5.2 +envier==0.6.1 # via ddtrace fastapi==0.135.1 # via -r requirements.in @@ -196,7 +196,7 @@ google-auth==2.25.2 # google-cloud-secret-manager # google-cloud-storage # kubernetes -google-cloud-artifact-registry==1.13.1 +google-cloud-artifact-registry==1.21.0 # via -r requirements.in google-cloud-core==2.5.0 # via @@ -223,16 +223,15 @@ greenlet==3.3.2 # -r requirements.in # sqlalchemy grpc-google-iam-v1==0.14.3 - # via - # google-cloud-artifact-registry - # google-cloud-secret-manager -grpcio==1.74.0 + # via google-cloud-artifact-registry +grpcio==1.75.1 # via # google-api-core # googleapis-common-protos # grpc-google-iam-v1 # grpcio-status -grpcio-status==1.71.2 + # google-cloud-secret-manager +grpcio-status==1.75.1 # via # google-api-core # google-cloud-secret-manager @@ -357,7 +356,7 @@ mypy-boto3-sqs==1.40.61 # via boto3-stubs mypy-extensions==1.0.0 # via typing-inspect -numpy==2.2.6 +numpy==2.4.4 # via # -r requirements.in # transformers @@ -398,7 +397,7 @@ proto-plus==1.27.1 # google-api-core # google-cloud-artifact-registry # google-cloud-secret-manager -protobuf==5.29.6 +protobuf==6.33.5 # via # -r requirements.in # ddtrace @@ -409,7 +408,7 @@ protobuf==5.29.6 # grpc-google-iam-v1 # grpcio-status # proto-plus -psycopg2-binary==2.9.10 +psycopg2-binary==2.9.11 # via -r requirements.in py-xid==0.3.0 # via -r requirements.in @@ -426,11 +425,13 @@ pycparser==2.21 # via cffi pycurl==7.45.2 # via -r requirements.in -pydantic==2.8.2 +pytz==2025.2 + # via -r requirements.in +pydantic==2.12.5 # via # -r requirements.in # fastapi -pydantic-core==2.20.1 +pydantic-core==2.41.5 # via pydantic pygments==2.15.1 # via From e2c5fb370b74f17289cce6083a6a8e53d412d854 Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Thu, 16 Apr 2026 13:54:43 -0400 Subject: [PATCH 02/22] fix: restore runtime kubectl assets --- model-engine/Dockerfile | 25 ++++++++++++++++++------- model-engine/requirements.in | 2 +- model-engine/requirements.txt | 2 +- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/model-engine/Dockerfile b/model-engine/Dockerfile index 8a42935c..601d0477 100644 --- a/model-engine/Dockerfile +++ b/model-engine/Dockerfile @@ -4,6 +4,7 @@ FROM cgr.dev/chainguard/python:latest-dev AS builder USER root WORKDIR /workspace +ARG TARGETARCH RUN apk add --no-cache \ bash \ @@ -29,6 +30,17 @@ COPY model-engine/model_engine_server model_engine_server COPY model-engine/service_configs service_configs RUN pip install -e . +RUN mkdir -p /tmp/runtime-bin /tmp/runtime-libs && \ + cp /usr/bin/dumb-init /tmp/runtime-bin/dumb-init && \ + cp /usr/bin/git /tmp/runtime-bin/git && \ + cp -R /usr/libexec/git-core /tmp/runtime-bin/git-core && \ + cp /usr/lib/libpcre2-8.so.0* /tmp/runtime-libs/ && \ + cp /usr/lib/libz.so.1* /tmp/runtime-libs/ && \ + curl -fsSL -o /tmp/runtime-bin/kubectl "https://dl.k8s.io/release/v1.35.3/bin/linux/${TARGETARCH}/kubectl" && \ + chmod +x /tmp/runtime-bin/kubectl && \ + curl -fsSL -o /tmp/runtime-bin/aws-iam-authenticator "https://github.com/kubernetes-sigs/aws-iam-authenticator/releases/download/v0.7.11/aws-iam-authenticator_0.7.11_linux_${TARGETARCH}" && \ + chmod +x /tmp/runtime-bin/aws-iam-authenticator + FROM cgr.dev/chainguard/python:latest AS model-engine USER root @@ -36,13 +48,12 @@ WORKDIR /workspace COPY --from=builder --chown=nonroot:nonroot /workspace/venv /workspace/venv COPY --from=builder --chown=nonroot:nonroot /workspace/model-engine /workspace/model-engine -COPY --from=builder /usr/bin/dumb-init /usr/bin/dumb-init -COPY --from=builder /usr/bin/git /usr/bin/git -COPY --from=builder /usr/libexec/git-core /usr/libexec/git-core -COPY --from=builder /usr/lib/libpcre2-8.so.0 /usr/lib/libpcre2-8.so.0 -COPY --from=builder /usr/lib/libpcre2-8.so.0.15.0 /usr/lib/libpcre2-8.so.0.15.0 -COPY --from=builder /usr/lib/libz.so.1 /usr/lib/libz.so.1 -COPY --from=builder /usr/lib/libz.so.1.3.2 /usr/lib/libz.so.1.3.2 +COPY --from=builder /tmp/runtime-bin/dumb-init /usr/bin/dumb-init +COPY --from=builder /tmp/runtime-bin/git /usr/bin/git +COPY --from=builder /tmp/runtime-bin/git-core /usr/libexec/git-core +COPY --from=builder /tmp/runtime-bin/kubectl /usr/local/bin/kubectl +COPY --from=builder /tmp/runtime-bin/aws-iam-authenticator /usr/local/bin/aws-iam-authenticator +COPY --from=builder /tmp/runtime-libs/ /usr/lib/ ENV PATH="/workspace/venv/bin:/usr/local/bin:/usr/libexec/git-core:/usr/bin" ENV PYTHONPATH=/workspace diff --git a/model-engine/requirements.in b/model-engine/requirements.in index 5ae32024..3dec6909 100644 --- a/model-engine/requirements.in +++ b/model-engine/requirements.in @@ -13,7 +13,7 @@ azure-storage-blob~=12.19.0 gcloud-aio-storage~=9.6 google-auth~=2.25.0 google-cloud-artifact-registry~=1.21.0 -google-cloud-secret-manager>=2.20 +google-cloud-secret-manager>=2.24.0 google-cloud-storage~=2.14.0 aioboto3==15.5.0 # 10.4.0 forced urllib3<1.27 (CVE-2023-43804/45803/2024-37891); 15.x uses aiobotocore 2.25.x (urllib3<3, 2.x compatible) boto3-stubs[essential]>=1.40.46,<1.40.62 diff --git a/model-engine/requirements.txt b/model-engine/requirements.txt index 5f4d93ee..370fe778 100644 --- a/model-engine/requirements.txt +++ b/model-engine/requirements.txt @@ -202,7 +202,7 @@ google-cloud-core==2.5.0 # via # google-cloud-secret-manager # google-cloud-storage -google-cloud-secret-manager==2.21.0 +google-cloud-secret-manager==2.24.0 # via -r requirements.in google-cloud-storage==2.14.0 # via -r requirements.in From 54f41f03550194cdb6db19d31f870b6a8d6a078a Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Thu, 16 Apr 2026 14:06:01 -0400 Subject: [PATCH 03/22] fix: tighten runtime binary handling --- model-engine/Dockerfile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/model-engine/Dockerfile b/model-engine/Dockerfile index 601d0477..2426c058 100644 --- a/model-engine/Dockerfile +++ b/model-engine/Dockerfile @@ -9,7 +9,6 @@ ARG TARGETARCH RUN apk add --no-cache \ bash \ build-base \ - cmake \ curl \ curl-dev \ dumb-init \ @@ -35,10 +34,15 @@ RUN mkdir -p /tmp/runtime-bin /tmp/runtime-libs && \ cp /usr/bin/git /tmp/runtime-bin/git && \ cp -R /usr/libexec/git-core /tmp/runtime-bin/git-core && \ cp /usr/lib/libpcre2-8.so.0* /tmp/runtime-libs/ && \ + cp /usr/lib/libcurl.so.4* /tmp/runtime-libs/ && \ cp /usr/lib/libz.so.1* /tmp/runtime-libs/ && \ curl -fsSL -o /tmp/runtime-bin/kubectl "https://dl.k8s.io/release/v1.35.3/bin/linux/${TARGETARCH}/kubectl" && \ + curl -fsSL -o /tmp/runtime-bin/kubectl.sha256 "https://dl.k8s.io/release/v1.35.3/bin/linux/${TARGETARCH}/kubectl.sha256" && \ + echo "$(cat /tmp/runtime-bin/kubectl.sha256) /tmp/runtime-bin/kubectl" | sha256sum -c - && \ chmod +x /tmp/runtime-bin/kubectl && \ curl -fsSL -o /tmp/runtime-bin/aws-iam-authenticator "https://github.com/kubernetes-sigs/aws-iam-authenticator/releases/download/v0.7.11/aws-iam-authenticator_0.7.11_linux_${TARGETARCH}" && \ + curl -fsSL -o /tmp/runtime-bin/aws-iam-authenticator.checksums "https://github.com/kubernetes-sigs/aws-iam-authenticator/releases/download/v0.7.11/authenticator_0.7.11_checksums.txt" && \ + grep "aws-iam-authenticator_0.7.11_linux_${TARGETARCH}\$" /tmp/runtime-bin/aws-iam-authenticator.checksums | sed "s| aws-iam-authenticator_0.7.11_linux_${TARGETARCH}| /tmp/runtime-bin/aws-iam-authenticator|" | sha256sum -c - && \ chmod +x /tmp/runtime-bin/aws-iam-authenticator FROM cgr.dev/chainguard/python:latest AS model-engine From a79a067331c0ad2bdf7ea9ff5a0d0b0df302cf6c Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Thu, 16 Apr 2026 15:52:39 -0400 Subject: [PATCH 04/22] fix: eliminate remaining runtime binary highs --- model-engine/Dockerfile | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/model-engine/Dockerfile b/model-engine/Dockerfile index 2426c058..2dc58207 100644 --- a/model-engine/Dockerfile +++ b/model-engine/Dockerfile @@ -13,7 +13,9 @@ RUN apk add --no-cache \ curl-dev \ dumb-init \ git \ - openssl-dev + go \ + openssl-dev \ + rsync RUN python -m venv /workspace/venv ENV PATH="/workspace/venv/bin:/usr/sbin:/usr/bin:/sbin:/bin" @@ -36,14 +38,11 @@ RUN mkdir -p /tmp/runtime-bin /tmp/runtime-libs && \ cp /usr/lib/libpcre2-8.so.0* /tmp/runtime-libs/ && \ cp /usr/lib/libcurl.so.4* /tmp/runtime-libs/ && \ cp /usr/lib/libz.so.1* /tmp/runtime-libs/ && \ - curl -fsSL -o /tmp/runtime-bin/kubectl "https://dl.k8s.io/release/v1.35.3/bin/linux/${TARGETARCH}/kubectl" && \ - curl -fsSL -o /tmp/runtime-bin/kubectl.sha256 "https://dl.k8s.io/release/v1.35.3/bin/linux/${TARGETARCH}/kubectl.sha256" && \ - echo "$(cat /tmp/runtime-bin/kubectl.sha256) /tmp/runtime-bin/kubectl" | sha256sum -c - && \ - chmod +x /tmp/runtime-bin/kubectl && \ - curl -fsSL -o /tmp/runtime-bin/aws-iam-authenticator "https://github.com/kubernetes-sigs/aws-iam-authenticator/releases/download/v0.7.11/aws-iam-authenticator_0.7.11_linux_${TARGETARCH}" && \ - curl -fsSL -o /tmp/runtime-bin/aws-iam-authenticator.checksums "https://github.com/kubernetes-sigs/aws-iam-authenticator/releases/download/v0.7.11/authenticator_0.7.11_checksums.txt" && \ - grep "aws-iam-authenticator_0.7.11_linux_${TARGETARCH}\$" /tmp/runtime-bin/aws-iam-authenticator.checksums | sed "s| aws-iam-authenticator_0.7.11_linux_${TARGETARCH}| /tmp/runtime-bin/aws-iam-authenticator|" | sha256sum -c - && \ - chmod +x /tmp/runtime-bin/aws-iam-authenticator + git clone --depth 1 --branch v1.35.3 https://github.com/kubernetes/kubernetes.git /tmp/k8s && \ + cd /tmp/k8s && \ + GOTOOLCHAIN=local make WHAT=cmd/kubectl && \ + cp _output/bin/kubectl /tmp/runtime-bin/kubectl && \ + GOBIN=/tmp/runtime-bin go install sigs.k8s.io/aws-iam-authenticator/cmd/aws-iam-authenticator@v0.7.11 FROM cgr.dev/chainguard/python:latest AS model-engine From b479214c9a838896b0d9bfe81ac5b80f8bfd7389 Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Thu, 16 Apr 2026 21:27:06 -0400 Subject: [PATCH 05/22] fix: honor target architecture for runtime binaries --- model-engine/Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/model-engine/Dockerfile b/model-engine/Dockerfile index 2dc58207..41745865 100644 --- a/model-engine/Dockerfile +++ b/model-engine/Dockerfile @@ -40,9 +40,9 @@ RUN mkdir -p /tmp/runtime-bin /tmp/runtime-libs && \ cp /usr/lib/libz.so.1* /tmp/runtime-libs/ && \ git clone --depth 1 --branch v1.35.3 https://github.com/kubernetes/kubernetes.git /tmp/k8s && \ cd /tmp/k8s && \ - GOTOOLCHAIN=local make WHAT=cmd/kubectl && \ - cp _output/bin/kubectl /tmp/runtime-bin/kubectl && \ - GOBIN=/tmp/runtime-bin go install sigs.k8s.io/aws-iam-authenticator/cmd/aws-iam-authenticator@v0.7.11 + GOTOOLCHAIN=local KUBE_BUILD_PLATFORMS=linux/${TARGETARCH} make WHAT=cmd/kubectl && \ + cp _output/local/bin/linux/${TARGETARCH}/kubectl /tmp/runtime-bin/kubectl && \ + GOBIN=/tmp/runtime-bin GOOS=linux GOARCH=${TARGETARCH} go install sigs.k8s.io/aws-iam-authenticator/cmd/aws-iam-authenticator@v0.7.11 FROM cgr.dev/chainguard/python:latest AS model-engine From 5f5840f70be01165ecdae0f01aabc8f0516ff1d9 Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Fri, 17 Apr 2026 08:28:39 -0400 Subject: [PATCH 06/22] fix: restore CI test compatibility --- model-engine/Dockerfile | 5 ++++- model-engine/model_engine_server/common/dtos/llms/vllm.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/model-engine/Dockerfile b/model-engine/Dockerfile index 41745865..08eefacd 100644 --- a/model-engine/Dockerfile +++ b/model-engine/Dockerfile @@ -32,11 +32,13 @@ COPY model-engine/service_configs service_configs RUN pip install -e . RUN mkdir -p /tmp/runtime-bin /tmp/runtime-libs && \ + cp /bin/bash /tmp/runtime-bin/bash && \ cp /usr/bin/dumb-init /tmp/runtime-bin/dumb-init && \ cp /usr/bin/git /tmp/runtime-bin/git && \ cp -R /usr/libexec/git-core /tmp/runtime-bin/git-core && \ cp /usr/lib/libpcre2-8.so.0* /tmp/runtime-libs/ && \ cp /usr/lib/libcurl.so.4* /tmp/runtime-libs/ && \ + cp /usr/lib/libtinfo.so.6* /tmp/runtime-libs/ && \ cp /usr/lib/libz.so.1* /tmp/runtime-libs/ && \ git clone --depth 1 --branch v1.35.3 https://github.com/kubernetes/kubernetes.git /tmp/k8s && \ cd /tmp/k8s && \ @@ -51,6 +53,7 @@ WORKDIR /workspace COPY --from=builder --chown=nonroot:nonroot /workspace/venv /workspace/venv COPY --from=builder --chown=nonroot:nonroot /workspace/model-engine /workspace/model-engine +COPY --from=builder /tmp/runtime-bin/bash /bin/bash COPY --from=builder /tmp/runtime-bin/dumb-init /usr/bin/dumb-init COPY --from=builder /tmp/runtime-bin/git /usr/bin/git COPY --from=builder /tmp/runtime-bin/git-core /usr/libexec/git-core @@ -58,7 +61,7 @@ COPY --from=builder /tmp/runtime-bin/kubectl /usr/local/bin/kubectl COPY --from=builder /tmp/runtime-bin/aws-iam-authenticator /usr/local/bin/aws-iam-authenticator COPY --from=builder /tmp/runtime-libs/ /usr/lib/ -ENV PATH="/workspace/venv/bin:/usr/local/bin:/usr/libexec/git-core:/usr/bin" +ENV PATH="/workspace/venv/bin:/usr/local/bin:/usr/libexec/git-core:/usr/bin:/bin" ENV PYTHONPATH=/workspace ENV WORKSPACE=/workspace diff --git a/model-engine/model_engine_server/common/dtos/llms/vllm.py b/model-engine/model_engine_server/common/dtos/llms/vllm.py index 473af057..5a85dc30 100644 --- a/model-engine/model_engine_server/common/dtos/llms/vllm.py +++ b/model-engine/model_engine_server/common/dtos/llms/vllm.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union, cast from model_engine_server.common.pydantic_types import BaseModel, Field from model_engine_server.common.types.gen.openai import ( @@ -275,7 +275,7 @@ class VLLMSamplingParams(BaseModel): (canonical beam search algorithm).""", ) stop_token_ids: Optional[List[int]] = Field( - default_factory=list, + default_factory=lambda: cast(List[int], []), description="""List of tokens that stop the generation when they are generated. The returned output will contain the stop tokens unless the stop tokens are special tokens.""", From 8ea29f9df02c0a547d034a78067a86e565bcd57e Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Fri, 17 Apr 2026 08:43:12 -0400 Subject: [PATCH 07/22] fix: remove coreutils dependency from migration script --- .../db/migrations/run_database_migration.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/model-engine/model_engine_server/db/migrations/run_database_migration.sh b/model-engine/model_engine_server/db/migrations/run_database_migration.sh index 8b25f20e..2f4a5e2b 100755 --- a/model-engine/model_engine_server/db/migrations/run_database_migration.sh +++ b/model-engine/model_engine_server/db/migrations/run_database_migration.sh @@ -1,10 +1,11 @@ #!/bin/bash -# Get the directory of this script -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +# Get the directory of this script without relying on external coreutils. +SCRIPT_PATH="${BASH_SOURCE[0]}" +DIR="$(cd -- "${SCRIPT_PATH%/*}" >/dev/null 2>&1 && pwd)" # Change directory to the directory of this script -cd $DIR +cd "$DIR" # Runs database migration -alembic upgrade head \ No newline at end of file +alembic upgrade head From 8eabec031935465cde9fe58a7f7aa827c3278a80 Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Fri, 17 Apr 2026 09:16:45 -0400 Subject: [PATCH 08/22] fix: use shell-based readiness probes --- charts/model-engine/templates/cacher_deployment.yaml | 5 +++-- .../model-engine/templates/endpoint_builder_deployment.yaml | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/charts/model-engine/templates/cacher_deployment.yaml b/charts/model-engine/templates/cacher_deployment.yaml index 0f2db059..7f427594 100644 --- a/charts/model-engine/templates/cacher_deployment.yaml +++ b/charts/model-engine/templates/cacher_deployment.yaml @@ -48,8 +48,9 @@ spec: readinessProbe: exec: command: - - cat - - /tmp/readyz + - bash + - -lc + - test -f /tmp/readyz command: - dumb-init - -- diff --git a/charts/model-engine/templates/endpoint_builder_deployment.yaml b/charts/model-engine/templates/endpoint_builder_deployment.yaml index 7791d405..9a348d8e 100644 --- a/charts/model-engine/templates/endpoint_builder_deployment.yaml +++ b/charts/model-engine/templates/endpoint_builder_deployment.yaml @@ -49,8 +49,9 @@ spec: readinessProbe: exec: command: - - cat - - /tmp/readyz + - bash + - -lc + - test -f /tmp/readyz command: - dumb-init - -- From 84a104503c32a5bb286ae133ce665ba495a40bbd Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Fri, 17 Apr 2026 10:14:23 -0400 Subject: [PATCH 09/22] fix: remove endpoint builder shell dependencies --- .../core/docker/remote_build.py | 123 ++++++++++++------ 1 file changed, 80 insertions(+), 43 deletions(-) diff --git a/model-engine/model_engine_server/core/docker/remote_build.py b/model-engine/model_engine_server/core/docker/remote_build.py index 5b192064..215d241d 100644 --- a/model-engine/model_engine_server/core/docker/remote_build.py +++ b/model-engine/model_engine_server/core/docker/remote_build.py @@ -2,16 +2,18 @@ import os import shutil import subprocess +import tarfile import tempfile import uuid from base64 import b64encode from contextlib import ExitStack from dataclasses import dataclass +from fnmatch import fnmatch from pathlib import Path from string import Template -from subprocess import PIPE from typing import Dict, Iterable, List, Optional, Union +import boto3 import click import tenacity import yaml @@ -74,49 +76,83 @@ def zip_context( s3_uri = f"s3://{S3_BUCKET}/{s3_file_name}" print(f"Uploading to s3 at: {s3_uri}") try: - # Need to gimme_okta_aws_creds (you can export AWS_PROFILE='ml-admin' right after) - tar_command = _build_tar_cmd(context, ignore_file, folders_to_include) - print(f"Creating archive: {' '.join(tar_command)}") - - with subprocess.Popen( - tar_command, - stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL, - ) as proc: - assert proc.stdout is not None - with storage_client.open( + context_path = Path(context).resolve() + ignore_patterns = _read_ignore_patterns(context_path, ignore_file) + with tempfile.NamedTemporaryFile(suffix=".tar.gz") as archive: + print(f"Creating archive: {archive.name}") + with tarfile.open(archive.name, mode="w:gz") as tar: + for folder in folders_to_include: + resolved_path, archive_root = _normalize_path_for_archive(context_path, folder) + tar.add( + resolved_path, + arcname=archive_root, + filter=lambda tar_info: _filter_archive_member( + tar_info, ignore_patterns + ), + ) + + with open(archive.name, "rb") as archive_in, storage_client.open( s3_uri, "wb", ) as out_file: - shutil.copyfileobj(proc.stdout, out_file) + shutil.copyfileobj(archive_in, out_file) print("Done uploading!") except (ClientError, ProfileNotFound): print("Did you gimme_okta_aws_creds and then export AWS_PROFILE='ml-admin'? Try doing both") raise -def _build_tar_cmd( - context: str, ignore_file: Optional[str], folders_to_include: List[str] -) -> List[str]: - assert len(folders_to_include) > 0, "Need at least one folder to create a tar archive from!" +def _read_ignore_patterns(context_path: Path, ignore_file: Optional[str]) -> List[str]: + if ignore_file is None: + return [] - tar_command = ["tar", "-C", context] - - if ignore_file is not None: - ignore_file = os.path.join(context, ignore_file) - if not os.path.isfile(ignore_file): - print( - f"WARNING: File {ignore_file} does not exist in calling context, not using any file as a .dockerignore" - ) - else: - tar_command.append("--exclude-from") - tar_command.append(ignore_file) - - tar_command.append("-cf") - tar_command.append("-") - tar_command.extend(folders_to_include) - - return tar_command + ignore_path = context_path / ignore_file + if not ignore_path.is_file(): + print( + f"WARNING: File {ignore_path} does not exist in calling context, not using any file as a .dockerignore" + ) + return [] + + patterns: List[str] = [] + for raw_line in ignore_path.read_text().splitlines(): + line = raw_line.strip() + if not line or line.startswith("#"): + continue + patterns.append(line.lstrip("./")) + return patterns + + +def _normalize_path_for_archive(context_path: Path, folder_to_include: str) -> tuple[Path, str]: + include_path = Path(folder_to_include) + resolved_path = ( + include_path.resolve() + if include_path.is_absolute() + else (context_path / include_path).resolve() + ) + try: + archive_root = str(resolved_path.relative_to(context_path)) + except ValueError as exc: + raise ValueError( + f"{folder_to_include=} is not contained within context {context_path}" + ) from exc + return resolved_path, archive_root + + +def _filter_archive_member( + tar_info: tarfile.TarInfo, ignore_patterns: List[str] +) -> Optional[tarfile.TarInfo]: + normalized_name = tar_info.name.lstrip("./") + basename = os.path.basename(normalized_name) + + for pattern in ignore_patterns: + normalized_pattern = pattern.rstrip("/") + if ( + fnmatch(normalized_name, normalized_pattern) + or fnmatch(basename, normalized_pattern) + or normalized_name.startswith(f"{normalized_pattern}/") + ): + return None + return tar_info def start_build_job( @@ -154,18 +190,18 @@ def start_build_job( f = stack.enter_context(tempfile.NamedTemporaryFile("wt", suffix=".yaml")) template_f = stack.enter_context(open(TEMPLATE_FILE, "rt")) - # In Circle CI we need to retrieve the AWS access key to attach to kaniko + # Keep these values available for any template using explicit env creds, but do not + # shell out to the AWS CLI from the endpoint-builder image. aws_access_key_id = "" aws_secret_access_key = "" + aws_session_token = "" if os.getenv("CIRCLECI"): - aws_access_key_id_result = subprocess.run( - ["aws", "configure", "get", "aws_access_key_id"], check=False, stdout=PIPE - ) - aws_access_key_id = aws_access_key_id_result.stdout.decode().strip() - aws_secret_access_key_result = subprocess.run( - ["aws", "configure", "get", "aws_secret_access_key"], check=False, stdout=PIPE - ) - aws_secret_access_key = aws_secret_access_key_result.stdout.decode().strip() + credentials = boto3.Session().get_credentials() + if credentials is not None: + frozen_credentials = credentials.get_frozen_credentials() + aws_access_key_id = frozen_credentials.access_key or "" + aws_secret_access_key = frozen_credentials.secret_key or "" + aws_session_token = frozen_credentials.token or "" job = Template(template_f.read()).substitute( NAME=job_name, CUSTOM_TAGS=json.dumps(custom_tags_serialized), @@ -176,6 +212,7 @@ def start_build_job( CACHE_REPO=f"{infra_config().docker_repo_prefix}/{cache_name}", AWS_ACCESS_KEY_ID=aws_access_key_id, AWS_SECRET_ACCESS_KEY=aws_secret_access_key, + AWS_SESSION_TOKEN=aws_session_token, NAMESPACE=NAMESPACE, ) yml = yaml.safe_load(job) From f871d5eba6223446bf0589e5e39fe6ab707b3dca Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Fri, 17 Apr 2026 11:12:19 -0400 Subject: [PATCH 10/22] style: format remote build helper --- .../core/docker/remote_build.py | 74 ++++++++++++++----- 1 file changed, 56 insertions(+), 18 deletions(-) diff --git a/model-engine/model_engine_server/core/docker/remote_build.py b/model-engine/model_engine_server/core/docker/remote_build.py index 215d241d..f024d672 100644 --- a/model-engine/model_engine_server/core/docker/remote_build.py +++ b/model-engine/model_engine_server/core/docker/remote_build.py @@ -82,7 +82,9 @@ def zip_context( print(f"Creating archive: {archive.name}") with tarfile.open(archive.name, mode="w:gz") as tar: for folder in folders_to_include: - resolved_path, archive_root = _normalize_path_for_archive(context_path, folder) + resolved_path, archive_root = _normalize_path_for_archive( + context_path, folder + ) tar.add( resolved_path, arcname=archive_root, @@ -98,7 +100,9 @@ def zip_context( shutil.copyfileobj(archive_in, out_file) print("Done uploading!") except (ClientError, ProfileNotFound): - print("Did you gimme_okta_aws_creds and then export AWS_PROFILE='ml-admin'? Try doing both") + print( + "Did you gimme_okta_aws_creds and then export AWS_PROFILE='ml-admin'? Try doing both" + ) raise @@ -122,7 +126,9 @@ def _read_ignore_patterns(context_path: Path, ignore_file: Optional[str]) -> Lis return patterns -def _normalize_path_for_archive(context_path: Path, folder_to_include: str) -> tuple[Path, str]: +def _normalize_path_for_archive( + context_path: Path, folder_to_include: str +) -> tuple[Path, str]: include_path = Path(folder_to_include) resolved_path = ( include_path.resolve() @@ -216,7 +222,9 @@ def start_build_job( NAMESPACE=NAMESPACE, ) yml = yaml.safe_load(job) - destinations = [destination_template.substitute(REPO_AND_TAG=rt) for rt in repotags] + destinations = [ + destination_template.substitute(REPO_AND_TAG=rt) for rt in repotags + ] yml["spec"]["template"]["spec"]["containers"][0]["args"].extend(destinations) if build_args: @@ -227,7 +235,9 @@ def start_build_job( yaml.dump(yml, stream=f, default_flow_style=False) f.seek(0) - container_spec: str = yaml.dump(yml["spec"]["template"]["spec"]["containers"][0]).strip() + container_spec: str = yaml.dump( + yml["spec"]["template"]["spec"]["containers"][0] + ).strip() print("Maybe update CodeArtifact token secret") if not os.path.exists("/tmp"): @@ -246,18 +256,28 @@ def start_build_job( with open(pip_conf_file) as f_conf: pip_conf_data = f_conf.read() except (subprocess.CalledProcessError, FileNotFoundError): - print("WARNING: Failed to refresh CodeArtifact token secret, using empty secret") + print( + "WARNING: Failed to refresh CodeArtifact token secret, using empty secret" + ) pip_conf_data = "" pip_conf_base64 = b64encode(pip_conf_data.encode("utf-8")).decode("utf-8") data = {"data": {"codeartifact_pip_conf": pip_conf_base64}} subprocess.check_output( - ["kubectl", "patch", "secret", "codeartifact-pip-conf", f"-p={json.dumps(data)}"] + [ + "kubectl", + "patch", + "secret", + "codeartifact-pip-conf", + f"-p={json.dumps(data)}", + ] ).decode("utf-8") print(f"Executing Kaniko build command:\n{container_spec}") print("-" * 80) - print(subprocess.check_output(["kubectl", "apply", "-f", f.name]).decode("utf-8")) + print( + subprocess.check_output(["kubectl", "apply", "-f", f.name]).decode("utf-8") + ) return job_name @@ -330,7 +350,13 @@ def build_remote( ignore_file=ignore_file, ) return start_build_job( - s3_file_name, dockerfile, repotags, use_cache, cache_name, build_args, custom_tags + s3_file_name, + dockerfile, + repotags, + use_cache, + cache_name, + build_args, + custom_tags, ) @@ -363,16 +389,18 @@ def verify_and_reformat_as_relative_to(context: str, dockerfile: str) -> str: try: dockerfile_relative_to_context = str(dockerfile_p.relative_to(context_p)) except ValueError: - logger.exception(f"Dockerfile ({dockerfile}) is not contained within context ({context})") + logger.exception( + f"Dockerfile ({dockerfile}) is not contained within context ({context})" + ) raise else: return f"./{dockerfile_relative_to_context}" def _read_pod_logs(pod_name): - return subprocess.check_output(["kubectl", "logs", pod_name, "-n", NAMESPACE, "kaniko"]).decode( - "utf-8" - ) + return subprocess.check_output( + ["kubectl", "logs", pod_name, "-n", NAMESPACE, "kaniko"] + ).decode("utf-8") def get_pod_status_and_log(job_name: str) -> BuildResult: @@ -430,7 +458,9 @@ def cleanup_logs_process(): logs_process.kill() else: # If we don't ever see a "Running" event print out the logs anyways - subprocess.run(["kubectl", "logs", pod_name, "-n", NAMESPACE, "kaniko"], check=True) + subprocess.run( + ["kubectl", "logs", pod_name, "-n", NAMESPACE, "kaniko"], check=True + ) for event in watcher.stream( core_api_instance.list_namespaced_pod, @@ -446,10 +476,14 @@ def cleanup_logs_process(): ) elif event["object"].status.phase == "Succeeded": cleanup_logs_process() - return BuildResult(status=True, logs=_read_pod_logs(pod_name), job_name=job_name) + return BuildResult( + status=True, logs=_read_pod_logs(pod_name), job_name=job_name + ) elif event["object"].status.phase == "Failed": cleanup_logs_process() - return BuildResult(status=False, logs=_read_pod_logs(pod_name), job_name=job_name) + return BuildResult( + status=False, logs=_read_pod_logs(pod_name), job_name=job_name + ) if logs_process is not None: logs_process.kill() return BuildResult(status=False, logs=_read_pod_logs(pod_name), job_name=job_name) @@ -569,14 +603,18 @@ def build_remote_wrapper( See README for further explanation """ custom_tags = json.loads(custom_tags) - folders_to_include: Optional[List[str]] = folders.split(",") if folders is not None else None + folders_to_include: Optional[List[str]] = ( + folders.split(",") if folders is not None else None + ) cache_name = "kaniko-cache" build_args = None if build_arg: build_arg_kvs = [arg.split("=") for arg in build_arg] - build_args = {k: v for k, v in build_arg_kvs} # pylint:disable=unnecessary-comprehension + build_args = { + k: v for k, v in build_arg_kvs + } # pylint:disable=unnecessary-comprehension if no_block: build_remote( From fea67a9f6667ccad57644d61391e4565375de576 Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Fri, 17 Apr 2026 15:19:40 -0400 Subject: [PATCH 11/22] fix: normalize endpoint build context paths --- .../repositories/ecr_docker_repository.py | 40 +++++++++++++++++-- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/model-engine/model_engine_server/infra/repositories/ecr_docker_repository.py b/model-engine/model_engine_server/infra/repositories/ecr_docker_repository.py index f20ee6ed..4d979014 100644 --- a/model-engine/model_engine_server/infra/repositories/ecr_docker_repository.py +++ b/model-engine/model_engine_server/infra/repositories/ecr_docker_repository.py @@ -1,7 +1,11 @@ -from typing import Optional +import os +from typing import Dict, Optional from model_engine_server.common.config import hmi_config -from model_engine_server.common.dtos.docker_repository import BuildImageRequest, BuildImageResponse +from model_engine_server.common.dtos.docker_repository import ( + BuildImageRequest, + BuildImageResponse, +) from model_engine_server.core.config import infra_config from model_engine_server.core.docker.ecr import get_latest_image_tag from model_engine_server.core.docker.ecr import image_exists as ecr_image_exists @@ -13,6 +17,28 @@ class ECRDockerRepository(DockerRepository): + @staticmethod + def _normalize_build_args( + base_path: str, build_args: Dict[str, str] + ) -> Dict[str, str]: + normalized = dict(build_args) + base_path_abs = os.path.abspath(base_path) + + for key, value in normalized.items(): + if not isinstance(value, str) or not os.path.isabs(value): + continue + + value_abs = os.path.abspath(value) + try: + if os.path.commonpath([base_path_abs, value_abs]) != base_path_abs: + continue + except ValueError: + continue + + normalized[key] = os.path.relpath(value_abs, base_path_abs) + + return normalized + def image_exists( self, image_tag: str, repository_name: str, aws_profile: Optional[str] = None ) -> bool: @@ -43,7 +69,11 @@ def build_image(self, image_params: BuildImageRequest) -> BuildImageResponse: } if image_params.substitution_args: - build_args.update(image_params.substitution_args) + build_args.update( + self._normalize_build_args( + image_params.base_path, image_params.substitution_args + ) + ) build_result = build_remote_block( context=image_params.base_path, @@ -54,7 +84,9 @@ def build_image(self, image_params: BuildImageRequest) -> BuildImageResponse: cache_name=hmi_config.docker_image_layer_cache_repository, ) return BuildImageResponse( - status=build_result.status, logs=build_result.logs, job_name=build_result.job_name + status=build_result.status, + logs=build_result.logs, + job_name=build_result.job_name, ) def get_latest_image_tag(self, repository_name: str) -> str: From b1d2c94fa5eef8fc789f9bb97b818abcf2b8cf36 Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Fri, 17 Apr 2026 18:55:22 -0400 Subject: [PATCH 12/22] fix: use writable build context temp dirs --- .../core/docker/remote_build.py | 11 +- .../services/live_endpoint_builder_service.py | 166 +++++++++++++----- .../test_live_endpoint_builder_service.py | 80 +++++++-- 3 files changed, 189 insertions(+), 68 deletions(-) diff --git a/model-engine/model_engine_server/core/docker/remote_build.py b/model-engine/model_engine_server/core/docker/remote_build.py index f024d672..10e5b528 100644 --- a/model-engine/model_engine_server/core/docker/remote_build.py +++ b/model-engine/model_engine_server/core/docker/remote_build.py @@ -93,10 +93,13 @@ def zip_context( ), ) - with open(archive.name, "rb") as archive_in, storage_client.open( - s3_uri, - "wb", - ) as out_file: + with ( + open(archive.name, "rb") as archive_in, + storage_client.open( + s3_uri, + "wb", + ) as out_file, + ): shutil.copyfileobj(archive_in, out_file) print("Done uploading!") except (ClientError, ProfileNotFound): diff --git a/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py b/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py index 275ba89c..4259c9fa 100644 --- a/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py +++ b/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py @@ -9,19 +9,27 @@ from datadog import statsd from model_engine_server.common.config import hmi_config -from model_engine_server.common.dtos.docker_repository import BuildImageRequest, BuildImageResponse +from model_engine_server.common.dtos.docker_repository import ( + BuildImageRequest, + BuildImageResponse, +) from model_engine_server.common.dtos.endpoint_builder import ( BuildEndpointRequest, BuildEndpointResponse, BuildEndpointStatus, ) -from model_engine_server.common.dtos.resource_manager import CreateOrUpdateResourcesRequest +from model_engine_server.common.dtos.resource_manager import ( + CreateOrUpdateResourcesRequest, +) from model_engine_server.common.env_vars import LOCAL from model_engine_server.common.io import open_wrapper from model_engine_server.common.serialization_utils import bool_to_str from model_engine_server.core.config import infra_config from model_engine_server.core.loggers import logger_name, make_logger -from model_engine_server.core.notification_gateway import NotificationApp, NotificationGateway +from model_engine_server.core.notification_gateway import ( + NotificationApp, + NotificationGateway, +) from model_engine_server.core.utils.env import environment from model_engine_server.domain.entities import ( ArtifactLike, @@ -79,6 +87,7 @@ GIT_TAG: str = os.getenv("GIT_TAG") # type: ignore ENV: str = os.getenv("DD_ENV") # type: ignore WORKSPACE_PATH = os.getenv("WORKSPACE", ".") +BUILD_CONTEXT_TEMP_ROOT = os.path.join(WORKSPACE_PATH, "model-engine", ".build-context") INITIAL_K8S_CACHE_TTL_SECONDS: int = 180 MAX_IMAGE_TAG_LEN = 128 @@ -151,7 +160,9 @@ async def build_endpoint( self._validate_build_endpoint_request(build_endpoint_request) async with AsyncExitStack() as stack: - lock_ctx = self.model_endpoint_record_repository.get_lock_context(model_endpoint_record) + lock_ctx = self.model_endpoint_record_repository.get_lock_context( + model_endpoint_record + ) lock = await stack.enter_async_context(lock_ctx) # If this can't acquire the lock by the timeout it'll happily keep on going and create # the requisite resources. Not sure this makes complete sense? @@ -204,11 +215,13 @@ async def build_endpoint( ) and build_endpoint_request.high_priority ): - inject_bundle_image_params = self._get_inject_bundle_image_params( - image, - user_image_params, - build_endpoint_request, - logger_adapter, + inject_bundle_image_params = ( + self._get_inject_bundle_image_params( + image, + user_image_params, + build_endpoint_request, + logger_adapter, + ) ) image_repo = inject_bundle_image_params.repo @@ -245,7 +258,9 @@ async def build_endpoint( # CONVERTED_FROM_ARTIFACT_LIKE_KEY will be checked by `get_endpoint_resource_arguments_from_request()` in k8s_resource_types.py if not model_endpoint_record.metadata: model_endpoint_record.metadata = {} - model_endpoint_record.metadata.update({CONVERTED_FROM_ARTIFACT_LIKE_KEY: True}) + model_endpoint_record.metadata.update( + {CONVERTED_FROM_ARTIFACT_LIKE_KEY: True} + ) else: flavor = model_bundle.flavor @@ -278,9 +293,12 @@ async def build_endpoint( # Clean up CONVERTED_FROM_ARTIFACT_LIKE_KEY as it is for internal use only if ( model_endpoint_record.metadata is not None - and CONVERTED_FROM_ARTIFACT_LIKE_KEY in model_endpoint_record.metadata + and CONVERTED_FROM_ARTIFACT_LIKE_KEY + in model_endpoint_record.metadata ): - del model_endpoint_record.metadata[CONVERTED_FROM_ARTIFACT_LIKE_KEY] + del model_endpoint_record.metadata[ + CONVERTED_FROM_ARTIFACT_LIKE_KEY + ] endpoint_info = ModelEndpointInfraState( deployment_name=build_endpoint_request.deployment_name, @@ -322,7 +340,9 @@ async def build_endpoint( endpoint_config = endpoint_info.user_config_state.endpoint_config updated_endpoint_name: Optional[str] = ( - endpoint_config.endpoint_name if endpoint_config is not None else None + endpoint_config.endpoint_name + if endpoint_config is not None + else None ) logger_adapter.info( f"Created {endpoint_id=}: " @@ -365,14 +385,18 @@ async def build_endpoint( try: self.monitoring_metrics_gateway.emit_successful_build_metric() except Exception: # noqa - log_error(f"[Continuing] Failed to emit successful build metric for {endpoint_id=}") + log_error( + f"[Continuing] Failed to emit successful build metric for {endpoint_id=}" + ) try: self.monitoring_metrics_gateway.emit_build_time_metric( time.time() - time_build_endpoint_start ) except Exception: # noqa - log_error(f"[Continuing] Failed to emit endpoint build time metric for {endpoint_id=}") + log_error( + f"[Continuing] Failed to emit endpoint build time metric for {endpoint_id=}" + ) return BuildEndpointResponse(status=BuildEndpointStatus.OK) @@ -446,7 +470,9 @@ def convert_artifact_like_bundle_to_runnable_image( new_model_bundle.flavor = new_flavor new_model_bundle.model_artifact_ids = [] - build_endpoint_request.model_endpoint_record.current_model_bundle = new_model_bundle + build_endpoint_request.model_endpoint_record.current_model_bundle = ( + new_model_bundle + ) def get_base_image_params( self, @@ -463,7 +489,9 @@ def get_base_image_params( if isinstance(env_params, PytorchFramework): image_tag = env_params.pytorch_image_tag if image_tag is None: # pragma: no cover - raise ValueError("Pytorch image tag must be specified if the framework is Pytorch.") + raise ValueError( + "Pytorch image tag must be specified if the framework is Pytorch." + ) logger_adapter.info(f"Using pytorch image tag: {image_tag}") dockerfile = "pytorch_or_tf.base.Dockerfile" base_image = f"pytorch/pytorch:{image_tag}" @@ -476,20 +504,28 @@ def get_base_image_params( # We may change this for Tensorflow GPU mages. tensorflow_version = env_params.tensorflow_version if tensorflow_version is None: # pragma: no cover - raise ValueError("Tensorflow version must be specified if the framework is TF.") + raise ValueError( + "Tensorflow version must be specified if the framework is TF." + ) logger_adapter.info(f"Using tensorflow version: {tensorflow_version}") dockerfile = "pytorch_or_tf.base.Dockerfile" base_image = "continuumio/miniconda3:4.9.2" resulting_image_tag = f"tensorflow-{GIT_TAG}" elif isinstance(env_params, CustomFramework): if env_params.image_tag is None or env_params.image_repository is None: - raise ValueError("Base image tag and ECR repo must be specified for custom images.") + raise ValueError( + "Base image tag and ECR repo must be specified for custom images." + ) base_image_tag = env_params.image_tag ecr_repo = env_params.image_repository - logger_adapter.info(f"Using ECR base image tag: {base_image_tag} in repo: {ecr_repo}") + logger_adapter.info( + f"Using ECR base image tag: {base_image_tag} in repo: {ecr_repo}" + ) dockerfile = "base.Dockerfile" base_image = self.docker_repository.get_image_url(base_image_tag, ecr_repo) - resulting_image_tag = "-".join([ecr_repo, base_image_tag, GIT_TAG]).replace("/", "-") + resulting_image_tag = "-".join([ecr_repo, base_image_tag, GIT_TAG]).replace( + "/", "-" + ) else: # pragma: no cover raise ValueError(f"Unsupported framework_type: {env_params.framework_type}") @@ -526,10 +562,14 @@ def _get_user_image_params( if isinstance(env_params, PytorchFramework): base_image_tag = env_params.pytorch_image_tag if base_image_tag is None: # pragma: no cover - raise ValueError("Pytorch image tag must be specified if the framework is Pytorch.") + raise ValueError( + "Pytorch image tag must be specified if the framework is Pytorch." + ) dockerfile = "pytorch_or_tf.user.Dockerfile" - service_image_tag = self._get_image_tag(base_image_tag, GIT_TAG, requirements_hash) + service_image_tag = self._get_image_tag( + base_image_tag, GIT_TAG, requirements_hash + ) ecr_repo = hmi_config.user_inference_pytorch_repository elif isinstance(env_params, TensorflowFramework): if build_endpoint_request.gpus > 0: @@ -539,34 +579,42 @@ def _get_user_image_params( # We may change this for Tensorflow GPU mages. tensorflow_version = env_params.tensorflow_version if tensorflow_version is None: # pragma: no cover - raise ValueError("Tensorflow version must be specified if the framework is TF.") + raise ValueError( + "Tensorflow version must be specified if the framework is TF." + ) dockerfile = "pytorch_or_tf.user.Dockerfile" - service_image_tag = self._get_image_tag(tensorflow_version, GIT_TAG, requirements_hash) + service_image_tag = self._get_image_tag( + tensorflow_version, GIT_TAG, requirements_hash + ) ecr_repo = hmi_config.user_inference_tensorflow_repository elif isinstance(env_params, CustomFramework): if ( env_params.image_tag is None or env_params.image_repository is None ): # pragma: no cover - raise ValueError("Base image tag and ECR repo must be specified for custom images.") + raise ValueError( + "Base image tag and ECR repo must be specified for custom images." + ) base_image_tag = env_params.image_tag dockerfile = "user.Dockerfile" - service_image_tag = self._get_image_tag(base_image_tag, GIT_TAG, requirements_hash) + service_image_tag = self._get_image_tag( + base_image_tag, GIT_TAG, requirements_hash + ) ecr_repo = env_params.image_repository else: # pragma: no cover raise ValueError(f"Unsupported framework_type: {env_params.framework_type}") # The context should be whatever WORKDIR is in the container running the build app itself. inference_folder = "model-engine/model_engine_server/inference" - requirements_folder = os.path.join(WORKSPACE_PATH, f"requirements_{requirements_hash}") - try: - os.mkdir(requirements_folder) - except FileExistsError: - pass + requirements_folder = self._create_build_context_dir( + prefix=f"requirements_{requirements_hash}_" + ) requirements_file = os.path.join(requirements_folder, "requirements.txt") with open(requirements_file, "w") as f: requirements_contents = "\n".join(model_bundle.requirements or []) - logger_adapter.info(f"Will pip install these requirements: {requirements_contents}") + logger_adapter.info( + f"Will pip install these requirements: {requirements_contents}" + ) f.write(requirements_contents) substitution_args = {"REQUIREMENTS_FILE": requirements_file} @@ -610,11 +658,9 @@ def _get_inject_bundle_image_params( # The context should be whatever WORKDIR is in the container running the build app itself. dockerfile = "inject_bundle.Dockerfile" inference_folder = "model-engine/model_engine_server/inference" - bundle_folder = os.path.join(WORKSPACE_PATH, f"bundle_{service_image_hash}") - try: - os.mkdir(bundle_folder) - except FileExistsError: - pass + bundle_folder = self._create_build_context_dir( + prefix=f"bundle_{service_image_hash}_" + ) _, model_bundle_path = tempfile.mkstemp(dir=bundle_folder, suffix=".zip") bundle_url = model_bundle.location logger_adapter.info( @@ -669,15 +715,19 @@ async def _build_image( image_tag=image_params.image_tag, aws_profile=ECR_AWS_PROFILE, ): - self.monitoring_metrics_gateway.emit_image_build_cache_miss_metric(image_type) + self.monitoring_metrics_gateway.emit_image_build_cache_miss_metric( + image_type + ) tags = [ f"kube_deployment:{build_endpoint_request.deployment_name}", f"user_id:{user_id}", ] with statsd.timed(f"kaniko.{image_type}_build_time", tags=tags): try: - build_result: BuildImageResponse = self.docker_repository.build_image( - image_params, + build_result: BuildImageResponse = ( + self.docker_repository.build_image( + image_params, + ) ) build_result_status = build_result.status build_result_logs: str = build_result.logs @@ -741,7 +791,9 @@ async def _build_image( user_id = build_endpoint_request.model_endpoint_record.created_by endpoint_name = build_endpoint_request.model_endpoint_record.name - bundle_id = build_endpoint_request.model_endpoint_record.current_model_bundle.id + bundle_id = ( + build_endpoint_request.model_endpoint_record.current_model_bundle.id + ) message = ( f"Your endpoint '{endpoint_name}' failed to build! " f"Endpoint ID: {endpoint_id}. Bundle ID: {bundle_id}." @@ -758,15 +810,21 @@ async def _build_image( users=[user_id], ) - raise DockerBuildFailedException(f"Image build failed ({endpoint_id=})") + raise DockerBuildFailedException( + f"Image build failed ({endpoint_id=})" + ) else: - self.monitoring_metrics_gateway.emit_image_build_cache_hit_metric(image_type) + self.monitoring_metrics_gateway.emit_image_build_cache_hit_metric( + image_type + ) logger_adapter.info( f"Image already exists, skipping build. Image={image_params.repo}:{image_params.image_tag}, {endpoint_id=}" ) - return self.docker_repository.get_image_url(image_params.image_tag, image_params.repo) + return self.docker_repository.get_image_url( + image_params.image_tag, image_params.repo + ) @staticmethod def _validate_build_endpoint_request( @@ -787,7 +845,10 @@ def _validate_build_endpoint_request( model_bundle: ModelBundle = ( build_endpoint_request.model_endpoint_record.current_model_bundle ) - if isinstance(model_bundle.flavor, RunnableImageLike) and model_bundle.flavor.env: + if ( + isinstance(model_bundle.flavor, RunnableImageLike) + and model_bundle.flavor.env + ): restriced_env_vars = LiveEndpointBuilderService._get_restricted_env_vars( model_bundle.flavor.env ) @@ -805,7 +866,9 @@ def _validate_build_endpoint_request( @staticmethod def _get_restricted_env_vars(env_vars: Dict[str, str]) -> Set[str]: - restricted_env_vars = set(key for keys in RESTRICTED_ENV_VARS_KEYS.values() for key in keys) + restricted_env_vars = set( + key for keys in RESTRICTED_ENV_VARS_KEYS.values() for key in keys + ) return set(env_vars.keys()) & restricted_env_vars @staticmethod @@ -820,7 +883,14 @@ def _get_requirements_hash(requirements: List[str]) -> str: ).hexdigest()[:6] @staticmethod - def _get_image_tag(base_image_tag: str, git_tag: str, requirements_hash: str) -> str: + def _create_build_context_dir(prefix: str) -> str: + os.makedirs(BUILD_CONTEXT_TEMP_ROOT, exist_ok=True) + return tempfile.mkdtemp(prefix=prefix, dir=BUILD_CONTEXT_TEMP_ROOT) + + @staticmethod + def _get_image_tag( + base_image_tag: str, git_tag: str, requirements_hash: str + ) -> str: """An identifier from an endpoint's base Docker image & git tag, plus the identify of its pip-installable requirements. """ diff --git a/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py b/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py index a0e876eb..eedd6200 100644 --- a/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py +++ b/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py @@ -103,9 +103,15 @@ def set_env_vars(): live_endpoint_builder_service.GIT_TAG = "test_tag" live_endpoint_builder_service.ENV = "test_env" live_endpoint_builder_service.WORKSPACE_PATH = ".." + live_endpoint_builder_service.BUILD_CONTEXT_TEMP_ROOT = ( + "../model-engine/.build-context" + ) live_endpoint_builder_service.open = mock_open() - live_endpoint_builder_service.os.mkdir = Mock() + live_endpoint_builder_service.os.makedirs = Mock() live_endpoint_builder_service.open_wrapper = mock_open() + live_endpoint_builder_service.tempfile.mkdtemp = Mock( + return_value="../model-engine/.build-context/tmpdir" + ) live_endpoint_builder_service.tempfile.mkstemp = Mock(return_value=["", ""]) @@ -150,13 +156,35 @@ async def test_build_endpoint( assert fake_monitoring_metrics_gateway.docker_failed_build == 0 assert fake_monitoring_metrics_gateway.successful_build == 1 assert fake_monitoring_metrics_gateway.build_time_seconds > 0 - if isinstance(request.model_endpoint_record.current_model_bundle.flavor, ArtifactLike): + if isinstance( + request.model_endpoint_record.current_model_bundle.flavor, ArtifactLike + ): if service == endpoint_builder_service_empty_docker_built: - assert sum(fake_monitoring_metrics_gateway.image_build_cache_hit.values()) > 0 - assert sum(fake_monitoring_metrics_gateway.image_build_cache_miss.values()) == 0 + assert ( + sum( + fake_monitoring_metrics_gateway.image_build_cache_hit.values() + ) + > 0 + ) + assert ( + sum( + fake_monitoring_metrics_gateway.image_build_cache_miss.values() + ) + == 0 + ) else: - assert sum(fake_monitoring_metrics_gateway.image_build_cache_hit.values()) == 0 - assert sum(fake_monitoring_metrics_gateway.image_build_cache_miss.values()) > 0 + assert ( + sum( + fake_monitoring_metrics_gateway.image_build_cache_hit.values() + ) + == 0 + ) + assert ( + sum( + fake_monitoring_metrics_gateway.image_build_cache_miss.values() + ) + > 0 + ) @pytest.mark.asyncio @@ -165,8 +193,12 @@ async def test_build_endpoint_update_failed_raises_resource_manager_exception( endpoint_builder_service_empty_docker_built: LiveEndpointBuilderService, fake_monitoring_metrics_gateway: FakeMonitoringMetricsGateway, ): - repo: Any = endpoint_builder_service_empty_docker_built.model_endpoint_record_repository - repo.add_model_endpoint_record(build_endpoint_request_sync_pytorch.model_endpoint_record) + repo: Any = ( + endpoint_builder_service_empty_docker_built.model_endpoint_record_repository + ) + repo.add_model_endpoint_record( + build_endpoint_request_sync_pytorch.model_endpoint_record + ) endpoint_builder_service_empty_docker_built.resource_gateway.__setattr__( "create_or_update_resources", Mock(side_effect=EndpointResourceInfraException) ) @@ -184,8 +216,12 @@ async def test_build_endpoint_tensorflow_with_nonzero_gpu_raises_not_implemented build_endpoint_request_async_tensorflow: BuildEndpointRequest, endpoint_builder_service_empty_docker_not_built: LiveEndpointBuilderService, ): - repo: Any = endpoint_builder_service_empty_docker_not_built.model_endpoint_record_repository - repo.add_model_endpoint_record(build_endpoint_request_async_tensorflow.model_endpoint_record) + repo: Any = ( + endpoint_builder_service_empty_docker_not_built.model_endpoint_record_repository + ) + repo.add_model_endpoint_record( + build_endpoint_request_async_tensorflow.model_endpoint_record + ) build_endpoint_request_async_tensorflow.gpus = 1 with pytest.raises(NotImplementedError): await endpoint_builder_service_empty_docker_not_built.build_endpoint( @@ -198,8 +234,12 @@ async def test_build_endpoint_tensorflow_with_invalid_aws_role_raises_value_erro build_endpoint_request_async_tensorflow: BuildEndpointRequest, endpoint_builder_service_empty_docker_not_built: LiveEndpointBuilderService, ): - repo: Any = endpoint_builder_service_empty_docker_not_built.model_endpoint_record_repository - repo.add_model_endpoint_record(build_endpoint_request_async_tensorflow.model_endpoint_record) + repo: Any = ( + endpoint_builder_service_empty_docker_not_built.model_endpoint_record_repository + ) + repo.add_model_endpoint_record( + build_endpoint_request_async_tensorflow.model_endpoint_record + ) build_endpoint_request_async_tensorflow.aws_role = "invalid_aws_role" with pytest.raises(ValueError): await endpoint_builder_service_empty_docker_not_built.build_endpoint( @@ -214,8 +254,12 @@ async def test_build_endpoint_build_result_failed_yields_docker_build_failed_exc fake_monitoring_metrics_gateway: FakeMonitoringMetricsGateway, fake_notification_gateway: FakeNotificationGateway, ): - repo: Any = endpoint_builder_service_empty_docker_not_built.model_endpoint_record_repository - repo.add_model_endpoint_record(build_endpoint_request_sync_pytorch.model_endpoint_record) + repo: Any = ( + endpoint_builder_service_empty_docker_not_built.model_endpoint_record_repository + ) + repo.add_model_endpoint_record( + build_endpoint_request_sync_pytorch.model_endpoint_record + ) endpoint_builder_service_empty_docker_not_built.docker_repository.__setattr__( "build_image", Mock(return_value=BuildImageResponse(status=False, logs="", job_name="")), @@ -241,7 +285,9 @@ async def test_build_endpoint_build_result_throws_error_yields_docker_build_fail repo: Any = ( endpoint_builder_service_empty_docker_builds_dont_work.model_endpoint_record_repository ) - repo.add_model_endpoint_record(build_endpoint_request_sync_pytorch.model_endpoint_record) + repo.add_model_endpoint_record( + build_endpoint_request_sync_pytorch.model_endpoint_record + ) with pytest.raises(DockerBuildFailedException): await endpoint_builder_service_empty_docker_builds_dont_work.build_endpoint( build_endpoint_request_sync_pytorch @@ -265,7 +311,9 @@ def test_convert_artifact_like_bundle_to_runnable_image( build_endpoint_request_sync_custom, "test_repo", "test_tag" ) - new_bundle = build_endpoint_request_sync_custom.model_endpoint_record.current_model_bundle + new_bundle = ( + build_endpoint_request_sync_custom.model_endpoint_record.current_model_bundle + ) assert isinstance(new_bundle.flavor, RunnableImageFlavor) assert new_bundle.flavor.repository == "test_repo" From 13f827f7a333347400d38115ed8c546d1ec3c26b Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Fri, 17 Apr 2026 19:32:14 -0400 Subject: [PATCH 13/22] fix: unblock simple bundle endpoint builds --- .../core/docker/remote_build.py | 62 ++------ .../repositories/ecr_docker_repository.py | 13 +- .../services/live_endpoint_builder_service.py | 148 +++++------------- .../test_live_endpoint_builder_service.py | 76 ++------- 4 files changed, 75 insertions(+), 224 deletions(-) diff --git a/model-engine/model_engine_server/core/docker/remote_build.py b/model-engine/model_engine_server/core/docker/remote_build.py index 10e5b528..c36f8260 100644 --- a/model-engine/model_engine_server/core/docker/remote_build.py +++ b/model-engine/model_engine_server/core/docker/remote_build.py @@ -82,15 +82,11 @@ def zip_context( print(f"Creating archive: {archive.name}") with tarfile.open(archive.name, mode="w:gz") as tar: for folder in folders_to_include: - resolved_path, archive_root = _normalize_path_for_archive( - context_path, folder - ) + resolved_path, archive_root = _normalize_path_for_archive(context_path, folder) tar.add( resolved_path, arcname=archive_root, - filter=lambda tar_info: _filter_archive_member( - tar_info, ignore_patterns - ), + filter=lambda tar_info: _filter_archive_member(tar_info, ignore_patterns), ) with ( @@ -103,9 +99,7 @@ def zip_context( shutil.copyfileobj(archive_in, out_file) print("Done uploading!") except (ClientError, ProfileNotFound): - print( - "Did you gimme_okta_aws_creds and then export AWS_PROFILE='ml-admin'? Try doing both" - ) + print("Did you gimme_okta_aws_creds and then export AWS_PROFILE='ml-admin'? Try doing both") raise @@ -129,9 +123,7 @@ def _read_ignore_patterns(context_path: Path, ignore_file: Optional[str]) -> Lis return patterns -def _normalize_path_for_archive( - context_path: Path, folder_to_include: str -) -> tuple[Path, str]: +def _normalize_path_for_archive(context_path: Path, folder_to_include: str) -> tuple[Path, str]: include_path = Path(folder_to_include) resolved_path = ( include_path.resolve() @@ -225,9 +217,7 @@ def start_build_job( NAMESPACE=NAMESPACE, ) yml = yaml.safe_load(job) - destinations = [ - destination_template.substitute(REPO_AND_TAG=rt) for rt in repotags - ] + destinations = [destination_template.substitute(REPO_AND_TAG=rt) for rt in repotags] yml["spec"]["template"]["spec"]["containers"][0]["args"].extend(destinations) if build_args: @@ -238,9 +228,7 @@ def start_build_job( yaml.dump(yml, stream=f, default_flow_style=False) f.seek(0) - container_spec: str = yaml.dump( - yml["spec"]["template"]["spec"]["containers"][0] - ).strip() + container_spec: str = yaml.dump(yml["spec"]["template"]["spec"]["containers"][0]).strip() print("Maybe update CodeArtifact token secret") if not os.path.exists("/tmp"): @@ -259,9 +247,7 @@ def start_build_job( with open(pip_conf_file) as f_conf: pip_conf_data = f_conf.read() except (subprocess.CalledProcessError, FileNotFoundError): - print( - "WARNING: Failed to refresh CodeArtifact token secret, using empty secret" - ) + print("WARNING: Failed to refresh CodeArtifact token secret, using empty secret") pip_conf_data = "" pip_conf_base64 = b64encode(pip_conf_data.encode("utf-8")).decode("utf-8") data = {"data": {"codeartifact_pip_conf": pip_conf_base64}} @@ -278,9 +264,7 @@ def start_build_job( print(f"Executing Kaniko build command:\n{container_spec}") print("-" * 80) - print( - subprocess.check_output(["kubectl", "apply", "-f", f.name]).decode("utf-8") - ) + print(subprocess.check_output(["kubectl", "apply", "-f", f.name]).decode("utf-8")) return job_name @@ -392,18 +376,16 @@ def verify_and_reformat_as_relative_to(context: str, dockerfile: str) -> str: try: dockerfile_relative_to_context = str(dockerfile_p.relative_to(context_p)) except ValueError: - logger.exception( - f"Dockerfile ({dockerfile}) is not contained within context ({context})" - ) + logger.exception(f"Dockerfile ({dockerfile}) is not contained within context ({context})") raise else: return f"./{dockerfile_relative_to_context}" def _read_pod_logs(pod_name): - return subprocess.check_output( - ["kubectl", "logs", pod_name, "-n", NAMESPACE, "kaniko"] - ).decode("utf-8") + return subprocess.check_output(["kubectl", "logs", pod_name, "-n", NAMESPACE, "kaniko"]).decode( + "utf-8" + ) def get_pod_status_and_log(job_name: str) -> BuildResult: @@ -461,9 +443,7 @@ def cleanup_logs_process(): logs_process.kill() else: # If we don't ever see a "Running" event print out the logs anyways - subprocess.run( - ["kubectl", "logs", pod_name, "-n", NAMESPACE, "kaniko"], check=True - ) + subprocess.run(["kubectl", "logs", pod_name, "-n", NAMESPACE, "kaniko"], check=True) for event in watcher.stream( core_api_instance.list_namespaced_pod, @@ -479,14 +459,10 @@ def cleanup_logs_process(): ) elif event["object"].status.phase == "Succeeded": cleanup_logs_process() - return BuildResult( - status=True, logs=_read_pod_logs(pod_name), job_name=job_name - ) + return BuildResult(status=True, logs=_read_pod_logs(pod_name), job_name=job_name) elif event["object"].status.phase == "Failed": cleanup_logs_process() - return BuildResult( - status=False, logs=_read_pod_logs(pod_name), job_name=job_name - ) + return BuildResult(status=False, logs=_read_pod_logs(pod_name), job_name=job_name) if logs_process is not None: logs_process.kill() return BuildResult(status=False, logs=_read_pod_logs(pod_name), job_name=job_name) @@ -606,18 +582,14 @@ def build_remote_wrapper( See README for further explanation """ custom_tags = json.loads(custom_tags) - folders_to_include: Optional[List[str]] = ( - folders.split(",") if folders is not None else None - ) + folders_to_include: Optional[List[str]] = folders.split(",") if folders is not None else None cache_name = "kaniko-cache" build_args = None if build_arg: build_arg_kvs = [arg.split("=") for arg in build_arg] - build_args = { - k: v for k, v in build_arg_kvs - } # pylint:disable=unnecessary-comprehension + build_args = {k: v for k, v in build_arg_kvs} # pylint:disable=unnecessary-comprehension if no_block: build_remote( diff --git a/model-engine/model_engine_server/infra/repositories/ecr_docker_repository.py b/model-engine/model_engine_server/infra/repositories/ecr_docker_repository.py index 4d979014..59b9aa7c 100644 --- a/model-engine/model_engine_server/infra/repositories/ecr_docker_repository.py +++ b/model-engine/model_engine_server/infra/repositories/ecr_docker_repository.py @@ -2,10 +2,7 @@ from typing import Dict, Optional from model_engine_server.common.config import hmi_config -from model_engine_server.common.dtos.docker_repository import ( - BuildImageRequest, - BuildImageResponse, -) +from model_engine_server.common.dtos.docker_repository import BuildImageRequest, BuildImageResponse from model_engine_server.core.config import infra_config from model_engine_server.core.docker.ecr import get_latest_image_tag from model_engine_server.core.docker.ecr import image_exists as ecr_image_exists @@ -18,9 +15,7 @@ class ECRDockerRepository(DockerRepository): @staticmethod - def _normalize_build_args( - base_path: str, build_args: Dict[str, str] - ) -> Dict[str, str]: + def _normalize_build_args(base_path: str, build_args: Dict[str, str]) -> Dict[str, str]: normalized = dict(build_args) base_path_abs = os.path.abspath(base_path) @@ -70,9 +65,7 @@ def build_image(self, image_params: BuildImageRequest) -> BuildImageResponse: if image_params.substitution_args: build_args.update( - self._normalize_build_args( - image_params.base_path, image_params.substitution_args - ) + self._normalize_build_args(image_params.base_path, image_params.substitution_args) ) build_result = build_remote_block( diff --git a/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py b/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py index 4259c9fa..1a8b7e6f 100644 --- a/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py +++ b/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py @@ -9,27 +9,19 @@ from datadog import statsd from model_engine_server.common.config import hmi_config -from model_engine_server.common.dtos.docker_repository import ( - BuildImageRequest, - BuildImageResponse, -) +from model_engine_server.common.dtos.docker_repository import BuildImageRequest, BuildImageResponse from model_engine_server.common.dtos.endpoint_builder import ( BuildEndpointRequest, BuildEndpointResponse, BuildEndpointStatus, ) -from model_engine_server.common.dtos.resource_manager import ( - CreateOrUpdateResourcesRequest, -) +from model_engine_server.common.dtos.resource_manager import CreateOrUpdateResourcesRequest from model_engine_server.common.env_vars import LOCAL from model_engine_server.common.io import open_wrapper from model_engine_server.common.serialization_utils import bool_to_str from model_engine_server.core.config import infra_config from model_engine_server.core.loggers import logger_name, make_logger -from model_engine_server.core.notification_gateway import ( - NotificationApp, - NotificationGateway, -) +from model_engine_server.core.notification_gateway import NotificationApp, NotificationGateway from model_engine_server.core.utils.env import environment from model_engine_server.domain.entities import ( ArtifactLike, @@ -160,9 +152,7 @@ async def build_endpoint( self._validate_build_endpoint_request(build_endpoint_request) async with AsyncExitStack() as stack: - lock_ctx = self.model_endpoint_record_repository.get_lock_context( - model_endpoint_record - ) + lock_ctx = self.model_endpoint_record_repository.get_lock_context(model_endpoint_record) lock = await stack.enter_async_context(lock_ctx) # If this can't acquire the lock by the timeout it'll happily keep on going and create # the requisite resources. Not sure this makes complete sense? @@ -215,13 +205,11 @@ async def build_endpoint( ) and build_endpoint_request.high_priority ): - inject_bundle_image_params = ( - self._get_inject_bundle_image_params( - image, - user_image_params, - build_endpoint_request, - logger_adapter, - ) + inject_bundle_image_params = self._get_inject_bundle_image_params( + image, + user_image_params, + build_endpoint_request, + logger_adapter, ) image_repo = inject_bundle_image_params.repo @@ -258,9 +246,7 @@ async def build_endpoint( # CONVERTED_FROM_ARTIFACT_LIKE_KEY will be checked by `get_endpoint_resource_arguments_from_request()` in k8s_resource_types.py if not model_endpoint_record.metadata: model_endpoint_record.metadata = {} - model_endpoint_record.metadata.update( - {CONVERTED_FROM_ARTIFACT_LIKE_KEY: True} - ) + model_endpoint_record.metadata.update({CONVERTED_FROM_ARTIFACT_LIKE_KEY: True}) else: flavor = model_bundle.flavor @@ -293,12 +279,9 @@ async def build_endpoint( # Clean up CONVERTED_FROM_ARTIFACT_LIKE_KEY as it is for internal use only if ( model_endpoint_record.metadata is not None - and CONVERTED_FROM_ARTIFACT_LIKE_KEY - in model_endpoint_record.metadata + and CONVERTED_FROM_ARTIFACT_LIKE_KEY in model_endpoint_record.metadata ): - del model_endpoint_record.metadata[ - CONVERTED_FROM_ARTIFACT_LIKE_KEY - ] + del model_endpoint_record.metadata[CONVERTED_FROM_ARTIFACT_LIKE_KEY] endpoint_info = ModelEndpointInfraState( deployment_name=build_endpoint_request.deployment_name, @@ -340,9 +323,7 @@ async def build_endpoint( endpoint_config = endpoint_info.user_config_state.endpoint_config updated_endpoint_name: Optional[str] = ( - endpoint_config.endpoint_name - if endpoint_config is not None - else None + endpoint_config.endpoint_name if endpoint_config is not None else None ) logger_adapter.info( f"Created {endpoint_id=}: " @@ -385,18 +366,14 @@ async def build_endpoint( try: self.monitoring_metrics_gateway.emit_successful_build_metric() except Exception: # noqa - log_error( - f"[Continuing] Failed to emit successful build metric for {endpoint_id=}" - ) + log_error(f"[Continuing] Failed to emit successful build metric for {endpoint_id=}") try: self.monitoring_metrics_gateway.emit_build_time_metric( time.time() - time_build_endpoint_start ) except Exception: # noqa - log_error( - f"[Continuing] Failed to emit endpoint build time metric for {endpoint_id=}" - ) + log_error(f"[Continuing] Failed to emit endpoint build time metric for {endpoint_id=}") return BuildEndpointResponse(status=BuildEndpointStatus.OK) @@ -470,9 +447,7 @@ def convert_artifact_like_bundle_to_runnable_image( new_model_bundle.flavor = new_flavor new_model_bundle.model_artifact_ids = [] - build_endpoint_request.model_endpoint_record.current_model_bundle = ( - new_model_bundle - ) + build_endpoint_request.model_endpoint_record.current_model_bundle = new_model_bundle def get_base_image_params( self, @@ -489,9 +464,7 @@ def get_base_image_params( if isinstance(env_params, PytorchFramework): image_tag = env_params.pytorch_image_tag if image_tag is None: # pragma: no cover - raise ValueError( - "Pytorch image tag must be specified if the framework is Pytorch." - ) + raise ValueError("Pytorch image tag must be specified if the framework is Pytorch.") logger_adapter.info(f"Using pytorch image tag: {image_tag}") dockerfile = "pytorch_or_tf.base.Dockerfile" base_image = f"pytorch/pytorch:{image_tag}" @@ -504,28 +477,20 @@ def get_base_image_params( # We may change this for Tensorflow GPU mages. tensorflow_version = env_params.tensorflow_version if tensorflow_version is None: # pragma: no cover - raise ValueError( - "Tensorflow version must be specified if the framework is TF." - ) + raise ValueError("Tensorflow version must be specified if the framework is TF.") logger_adapter.info(f"Using tensorflow version: {tensorflow_version}") dockerfile = "pytorch_or_tf.base.Dockerfile" base_image = "continuumio/miniconda3:4.9.2" resulting_image_tag = f"tensorflow-{GIT_TAG}" elif isinstance(env_params, CustomFramework): if env_params.image_tag is None or env_params.image_repository is None: - raise ValueError( - "Base image tag and ECR repo must be specified for custom images." - ) + raise ValueError("Base image tag and ECR repo must be specified for custom images.") base_image_tag = env_params.image_tag ecr_repo = env_params.image_repository - logger_adapter.info( - f"Using ECR base image tag: {base_image_tag} in repo: {ecr_repo}" - ) + logger_adapter.info(f"Using ECR base image tag: {base_image_tag} in repo: {ecr_repo}") dockerfile = "base.Dockerfile" base_image = self.docker_repository.get_image_url(base_image_tag, ecr_repo) - resulting_image_tag = "-".join([ecr_repo, base_image_tag, GIT_TAG]).replace( - "/", "-" - ) + resulting_image_tag = "-".join([ecr_repo, base_image_tag, GIT_TAG]).replace("/", "-") else: # pragma: no cover raise ValueError(f"Unsupported framework_type: {env_params.framework_type}") @@ -562,14 +527,10 @@ def _get_user_image_params( if isinstance(env_params, PytorchFramework): base_image_tag = env_params.pytorch_image_tag if base_image_tag is None: # pragma: no cover - raise ValueError( - "Pytorch image tag must be specified if the framework is Pytorch." - ) + raise ValueError("Pytorch image tag must be specified if the framework is Pytorch.") dockerfile = "pytorch_or_tf.user.Dockerfile" - service_image_tag = self._get_image_tag( - base_image_tag, GIT_TAG, requirements_hash - ) + service_image_tag = self._get_image_tag(base_image_tag, GIT_TAG, requirements_hash) ecr_repo = hmi_config.user_inference_pytorch_repository elif isinstance(env_params, TensorflowFramework): if build_endpoint_request.gpus > 0: @@ -579,26 +540,18 @@ def _get_user_image_params( # We may change this for Tensorflow GPU mages. tensorflow_version = env_params.tensorflow_version if tensorflow_version is None: # pragma: no cover - raise ValueError( - "Tensorflow version must be specified if the framework is TF." - ) + raise ValueError("Tensorflow version must be specified if the framework is TF.") dockerfile = "pytorch_or_tf.user.Dockerfile" - service_image_tag = self._get_image_tag( - tensorflow_version, GIT_TAG, requirements_hash - ) + service_image_tag = self._get_image_tag(tensorflow_version, GIT_TAG, requirements_hash) ecr_repo = hmi_config.user_inference_tensorflow_repository elif isinstance(env_params, CustomFramework): if ( env_params.image_tag is None or env_params.image_repository is None ): # pragma: no cover - raise ValueError( - "Base image tag and ECR repo must be specified for custom images." - ) + raise ValueError("Base image tag and ECR repo must be specified for custom images.") base_image_tag = env_params.image_tag dockerfile = "user.Dockerfile" - service_image_tag = self._get_image_tag( - base_image_tag, GIT_TAG, requirements_hash - ) + service_image_tag = self._get_image_tag(base_image_tag, GIT_TAG, requirements_hash) ecr_repo = env_params.image_repository else: # pragma: no cover raise ValueError(f"Unsupported framework_type: {env_params.framework_type}") @@ -612,9 +565,7 @@ def _get_user_image_params( requirements_file = os.path.join(requirements_folder, "requirements.txt") with open(requirements_file, "w") as f: requirements_contents = "\n".join(model_bundle.requirements or []) - logger_adapter.info( - f"Will pip install these requirements: {requirements_contents}" - ) + logger_adapter.info(f"Will pip install these requirements: {requirements_contents}") f.write(requirements_contents) substitution_args = {"REQUIREMENTS_FILE": requirements_file} @@ -658,9 +609,7 @@ def _get_inject_bundle_image_params( # The context should be whatever WORKDIR is in the container running the build app itself. dockerfile = "inject_bundle.Dockerfile" inference_folder = "model-engine/model_engine_server/inference" - bundle_folder = self._create_build_context_dir( - prefix=f"bundle_{service_image_hash}_" - ) + bundle_folder = self._create_build_context_dir(prefix=f"bundle_{service_image_hash}_") _, model_bundle_path = tempfile.mkstemp(dir=bundle_folder, suffix=".zip") bundle_url = model_bundle.location logger_adapter.info( @@ -715,19 +664,15 @@ async def _build_image( image_tag=image_params.image_tag, aws_profile=ECR_AWS_PROFILE, ): - self.monitoring_metrics_gateway.emit_image_build_cache_miss_metric( - image_type - ) + self.monitoring_metrics_gateway.emit_image_build_cache_miss_metric(image_type) tags = [ f"kube_deployment:{build_endpoint_request.deployment_name}", f"user_id:{user_id}", ] with statsd.timed(f"kaniko.{image_type}_build_time", tags=tags): try: - build_result: BuildImageResponse = ( - self.docker_repository.build_image( - image_params, - ) + build_result: BuildImageResponse = self.docker_repository.build_image( + image_params, ) build_result_status = build_result.status build_result_logs: str = build_result.logs @@ -791,9 +736,7 @@ async def _build_image( user_id = build_endpoint_request.model_endpoint_record.created_by endpoint_name = build_endpoint_request.model_endpoint_record.name - bundle_id = ( - build_endpoint_request.model_endpoint_record.current_model_bundle.id - ) + bundle_id = build_endpoint_request.model_endpoint_record.current_model_bundle.id message = ( f"Your endpoint '{endpoint_name}' failed to build! " f"Endpoint ID: {endpoint_id}. Bundle ID: {bundle_id}." @@ -810,21 +753,15 @@ async def _build_image( users=[user_id], ) - raise DockerBuildFailedException( - f"Image build failed ({endpoint_id=})" - ) + raise DockerBuildFailedException(f"Image build failed ({endpoint_id=})") else: - self.monitoring_metrics_gateway.emit_image_build_cache_hit_metric( - image_type - ) + self.monitoring_metrics_gateway.emit_image_build_cache_hit_metric(image_type) logger_adapter.info( f"Image already exists, skipping build. Image={image_params.repo}:{image_params.image_tag}, {endpoint_id=}" ) - return self.docker_repository.get_image_url( - image_params.image_tag, image_params.repo - ) + return self.docker_repository.get_image_url(image_params.image_tag, image_params.repo) @staticmethod def _validate_build_endpoint_request( @@ -845,10 +782,7 @@ def _validate_build_endpoint_request( model_bundle: ModelBundle = ( build_endpoint_request.model_endpoint_record.current_model_bundle ) - if ( - isinstance(model_bundle.flavor, RunnableImageLike) - and model_bundle.flavor.env - ): + if isinstance(model_bundle.flavor, RunnableImageLike) and model_bundle.flavor.env: restriced_env_vars = LiveEndpointBuilderService._get_restricted_env_vars( model_bundle.flavor.env ) @@ -866,9 +800,7 @@ def _validate_build_endpoint_request( @staticmethod def _get_restricted_env_vars(env_vars: Dict[str, str]) -> Set[str]: - restricted_env_vars = set( - key for keys in RESTRICTED_ENV_VARS_KEYS.values() for key in keys - ) + restricted_env_vars = set(key for keys in RESTRICTED_ENV_VARS_KEYS.values() for key in keys) return set(env_vars.keys()) & restricted_env_vars @staticmethod @@ -888,9 +820,7 @@ def _create_build_context_dir(prefix: str) -> str: return tempfile.mkdtemp(prefix=prefix, dir=BUILD_CONTEXT_TEMP_ROOT) @staticmethod - def _get_image_tag( - base_image_tag: str, git_tag: str, requirements_hash: str - ) -> str: + def _get_image_tag(base_image_tag: str, git_tag: str, requirements_hash: str) -> str: """An identifier from an endpoint's base Docker image & git tag, plus the identify of its pip-installable requirements. """ diff --git a/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py b/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py index eedd6200..5b22acce 100644 --- a/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py +++ b/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py @@ -103,9 +103,7 @@ def set_env_vars(): live_endpoint_builder_service.GIT_TAG = "test_tag" live_endpoint_builder_service.ENV = "test_env" live_endpoint_builder_service.WORKSPACE_PATH = ".." - live_endpoint_builder_service.BUILD_CONTEXT_TEMP_ROOT = ( - "../model-engine/.build-context" - ) + live_endpoint_builder_service.BUILD_CONTEXT_TEMP_ROOT = "../model-engine/.build-context" live_endpoint_builder_service.open = mock_open() live_endpoint_builder_service.os.makedirs = Mock() live_endpoint_builder_service.open_wrapper = mock_open() @@ -156,35 +154,13 @@ async def test_build_endpoint( assert fake_monitoring_metrics_gateway.docker_failed_build == 0 assert fake_monitoring_metrics_gateway.successful_build == 1 assert fake_monitoring_metrics_gateway.build_time_seconds > 0 - if isinstance( - request.model_endpoint_record.current_model_bundle.flavor, ArtifactLike - ): + if isinstance(request.model_endpoint_record.current_model_bundle.flavor, ArtifactLike): if service == endpoint_builder_service_empty_docker_built: - assert ( - sum( - fake_monitoring_metrics_gateway.image_build_cache_hit.values() - ) - > 0 - ) - assert ( - sum( - fake_monitoring_metrics_gateway.image_build_cache_miss.values() - ) - == 0 - ) + assert sum(fake_monitoring_metrics_gateway.image_build_cache_hit.values()) > 0 + assert sum(fake_monitoring_metrics_gateway.image_build_cache_miss.values()) == 0 else: - assert ( - sum( - fake_monitoring_metrics_gateway.image_build_cache_hit.values() - ) - == 0 - ) - assert ( - sum( - fake_monitoring_metrics_gateway.image_build_cache_miss.values() - ) - > 0 - ) + assert sum(fake_monitoring_metrics_gateway.image_build_cache_hit.values()) == 0 + assert sum(fake_monitoring_metrics_gateway.image_build_cache_miss.values()) > 0 @pytest.mark.asyncio @@ -193,12 +169,8 @@ async def test_build_endpoint_update_failed_raises_resource_manager_exception( endpoint_builder_service_empty_docker_built: LiveEndpointBuilderService, fake_monitoring_metrics_gateway: FakeMonitoringMetricsGateway, ): - repo: Any = ( - endpoint_builder_service_empty_docker_built.model_endpoint_record_repository - ) - repo.add_model_endpoint_record( - build_endpoint_request_sync_pytorch.model_endpoint_record - ) + repo: Any = endpoint_builder_service_empty_docker_built.model_endpoint_record_repository + repo.add_model_endpoint_record(build_endpoint_request_sync_pytorch.model_endpoint_record) endpoint_builder_service_empty_docker_built.resource_gateway.__setattr__( "create_or_update_resources", Mock(side_effect=EndpointResourceInfraException) ) @@ -216,12 +188,8 @@ async def test_build_endpoint_tensorflow_with_nonzero_gpu_raises_not_implemented build_endpoint_request_async_tensorflow: BuildEndpointRequest, endpoint_builder_service_empty_docker_not_built: LiveEndpointBuilderService, ): - repo: Any = ( - endpoint_builder_service_empty_docker_not_built.model_endpoint_record_repository - ) - repo.add_model_endpoint_record( - build_endpoint_request_async_tensorflow.model_endpoint_record - ) + repo: Any = endpoint_builder_service_empty_docker_not_built.model_endpoint_record_repository + repo.add_model_endpoint_record(build_endpoint_request_async_tensorflow.model_endpoint_record) build_endpoint_request_async_tensorflow.gpus = 1 with pytest.raises(NotImplementedError): await endpoint_builder_service_empty_docker_not_built.build_endpoint( @@ -234,12 +202,8 @@ async def test_build_endpoint_tensorflow_with_invalid_aws_role_raises_value_erro build_endpoint_request_async_tensorflow: BuildEndpointRequest, endpoint_builder_service_empty_docker_not_built: LiveEndpointBuilderService, ): - repo: Any = ( - endpoint_builder_service_empty_docker_not_built.model_endpoint_record_repository - ) - repo.add_model_endpoint_record( - build_endpoint_request_async_tensorflow.model_endpoint_record - ) + repo: Any = endpoint_builder_service_empty_docker_not_built.model_endpoint_record_repository + repo.add_model_endpoint_record(build_endpoint_request_async_tensorflow.model_endpoint_record) build_endpoint_request_async_tensorflow.aws_role = "invalid_aws_role" with pytest.raises(ValueError): await endpoint_builder_service_empty_docker_not_built.build_endpoint( @@ -254,12 +218,8 @@ async def test_build_endpoint_build_result_failed_yields_docker_build_failed_exc fake_monitoring_metrics_gateway: FakeMonitoringMetricsGateway, fake_notification_gateway: FakeNotificationGateway, ): - repo: Any = ( - endpoint_builder_service_empty_docker_not_built.model_endpoint_record_repository - ) - repo.add_model_endpoint_record( - build_endpoint_request_sync_pytorch.model_endpoint_record - ) + repo: Any = endpoint_builder_service_empty_docker_not_built.model_endpoint_record_repository + repo.add_model_endpoint_record(build_endpoint_request_sync_pytorch.model_endpoint_record) endpoint_builder_service_empty_docker_not_built.docker_repository.__setattr__( "build_image", Mock(return_value=BuildImageResponse(status=False, logs="", job_name="")), @@ -285,9 +245,7 @@ async def test_build_endpoint_build_result_throws_error_yields_docker_build_fail repo: Any = ( endpoint_builder_service_empty_docker_builds_dont_work.model_endpoint_record_repository ) - repo.add_model_endpoint_record( - build_endpoint_request_sync_pytorch.model_endpoint_record - ) + repo.add_model_endpoint_record(build_endpoint_request_sync_pytorch.model_endpoint_record) with pytest.raises(DockerBuildFailedException): await endpoint_builder_service_empty_docker_builds_dont_work.build_endpoint( build_endpoint_request_sync_pytorch @@ -311,9 +269,7 @@ def test_convert_artifact_like_bundle_to_runnable_image( build_endpoint_request_sync_custom, "test_repo", "test_tag" ) - new_bundle = ( - build_endpoint_request_sync_custom.model_endpoint_record.current_model_bundle - ) + new_bundle = build_endpoint_request_sync_custom.model_endpoint_record.current_model_bundle assert isinstance(new_bundle.flavor, RunnableImageFlavor) assert new_bundle.flavor.repository == "test_repo" From 56e57a07ef46554752bd0e6cf0717d0343a7658d Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Fri, 17 Apr 2026 20:04:57 -0400 Subject: [PATCH 14/22] test: cover remote build diff paths --- .../unit/core/docker/test_remote_build.py | 219 ++++++++++++++++++ .../test_ecr_docker_repository.py | 101 ++++++++ 2 files changed, 320 insertions(+) create mode 100644 model-engine/tests/unit/core/docker/test_remote_build.py create mode 100644 model-engine/tests/unit/infra/repositories/test_ecr_docker_repository.py diff --git a/model-engine/tests/unit/core/docker/test_remote_build.py b/model-engine/tests/unit/core/docker/test_remote_build.py new file mode 100644 index 00000000..9f384e27 --- /dev/null +++ b/model-engine/tests/unit/core/docker/test_remote_build.py @@ -0,0 +1,219 @@ +from io import BytesIO +from pathlib import Path +from types import SimpleNamespace +from unittest import mock + +import pytest +from botocore.exceptions import ClientError +from model_engine_server.core.docker import remote_build + + +def test_read_ignore_patterns_handles_missing_file(tmp_path, capsys): + patterns = remote_build._read_ignore_patterns(tmp_path, ".dockerignore") + + assert patterns == [] + assert "does not exist" in capsys.readouterr().out + + +def test_read_ignore_patterns_skips_comments_and_blank_lines(tmp_path): + ignore_file = tmp_path / ".dockerignore" + ignore_file.write_text("\n# comment\n./foo\nbar/\n") + + patterns = remote_build._read_ignore_patterns(tmp_path, ".dockerignore") + + assert patterns == ["foo", "bar/"] + + +def test_normalize_path_for_archive_relative_path(tmp_path): + folder = tmp_path / "subdir" + folder.mkdir() + + resolved_path, archive_root = remote_build._normalize_path_for_archive(tmp_path, "subdir") + + assert resolved_path == folder.resolve() + assert archive_root == "subdir" + + +def test_normalize_path_for_archive_rejects_path_outside_context(tmp_path): + outside = tmp_path.parent / "outside" + outside.mkdir(exist_ok=True) + + with pytest.raises(ValueError, match="is not contained within context"): + remote_build._normalize_path_for_archive(tmp_path, str(outside)) + + +@pytest.mark.parametrize( + ("member_name", "patterns", "should_keep"), + [ + ("pkg/file.py", ["pkg"], False), + ("pkg/file.py", ["*.py"], False), + ("pkg/file.py", ["other"], True), + ], +) +def test_filter_archive_member(member_name, patterns, should_keep): + tar_info = mock.Mock() + tar_info.name = member_name + + result = remote_build._filter_archive_member(tar_info, patterns) + + assert (result is tar_info) is should_keep + + +def test_zip_context_uploads_filtered_archive(tmp_path): + context = tmp_path / "context" + include_dir = context / "pkg" + include_dir.mkdir(parents=True) + (include_dir / "keep.txt").write_text("keep") + (include_dir / "drop.log").write_text("drop") + (context / ".dockerignore").write_text("*.log\n") + + uploaded = BytesIO() + + class UploadSink: + def __enter__(self): + return uploaded + + def __exit__(self, exc_type, exc, tb): + uploaded.seek(0) + return False + + with mock.patch.object(remote_build.storage_client, "open", return_value=UploadSink()): + remote_build.zip_context( + s3_file_name="bundle.tar.gz", + context=str(context), + folders_to_include=["pkg"], + ignore_file=".dockerignore", + ) + + archive_path = tmp_path / "uploaded.tar.gz" + archive_path.write_bytes(uploaded.getvalue()) + import tarfile + + with tarfile.open(archive_path, mode="r:gz") as tar: + names = tar.getnames() + + assert "pkg/keep.txt" in names + assert "pkg/drop.log" not in names + + +def test_zip_context_reraises_storage_errors(tmp_path): + context = tmp_path / "context" + folder = context / "pkg" + folder.mkdir(parents=True) + (folder / "keep.txt").write_text("keep") + error_response = {"Error": {"Code": "AccessDenied", "Message": "denied"}} + + with mock.patch.object( + remote_build.storage_client, + "open", + side_effect=ClientError(error_response, "PutObject"), + ): + with pytest.raises(ClientError): + remote_build.zip_context( + s3_file_name="bundle.tar.gz", + context=str(context), + folders_to_include=["pkg"], + ) + + +def test_start_build_job_uses_boto_credentials_for_circleci(tmp_path): + template_file = tmp_path / "kaniko_template.yaml" + template_file.write_text( + """ +apiVersion: batch/v1 +kind: Job +metadata: + name: $NAME +spec: + template: + spec: + containers: + - name: kaniko + args: [] +""" + ) + captured = {} + + def fake_check_output(args, cwd=None, shell=False): + if shell: + return b"" + if args[:3] == ["kubectl", "patch", "secret"]: + captured["patch_args"] = args + return b"patched" + if args[:3] == ["kubectl", "apply", "-f"]: + captured["apply_args"] = args + captured["apply_yaml"] = Path(args[3]).read_text() + return b"applied" + raise AssertionError(f"unexpected subprocess call: {args}") + + frozen_credentials = SimpleNamespace( + access_key="access", + secret_key="secret", + token="token", + ) + credentials = SimpleNamespace(get_frozen_credentials=lambda: frozen_credentials) + + with ( + mock.patch.object(remote_build, "TEMPLATE_FILE", str(template_file)), + mock.patch.object( + remote_build, + "infra_config", + return_value=SimpleNamespace( + docker_repo_prefix="repo-prefix", + profile_ml_worker="default", + ), + ), + mock.patch.dict(remote_build.os.environ, {"CIRCLECI": "true"}, clear=False), + mock.patch.object( + remote_build.boto3, + "Session", + return_value=mock.Mock(get_credentials=mock.Mock(return_value=credentials)), + ), + mock.patch.object(remote_build.subprocess, "check_output", side_effect=fake_check_output), + ): + job_name = remote_build.start_build_job( + s3_file_name="tmp/context.tar.gz", + path_to_dockerfile="./Dockerfile", + repotags=["repo/image:tag"], + use_cache=True, + cache_name="cache-repo", + build_args={"ARG1": "VALUE1"}, + custom_tags={"team": "ml"}, + ) + + assert job_name.startswith("kaniko-") + assert captured["patch_args"][:4] == ["kubectl", "patch", "secret", "codeartifact-pip-conf"] + assert "--destination=repo-prefix/repo/image:tag" in captured["apply_yaml"] + assert "--build-arg=ARG1=VALUE1" in captured["apply_yaml"] + assert "AWS_ACCESS_KEY_ID: access" in captured["apply_yaml"] + assert "AWS_SECRET_ACCESS_KEY: secret" in captured["apply_yaml"] + assert "AWS_SESSION_TOKEN: token" in captured["apply_yaml"] + + +def test_build_remote_with_explicit_folders_calls_zip_and_start(tmp_path): + dockerfile = tmp_path / "Dockerfile" + dockerfile.write_text("FROM scratch\n") + + with ( + mock.patch.object(remote_build, "zip_context") as mock_zip_context, + mock.patch.object( + remote_build, "start_build_job", return_value="kaniko-job" + ) as mock_start_build_job, + ): + result = remote_build.build_remote( + context=str(tmp_path), + dockerfile=str(dockerfile), + repotags="repo/image:tag", + folders_to_include=["model-engine"], + build_args={"ARG1": "VALUE1"}, + ) + + assert result == "kaniko-job" + mock_zip_context.assert_called_once() + zip_kwargs = mock_zip_context.call_args.kwargs + assert zip_kwargs["context"] == str(tmp_path) + assert zip_kwargs["folders_to_include"] == ["model-engine"] + mock_start_build_job.assert_called_once() + start_args = mock_start_build_job.call_args.args + assert start_args[1] == "./Dockerfile" + assert start_args[2] == ["repo/image:tag"] diff --git a/model-engine/tests/unit/infra/repositories/test_ecr_docker_repository.py b/model-engine/tests/unit/infra/repositories/test_ecr_docker_repository.py new file mode 100644 index 00000000..3fa75eca --- /dev/null +++ b/model-engine/tests/unit/infra/repositories/test_ecr_docker_repository.py @@ -0,0 +1,101 @@ +from unittest import mock + +from model_engine_server.common.dtos.docker_repository import BuildImageRequest +from model_engine_server.infra.repositories.ecr_docker_repository import ECRDockerRepository + + +def test_normalize_build_args_rewrites_only_paths_inside_base(tmp_path): + base = tmp_path / "repo" + base.mkdir() + inside = base / "nested" / "requirements.txt" + inside.parent.mkdir() + inside.write_text("x") + outside = tmp_path / "outside.txt" + outside.write_text("y") + + normalized = ECRDockerRepository._normalize_build_args( + str(base), + { + "INSIDE": str(inside), + "OUTSIDE": str(outside), + "RELATIVE": "already/relative.txt", + "NON_STRING": 1, + }, + ) + + assert normalized["INSIDE"] == "nested/requirements.txt" + assert normalized["OUTSIDE"] == str(outside) + assert normalized["RELATIVE"] == "already/relative.txt" + assert normalized["NON_STRING"] == 1 + + +def test_build_image_includes_requirements_and_dockerfile_root(tmp_path): + repo = ECRDockerRepository() + base = tmp_path / "repo" + base.mkdir() + requirements = base / "model-engine" / ".build-context" / "reqs" + requirements.mkdir(parents=True) + abs_build_arg = base / "model-engine" / ".build-context" / "reqs" / "requirements.txt" + abs_build_arg.write_text("x") + + image_request = BuildImageRequest( + repo="hosted-model-inference/test", + image_tag="tag", + aws_profile="default", + base_path=str(base), + dockerfile="model-engine/model_engine_server/inference/pytorch_or_tf.user.Dockerfile", + base_image="python:3.8-slim", + requirements_folder="model-engine/.build-context/reqs", + substitution_args={"REQUIREMENTS_FILE": str(abs_build_arg)}, + ) + + build_result = mock.Mock(status=True, logs="ok", job_name="job-1") + + with mock.patch( + "model_engine_server.infra.repositories.ecr_docker_repository.build_remote_block", + return_value=build_result, + ) as mock_build_remote_block: + response = repo.build_image(image_request) + + assert response.status is True + assert response.logs == "ok" + assert response.job_name == "job-1" + + mock_build_remote_block.assert_called_once() + _, kwargs = mock_build_remote_block.call_args + assert kwargs["folders_to_include"] == [ + "model-engine", + "model-engine/.build-context/reqs", + ] + assert kwargs["build_args"] == { + "BASE_IMAGE": "python:3.8-slim", + "REQUIREMENTS_FILE": "model-engine/.build-context/reqs/requirements.txt", + } + + +def test_build_image_without_substitution_args_keeps_base_image_only(tmp_path): + repo = ECRDockerRepository() + base = tmp_path / "repo" + base.mkdir() + + image_request = BuildImageRequest( + repo="hosted-model-inference/test", + image_tag="tag", + aws_profile="default", + base_path=str(base), + dockerfile="model-engine/Dockerfile", + base_image="python:3.13-slim", + ) + + build_result = mock.Mock(status=True, logs="ok", job_name="job-2") + + with mock.patch( + "model_engine_server.infra.repositories.ecr_docker_repository.build_remote_block", + return_value=build_result, + ) as mock_build_remote_block: + response = repo.build_image(image_request) + + assert response.status is True + _, kwargs = mock_build_remote_block.call_args + assert kwargs["folders_to_include"] == ["model-engine"] + assert kwargs["build_args"] == {"BASE_IMAGE": "python:3.13-slim"} From 6637e6e596c9a9938c6d8cd7aa48b18bfd09b173 Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Fri, 17 Apr 2026 20:21:50 -0400 Subject: [PATCH 15/22] test: fix remote build credential assertion --- .../tests/unit/core/docker/test_remote_build.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/model-engine/tests/unit/core/docker/test_remote_build.py b/model-engine/tests/unit/core/docker/test_remote_build.py index 9f384e27..8731fadb 100644 --- a/model-engine/tests/unit/core/docker/test_remote_build.py +++ b/model-engine/tests/unit/core/docker/test_remote_build.py @@ -130,6 +130,13 @@ def test_start_build_job_uses_boto_credentials_for_circleci(tmp_path): containers: - name: kaniko args: [] + env: + - name: AWS_ACCESS_KEY_ID + value: "$AWS_ACCESS_KEY_ID" + - name: AWS_SECRET_ACCESS_KEY + value: "$AWS_SECRET_ACCESS_KEY" + - name: AWS_SESSION_TOKEN + value: "$AWS_SESSION_TOKEN" """ ) captured = {} @@ -185,9 +192,12 @@ def fake_check_output(args, cwd=None, shell=False): assert captured["patch_args"][:4] == ["kubectl", "patch", "secret", "codeartifact-pip-conf"] assert "--destination=repo-prefix/repo/image:tag" in captured["apply_yaml"] assert "--build-arg=ARG1=VALUE1" in captured["apply_yaml"] - assert "AWS_ACCESS_KEY_ID: access" in captured["apply_yaml"] - assert "AWS_SECRET_ACCESS_KEY: secret" in captured["apply_yaml"] - assert "AWS_SESSION_TOKEN: token" in captured["apply_yaml"] + assert "name: AWS_ACCESS_KEY_ID" in captured["apply_yaml"] + assert "value: access" in captured["apply_yaml"] + assert "name: AWS_SECRET_ACCESS_KEY" in captured["apply_yaml"] + assert "value: secret" in captured["apply_yaml"] + assert "name: AWS_SESSION_TOKEN" in captured["apply_yaml"] + assert "value: token" in captured["apply_yaml"] def test_build_remote_with_explicit_folders_calls_zip_and_start(tmp_path): From 1da90cbc0a27b613358616d807fff6a0e97fbe05 Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Fri, 17 Apr 2026 20:32:35 -0400 Subject: [PATCH 16/22] fix: address review feedback on build context handling --- charts/model-engine/templates/cacher_deployment.yaml | 2 +- .../model-engine/templates/endpoint_builder_deployment.yaml | 2 +- model-engine/model_engine_server/core/docker/remote_build.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/charts/model-engine/templates/cacher_deployment.yaml b/charts/model-engine/templates/cacher_deployment.yaml index 7f427594..62b5f1d4 100644 --- a/charts/model-engine/templates/cacher_deployment.yaml +++ b/charts/model-engine/templates/cacher_deployment.yaml @@ -49,7 +49,7 @@ spec: exec: command: - bash - - -lc + - -c - test -f /tmp/readyz command: - dumb-init diff --git a/charts/model-engine/templates/endpoint_builder_deployment.yaml b/charts/model-engine/templates/endpoint_builder_deployment.yaml index 9a348d8e..bf684c41 100644 --- a/charts/model-engine/templates/endpoint_builder_deployment.yaml +++ b/charts/model-engine/templates/endpoint_builder_deployment.yaml @@ -50,7 +50,7 @@ spec: exec: command: - bash - - -lc + - -c - test -f /tmp/readyz command: - dumb-init diff --git a/model-engine/model_engine_server/core/docker/remote_build.py b/model-engine/model_engine_server/core/docker/remote_build.py index c36f8260..408906ab 100644 --- a/model-engine/model_engine_server/core/docker/remote_build.py +++ b/model-engine/model_engine_server/core/docker/remote_build.py @@ -119,7 +119,7 @@ def _read_ignore_patterns(context_path: Path, ignore_file: Optional[str]) -> Lis line = raw_line.strip() if not line or line.startswith("#"): continue - patterns.append(line.lstrip("./")) + patterns.append(line.removeprefix("./")) return patterns @@ -142,7 +142,7 @@ def _normalize_path_for_archive(context_path: Path, folder_to_include: str) -> t def _filter_archive_member( tar_info: tarfile.TarInfo, ignore_patterns: List[str] ) -> Optional[tarfile.TarInfo]: - normalized_name = tar_info.name.lstrip("./") + normalized_name = tar_info.name.removeprefix("./") basename = os.path.basename(normalized_name) for pattern in ignore_patterns: From f4261c8807e50eb5fb478c9bf6eabbd0cd3a1e87 Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Fri, 17 Apr 2026 21:15:43 -0400 Subject: [PATCH 17/22] fix: keep temp build contexts out of archives --- .../infra/repositories/ecr_docker_repository.py | 4 +++- .../infra/services/live_endpoint_builder_service.py | 2 +- .../infra/repositories/test_ecr_docker_repository.py | 10 +++++----- .../services/test_live_endpoint_builder_service.py | 6 ++---- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/model-engine/model_engine_server/infra/repositories/ecr_docker_repository.py b/model-engine/model_engine_server/infra/repositories/ecr_docker_repository.py index 59b9aa7c..10beed10 100644 --- a/model-engine/model_engine_server/infra/repositories/ecr_docker_repository.py +++ b/model-engine/model_engine_server/infra/repositories/ecr_docker_repository.py @@ -18,6 +18,7 @@ class ECRDockerRepository(DockerRepository): def _normalize_build_args(base_path: str, build_args: Dict[str, str]) -> Dict[str, str]: normalized = dict(build_args) base_path_abs = os.path.abspath(base_path) + updates: Dict[str, str] = {} for key, value in normalized.items(): if not isinstance(value, str) or not os.path.isabs(value): @@ -30,8 +31,9 @@ def _normalize_build_args(base_path: str, build_args: Dict[str, str]) -> Dict[st except ValueError: continue - normalized[key] = os.path.relpath(value_abs, base_path_abs) + updates[key] = os.path.relpath(value_abs, base_path_abs) + normalized.update(updates) return normalized def image_exists( diff --git a/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py b/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py index 1a8b7e6f..06bd7a36 100644 --- a/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py +++ b/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py @@ -79,7 +79,7 @@ GIT_TAG: str = os.getenv("GIT_TAG") # type: ignore ENV: str = os.getenv("DD_ENV") # type: ignore WORKSPACE_PATH = os.getenv("WORKSPACE", ".") -BUILD_CONTEXT_TEMP_ROOT = os.path.join(WORKSPACE_PATH, "model-engine", ".build-context") +BUILD_CONTEXT_TEMP_ROOT = os.path.join(WORKSPACE_PATH, ".build-context") INITIAL_K8S_CACHE_TTL_SECONDS: int = 180 MAX_IMAGE_TAG_LEN = 128 diff --git a/model-engine/tests/unit/infra/repositories/test_ecr_docker_repository.py b/model-engine/tests/unit/infra/repositories/test_ecr_docker_repository.py index 3fa75eca..e97472a0 100644 --- a/model-engine/tests/unit/infra/repositories/test_ecr_docker_repository.py +++ b/model-engine/tests/unit/infra/repositories/test_ecr_docker_repository.py @@ -33,9 +33,9 @@ def test_build_image_includes_requirements_and_dockerfile_root(tmp_path): repo = ECRDockerRepository() base = tmp_path / "repo" base.mkdir() - requirements = base / "model-engine" / ".build-context" / "reqs" + requirements = base / ".build-context" / "reqs" requirements.mkdir(parents=True) - abs_build_arg = base / "model-engine" / ".build-context" / "reqs" / "requirements.txt" + abs_build_arg = base / ".build-context" / "reqs" / "requirements.txt" abs_build_arg.write_text("x") image_request = BuildImageRequest( @@ -45,7 +45,7 @@ def test_build_image_includes_requirements_and_dockerfile_root(tmp_path): base_path=str(base), dockerfile="model-engine/model_engine_server/inference/pytorch_or_tf.user.Dockerfile", base_image="python:3.8-slim", - requirements_folder="model-engine/.build-context/reqs", + requirements_folder=".build-context/reqs", substitution_args={"REQUIREMENTS_FILE": str(abs_build_arg)}, ) @@ -65,11 +65,11 @@ def test_build_image_includes_requirements_and_dockerfile_root(tmp_path): _, kwargs = mock_build_remote_block.call_args assert kwargs["folders_to_include"] == [ "model-engine", - "model-engine/.build-context/reqs", + ".build-context/reqs", ] assert kwargs["build_args"] == { "BASE_IMAGE": "python:3.8-slim", - "REQUIREMENTS_FILE": "model-engine/.build-context/reqs/requirements.txt", + "REQUIREMENTS_FILE": ".build-context/reqs/requirements.txt", } diff --git a/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py b/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py index 5b22acce..d2ffc708 100644 --- a/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py +++ b/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py @@ -103,13 +103,11 @@ def set_env_vars(): live_endpoint_builder_service.GIT_TAG = "test_tag" live_endpoint_builder_service.ENV = "test_env" live_endpoint_builder_service.WORKSPACE_PATH = ".." - live_endpoint_builder_service.BUILD_CONTEXT_TEMP_ROOT = "../model-engine/.build-context" + live_endpoint_builder_service.BUILD_CONTEXT_TEMP_ROOT = "../.build-context" live_endpoint_builder_service.open = mock_open() live_endpoint_builder_service.os.makedirs = Mock() live_endpoint_builder_service.open_wrapper = mock_open() - live_endpoint_builder_service.tempfile.mkdtemp = Mock( - return_value="../model-engine/.build-context/tmpdir" - ) + live_endpoint_builder_service.tempfile.mkdtemp = Mock(return_value="../.build-context/tmpdir") live_endpoint_builder_service.tempfile.mkstemp = Mock(return_value=["", ""]) From 42699f1825a462fee981cdcca3f1f3234af4e418 Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Fri, 17 Apr 2026 22:39:56 -0400 Subject: [PATCH 18/22] fix: avoid archiving temp build contexts --- .../core/docker/remote_build.py | 25 +++++++-- .../services/live_endpoint_builder_service.py | 2 +- .../unit/core/docker/test_remote_build.py | 54 ++++++++++++++++--- .../test_ecr_docker_repository.py | 10 ++-- .../test_live_endpoint_builder_service.py | 6 ++- 5 files changed, 79 insertions(+), 18 deletions(-) diff --git a/model-engine/model_engine_server/core/docker/remote_build.py b/model-engine/model_engine_server/core/docker/remote_build.py index 408906ab..1d7b5956 100644 --- a/model-engine/model_engine_server/core/docker/remote_build.py +++ b/model-engine/model_engine_server/core/docker/remote_build.py @@ -78,15 +78,25 @@ def zip_context( try: context_path = Path(context).resolve() ignore_patterns = _read_ignore_patterns(context_path, ignore_file) + archive_roots = [ + _normalize_path_for_archive(context_path, folder)[1] for folder in folders_to_include + ] with tempfile.NamedTemporaryFile(suffix=".tar.gz") as archive: print(f"Creating archive: {archive.name}") with tarfile.open(archive.name, mode="w:gz") as tar: - for folder in folders_to_include: - resolved_path, archive_root = _normalize_path_for_archive(context_path, folder) + for folder, archive_root in zip(folders_to_include, archive_roots): + resolved_path, _ = _normalize_path_for_archive(context_path, folder) + nested_archive_roots = [ + root + for root in archive_roots + if root != archive_root and root.startswith(f"{archive_root}/") + ] tar.add( resolved_path, arcname=archive_root, - filter=lambda tar_info: _filter_archive_member(tar_info, ignore_patterns), + filter=lambda tar_info, nested_archive_roots=nested_archive_roots: _filter_archive_member( + tar_info, ignore_patterns, nested_archive_roots + ), ) with ( @@ -140,10 +150,17 @@ def _normalize_path_for_archive(context_path: Path, folder_to_include: str) -> t def _filter_archive_member( - tar_info: tarfile.TarInfo, ignore_patterns: List[str] + tar_info: tarfile.TarInfo, + ignore_patterns: List[str], + nested_archive_roots: Optional[List[str]] = None, ) -> Optional[tarfile.TarInfo]: normalized_name = tar_info.name.removeprefix("./") basename = os.path.basename(normalized_name) + nested_archive_roots = nested_archive_roots or [] + + for nested_root in nested_archive_roots: + if normalized_name == nested_root or normalized_name.startswith(f"{nested_root}/"): + return None for pattern in ignore_patterns: normalized_pattern = pattern.rstrip("/") diff --git a/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py b/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py index 06bd7a36..1a8b7e6f 100644 --- a/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py +++ b/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py @@ -79,7 +79,7 @@ GIT_TAG: str = os.getenv("GIT_TAG") # type: ignore ENV: str = os.getenv("DD_ENV") # type: ignore WORKSPACE_PATH = os.getenv("WORKSPACE", ".") -BUILD_CONTEXT_TEMP_ROOT = os.path.join(WORKSPACE_PATH, ".build-context") +BUILD_CONTEXT_TEMP_ROOT = os.path.join(WORKSPACE_PATH, "model-engine", ".build-context") INITIAL_K8S_CACHE_TTL_SECONDS: int = 180 MAX_IMAGE_TAG_LEN = 128 diff --git a/model-engine/tests/unit/core/docker/test_remote_build.py b/model-engine/tests/unit/core/docker/test_remote_build.py index 8731fadb..3ff7e647 100644 --- a/model-engine/tests/unit/core/docker/test_remote_build.py +++ b/model-engine/tests/unit/core/docker/test_remote_build.py @@ -43,18 +43,24 @@ def test_normalize_path_for_archive_rejects_path_outside_context(tmp_path): @pytest.mark.parametrize( - ("member_name", "patterns", "should_keep"), + ("member_name", "patterns", "nested_archive_roots", "should_keep"), [ - ("pkg/file.py", ["pkg"], False), - ("pkg/file.py", ["*.py"], False), - ("pkg/file.py", ["other"], True), + ("pkg/file.py", ["pkg"], [], False), + ("pkg/file.py", ["*.py"], [], False), + ("pkg/file.py", ["other"], [], True), + ( + "model-engine/.build-context/reqs/file.txt", + [], + ["model-engine/.build-context/reqs"], + False, + ), ], ) -def test_filter_archive_member(member_name, patterns, should_keep): +def test_filter_archive_member(member_name, patterns, nested_archive_roots, should_keep): tar_info = mock.Mock() tar_info.name = member_name - result = remote_build._filter_archive_member(tar_info, patterns) + result = remote_build._filter_archive_member(tar_info, patterns, nested_archive_roots) assert (result is tar_info) is should_keep @@ -116,6 +122,42 @@ def test_zip_context_reraises_storage_errors(tmp_path): ) +def test_zip_context_excludes_nested_explicit_roots_from_parent_archive(tmp_path): + context = tmp_path / "context" + nested_dir = context / "model-engine" / ".build-context" / "reqs" + nested_dir.mkdir(parents=True) + (nested_dir / "requirements.txt").write_text("pkg==1.0") + (context / "model-engine" / "app.py").write_text("print('ok')") + + uploaded = BytesIO() + + class UploadSink: + def __enter__(self): + return uploaded + + def __exit__(self, exc_type, exc, tb): + uploaded.seek(0) + return False + + with mock.patch.object(remote_build.storage_client, "open", return_value=UploadSink()): + remote_build.zip_context( + s3_file_name="bundle.tar.gz", + context=str(context), + folders_to_include=["model-engine", "model-engine/.build-context/reqs"], + ) + + archive_path = tmp_path / "uploaded_nested.tar.gz" + archive_path.write_bytes(uploaded.getvalue()) + import tarfile + + with tarfile.open(archive_path, mode="r:gz") as tar: + names = tar.getnames() + + assert "model-engine/app.py" in names + assert "model-engine/.build-context/reqs/requirements.txt" in names + assert names.count("model-engine/.build-context/reqs/requirements.txt") == 1 + + def test_start_build_job_uses_boto_credentials_for_circleci(tmp_path): template_file = tmp_path / "kaniko_template.yaml" template_file.write_text( diff --git a/model-engine/tests/unit/infra/repositories/test_ecr_docker_repository.py b/model-engine/tests/unit/infra/repositories/test_ecr_docker_repository.py index e97472a0..3fa75eca 100644 --- a/model-engine/tests/unit/infra/repositories/test_ecr_docker_repository.py +++ b/model-engine/tests/unit/infra/repositories/test_ecr_docker_repository.py @@ -33,9 +33,9 @@ def test_build_image_includes_requirements_and_dockerfile_root(tmp_path): repo = ECRDockerRepository() base = tmp_path / "repo" base.mkdir() - requirements = base / ".build-context" / "reqs" + requirements = base / "model-engine" / ".build-context" / "reqs" requirements.mkdir(parents=True) - abs_build_arg = base / ".build-context" / "reqs" / "requirements.txt" + abs_build_arg = base / "model-engine" / ".build-context" / "reqs" / "requirements.txt" abs_build_arg.write_text("x") image_request = BuildImageRequest( @@ -45,7 +45,7 @@ def test_build_image_includes_requirements_and_dockerfile_root(tmp_path): base_path=str(base), dockerfile="model-engine/model_engine_server/inference/pytorch_or_tf.user.Dockerfile", base_image="python:3.8-slim", - requirements_folder=".build-context/reqs", + requirements_folder="model-engine/.build-context/reqs", substitution_args={"REQUIREMENTS_FILE": str(abs_build_arg)}, ) @@ -65,11 +65,11 @@ def test_build_image_includes_requirements_and_dockerfile_root(tmp_path): _, kwargs = mock_build_remote_block.call_args assert kwargs["folders_to_include"] == [ "model-engine", - ".build-context/reqs", + "model-engine/.build-context/reqs", ] assert kwargs["build_args"] == { "BASE_IMAGE": "python:3.8-slim", - "REQUIREMENTS_FILE": ".build-context/reqs/requirements.txt", + "REQUIREMENTS_FILE": "model-engine/.build-context/reqs/requirements.txt", } diff --git a/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py b/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py index d2ffc708..5b22acce 100644 --- a/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py +++ b/model-engine/tests/unit/infra/services/test_live_endpoint_builder_service.py @@ -103,11 +103,13 @@ def set_env_vars(): live_endpoint_builder_service.GIT_TAG = "test_tag" live_endpoint_builder_service.ENV = "test_env" live_endpoint_builder_service.WORKSPACE_PATH = ".." - live_endpoint_builder_service.BUILD_CONTEXT_TEMP_ROOT = "../.build-context" + live_endpoint_builder_service.BUILD_CONTEXT_TEMP_ROOT = "../model-engine/.build-context" live_endpoint_builder_service.open = mock_open() live_endpoint_builder_service.os.makedirs = Mock() live_endpoint_builder_service.open_wrapper = mock_open() - live_endpoint_builder_service.tempfile.mkdtemp = Mock(return_value="../.build-context/tmpdir") + live_endpoint_builder_service.tempfile.mkdtemp = Mock( + return_value="../model-engine/.build-context/tmpdir" + ) live_endpoint_builder_service.tempfile.mkstemp = Mock(return_value=["", ""]) From 9b4686ee4a5f5719ec8ceb974a0a47e0bef4ec42 Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Sat, 18 Apr 2026 07:23:11 -0400 Subject: [PATCH 19/22] fix: address runtime library and ignore matching reviews --- model-engine/Dockerfile | 1 + .../model_engine_server/core/docker/remote_build.py | 10 +++------- .../tests/unit/core/docker/test_remote_build.py | 3 ++- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/model-engine/Dockerfile b/model-engine/Dockerfile index 08eefacd..b1a58799 100644 --- a/model-engine/Dockerfile +++ b/model-engine/Dockerfile @@ -38,6 +38,7 @@ RUN mkdir -p /tmp/runtime-bin /tmp/runtime-libs && \ cp -R /usr/libexec/git-core /tmp/runtime-bin/git-core && \ cp /usr/lib/libpcre2-8.so.0* /tmp/runtime-libs/ && \ cp /usr/lib/libcurl.so.4* /tmp/runtime-libs/ && \ + cp /usr/lib/libreadline.so.8* /tmp/runtime-libs/ && \ cp /usr/lib/libtinfo.so.6* /tmp/runtime-libs/ && \ cp /usr/lib/libz.so.1* /tmp/runtime-libs/ && \ git clone --depth 1 --branch v1.35.3 https://github.com/kubernetes/kubernetes.git /tmp/k8s && \ diff --git a/model-engine/model_engine_server/core/docker/remote_build.py b/model-engine/model_engine_server/core/docker/remote_build.py index 1d7b5956..a277007a 100644 --- a/model-engine/model_engine_server/core/docker/remote_build.py +++ b/model-engine/model_engine_server/core/docker/remote_build.py @@ -8,8 +8,7 @@ from base64 import b64encode from contextlib import ExitStack from dataclasses import dataclass -from fnmatch import fnmatch -from pathlib import Path +from pathlib import Path, PurePosixPath from string import Template from typing import Dict, Iterable, List, Optional, Union @@ -155,7 +154,6 @@ def _filter_archive_member( nested_archive_roots: Optional[List[str]] = None, ) -> Optional[tarfile.TarInfo]: normalized_name = tar_info.name.removeprefix("./") - basename = os.path.basename(normalized_name) nested_archive_roots = nested_archive_roots or [] for nested_root in nested_archive_roots: @@ -164,10 +162,8 @@ def _filter_archive_member( for pattern in ignore_patterns: normalized_pattern = pattern.rstrip("/") - if ( - fnmatch(normalized_name, normalized_pattern) - or fnmatch(basename, normalized_pattern) - or normalized_name.startswith(f"{normalized_pattern}/") + if PurePosixPath(normalized_name).match(normalized_pattern) or normalized_name.startswith( + f"{normalized_pattern}/" ): return None return tar_info diff --git a/model-engine/tests/unit/core/docker/test_remote_build.py b/model-engine/tests/unit/core/docker/test_remote_build.py index 3ff7e647..25778db9 100644 --- a/model-engine/tests/unit/core/docker/test_remote_build.py +++ b/model-engine/tests/unit/core/docker/test_remote_build.py @@ -46,7 +46,8 @@ def test_normalize_path_for_archive_rejects_path_outside_context(tmp_path): ("member_name", "patterns", "nested_archive_roots", "should_keep"), [ ("pkg/file.py", ["pkg"], [], False), - ("pkg/file.py", ["*.py"], [], False), + ("pkg/file.py", ["*.py"], [], True), + ("file.py", ["*.py"], [], False), ("pkg/file.py", ["other"], [], True), ( "model-engine/.build-context/reqs/file.txt", From 86f894a388466b93d9369fa73983360c614a8446 Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Mon, 20 Apr 2026 11:33:09 -0400 Subject: [PATCH 20/22] fix: restore root-only ignore glob behavior --- .../model_engine_server/core/docker/remote_build.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/model-engine/model_engine_server/core/docker/remote_build.py b/model-engine/model_engine_server/core/docker/remote_build.py index a277007a..04d85b63 100644 --- a/model-engine/model_engine_server/core/docker/remote_build.py +++ b/model-engine/model_engine_server/core/docker/remote_build.py @@ -8,7 +8,8 @@ from base64 import b64encode from contextlib import ExitStack from dataclasses import dataclass -from pathlib import Path, PurePosixPath +from fnmatch import fnmatchcase +from pathlib import Path from string import Template from typing import Dict, Iterable, List, Optional, Union @@ -162,9 +163,13 @@ def _filter_archive_member( for pattern in ignore_patterns: normalized_pattern = pattern.rstrip("/") - if PurePosixPath(normalized_name).match(normalized_pattern) or normalized_name.startswith( - f"{normalized_pattern}/" - ): + if "/" in normalized_pattern: + pattern_matches = fnmatchcase(normalized_name, normalized_pattern) + else: + pattern_matches = "/" not in normalized_name and fnmatchcase( + normalized_name, normalized_pattern + ) + if pattern_matches or normalized_name.startswith(f"{normalized_pattern}/"): return None return tar_info From b2cb597e3895aad6ea4ef001d3b769439fd07701 Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Mon, 20 Apr 2026 11:51:22 -0400 Subject: [PATCH 21/22] test: align archive ignore coverage with matcher semantics --- model-engine/tests/unit/core/docker/test_remote_build.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/model-engine/tests/unit/core/docker/test_remote_build.py b/model-engine/tests/unit/core/docker/test_remote_build.py index 25778db9..7b25c171 100644 --- a/model-engine/tests/unit/core/docker/test_remote_build.py +++ b/model-engine/tests/unit/core/docker/test_remote_build.py @@ -70,9 +70,10 @@ def test_zip_context_uploads_filtered_archive(tmp_path): context = tmp_path / "context" include_dir = context / "pkg" include_dir.mkdir(parents=True) + (context / "root.log").write_text("root") (include_dir / "keep.txt").write_text("keep") (include_dir / "drop.log").write_text("drop") - (context / ".dockerignore").write_text("*.log\n") + (context / ".dockerignore").write_text("*.log\npkg/*.log\n") uploaded = BytesIO() @@ -100,6 +101,7 @@ def __exit__(self, exc_type, exc, tb): names = tar.getnames() assert "pkg/keep.txt" in names + assert "root.log" not in names assert "pkg/drop.log" not in names From 823f55667ee31a1054d4530fd7f5ce2d14e30c1f Mon Sep 17 00:00:00 2001 From: Brandon Allen Date: Mon, 20 Apr 2026 12:07:04 -0400 Subject: [PATCH 22/22] fix: skip rewriting build context root args --- .../infra/repositories/ecr_docker_repository.py | 9 +++++---- .../repositories/test_ecr_docker_repository.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/model-engine/model_engine_server/infra/repositories/ecr_docker_repository.py b/model-engine/model_engine_server/infra/repositories/ecr_docker_repository.py index 10beed10..7ce835f6 100644 --- a/model-engine/model_engine_server/infra/repositories/ecr_docker_repository.py +++ b/model-engine/model_engine_server/infra/repositories/ecr_docker_repository.py @@ -1,4 +1,5 @@ import os +from pathlib import Path from typing import Dict, Optional from model_engine_server.common.config import hmi_config @@ -17,21 +18,21 @@ class ECRDockerRepository(DockerRepository): @staticmethod def _normalize_build_args(base_path: str, build_args: Dict[str, str]) -> Dict[str, str]: normalized = dict(build_args) - base_path_abs = os.path.abspath(base_path) + base_path_abs = Path(base_path).resolve() updates: Dict[str, str] = {} for key, value in normalized.items(): if not isinstance(value, str) or not os.path.isabs(value): continue - value_abs = os.path.abspath(value) + value_abs = Path(value).resolve() try: - if os.path.commonpath([base_path_abs, value_abs]) != base_path_abs: + if value_abs == base_path_abs or not value_abs.is_relative_to(base_path_abs): continue except ValueError: continue - updates[key] = os.path.relpath(value_abs, base_path_abs) + updates[key] = os.path.relpath(str(value_abs), str(base_path_abs)) normalized.update(updates) return normalized diff --git a/model-engine/tests/unit/infra/repositories/test_ecr_docker_repository.py b/model-engine/tests/unit/infra/repositories/test_ecr_docker_repository.py index 3fa75eca..ce35836b 100644 --- a/model-engine/tests/unit/infra/repositories/test_ecr_docker_repository.py +++ b/model-engine/tests/unit/infra/repositories/test_ecr_docker_repository.py @@ -29,6 +29,20 @@ def test_normalize_build_args_rewrites_only_paths_inside_base(tmp_path): assert normalized["NON_STRING"] == 1 +def test_normalize_build_args_does_not_rewrite_base_path_itself(tmp_path): + base = tmp_path / "repo" + base.mkdir() + + normalized = ECRDockerRepository._normalize_build_args( + str(base), + { + "CONTEXT_ROOT": str(base), + }, + ) + + assert normalized["CONTEXT_ROOT"] == str(base) + + def test_build_image_includes_requirements_and_dockerfile_root(tmp_path): repo = ECRDockerRepository() base = tmp_path / "repo"