diff --git a/templates/http/base/deployment.yaml b/templates/http/base/deployment.yaml index 87fa735..5c94a43 100644 --- a/templates/http/base/deployment.yaml +++ b/templates/http/base/deployment.yaml @@ -22,6 +22,7 @@ spec: labels: app.kubernetes.io/instance: {{values.name}} spec: + {%- if values.vllmSelected == nil or not(values.vllmSelected) %} initContainers: - name: model-file image: {{values.initContainer}} @@ -29,16 +30,39 @@ spec: volumeMounts: - name: model-file mountPath: /shared + {%- endif %} containers: - env: - name: MODEL_ENDPOINT value: http://0.0.0.0:{{values.modelServicePort}} + {%- if values.vllmSelected %} + - name: MODEL_NAME + value: "{{values.vllmModelName}}" + {%- endif %} image: {{values.appContainer}} name: app-inference ports: - containerPort: {{values.appPort}} securityContext: runAsNonRoot: true + {%- if values.vllmSelected %} + - image: {{values.vllmModelServiceContainer}} + args: [ + "--model", + "{{values.vllmModelName}}", + "--port", + "{{values.modelServicePort}}", + "--download-dir", + "/models-cache"] + resources: + limits: + nvidia.com/gpu: '1' + volumeMounts: + - name: dshm + mountPath: /dev/shm + - name: models-cache + mountPath: /models-cache + {%- else %} - env: - name: HOST value: "0.0.0.0" @@ -47,14 +71,31 @@ spec: - name: MODEL_PATH value: /model/model.file image: {{values.modelServiceContainer}} + volumeMounts: + - name: model-file + mountPath: /model + {%- endif %} name: app-model-service ports: - containerPort: {{values.modelServicePort}} securityContext: runAsNonRoot: true - volumeMounts: - - name: model-file - mountPath: /model + {%- if values.vllmSelected %} + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: "2Gi" + - name: models-cache + persistentVolumeClaim: + claimName: vllm-models-cache + + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + {%- else %} volumes: - name: model-file emptyDir: {} + {%- endif %} diff --git a/templates/http/base/kustomization.yaml b/templates/http/base/kustomization.yaml index 407ab41..d226825 100644 --- a/templates/http/base/kustomization.yaml +++ b/templates/http/base/kustomization.yaml @@ -8,6 +8,9 @@ commonLabels: app.kubernetes.io/part-of: {{values.name}} resources: - initialize-namespace.yaml +{%- if values.vllmSelected %} +- pvc.yaml +{%- endif %} - deployment.yaml - route.yaml - service.yaml diff --git a/templates/http/base/pvc.yaml b/templates/http/base/pvc.yaml new file mode 100644 index 0000000..f06aac0 --- /dev/null +++ b/templates/http/base/pvc.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: vllm-models-cache +spec: + accessModes: + - ReadWriteOnce + volumeMode: Filesystem + resources: + requests: + storage: 100Gi \ No newline at end of file