predibase · tgaddair · Nov 16, 2023 · Nov 16, 2023 · Nov 16, 2023 · Nov 16, 2023
diff --git a/helm/.helmignore b/helm/.helmignore
@@ -0,0 +1,3 @@
+docs/
+integration-tests/
+load_tests/
diff --git a/helm/Chart.yaml b/helm/Chart.yaml
@@ -0,0 +1,4 @@
+apiVersion: v2
+name: LoRAX
+description: A Helm chart for LoRAX
+version: 0.1.0
diff --git a/helm/Chart.yaml~ b/helm/Chart.yaml~
diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl
@@ -0,0 +1,3 @@
+{{- define "llmDeployment.name" -}}
+  {{ .Values.llmDeployment.name }}
+{{- end -}}
diff --git a/helm/templates/deployment.yaml b/helm/templates/deployment.yaml
@@ -0,0 +1,112 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    app: {{ template "llmDeployment.name" . }}
+    {{- if .Values.llmDeployment.k8sDeploymentObject.metadata.additionalLabels }}
+      {{- toYaml .Values.llmDeployment.k8sDeploymentObject.metadata.additionalLabels | nindent 4 }}
+    {{- end }}
+  name: {{ template "llmDeployment.name" . }}
+  namespace: {{ default .Release.Namespace "default" }}
+spec:
+  progressDeadlineSeconds: 600
+  replicas: {{ .Values.llmDeployment.replicas }}
+  revisionHistoryLimit: 10
+  selector:
+    matchLabels:
+      app: {{ template "llmDeployment.name" . }}
+  strategy:
+    type: {{ .Values.llmDeployment.updateStrategy }}
+  template:
+    metadata:
+      labels:
+        app: {{ template "llmDeployment.name" . }}
+        {{- if .Values.llmDeployment.k8sDeploymentObject.spec.template.metadata.additionalLabels }}
+          {{- toYaml .Values.llmDeployment.k8sDeploymentObject.spec.template.metadata.additionalLabels | nindent 8 }}
+        {{- end }}
+      {{- if .Values.llmDeployment.k8sDeploymentObject.spec.template.metadata.additionalAnnotations }}
+      annotations: {{- toYaml .Values.llmDeployment.k8sDeploymentObject.spec.template.metadata.additionalAnnotations | nindent 8 }}
+      {{- end }}
+
+    spec:
+      {{- if .Values.llmDeployment.k8sDeploymentObject.spec.template.spec.affinity }}
+      affinity:
+        {{- toYaml .Values.llmDeployment.k8sDeploymentObject.spec.template.spec.affinity | nindent 8 }}
+      {{- end }}
+      containers:
+      - args:
+        - --model-id
+        - {{ .Values.llmDeployment.args.modelId }}
+        - --max-input-length
+        - {{ .Values.llmDeployment.args.maxInputLength | quote }}
+        - --max-total-tokens
+        - {{ .Values.llmDeployment.args.maxTotalTokens | quote }}
+        - --max-batch-total-tokens
+        - {{ .Values.llmDeployment.args.maxBatchTotalTokens | quote }}
+        - --max-batch-prefill-tokens
+        - {{ .Values.llmDeployment.args.maxBatchPrefillTokens | quote }}
+        - --sharded
+        - {{ .Values.llmDeployment.args.sharded | quote }}
+        env:
+        - name: PORT
+          value: "8000"
+        - name: HUGGING_FACE_HUB_TOKEN
+          value: {{ .Values.llmDeployment.env.huggingFaceHubToken | quote }}
+        - name: LORAX_ENABLED_MODEL_TYPES
+          value: {{ .Values.llmDeployment.env.loraxEnabledModelTypes | quote }}
+        image: {{ .Values.llmDeployment.image.repository }}:{{ .Values.llmDeployment.image.tag }}
+        imagePullPolicy: IfNotPresent
+        livenessProbe:
+          failureThreshold: 240
+          httpGet:
+            path: /health
+            port: http
+            scheme: HTTP
+          initialDelaySeconds: 5
+          periodSeconds: 5
+          successThreshold: 1
+          timeoutSeconds: 1
+        name: lorax
+        ports:
+        - containerPort: 8000
+          name: http
+          protocol: TCP
+        readinessProbe:
+          failureThreshold: 240
+          httpGet:
+            path: /health
+            port: http
+            scheme: HTTP
+          initialDelaySeconds: 5
+          periodSeconds: 5
+          successThreshold: 1
+          timeoutSeconds: 1
+        resources: {{- toYaml .Values.resources | nindent 10 }}
+        terminationMessagePath: /dev/termination-log
+        terminationMessagePolicy: File
+        volumeMounts:
+        - mountPath: /data
+          name: data
+        - mountPath: /dev/shm
+          name: shm
+      dnsPolicy: ClusterFirst
+      enableServiceLinks: false
+      {{- if .Values.llmDeployment.k8sDeploymentObject.spec.template.spec.nodeSelector }}
+      nodeSelector:
+        {{- toYaml .Values.llmDeployment.k8sDeploymentObject.spec.template.spec.nodeSelector | nindent 8 }}
+      {{- end }}
+      {{- if .Values.llmDeployment.k8sDeploymentObject.spec.template.spec.tolerations }}
+      tolerations:
+        {{- toYaml .Values.llmDeployment.k8sDeploymentObject.spec.template.spec.tolerations | nindent 6 }}
+      {{- end }}
+      nodeSelector: {{- toYaml .Values.llmDeployment.nodeSelector | nindent 8 }}
+      restartPolicy: Always
+      schedulerName: default-scheduler
+      securityContext: {}
+      terminationGracePeriodSeconds: 30
+      volumes:
+      - emptyDir:
+          medium: Memory
+        name: shm
+      - emptyDir:
+        name: data
diff --git a/helm/templates/service.yaml b/helm/templates/service.yaml
@@ -0,0 +1,26 @@
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    app: {{ template "llmDeployment.name" . }}
+    app.kubernetes.io/name: {{ template "llmDeployment.name" . }}
+    {{- if .Values.llmService.additionalLabels }}
+      {{- toYaml .Values.llmService.additionalLabels | nindent 4 }}
+    {{- end }}
+
+  name: {{ template "llmDeployment.name" . }}
+  namespace: {{ .Release.Namespace }}
+spec:
+  internalTrafficPolicy: Cluster
+  ipFamilies:
+  - IPv4
+  ipFamilyPolicy: SingleStack
+  ports:
+  - name: http
+    port: 80
+    protocol: TCP
+    targetPort: http
+  selector:
+    app: {{ template "llmDeployment.name" . }}
+  sessionAffinity: None
+  type: {{ .Values.llmService.serviceType }}
diff --git a/helm/values.yaml b/helm/values.yaml
@@ -0,0 +1,43 @@
+llmDeployment:
+  name: "llm-deployment-example"
+  namespace: "default"
+  replicas: 1
+  updateStrategy: RollingUpdate
+
+  # additionalLabels:
+  #   some-label: your-deployment-label
+
+  # specTemplate:
+  #   metadata:
+  #     ... add metadata here ...
+
+  image:
+    repository: "ghcr.io/predibase/lorax"
+    tag: "f76119a"
+  args:
+    modelId: "model-id-from-huggingface"
+    maxInputLength: 512
+    maxTotalTokens: 1024
+    maxBatchTotalTokens: 4096
+    maxBatchPrefillTokens: 2048
+    sharded: true
+  env:
+    huggingFaceHubToken: "your-hub-token"
+    loraxEnabledModelTypes: "llama,mistral"
+  nodeSelector:
+    node.kubernetes.io/instance-type: g5.2xlarge # this will only work on AWS. Update this to your liking.
+resources:
+  limits:
+    cpu: "8"
+    ephemeral-storage: 100Gi
+    memory: 27041Mi
+    nvidia.com/gpu: "1"
+  requests:
+    cpu: "8"
+    ephemeral-storage: 100Gi
+    memory: 27041Mi
+    nvidia.com/gpu: "1"
+llmService:
+  serviceType: LoadBalancer
+  # additionalLabels:
+  #   some-label: your-service-label