start porting latest tgi (#480)

predibase · May 24, 2024 · a2ca687 · a2ca687
1 parent da90421
commit a2ca687
Show file tree

Hide file tree

Showing 151 changed files with 5,317 additions and 10,825 deletions.
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -1,6 +1,6 @@
 {
     "build": {
-        "dockerfile": "../Dockerfile"
+        "dockerfile": "../Dockerfile.dev"
     },
     "runArgs": [
         "--gpus",

diff --git a/Dockerfile b/Dockerfile
@@ -39,8 +39,9 @@ FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 as pytorch-install
 
 ARG PYTORCH_VERSION=2.3.0
 ARG PYTHON_VERSION=3.10
+# Keep in sync with `server/pyproject.toml
 ARG CUDA_VERSION=12.1
-ARG MAMBA_VERSION=23.3.1-1
+ARG MAMBA_VERSION=24.3.0-0
 ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=pytorch
 # Automatically set by buildx
@@ -52,7 +53,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
     build-essential \
     ca-certificates \
     ccache \
-    sudo \
     curl \
     git && \
     rm -rf /var/lib/apt/lists/*
@@ -73,7 +73,7 @@ RUN chmod +x ~/mambaforge.sh && \
 RUN case ${TARGETPLATFORM} in \
     "linux/arm64")  exit 1 ;; \
     *)              /opt/conda/bin/conda update -y conda &&  \
-    /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -c anaconda -c conda-forge -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
+    /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
     esac && \
     /opt/conda/bin/conda clean -ya
 
@@ -90,29 +90,32 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
 FROM kernel-builder as flash-att-builder
 WORKDIR /usr/src
 COPY server/Makefile-flash-att Makefile
-
-# Build specific version of flash attention
 RUN make build-flash-attention
+
 # Build Flash Attention v2 CUDA kernels
 FROM kernel-builder as flash-att-v2-builder
 WORKDIR /usr/src
 COPY server/Makefile-flash-att-v2 Makefile
-# Build specific version of flash attention v2
-RUN make build-flash-attention-v2
+RUN make build-flash-attention-v2-cuda
 
-# Build Transformers exllamav2 kernels
+# Build Transformers exllama kernels
 FROM kernel-builder as exllama-kernels-builder
 WORKDIR /usr/src
+COPY server/exllama_kernels/ .
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+
+# Build Transformers exllama kernels
+FROM kernel-builder as exllamav2-kernels-builder
+WORKDIR /usr/src
 COPY server/exllamav2_kernels/ .
-# Build specific version of transformers
 RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
 
-# Build awq kernels
+# Build Transformers awq kernels
 FROM kernel-builder as awq-kernels-builder
 WORKDIR /usr/src
-COPY server/awq_kernels/ .
-# Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+COPY server/Makefile-awq Makefile
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq
+
 
 # Build Transformers CUDA kernels
 FROM kernel-builder as custom-kernels-builder
@@ -123,12 +126,11 @@ RUN python setup.py build
 
 # Build vllm CUDA kernels
 FROM kernel-builder as vllm-builder
-RUN /opt/conda/bin/conda install packaging
 WORKDIR /usr/src
+ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 COPY server/Makefile-vllm Makefile
 # Build specific version of vllm
-ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
-RUN make build-vllm
+RUN make build-vllm-cuda
 
 # Build megablocks kernels
 FROM kernel-builder as megablocks-kernels-builder
@@ -188,9 +190,10 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86
 COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
 # Copy build artifacts from exllama kernels builder
 COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# Copy build artifacts from exllamav2 kernels builder
+COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
 # Copy build artifacts from awq kernels builder
-COPY --from=awq-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-
+COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
 # Copy builds artifacts from vllm builder
 COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
 

diff --git a/Dockerfile.dev b/Dockerfile.dev
@@ -0,0 +1,19 @@
+# LoRAX base image
+FROM ghcr.io/predibase/lorax:latest as base
+
+# Install server
+COPY proto proto
+COPY server server
+COPY server/Makefile server/Makefile
+
+# Final image
+FROM base
+
+COPY container-entrypoint.sh entrypoint.sh
+RUN chmod +x entrypoint.sh
+COPY sync.sh sync.sh
+RUN chmod +x sync.sh
+
+# ENTRYPOINT ["./entrypoint.sh"]
+ENTRYPOINT ["lorax-launcher"]
+CMD ["--json-output"]
diff --git a/clients/python/lorax/__init__.py b/clients/python/lorax/__init__.py
@@ -14,4 +14,4 @@
 
 __version__ = "0.6.0"
 
-from lorax.client import Client, AsyncClient, MergedAdapters
+from lorax.client import Client, AsyncClient, MergedAdapters # noqa
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
@@ -164,7 +164,7 @@ async def health(self, timeout: int = 60):
             try:
                 await self.client.generate("test")
                 return
-            except (ClientConnectorError, ClientOSError, ServerDisconnectedError) as e:
+            except (ClientConnectorError, ClientOSError, ServerDisconnectedError):
                 time.sleep(1)
         raise RuntimeError("Health check failed")
 

diff --git a/integration-tests/scripts/dynamic_adapter_loading.py b/integration-tests/scripts/dynamic_adapter_loading.py
@@ -36,11 +36,9 @@
 import collections
 import concurrent.futures
 import json
-import random
 import time
 from urllib.request import Request, urlopen
 
-import numpy as np
 
 
 def query_lorax(args):

diff --git a/launcher/src/main.rs b/launcher/src/main.rs
@@ -30,6 +30,7 @@ enum Quantization {
     Hqq_4bit,
     Hqq_3bit,
     Hqq_2bit,
+    Fp8,
 }
 
 impl std::fmt::Display for Quantization {
@@ -63,6 +64,9 @@ impl std::fmt::Display for Quantization {
             Quantization::Hqq_2bit => {
                 write!(f, "hqq-2bit")
             }
+            Quantization::Fp8 => {
+                write!(f, "fp8")
+            }
         }
     }
 }

diff --git a/server/Makefile b/server/Makefile
@@ -3,6 +3,7 @@ include Makefile-flash-att-v2
 include Makefile-vllm
 include Makefile-megablocks
 include Makefile-eetq
+include Makefile-awq
 
 unit-tests:
 	pytest -s -vv -m "not private" tests

diff --git a/server/Makefile-awq b/server/Makefile-awq
@@ -0,0 +1,15 @@
+# Fork that adds only the correct stream to this kernel in order
+# to make cuda graphs work.
+awq_commit := bd1dc2d5254345cc76ab71894651fb821275bdd4
+
+awq:
+	rm -rf llm-awq
+	git clone https://github.com/huggingface/llm-awq
+
+build-awq: awq
+	cd llm-awq/ && git fetch && git checkout $(awq_commit)
+	cd llm-awq/awq/kernels && python setup.py build
+
+install-awq: build-awq
+	pip uninstall awq_inference_engine -y || true
+	cd llm-awq/awq/kernels && python setup.py install
diff --git a/server/Makefile-eetq b/server/Makefile-eetq
@@ -1,4 +1,4 @@
-eetq_commit := cc2fdb4637e03652ac264eaef44dd8492472de01 # 323827dd471458a84e9c840f614e4592b157a4b1
+eetq_commit := 1657b1504faa359e2ce0ac02999439d7ac8c74c0
 
 eetq:
     # Clone eetq

diff --git a/server/Makefile-flash-att b/server/Makefile-flash-att
@@ -2,7 +2,7 @@ flash_att_commit := 3a9bfd076f98746c73362328958dbc68d145fbec
 
 flash-attention:
     # Clone flash attention
-	pip install packaging
+	pip install -U packaging ninja  --no-cache-dir
 	git clone https://github.com/HazyResearch/flash-attention.git
 
 build-flash-attention: flash-attention
@@ -13,4 +13,4 @@ build-flash-attention: flash-attention
 
 install-flash-attention: build-flash-attention
 	pip uninstall flash_attn rotary_emb dropout_layer_norm -y || true
-	cd flash-attention && python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install
+	cd flash-attention && python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install
diff --git a/server/Makefile-flash-att-v2 b/server/Makefile-flash-att-v2
@@ -1,19 +1,29 @@
-flash_att_v2_commit := 2c3baba4a63c4007c8a132c5380edc9430f88a22
+flash_att_v2_commit_cuda := v2.5.8
+flash_att_v2_commit_rocm := 2554f490101742ccdc56620a938f847f61754be6
 
-flash-attention-v2:
-    # Clone flash attention
-	pip install packaging
-	git clone https://github.com/HazyResearch/flash-attention.git flash-attention-v2
 
-build-flash-attention-v2: flash-attention-v2
-	cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit)
+flash-attention-v2-cuda:
+  # Clone flash attention
+	pip install -U packaging ninja  --no-cache-dir
+	git clone https://github.com/Dao-AILab/flash-attention.git flash-attention-v2
+
+build-flash-attention-v2-cuda: flash-attention-v2-cuda
+	cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_cuda)
+	cd flash-attention-v2 && git submodule update --init --recursive
 	cd flash-attention-v2 && python setup.py build
 
-# install-flash-attention-v2: build-flash-attention-v2
-# 	cd flash-attention-v2 && python setup.py install
+install-flash-attention-v2-cuda: build-flash-attention-v2-cuda
+	cd flash-attention-v2 && git submodule update --init --recursive && python setup.py install
+
+flash-attention-v2-rocm:
+  # Clone flash attention
+	pip install -U packaging ninja  --no-cache-dir
+	git clone https://github.com/ROCm/flash-attention.git flash-attention-v2
+
+build-flash-attention-v2-rocm: flash-attention-v2-rocm
+	cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_rocm)
+	cd flash-attention-v2 && git submodule update --init --recursive
+	cd flash-attention-v2 && GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build
 
-# Install from pip because the target commit is actually a release commit
-# and the pip wheels do not require nvcc to be installed.
-# Reference: https://github.com/Dao-AILab/flash-attention/issues/509
-install-flash-attention-v2:
-	FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn==2.3.0 --no-build-isolation
+install-flash-attention-v2-rocm: build-flash-attention-v2-rocm
+	cd flash-attention-v2 && git submodule update --init --recursive && python setup.py install
diff --git a/server/Makefile-vllm b/server/Makefile-vllm
@@ -1,13 +1,25 @@
-vllm_commit := 6d592eb430a37a7f8f5f9beb2dbc014bf3aa76bc
-
-vllm:
+vllm-cuda:
     # Clone vllm
-	git clone https://github.com/vllm-project/vllm.git
+	pip install -U ninja packaging --no-cache-dir
+	git clone https://github.com/Narsil/vllm.git vllm
 
-build-vllm: vllm
-	cd vllm && git fetch && git checkout $(vllm_commit)
+build-vllm-cuda: vllm-cuda
+	cd vllm && git fetch && git checkout b5dfc61db88a81069e45b44f7cc99bd9e62a60fa
 	cd vllm && python setup.py build
 
-install-vllm: build-vllm
+install-vllm-cuda: build-vllm-cuda
+	pip uninstall vllm -y || true
+	cd vllm && python setup.py install
+
+vllm-rocm:
+    # Clone vllm
+	pip install -U ninja packaging --no-cache-dir
+	git clone https://github.com/fxmarty/rocm-vllm.git vllm
+
+build-vllm-rocm: vllm-rocm
+	cd vllm && git fetch && git checkout ca6913b3c2ffacdcb7d15e914dc34adbc6c89479
+	cd vllm && PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py install
+
+install-vllm-rocm: build-vllm-rocm
 	pip uninstall vllm -y || true
-	cd vllm && python setup.py install
+	cd vllm && python setup.py install