Skip to content

Commit

Permalink
start porting latest tgi (#480)
Browse files Browse the repository at this point in the history
  • Loading branch information
flozi00 committed May 24, 2024
1 parent da90421 commit a2ca687
Show file tree
Hide file tree
Showing 151 changed files with 5,317 additions and 10,825 deletions.
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"build": {
"dockerfile": "../Dockerfile"
"dockerfile": "../Dockerfile.dev"
},
"runArgs": [
"--gpus",
Expand Down
39 changes: 21 additions & 18 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 as pytorch-install

ARG PYTORCH_VERSION=2.3.0
ARG PYTHON_VERSION=3.10
# Keep in sync with `server/pyproject.toml
ARG CUDA_VERSION=12.1
ARG MAMBA_VERSION=23.3.1-1
ARG MAMBA_VERSION=24.3.0-0
ARG CUDA_CHANNEL=nvidia
ARG INSTALL_CHANNEL=pytorch
# Automatically set by buildx
Expand All @@ -52,7 +53,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
build-essential \
ca-certificates \
ccache \
sudo \
curl \
git && \
rm -rf /var/lib/apt/lists/*
Expand All @@ -73,7 +73,7 @@ RUN chmod +x ~/mambaforge.sh && \
RUN case ${TARGETPLATFORM} in \
"linux/arm64") exit 1 ;; \
*) /opt/conda/bin/conda update -y conda && \
/opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -c anaconda -c conda-forge -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \
/opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \
esac && \
/opt/conda/bin/conda clean -ya

Expand All @@ -90,29 +90,32 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
FROM kernel-builder as flash-att-builder
WORKDIR /usr/src
COPY server/Makefile-flash-att Makefile

# Build specific version of flash attention
RUN make build-flash-attention

# Build Flash Attention v2 CUDA kernels
FROM kernel-builder as flash-att-v2-builder
WORKDIR /usr/src
COPY server/Makefile-flash-att-v2 Makefile
# Build specific version of flash attention v2
RUN make build-flash-attention-v2
RUN make build-flash-attention-v2-cuda

# Build Transformers exllamav2 kernels
# Build Transformers exllama kernels
FROM kernel-builder as exllama-kernels-builder
WORKDIR /usr/src
COPY server/exllama_kernels/ .
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

# Build Transformers exllama kernels
FROM kernel-builder as exllamav2-kernels-builder
WORKDIR /usr/src
COPY server/exllamav2_kernels/ .
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

# Build awq kernels
# Build Transformers awq kernels
FROM kernel-builder as awq-kernels-builder
WORKDIR /usr/src
COPY server/awq_kernels/ .
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
COPY server/Makefile-awq Makefile
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq


# Build Transformers CUDA kernels
FROM kernel-builder as custom-kernels-builder
Expand All @@ -123,12 +126,11 @@ RUN python setup.py build

# Build vllm CUDA kernels
FROM kernel-builder as vllm-builder
RUN /opt/conda/bin/conda install packaging
WORKDIR /usr/src
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
COPY server/Makefile-vllm Makefile
# Build specific version of vllm
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
RUN make build-vllm
RUN make build-vllm-cuda

# Build megablocks kernels
FROM kernel-builder as megablocks-kernels-builder
Expand Down Expand Up @@ -188,9 +190,10 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from exllama kernels builder
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from exllamav2 kernels builder
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from awq kernels builder
COPY --from=awq-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy builds artifacts from vllm builder
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

Expand Down
19 changes: 19 additions & 0 deletions Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# LoRAX base image
FROM ghcr.io/predibase/lorax:latest as base

# Install server
COPY proto proto
COPY server server
COPY server/Makefile server/Makefile

# Final image
FROM base

COPY container-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh
COPY sync.sh sync.sh
RUN chmod +x sync.sh

# ENTRYPOINT ["./entrypoint.sh"]
ENTRYPOINT ["lorax-launcher"]
CMD ["--json-output"]
2 changes: 1 addition & 1 deletion clients/python/lorax/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@

__version__ = "0.6.0"

from lorax.client import Client, AsyncClient, MergedAdapters
from lorax.client import Client, AsyncClient, MergedAdapters # noqa
2 changes: 1 addition & 1 deletion integration-tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ async def health(self, timeout: int = 60):
try:
await self.client.generate("test")
return
except (ClientConnectorError, ClientOSError, ServerDisconnectedError) as e:
except (ClientConnectorError, ClientOSError, ServerDisconnectedError):
time.sleep(1)
raise RuntimeError("Health check failed")

Expand Down
2 changes: 0 additions & 2 deletions integration-tests/scripts/dynamic_adapter_loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,9 @@
import collections
import concurrent.futures
import json
import random
import time
from urllib.request import Request, urlopen

import numpy as np


def query_lorax(args):
Expand Down
4 changes: 4 additions & 0 deletions launcher/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ enum Quantization {
Hqq_4bit,
Hqq_3bit,
Hqq_2bit,
Fp8,
}

impl std::fmt::Display for Quantization {
Expand Down Expand Up @@ -63,6 +64,9 @@ impl std::fmt::Display for Quantization {
Quantization::Hqq_2bit => {
write!(f, "hqq-2bit")
}
Quantization::Fp8 => {
write!(f, "fp8")
}
}
}
}
Expand Down
1 change: 1 addition & 0 deletions server/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ include Makefile-flash-att-v2
include Makefile-vllm
include Makefile-megablocks
include Makefile-eetq
include Makefile-awq

unit-tests:
pytest -s -vv -m "not private" tests
Expand Down
15 changes: 15 additions & 0 deletions server/Makefile-awq
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Fork that adds only the correct stream to this kernel in order
# to make cuda graphs work.
awq_commit := bd1dc2d5254345cc76ab71894651fb821275bdd4

awq:
rm -rf llm-awq
git clone https://github.com/huggingface/llm-awq

build-awq: awq
cd llm-awq/ && git fetch && git checkout $(awq_commit)
cd llm-awq/awq/kernels && python setup.py build

install-awq: build-awq
pip uninstall awq_inference_engine -y || true
cd llm-awq/awq/kernels && python setup.py install
2 changes: 1 addition & 1 deletion server/Makefile-eetq
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
eetq_commit := cc2fdb4637e03652ac264eaef44dd8492472de01 # 323827dd471458a84e9c840f614e4592b157a4b1
eetq_commit := 1657b1504faa359e2ce0ac02999439d7ac8c74c0

eetq:
# Clone eetq
Expand Down
4 changes: 2 additions & 2 deletions server/Makefile-flash-att
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ flash_att_commit := 3a9bfd076f98746c73362328958dbc68d145fbec

flash-attention:
# Clone flash attention
pip install packaging
pip install -U packaging ninja --no-cache-dir
git clone https://github.com/HazyResearch/flash-attention.git

build-flash-attention: flash-attention
Expand All @@ -13,4 +13,4 @@ build-flash-attention: flash-attention

install-flash-attention: build-flash-attention
pip uninstall flash_attn rotary_emb dropout_layer_norm -y || true
cd flash-attention && python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install
cd flash-attention && python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install
38 changes: 24 additions & 14 deletions server/Makefile-flash-att-v2
Original file line number Diff line number Diff line change
@@ -1,19 +1,29 @@
flash_att_v2_commit := 2c3baba4a63c4007c8a132c5380edc9430f88a22
flash_att_v2_commit_cuda := v2.5.8
flash_att_v2_commit_rocm := 2554f490101742ccdc56620a938f847f61754be6

flash-attention-v2:
# Clone flash attention
pip install packaging
git clone https://github.com/HazyResearch/flash-attention.git flash-attention-v2

build-flash-attention-v2: flash-attention-v2
cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit)
flash-attention-v2-cuda:
# Clone flash attention
pip install -U packaging ninja --no-cache-dir
git clone https://github.com/Dao-AILab/flash-attention.git flash-attention-v2

build-flash-attention-v2-cuda: flash-attention-v2-cuda
cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_cuda)
cd flash-attention-v2 && git submodule update --init --recursive
cd flash-attention-v2 && python setup.py build

# install-flash-attention-v2: build-flash-attention-v2
# cd flash-attention-v2 && python setup.py install
install-flash-attention-v2-cuda: build-flash-attention-v2-cuda
cd flash-attention-v2 && git submodule update --init --recursive && python setup.py install

flash-attention-v2-rocm:
# Clone flash attention
pip install -U packaging ninja --no-cache-dir
git clone https://github.com/ROCm/flash-attention.git flash-attention-v2

build-flash-attention-v2-rocm: flash-attention-v2-rocm
cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_rocm)
cd flash-attention-v2 && git submodule update --init --recursive
cd flash-attention-v2 && GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build

# Install from pip because the target commit is actually a release commit
# and the pip wheels do not require nvcc to be installed.
# Reference: https://github.com/Dao-AILab/flash-attention/issues/509
install-flash-attention-v2:
FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn==2.3.0 --no-build-isolation
install-flash-attention-v2-rocm: build-flash-attention-v2-rocm
cd flash-attention-v2 && git submodule update --init --recursive && python setup.py install
28 changes: 20 additions & 8 deletions server/Makefile-vllm
Original file line number Diff line number Diff line change
@@ -1,13 +1,25 @@
vllm_commit := 6d592eb430a37a7f8f5f9beb2dbc014bf3aa76bc

vllm:
vllm-cuda:
# Clone vllm
git clone https://github.com/vllm-project/vllm.git
pip install -U ninja packaging --no-cache-dir
git clone https://github.com/Narsil/vllm.git vllm

build-vllm: vllm
cd vllm && git fetch && git checkout $(vllm_commit)
build-vllm-cuda: vllm-cuda
cd vllm && git fetch && git checkout b5dfc61db88a81069e45b44f7cc99bd9e62a60fa
cd vllm && python setup.py build

install-vllm: build-vllm
install-vllm-cuda: build-vllm-cuda
pip uninstall vllm -y || true
cd vllm && python setup.py install

vllm-rocm:
# Clone vllm
pip install -U ninja packaging --no-cache-dir
git clone https://github.com/fxmarty/rocm-vllm.git vllm

build-vllm-rocm: vllm-rocm
cd vllm && git fetch && git checkout ca6913b3c2ffacdcb7d15e914dc34adbc6c89479
cd vllm && PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py install

install-vllm-rocm: build-vllm-rocm
pip uninstall vllm -y || true
cd vllm && python setup.py install
cd vllm && python setup.py install
Loading

0 comments on commit a2ca687

Please sign in to comment.