Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

start porting latest tgi #480

Merged
merged 32 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"build": {
"dockerfile": "../Dockerfile"
"dockerfile": "../Dockerfile.dev"
},
"runArgs": [
"--gpus",
Expand Down
39 changes: 21 additions & 18 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 as pytorch-install

ARG PYTORCH_VERSION=2.3.0
ARG PYTHON_VERSION=3.10
# Keep in sync with `server/pyproject.toml
ARG CUDA_VERSION=12.1
ARG MAMBA_VERSION=23.3.1-1
ARG MAMBA_VERSION=24.3.0-0
ARG CUDA_CHANNEL=nvidia
ARG INSTALL_CHANNEL=pytorch
# Automatically set by buildx
Expand All @@ -52,7 +53,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
build-essential \
ca-certificates \
ccache \
sudo \
curl \
git && \
rm -rf /var/lib/apt/lists/*
Expand All @@ -73,7 +73,7 @@ RUN chmod +x ~/mambaforge.sh && \
RUN case ${TARGETPLATFORM} in \
"linux/arm64") exit 1 ;; \
*) /opt/conda/bin/conda update -y conda && \
/opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -c anaconda -c conda-forge -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \
/opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \
esac && \
/opt/conda/bin/conda clean -ya

Expand All @@ -90,29 +90,32 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
FROM kernel-builder as flash-att-builder
WORKDIR /usr/src
COPY server/Makefile-flash-att Makefile

# Build specific version of flash attention
RUN make build-flash-attention

# Build Flash Attention v2 CUDA kernels
FROM kernel-builder as flash-att-v2-builder
WORKDIR /usr/src
COPY server/Makefile-flash-att-v2 Makefile
# Build specific version of flash attention v2
RUN make build-flash-attention-v2
RUN make build-flash-attention-v2-cuda

# Build Transformers exllamav2 kernels
# Build Transformers exllama kernels
FROM kernel-builder as exllama-kernels-builder
WORKDIR /usr/src
COPY server/exllama_kernels/ .
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

# Build Transformers exllama kernels
FROM kernel-builder as exllamav2-kernels-builder
WORKDIR /usr/src
COPY server/exllamav2_kernels/ .
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

# Build awq kernels
# Build Transformers awq kernels
FROM kernel-builder as awq-kernels-builder
WORKDIR /usr/src
COPY server/awq_kernels/ .
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
COPY server/Makefile-awq Makefile
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq


# Build Transformers CUDA kernels
FROM kernel-builder as custom-kernels-builder
Expand All @@ -123,12 +126,11 @@ RUN python setup.py build

# Build vllm CUDA kernels
FROM kernel-builder as vllm-builder
RUN /opt/conda/bin/conda install packaging
WORKDIR /usr/src
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
COPY server/Makefile-vllm Makefile
# Build specific version of vllm
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
RUN make build-vllm
RUN make build-vllm-cuda

# Build megablocks kernels
FROM kernel-builder as megablocks-kernels-builder
Expand Down Expand Up @@ -188,9 +190,10 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from exllama kernels builder
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from exllamav2 kernels builder
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from awq kernels builder
COPY --from=awq-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy builds artifacts from vllm builder
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

Expand Down
19 changes: 19 additions & 0 deletions Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# LoRAX base image
FROM ghcr.io/predibase/lorax:latest as base

# Install server
COPY proto proto
COPY server server
COPY server/Makefile server/Makefile

# Final image
FROM base

COPY container-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh
COPY sync.sh sync.sh
RUN chmod +x sync.sh

# ENTRYPOINT ["./entrypoint.sh"]
ENTRYPOINT ["lorax-launcher"]
CMD ["--json-output"]
2 changes: 1 addition & 1 deletion clients/python/lorax/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@

__version__ = "0.5.1"

from lorax.client import Client, AsyncClient, MergedAdapters
from lorax.client import Client, AsyncClient, MergedAdapters # noqa
2 changes: 1 addition & 1 deletion integration-tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ async def health(self, timeout: int = 60):
try:
await self.client.generate("test")
return
except (ClientConnectorError, ClientOSError, ServerDisconnectedError) as e:
except (ClientConnectorError, ClientOSError, ServerDisconnectedError):
time.sleep(1)
raise RuntimeError("Health check failed")

Expand Down
2 changes: 0 additions & 2 deletions integration-tests/scripts/dynamic_adapter_loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,9 @@
import collections
import concurrent.futures
import json
import random
import time
from urllib.request import Request, urlopen

import numpy as np


def query_lorax(args):
Expand Down
4 changes: 4 additions & 0 deletions launcher/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ enum Quantization {
Hqq_4bit,
Hqq_3bit,
Hqq_2bit,
Fp8,
}

impl std::fmt::Display for Quantization {
Expand Down Expand Up @@ -63,6 +64,9 @@ impl std::fmt::Display for Quantization {
Quantization::Hqq_2bit => {
write!(f, "hqq-2bit")
}
Quantization::Fp8 => {
write!(f, "fp8")
}
}
}
}
Expand Down
1 change: 1 addition & 0 deletions server/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ include Makefile-flash-att-v2
include Makefile-vllm
include Makefile-megablocks
include Makefile-eetq
include Makefile-awq

unit-tests:
pytest -s -vv -m "not private" tests
Expand Down
15 changes: 15 additions & 0 deletions server/Makefile-awq
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Fork that adds only the correct stream to this kernel in order
# to make cuda graphs work.
awq_commit := bd1dc2d5254345cc76ab71894651fb821275bdd4

awq:
rm -rf llm-awq
git clone https://github.com/huggingface/llm-awq

build-awq: awq
cd llm-awq/ && git fetch && git checkout $(awq_commit)
cd llm-awq/awq/kernels && python setup.py build

install-awq: build-awq
pip uninstall awq_inference_engine -y || true
cd llm-awq/awq/kernels && python setup.py install
2 changes: 1 addition & 1 deletion server/Makefile-eetq
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
eetq_commit := cc2fdb4637e03652ac264eaef44dd8492472de01 # 323827dd471458a84e9c840f614e4592b157a4b1
eetq_commit := 1657b1504faa359e2ce0ac02999439d7ac8c74c0

eetq:
# Clone eetq
Expand Down
4 changes: 2 additions & 2 deletions server/Makefile-flash-att
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ flash_att_commit := 3a9bfd076f98746c73362328958dbc68d145fbec

flash-attention:
# Clone flash attention
pip install packaging
pip install -U packaging ninja --no-cache-dir
git clone https://github.com/HazyResearch/flash-attention.git

build-flash-attention: flash-attention
Expand All @@ -13,4 +13,4 @@ build-flash-attention: flash-attention

install-flash-attention: build-flash-attention
pip uninstall flash_attn rotary_emb dropout_layer_norm -y || true
cd flash-attention && python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install
cd flash-attention && python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install
38 changes: 24 additions & 14 deletions server/Makefile-flash-att-v2
Original file line number Diff line number Diff line change
@@ -1,19 +1,29 @@
flash_att_v2_commit := 2c3baba4a63c4007c8a132c5380edc9430f88a22
flash_att_v2_commit_cuda := v2.5.8
flash_att_v2_commit_rocm := 2554f490101742ccdc56620a938f847f61754be6

flash-attention-v2:
# Clone flash attention
pip install packaging
git clone https://github.com/HazyResearch/flash-attention.git flash-attention-v2

build-flash-attention-v2: flash-attention-v2
cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit)
flash-attention-v2-cuda:
# Clone flash attention
pip install -U packaging ninja --no-cache-dir
git clone https://github.com/Dao-AILab/flash-attention.git flash-attention-v2

build-flash-attention-v2-cuda: flash-attention-v2-cuda
cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_cuda)
cd flash-attention-v2 && git submodule update --init --recursive
cd flash-attention-v2 && python setup.py build

# install-flash-attention-v2: build-flash-attention-v2
# cd flash-attention-v2 && python setup.py install
install-flash-attention-v2-cuda: build-flash-attention-v2-cuda
cd flash-attention-v2 && git submodule update --init --recursive && python setup.py install

flash-attention-v2-rocm:
# Clone flash attention
pip install -U packaging ninja --no-cache-dir
git clone https://github.com/ROCm/flash-attention.git flash-attention-v2

build-flash-attention-v2-rocm: flash-attention-v2-rocm
cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_rocm)
cd flash-attention-v2 && git submodule update --init --recursive
cd flash-attention-v2 && GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build

# Install from pip because the target commit is actually a release commit
# and the pip wheels do not require nvcc to be installed.
# Reference: https://github.com/Dao-AILab/flash-attention/issues/509
install-flash-attention-v2:
FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn==2.3.0 --no-build-isolation
install-flash-attention-v2-rocm: build-flash-attention-v2-rocm
cd flash-attention-v2 && git submodule update --init --recursive && python setup.py install
28 changes: 20 additions & 8 deletions server/Makefile-vllm
Original file line number Diff line number Diff line change
@@ -1,13 +1,25 @@
vllm_commit := 6d592eb430a37a7f8f5f9beb2dbc014bf3aa76bc

vllm:
vllm-cuda:
# Clone vllm
git clone https://github.com/vllm-project/vllm.git
pip install -U ninja packaging --no-cache-dir
git clone https://github.com/Narsil/vllm.git vllm

build-vllm: vllm
cd vllm && git fetch && git checkout $(vllm_commit)
build-vllm-cuda: vllm-cuda
cd vllm && git fetch && git checkout b5dfc61db88a81069e45b44f7cc99bd9e62a60fa
cd vllm && python setup.py build

install-vllm: build-vllm
install-vllm-cuda: build-vllm-cuda
pip uninstall vllm -y || true
cd vllm && python setup.py install

vllm-rocm:
# Clone vllm
pip install -U ninja packaging --no-cache-dir
git clone https://github.com/fxmarty/rocm-vllm.git vllm

build-vllm-rocm: vllm-rocm
cd vllm && git fetch && git checkout ca6913b3c2ffacdcb7d15e914dc34adbc6c89479
cd vllm && PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py install

install-vllm-rocm: build-vllm-rocm
pip uninstall vllm -y || true
cd vllm && python setup.py install
cd vllm && python setup.py install