Skip to content

Commit

Permalink
v0.3.0: OpenAI Compatibility, Dynamic Stream Batching, Refactor, Erro…
Browse files Browse the repository at this point in the history
…r Catching
  • Loading branch information
alpayariyak committed Feb 24, 2024
2 parents 235d0d3 + 819102c commit 91167b8
Show file tree
Hide file tree
Showing 13 changed files with 669 additions and 313 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
runpod.toml
*.pyc
.env
test/*
test/*
vllm-base/vllm-*
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ARG WORKER_CUDA_VERSION=11.8.0
FROM runpod/worker-vllm:base-0.2.2-cuda${WORKER_CUDA_VERSION} AS vllm-base
FROM runpod/worker-vllm:base-0.3.0-cuda${WORKER_CUDA_VERSION} AS vllm-base

RUN apt-get update -y \
&& apt-get install -y python3-pip
Expand Down
543 changes: 416 additions & 127 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion builder/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ hf_transfer
ray
pandas
pyarrow
runpod==1.5.3
runpod==1.6.2
huggingface-hub
packaging
typing-extensions==4.7.1
Expand Down
Empty file added src/__init__.py
Empty file.
51 changes: 51 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os
from dotenv import load_dotenv
from utils import count_physical_cores
from torch.cuda import device_count

class EngineConfig:
def __init__(self):
load_dotenv()
self.model_name_or_path, self.hf_home, self.model_revision = self._get_local_or_env("/local_model_path.txt", "MODEL_NAME")
self.tokenizer_name_or_path, _, self.tokenizer_revision = self._get_local_or_env("/local_tokenizer_path.txt", "TOKENIZER_NAME")
self.tokenizer_name_or_path = self.tokenizer_name_or_path or self.model_name_or_path
self.quantization = self._get_quantization()
self.config = self._initialize_config()

def _get_local_or_env(self, local_path, env_var):
if os.path.exists(local_path):
with open(local_path, "r") as file:
return file.read().strip(), None, None
return os.getenv(env_var), os.getenv("HF_HOME"), os.getenv(f"{env_var}_REVISION")

def _get_quantization(self):
quantization = os.getenv("QUANTIZATION", "").lower()
return quantization if quantization in ["awq", "squeezellm", "gptq"] else None

def _initialize_config(self):
args = {
"model": self.model_name_or_path,
"revision": self.model_revision,
"download_dir": self.hf_home,
"quantization": self.quantization,
"load_format": os.getenv("LOAD_FORMAT", "auto"),
"dtype": os.getenv("DTYPE", "half" if self.quantization else "auto"),
"tokenizer": self.tokenizer_name_or_path,
"tokenizer_revision": self.tokenizer_revision,
"disable_log_stats": bool(int(os.getenv("DISABLE_LOG_STATS", 1))),
"disable_log_requests": bool(int(os.getenv("DISABLE_LOG_REQUESTS", 1))),
"trust_remote_code": bool(int(os.getenv("TRUST_REMOTE_CODE", 0))),
"gpu_memory_utilization": float(os.getenv("GPU_MEMORY_UTILIZATION", 0.95)),
"max_parallel_loading_workers": None if device_count() > 1 or not os.getenv("MAX_PARALLEL_LOADING_WORKERS") else int(os.getenv("MAX_PARALLEL_LOADING_WORKERS")),
"max_model_len": int(os.getenv("MAX_MODEL_LENGTH")) if os.getenv("MAX_MODEL_LENGTH") else None,
"tensor_parallel_size": device_count(),
"seed": int(os.getenv("SEED")) if os.getenv("SEED") else None,
"kv_cache_dtype": os.getenv("KV_CACHE_DTYPE"),
"block_size": int(os.getenv("BLOCK_SIZE")) if os.getenv("BLOCK_SIZE") else None,
"swap_space": int(os.getenv("SWAP_SPACE")) if os.getenv("SWAP_SPACE") else None,
"max_context_len_to_capture": int(os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE")) if os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE") else None,
"disable_custom_all_reduce": bool(int(os.getenv("DISABLE_CUSTOM_ALL_REDUCE", 0))),
"enforce_eager": bool(int(os.getenv("ENFORCE_EAGER", 0)))
}

return {k: v for k, v in args.items() if v is not None}
6 changes: 4 additions & 2 deletions src/constants.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from typing import Union

DEFAULT_BATCH_SIZE = 30
DEFAULT_BATCH_SIZE = 50
DEFAULT_MAX_CONCURRENCY = 300
DEFAULT_BATCH_SIZE_GROWTH_FACTOR = 3
DEFAULT_MIN_BATCH_SIZE = 1

SAMPLING_PARAM_TYPES = {
"n": int,
Expand All @@ -25,4 +27,4 @@
"skip_special_tokens": bool,
"spaces_between_special_tokens": bool,
"include_stop_str_in_output": bool
}
}
Loading

0 comments on commit 91167b8

Please sign in to comment.