In [1]:
import ray
import os
from starlette.requests import Request
from typing import List, Optional, Any
import torch
import shutil
import logging
import sys
import json
import time
from huggingface_hub.hf_api import HfFolder
import json
from typing import AsyncGenerator
from fastapi import BackgroundTasks
from starlette.requests import Request
from starlette.responses import StreamingResponse, Response
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.sampling_params import SamplingParams
from vllm.logger import init_logger
from vllm.utils import random_uuid
from ray import serve
import vllm
vllm_logger = init_logger(__name__)
vllm_logger.setLevel(logging.DEBUG)

In [5]:
MODEL = "lmsys/vicuna-13b-v1.5-16k"

In [6]:
logger = logging.getLogger("ray.serve")
logger.setLevel(logging.DEBUG)

@serve.deployment(num_replicas=1, ray_actor_options={"resources": {"custom_llm_serving_label": 1}}, route_prefix="/llmapi")
class SnowflakeVLLMDeployment:
    def __init__(self, **kwargs):
        args = AsyncEngineArgs(**kwargs)
        self.engine = AsyncLLMEngine.from_engine_args(args)

    async def stream_results(self, results_generator) -> AsyncGenerator[bytes, None]:
        num_returned = 0
        async for request_output in results_generator:
            text_outputs = [output.text for output in request_output.outputs]
            assert len(text_outputs) == 1
            text_output = text_outputs[0][num_returned:]
            ret = {"text": text_output}
            yield (json.dumps(ret) + "\n").encode("utf-8")
            num_returned += len(text_output)

    async def may_abort_request(self, request_id) -> None:
        await self.engine.abort(request_id)

    async def __call__(self, request: Request) -> Response:
        request_dict = await request.json()
        prompt = request_dict.pop("prompt")
        stream = request_dict.pop("stream", False)
        sampling_params = SamplingParams(**request_dict)
        request_id = random_uuid()
        results_generator = self.engine.generate(prompt, sampling_params, request_id)
        if stream:
            background_tasks = BackgroundTasks()
            background_tasks.add_task(self.may_abort_request, request_id)
            return StreamingResponse(
                self.stream_results(results_generator), background=background_tasks
            )

        # Non-streaming case
        final_output = None
        async for request_output in results_generator:
            if await request.is_disconnected():
                # Abort the request if the client disconnects.
                await self.engine.abort(request_id)
                return Response(status_code=499)
            final_output = request_output

        assert final_output is not None
        #prompt = final_output.prompt
        #text_outputs = [prompt + output.text for output in final_output.outputs]
        text_outputs = [output.text for output in final_output.outputs]
        ret = {"text": text_outputs}
        return Response(content=json.dumps(ret))



In [7]:
deployment = SnowflakeVLLMDeployment.bind(model=MODEL, tensor_parallel_size=8, seed=123)
ray.init(address="auto", runtime_env={"pip": ["flash-attn==2.4.2"]})
serve.run(target=deployment, name="llm")

2024-04-18 05:20:10,917	INFO worker.py:1540 -- Connecting to existing Ray cluster at address: 10.244.31.9:6379...
2024-04-18 05:20:12,147	INFO worker.py:1715 -- Connected to Ray cluster. View the dashboard at [1m[32mhttp://10.244.31.9:8265 [39m[22m
[2024-04-18 05:20:12,150 I 877 877] logging.cc:230: Set ray log level from environment variable RAY_BACKEND_LOG_LEVEL to -1
[33m(raylet)[0m [2024-04-18 05:20:13,924 I 958 958] logging.cc:230: Set ray log level from environment variable RAY_BACKEND_LOG_LEVEL to -1
[36m(ProxyActor pid=997)[0m INFO 2024-04-18 05:20:19,945 proxy 10.244.31.9 proxy.py:1143 - Proxy actor 28061d6e60351e0feb9de5a802000000 starting on node b3d8591b0a074a32350190f00903d159aa0ccf0c5ff8c3eb9c01662d.
[33m(raylet)[0m [2024-04-18 05:20:17,892 I 997 997] logging.cc:230: Set ray log level from environment variable RAY_BACKEND_LOG_LEVEL to -1
[36m(ServeController pid=958)[0m INFO 2024-04-18 05:20:20,322 controller 958 deployment_state.py:1547 - Deploying new versio

[36m(ServeReplica:llm:SnowflakeVLLMDeployment pid=519, ip=10.244.29.9)[0m INFO 04-18 05:20:27 llm_engine.py:75] Initializing an LLM engine (v0.4.0) with config: model='lmsys/vicuna-13b-v1.5-16k', tokenizer='lmsys/vicuna-13b-v1.5-16k', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=8, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=123)


[36m(ProxyActor pid=593, ip=10.244.29.9)[0m INFO 2024-04-18 05:20:27,686 proxy 10.244.29.9 proxy.py:1143 - Proxy actor 7e4d92bb5714be711ef0042102000000 starting on node e219648a2f442d57852464195933cd440000e47546ef89e83b32621a.
[36m(ProxyActor pid=593, ip=10.244.29.9)[0m INFO 2024-04-18 05:20:27,691 proxy 10.244.29.9 proxy.py:1357 - Starting HTTP server on node: e219648a2f442d57852464195933cd440000e47546ef89e83b32621a listening on port 8000
[36m(ProxyActor pid=593, ip=10.244.29.9)[0m INFO:     Started server process [593]
[33m(raylet, ip=10.244.30.9)[0m [2024-04-18 05:20:33,187 I 629 629] logging.cc:230: Set ray log level from environment variable RAY_BACKEND_LOG_LEVEL to -1[32m [repeated 3x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
[33m(raylet, ip=10.244.30.9)[0m [2024-04-18 05:20:42,339 I 766 766] loggin

[36m(ServeReplica:llm:SnowflakeVLLMDeployment pid=519, ip=10.244.29.9)[0m INFO 04-18 05:21:05 selector.py:16] Using FlashAttention backend.
[36m(RayWorkerVllm pid=797, ip=10.244.29.9)[0m INFO 04-18 05:21:06 pynccl_utils.py:45] vLLM is using nccl==2.18.1
[36m(ServeReplica:llm:SnowflakeVLLMDeployment pid=519, ip=10.244.29.9)[0m statefulset-1:519:571 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
[36m(ServeReplica:llm:SnowflakeVLLMDeployment pid=519, ip=10.244.29.9)[0m statefulset-1:519:571 [0] NCCL INFO Bootstrap : Using eth0:10.244.29.9<0>
[36m(ServeReplica:llm:SnowflakeVLLMDeployment pid=519, ip=10.244.29.9)[0m statefulset-1:519:571 [0] NCCL INFO NET/Plugin : Plugin load (libnccl-net.so) returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory
[36m(ServeReplica:llm:SnowflakeVLLMDeployment pid=519, ip=10.244.29.9)[0m statefulset-1:519:571 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation
[36m(ServeReplica:ll



[36m(RayWorkerVllm pid=698, ip=10.244.30.9)[0m INFO 04-18 05:22:17 model_runner.py:104] Loading model weights took 3.0467 GB
[36m(RayWorkerVllm pid=766, ip=10.244.30.9)[0m INFO 04-18 05:21:05 selector.py:16] Using FlashAttention backend.[32m [repeated 7x across cluster][0m
[36m(RayWorkerVllm pid=766, ip=10.244.30.9)[0m INFO 04-18 05:21:06 pynccl_utils.py:45] vLLM is using nccl==2.18.1[32m [repeated 7x across cluster][0m
[36m(RayWorkerVllm pid=766, ip=10.244.30.9)[0m statefulset-0:766:850 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0[32m [repeated 15x across cluster][0m
[36m(RayWorkerVllm pid=766, ip=10.244.30.9)[0m statefulset-0:766:766 [3] NCCL INFO Bootstrap : Using eth0:10.244.30.9<0>[32m [repeated 7x across cluster][0m
[36m(RayWorkerVllm pid=766, ip=10.244.30.9)[0m statefulset-0:766:766 [3] NCCL INFO NET/Plugin : Plugin load (libnccl-net.so) returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory[32m [repeated 7x acr



[36m(RayWorkerVllm pid=766, ip=10.244.30.9)[0m INFO 04-18 05:23:25 model_runner.py:104] Loading model weights took 3.0467 GB
[36m(RayWorkerVllm pid=629, ip=10.244.30.9)[0m INFO 04-18 05:23:25 model_runner.py:104] Loading model weights took 3.0467 GB
[36m(ServeReplica:llm:SnowflakeVLLMDeployment pid=519, ip=10.244.29.9)[0m INFO 04-18 05:23:38 model_runner.py:104] Loading model weights took 3.0467 GB
[36m(RayWorkerVllm pid=865, ip=10.244.29.9)[0m INFO 04-18 05:23:40 model_runner.py:104] Loading model weights took 3.0467 GB




[36m(RayWorkerVllm pid=797, ip=10.244.29.9)[0m INFO 04-18 05:24:50 model_runner.py:104] Loading model weights took 3.0467 GB[32m [repeated 2x across cluster][0m




[36m(ServeReplica:llm:SnowflakeVLLMDeployment pid=519, ip=10.244.29.9)[0m statefulset-1:519:1085 [0] NCCL INFO Using network Socket
[36m(ServeReplica:llm:SnowflakeVLLMDeployment pid=519, ip=10.244.29.9)[0m statefulset-1:519:1085 [0] NCCL INFO P2P is disabled between connected GPUs 1 and 0. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
[36m(ServeReplica:llm:SnowflakeVLLMDeployment pid=519, ip=10.244.29.9)[0m statefulset-1:519:1085 [0] NCCL INFO P2P is disabled between connected GPUs 2 and 0. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
[36m(ServeReplica:llm:SnowflakeVLLMDeployment pid=519, ip=10.244.29.9)[0m statefulset-1:519:1085 [0] NCCL INFO P2P is disabled between connected GPUs 3 and 0. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
[36m(ServeReplica:llm:SnowflakeVLLMDeployment pid=519, ip=10.244.29.9)[0m statefulset-1:519:1085 [0] NCCL INFO P2P is disabled between connected GPUs 1 and 0. You can repress this message with NC

2024-04-18 05:25:17,623	INFO router.py:959 -- Using router <class 'ray.serve._private.router.PowerOfTwoChoicesReplicaScheduler'>.
2024-04-18 05:25:17,633	DEBUG long_poll.py:156 -- LongPollClient <ray.serve._private.long_poll.LongPollClient object at 0x7f4184254a00> received updates for keys: [(LongPollNamespace.RUNNING_REPLICAS, DeploymentID(name='SnowflakeVLLMDeployment', app='llm'))].
2024-04-18 05:25:17,634	INFO router.py:496 -- Got updated replicas for deployment 'SnowflakeVLLMDeployment' in application 'llm': {'llm#SnowflakeVLLMDeployment#JMXtMC'}.


DeploymentHandle(deployment='SnowflakeVLLMDeployment')

2024-04-18 05:25:55,421	DEBUG long_poll.py:152 -- LongPollClient polling timed out. Retrying.
2024-04-18 05:26:53,873	DEBUG long_poll.py:152 -- LongPollClient polling timed out. Retrying.
2024-04-18 05:27:25,823	DEBUG long_poll.py:152 -- LongPollClient polling timed out. Retrying.
2024-04-18 05:28:09,373	DEBUG long_poll.py:152 -- LongPollClient polling timed out. Retrying.
2024-04-18 05:28:59,316	DEBUG long_poll.py:152 -- LongPollClient polling timed out. Retrying.
