In [8]:
!pip install "ray==2.3.0"
!pip install torch --no-cache-dir
!pip install "ray[serve]" requests diffusers transformers fastapi

Collecting torch
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m119.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m161.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.7.99 (from torch)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m273.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu11==11.7.101 (from torch)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/

In [1]:
import ray

ray.init(
    address="ray://example-cluster-kuberay-head-svc:10001",
    runtime_env={
        "pip": [
            "IPython",
            "boto3==1.26",
            "botocore==1.29", 
            "datasets",
            "diffusers",
            "fastapi",
            "accelerate>=0.16.0",
            "transformers>=4.26.0",
            "numpy<1.24",  # remove when mlflow updates beyond 2.2
            "torch",
        ]
    }
)

0,1
Python version:,3.10.9
Ray version:,2.3.0
Dashboard:,http://10.0.3.10:8265


In [9]:
from io import BytesIO
from fastapi import FastAPI
from fastapi.responses import Response
import torch

from ray import serve


app = FastAPI()


@serve.deployment(num_replicas=1, route_prefix="/")
@serve.ingress(app)
class APIIngress:
    def __init__(self, diffusion_model_handle) -> None:
        self.handle = diffusion_model_handle

    @app.get(
        "/imagine",
        responses={200: {"content": {"image/png": {}}}},
        response_class=Response,
    )
    async def generate(self, prompt: str, img_size: int = 512):
        assert len(prompt), "prompt parameter cannot be empty"

        image_ref = await self.handle.generate.remote(prompt, img_size=img_size)
        image = await image_ref
        file_stream = BytesIO()
        image.save(file_stream, "PNG")
        return Response(content=file_stream.getvalue(), media_type="image/png")


@serve.deployment(
    ray_actor_options={"num_gpus": 1},
    autoscaling_config={"min_replicas": 0, "max_replicas": 2},
)
class StableDiffusionV2:
    def __init__(self):
        from diffusers import EulerDiscreteScheduler, StableDiffusionPipeline

        model_id = "stabilityai/stable-diffusion-2"

        scheduler = EulerDiscreteScheduler.from_pretrained(
            model_id, subfolder="scheduler"
        )
        self.pipe = StableDiffusionPipeline.from_pretrained(
            model_id, scheduler=scheduler, revision="fp16", torch_dtype=torch.float16
        )
        self.pipe = self.pipe.to("cuda")

    def generate(self, prompt: str, img_size: int = 512):
        assert len(prompt), "prompt parameter cannot be empty"

        image = self.pipe(prompt, height=img_size, width=img_size).images[0]
        return image

In [10]:
deployment = APIIngress.bind(StableDiffusionV2.bind())
serve.run(deployment, host="0.0.0.0")

[2m[36m(ServeController pid=1186)[0m INFO 2023-05-12 14:07:21,393 controller 1186 http_state.py:129 - Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:SERVE_PROXY_ACTOR-b69bac8700d9e7100149ddcb3da1c17804fdbe5d26caa6b6cb112e2e' on node 'b69bac8700d9e7100149ddcb3da1c17804fdbe5d26caa6b6cb112e2e' listening on '0.0.0.0:8000'
[2m[36m(HTTPProxyActor pid=1255)[0m INFO:     Started server process [1255]
[2m[36m(ServeController pid=1186)[0m INFO 2023-05-12 14:07:23,132 controller 1186 deployment_state.py:1333 - Adding 1 replica to deployment 'APIIngress'.


RayServeSyncHandle(deployment='APIIngress')

In [11]:
import requests

prompt = "a cute cat is dancing on the grass."
input = "%20".join(prompt.split(" "))
resp = requests.get(f"http://example-cluster-kuberay-head-svc:8000/imagine?prompt={input}")
with open("output.png", 'wb') as f:
    f.write(resp.content)

[2m[36m(ServeController pid=1186)[0m INFO 2023-05-12 14:07:50,304 controller 1186 deployment_state.py:1333 - Adding 1 replica to deployment 'StableDiffusionV2'.
Downloading (…)cheduler_config.json: 100%|██████████| 345/345 [00:00<00:00, 2.28MB/s]
Downloading (…)p16/model_index.json: 100%|██████████| 511/511 [00:00<00:00, 3.11MB/s]
Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]
[2m[36m(ServeReplica:StableDiffusionV2 pid=1538)[0m 
Downloading (…)5f1/unet/config.json: 100%|██████████| 900/900 [00:00<00:00, 4.77MB/s]
[2m[36m(ServeReplica:StableDiffusionV2 pid=1538)[0m 
Downloading (…)cial_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s][A
Downloading (…)cial_tokens_map.json: 100%|██████████| 460/460 [00:00<00:00, 4.25MB/s]
[2m[36m(ServeReplica:StableDiffusionV2 pid=1538)[0m 
Downloading (…)_encoder/config.json: 100%|██████████| 624/624 [00:00<00:00, 5.80MB/s]
Fetching 11 files:  18%|█▊        | 2/11 [00:00<00:00, 13.28it/s]
Downloading (…)okenizer_config.j

In [None]:
from ray import serve

serve.shutdown()