# Running LLaMa from a notebook.

To get started, start up your Ray cluster if it has not already been created:
```
ray up -y cluster/dev.yaml
```

You can monitor the status of the Ray cluster as follows:
```
ray monitor cluster/dev.yaml
```

Once the cluster is up, you'll need to connect ot the Ray cluster. For simplicity, you can run:
```
./scripts/get_cluster_ip.sh
```

which will print something like this:
```
$ ./get_cluster_ip.sh 
External IP address of the head node: 35.186.77.141
Internal IP address of the head node: 10.130.0.107


If developing from a notebook, connect to the Ray cluster as follows:

import ray

ray.init("ray://10.130.0.107:10001")
```


In [24]:
import ray

try:
    ray.shutdown()
except:
    pass

runtime_env = {
  "working_dir": "../serve",
}

IP = "10.130.0.41" # Fill this out
ray.init(f"ray://{IP}:10001", runtime_env=runtime_env)

SIGTERM handler is not set because current thread is not the main thread.


0,1
Python version:,3.8.18
Ray version:,3.0.0.dev0
Dashboard:,http://10.130.0.41:8265


In [25]:
from typing import Iterable, List
_VALID_MODELS = [
    "llama-2-7b",
    "llama-2-13b",
    "llama-2-70b",
]
_MAX_BATCH_SIZE = 1
_CHECKPOINT_PATH = "gs://ray-llama-demo"

@ray.remote(resources={"TPU": 4})
class LlamaTpuActor:
    """A LLaMA2 actor that lives on a single TPU VM actor."""
    def __init__(self,
        model_name: str,
        worker_id: int,
        tokenizer_path: str = "/tokenizer.model",
        max_batch_size: int = _MAX_BATCH_SIZE,
        max_seq_len: int = 2048,
        max_gen_len: int = 20,
        temperature: float = 0.6,
        top_p: int = 1,
        dynamo: bool = True):
        # Note - we intentionally separate the ML framework
        # initialization to another function that we can
        # `ray.get()`.

        # This is a best practice that will help us catch and
        # raise errors very quickly.
        import os
        import socket
        print(f"Initializing model: {model_name}.")
        self._host_name = socket.gethostname()
        self._model_name = model_name
        self._tokenizer_path = tokenizer_path
        self._max_batch_size = max_batch_size
        self._max_seq_len = max_seq_len
        self._max_gen_len = max_gen_len
        self._temperature = temperature
        self._top_p = top_p
        self._dynamo = dynamo
        self._ckpt_dir = os.path.join(_CHECKPOINT_PATH, model_name)
        self._worker_id = worker_id

    def __repr__(self) -> str:
        """Returns the actor logger prefix."""
        return f"LLaMAActor{self._model_name}::{self._host_name}"

    def initialize(self):
        """Initializes the LLaMA generator."""
        import torch
        import torch_xla
        import torch_xla.runtime as xr
        from llama import Llama
        self.generator = Llama.build(
            ckpt_dir=self._ckpt_dir,
            tokenizer_path=self._tokenizer_path,
            max_seq_len=self._max_seq_len,
            max_batch_size=self._max_batch_size,
            dynamo=self._dynamo)

    def generate(self, inputs: Iterable[str]) -> List[str]:
        print("Generating results for inputs: ", inputs)
        import torch
        with torch.no_grad():
            results = self.generator.text_completion(
                inputs,
                max_gen_len=self._max_gen_len,
                temperature=self._temperature,
                top_p=self._top_p,
            )
            return results


In [26]:
@ray.remote
class LlamaServer:
    def __init__(
        self,
        model_name: str,
        tokenizer_path: str = "/tokenizer.model",
        max_batch_size: int = _MAX_BATCH_SIZE,
        max_seq_len: int = 2048,
        max_gen_len: int = 20,
        temperature: float = 0.6,
        top_p: int = 1,
        dynamo: bool = True):
        self._model_name = model_name
        self._tokenizer_path = tokenizer_path
        self._max_batch_size = max_batch_size
        self._max_seq_len = max_seq_len
        self._max_gen_len = max_gen_len
        self._temperature = temperature
        self._top_p = top_p
        self._dynamo = dynamo

    def initialize(self):
        assert self._model_name in _VALID_MODELS
        tpu_pod_name = ray.util.accelerators.tpu.get_current_pod_name()
        num_tpu_pod_hosts = ray.util.accelerators.tpu.get_current_pod_worker_count()
        assert ray.available_resources()[tpu_pod_name] == num_tpu_pod_hosts
        actor_def = LlamaTpuActor.options(resources={tpu_pod_name: 1, "TPU": 4})
        print("Creating TPU VM shards.")
        try:
            self._shards = [actor_def.remote(
                model_name=self._model_name,
                worker_id=i,
                tokenizer_path=self._tokenizer_path,
                max_batch_size=self._max_batch_size,
                max_seq_len=self._max_seq_len,
                max_gen_len=self._max_gen_len,
                temperature=self._temperature,
                top_p=self._top_p,
                dynamo=self._dynamo,
            ) for i in range(num_tpu_pod_hosts)]
            print("Created shards")
            print("Initializing shards")
            ray.get([s.initialize.remote() for s in self._shards])
            # warmup
            self.generate_batch(["I believe the meaning of life is ..."])
        except Exception as e:
            print("Caught error ", e)
            raise e

    def generate_batch(self, prompts: Iterable[str]) -> List[str]:
        print("Preprocessing prompts: ", prompts)
        try:
            all_results = ray.get([
                s.generate.remote(prompts) for s in self._shards])
            return all_results
        except Exception as e:
            print("Failed with ", e)

    def __repr__(self):
        return f"[{self._model_name}-shard]: "


In [27]:
server = LlamaServer.options(resources={"TPU-v4-8-head": 1}).remote(
    model_name="llama-2-7b")

ray.get(server.initialize.remote())

[36m([llama-2-7b-shard]:  pid=1241, ip=10.130.0.38)[0m Creating TPU VM shards.


RayTaskError(AttributeError): [36mray::LlamaServer.initialize()[39m (pid=1241, ip=10.130.0.38, actor_id=bde9b522e08b1abb3b7c52c30a000000, repr=[llama-2-7b-shard]: )
  File "/tmp/ipykernel_262472/2958565436.py", line 29, in initialize
  File "/tmp/ipykernel_262472/2958565436.py", line 29, in <listcomp>
  File "/usr/local/lib/python3.8/site-packages/ray/actor.py", line 687, in remote
    return actor_cls._remote(args=args, kwargs=kwargs, **updated_options)
  File "/usr/local/lib/python3.8/site-packages/ray/actor.py", line 1044, in _remote
    meta.method_meta.generator_backpressure_num_objects,
AttributeError: '_ActorClassMethodMetadata' object has no attribute 'generator_backpressure_num_objects'

[36m(LlamaTpuActor pid=1355, ip=10.130.0.38)[0m Using model: llama-2-7b.


In [15]:
kray.shutdown()