In [1]:
import torch
from text_generation_server.models.flash_llama import FlashLlama

model_id = "meta-llama/Llama-2-7b-hf"
model = FlashLlama(model_id=model_id, dtype=torch.bfloat16,)



You are using a model of type llama to instantiate a model of type . This is not supported for all configurations of models and can yield errors.


In [2]:
!nvidia-smi

Mon Nov 20 20:40:54 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A10G                     Off| 00000000:00:1E.0 Off |                    0 |
|  0%   26C    P0               62W / 300W|  14026MiB / 23028MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
from text_generation_server.models.flash_causal_lm import FlashCausalLMBatch
from text_generation_server.pb import generate_pb2

max_input_length = 1024
max_batch_size = 10
max_prefill_tokens = max_input_length * max_batch_size - 32

warmup_requests = []
n_tokens = 0
while n_tokens < max_prefill_tokens:
    warmup_requests.append(
        generate_pb2.Request(
            id=0,
            inputs="_text" * max_input_length,
            truncate=min(max_input_length, max_prefill_tokens - n_tokens),
            parameters=generate_pb2.NextTokenChooserParameters(
                do_sample=False
            ),
            stopping_parameters=generate_pb2.StoppingCriteriaParameters(
                max_new_tokens=2
            )
        ),
    )
    
    n_tokens += max_input_length

warmup_batch = generate_pb2.Batch(id=0, requests=warmup_requests, size=len(warmup_requests))

fclm_warmup_batch = FlashCausalLMBatch.from_pb(
    pb=warmup_batch,
    tokenizer=model.tokenizer,
    dtype=model.dtype,
    device=model.device,
)

max_supported_total_tokens = model.warmup(batch=fclm_warmup_batch)

In [4]:
max_input_length = 256

BATCH_SIZE = 1
MAX_NEW_TOKENS = 100

def make_clm_batch(batch_size=1, max_new_tokens=100):
    parameters = generate_pb2.NextTokenChooserParameters(
        watermark=False,
        temperature=1.0,
        repetition_penalty=1.0,
        top_k=0,
        top_p=1.0,
        typical_p=1.0,
        do_sample=False
    )

    stopping_parameters = generate_pb2.StoppingCriteriaParameters(
        max_new_tokens=MAX_NEW_TOKENS,
        ignore_eos_token=True
    )

    input_lst = [
        "In a galaxy far, far away"
    ]

    requests = [
        generate_pb2.Request(
            id=idx,
            inputs=inputs,
            truncate=max_input_length,
            parameters=parameters,    
            stopping_parameters=stopping_parameters
        )
        for idx, inputs in enumerate(input_lst * batch_size)
    ]

    return FlashCausalLMBatch.from_pb(
        pb=generate_pb2.Batch(id=0, requests=requests),
        tokenizer=model.tokenizer,
        dtype=model.dtype,
        device=model.device,
    )

fclm_batch = make_clm_batch()

texts = {
    idx: request.inputs
    for idx, request in enumerate(fclm_batch.requests)
}

for _ in range(MAX_NEW_TOKENS):
    generations, fclm_batch = model.generate_token(fclm_batch)
    for idx, gen in enumerate(generations):
        texts[idx] += gen.token_text

print(texts[0])

In a galaxy far, far away, a long time ago, a young boy named Luke Skywalker was born. Luke was a dreamer, and he dreamed of becoming a Jedi Knight. But Luke’s dreams were put on hold when his home planet of Tatooine was attacked by the evil Empire. Luke’s family was killed, and he was forced to flee to the planet of Dagobah, where he would be trained by the wise Jedi Master Yoda.
Luke


In [6]:
def make_prefill_clm_batch(batch_size=1, prefill_tokens=100):
    parameters = generate_pb2.NextTokenChooserParameters(
        watermark=False,
        temperature=1.0,
        repetition_penalty=1.0,
        top_k=0,
        top_p=1.0,
        typical_p=1.0,
        do_sample=False
    )

    stopping_parameters = generate_pb2.StoppingCriteriaParameters(
        max_new_tokens=1,
        ignore_eos_token=False,
    )

    input_lst = [
        "Hello my name is " * prefill_tokens
    ]

    requests = [
        generate_pb2.Request(
            id=idx,
            inputs=inputs,
            truncate=prefill_tokens,
            parameters=parameters,    
            stopping_parameters=stopping_parameters
        )
        for idx, inputs in enumerate(input_lst * batch_size)
    ]

    return FlashCausalLMBatch.from_pb(
        pb=generate_pb2.Batch(id=0, requests=requests),
        tokenizer=model.tokenizer,
        dtype=model.dtype,
        device=model.device,
    )

In [19]:
fclm_batch = make_prefill_clm_batch(batch_size=16, prefill_tokens=512)

texts = {
    idx: request.inputs
    for idx, request in enumerate(fclm_batch.requests)
}
texts[0]

'Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name

In [20]:
fclm_batch.input_lengths_tensor

tensor([512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512,
        512, 512], device='cuda:0', dtype=torch.int32)

In [23]:
for _ in range(1):
    generations, fclm_batch = model.generate_token(fclm_batch)
    for idx, gen in enumerate(generations):
        texts[idx] += gen.token_text

AttributeError: 'NoneType' object has no attribute 'cu_seqlen_prefill'

In [22]:
texts

{0: 'Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my name is Hello my 