# HF Accelerate

In [28]:
BATCH_SIZE = 4

In [29]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

import time, gc, torch
from tqdm import tqdm

In [30]:
model_name = "facebook/opt-6.7b"

tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')
kwargs = dict(
    torch_dtype=torch.float16,
)

In [31]:
model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)
model = model.to("cuda:0")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [32]:
inputs = ["In a galaxy far, far away"] * BATCH_SIZE
input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True)
for t in input_tokens:
    if torch.is_tensor(input_tokens[t]):
        input_tokens[t] = input_tokens[t].to("cuda:0")

In [33]:
generate_kwargs = dict(max_new_tokens=100, use_cache=True, do_sample=False)
output_tokens = model.generate(**input_tokens, **generate_kwargs)

In [34]:
print(tokenizer.batch_decode(output_tokens))

['</s>In a galaxy far, far away, the Empire is still a thing.\n\nThe latest Star Wars: The Rise of Skywalker trailer is here, and it’s packed with new footage from the upcoming film.\n\nThe trailer opens with Rey (Daisy Ridley) and Finn (John Boyega) on a desert planet, where they’re being chased by a group of Stormtroopers.\n\n“We’re not going to make it,” Finn says.\n\n“We', '</s>In a galaxy far, far away, the Empire is still a thing.\n\nThe latest Star Wars: The Rise of Skywalker trailer is here, and it’s packed with new footage from the upcoming film.\n\nThe trailer opens with Rey (Daisy Ridley) and Finn (John Boyega) on a desert planet, where they’re being chased by a group of Stormtroopers.\n\n“We’re not going to make it,” Finn says.\n\n“We', '</s>In a galaxy far, far away, the Empire is still a thing.\n\nThe latest Star Wars: The Rise of Skywalker trailer is here, and it’s packed with new footage from the upcoming film.\n\nThe trailer opens with Rey (Daisy Ridley) and Finn (John

### RECREATING

In [35]:
from tqdm import tqdm

In [36]:
inputs_saver = {}
# shapes = [10, 128, 256, 512, 1024]
shapes = [512]

In [37]:
model_kwargs = model.generation_config.update(**input_tokens, **generate_kwargs)
inputs_tensor, model_input_name, model_kwargs = model._prepare_model_inputs(
    None, model.generation_config.bos_token_id, model_kwargs
)
model_kwargs["attention_mask"] = model._prepare_attention_mask_for_generation(
    inputs_tensor, model.generation_config.pad_token_id, model.generation_config.eos_token_id
)
model_kwargs["output_attentions"] = model.generation_config.output_attentions
model_kwargs["output_hidden_states"] = model.generation_config.output_hidden_states
model_kwargs["use_cache"] = model.generation_config.use_cache

input_ids = inputs_tensor

model = model.eval()
with torch.no_grad():
    for i in tqdm(range(1024)):
        if input_ids.shape[1] in shapes:
            inputs_saver[input_ids.shape[1]] = {
                "model_kwargs": model_kwargs.copy(),
                "input_ids": input_ids,
            }
        
        model_inputs = model.prepare_inputs_for_generation(input_ids, **model_kwargs)
        
        outputs = model(
            **model_inputs, 
            return_dict=True,
            output_attentions=model.generation_config.output_attentions,
            output_hidden_states=model.generation_config.output_hidden_states)

        next_token_logits = outputs.logits[:, -1, :]

        next_tokens = torch.argmax(next_token_logits, dim=-1)
        input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
        model_kwargs = model._update_model_kwargs_for_generation(
            outputs, model_kwargs, is_encoder_decoder=False
        )

100%|█████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:28<00:00, 35.48it/s]


In [38]:
inputs_saver[512]["model_kwargs"]["past_key_values"][0][0].shape

torch.Size([4, 32, 511, 128])

## PROFILING

In [39]:
import time

### DECODE

In [40]:
iterations = 100
for shape in shapes:
    input_ids = inputs_saver[shape]["input_ids"]
    model_kwargs = inputs_saver[shape]["model_kwargs"]
    model_inputs = model.prepare_inputs_for_generation(input_ids, **model_kwargs)
   
    with torch.no_grad():
        start = time.perf_counter()

        print(f'KV Cache Shape: {model_inputs["past_key_values"][0][0].shape}')
        for it in tqdm(range(iterations)):
            outputs = model(
                **model_inputs, 
                return_dict=True,
                output_attentions=model.generation_config.output_attentions,
                output_hidden_states=model.generation_config.output_hidden_states)

        torch.cuda.synchronize()
        end = time.perf_counter()
    
    print(f"Decode with input_ids.shape = {input_ids.shape}")
    batch = input_ids.shape[0]
    print(f"Time: {end-start: .2f}")
    print(f"Iterations: {iterations}")
    print(f"Throughput (tokens/sec): {(iterations * batch) / (end-start) : .2f}")
    print(f"Latency (sec/inference): {(end-start) / iterations : .3f}")
    print("\n")

KV Cache Shape: torch.Size([4, 32, 511, 128])


100%|███████████████████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 37.15it/s]

Decode with input_ids.shape = torch.Size([4, 512])
Time:  2.69
Iterations: 100
Throughput (tokens/sec):  148.48
Latency (sec/inference):  0.027





