# HF Accelerate

In [1]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

import time, gc, torch
from tqdm import tqdm

In [2]:
model_name = "facebook/opt-13b"

tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')
kwargs = dict(
    device_map="balanced_low_0",
    torch_dtype=torch.float16
)

In [3]:
model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
def test_performance(model, iterations=3, batch_size=1, max_new_tokens=100, use_cache=True):
    model = model.eval()
    
    print("Setting up inputs:")
    
    # setup inputs
    inputs = ["In the far far distance from our galaxy"] * batch_size
    input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True)
    for t in input_tokens:
        if torch.is_tensor(input_tokens[t]):
            input_tokens[t] = input_tokens[t].to("cuda:0")
    
    # run loop
    print("Running loop:")
    
    tokens_generated = 0
    generate_kwargs = dict(max_new_tokens=max_new_tokens, use_cache=use_cache, do_sample=False)

    t0 = time.perf_counter()
    
    with torch.no_grad():
        for _ in tqdm(range(iterations)):
            output_tokens = model.generate(**input_tokens, **generate_kwargs)
            tokens_generated += output_tokens.shape[0] * output_tokens.shape[1]

    torch.cuda.synchronize()
    t1 = time.perf_counter()
    
    print(f"Results with use_cache = {use_cache}")
    print(f"With max_new_tokens = {max_new_tokens}")
    print(f"With batch = {batch_size}")
    print(f"Total time = {round(t1-t0,2)}")
    print(f"Total Tokens = {tokens_generated}")
    print(f"Tokens / Sec = {round(tokens_generated / (t1-t0),2)}")

### `use_cache=True`

#### `b=1`

In [6]:
torch.cuda.empty_cache()
gc.collect()

21

In [7]:
BATCH_SIZE = 1
ITERATIONS = 1
MAX_NEW_TOKENS = 100
USE_CACHE = True

test_performance(
    model, 
    iterations=ITERATIONS, 
    batch_size=BATCH_SIZE, 
    max_new_tokens=MAX_NEW_TOKENS, 
    use_cache=USE_CACHE)

Setting up inputs:
Running loop:


100%|███████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:20<00:00, 20.44s/it]

Results with use_cache = True
With max_new_tokens = 100
With batch = 1
Total time = 20.45
Total Tokens = 109
Tokens / Sec = 5.33





#### `b=4`

In [7]:
torch.cuda.empty_cache()
gc.collect()

0

In [8]:
BATCH_SIZE = 4
ITERATIONS = 1
MAX_NEW_TOKENS = 100
USE_CACHE = True

test_performance(
    model, 
    iterations=ITERATIONS, 
    batch_size=BATCH_SIZE, 
    max_new_tokens=MAX_NEW_TOKENS, 
    use_cache=USE_CACHE)

Setting up inputs:
Running loop:


100%|███████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:25<00:00, 25.24s/it]

Results with use_cache = True
With max_new_tokens = 100
With batch = 4
Total time = 25.24
Total Tokens = 436
Tokens / Sec = 17.28





#### `b=8`

In [5]:
torch.cuda.empty_cache()
gc.collect()

21

In [6]:
BATCH_SIZE = 8
ITERATIONS = 1
MAX_NEW_TOKENS = 100
USE_CACHE = True

test_performance(
    model, 
    iterations=ITERATIONS, 
    batch_size=BATCH_SIZE, 
    max_new_tokens=MAX_NEW_TOKENS, 
    use_cache=USE_CACHE)

Setting up inputs:
Running loop:


100%|███████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:35<00:00, 35.27s/it]

Results with use_cache = True
With max_new_tokens = 100
With batch = 8
Total time = 35.27
Total Tokens = 872
Tokens / Sec = 24.72





#### `b=16`

In [8]:
torch.cuda.empty_cache()
gc.collect()

0

In [9]:
BATCH_SIZE = 16
ITERATIONS = 1
MAX_NEW_TOKENS = 100
USE_CACHE = True

test_performance(
    model, 
    iterations=ITERATIONS, 
    batch_size=BATCH_SIZE, 
    max_new_tokens=MAX_NEW_TOKENS, 
    use_cache=USE_CACHE)

Setting up inputs:
Running loop:


100%|███████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:44<00:00, 44.99s/it]

Results with use_cache = True
With max_new_tokens = 100
With batch = 16
Total time = 44.99
Total Tokens = 1744
Tokens / Sec = 38.76





#### `b=32`

In [10]:
torch.cuda.empty_cache()
gc.collect()

0

In [11]:
BATCH_SIZE = 32
ITERATIONS = 1
MAX_NEW_TOKENS = 100
USE_CACHE = True

test_performance(
    model, 
    iterations=ITERATIONS, 
    batch_size=BATCH_SIZE, 
    max_new_tokens=MAX_NEW_TOKENS, 
    use_cache=USE_CACHE)

Setting up inputs:
Running loop:


100%|███████████████████████████████████████████████████████████████████████████████████████| 1/1 [01:16<00:00, 76.82s/it]

Results with use_cache = True
With max_new_tokens = 100
With batch = 32
Total time = 76.82
Total Tokens = 3488
Tokens / Sec = 45.4





#### `b=64`

In [12]:
torch.cuda.empty_cache()
gc.collect()

0

In [13]:
BATCH_SIZE = 64
ITERATIONS = 1
MAX_NEW_TOKENS = 100
USE_CACHE = True

test_performance(
    model, 
    iterations=ITERATIONS, 
    batch_size=BATCH_SIZE, 
    max_new_tokens=MAX_NEW_TOKENS, 
    use_cache=USE_CACHE)

Setting up inputs:
Running loop:


  0%|                                                                                               | 0/1 [01:23<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 50.00 MiB (GPU 1; 14.61 GiB total capacity; 13.71 GiB already allocated; 17.12 MiB free; 13.91 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [14]:
torch.cuda.empty_cache()
gc.collect()

880

#### `b=128`

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
BATCH_SIZE = 128
ITERATIONS = 1
MAX_NEW_TOKENS = 100
USE_CACHE = True

test_performance(
    model, 
    iterations=ITERATIONS, 
    batch_size=BATCH_SIZE, 
    max_new_tokens=MAX_NEW_TOKENS, 
    use_cache=USE_CACHE)

### `use_cache=False`

In [None]:
BATCH_SIZE = 16
ITERATIONS = 1
MAX_NEW_TOKENS = 100
USE_CACHE = False

test_performance(
    model, 
    iterations=ITERATIONS, 
    batch_size=BATCH_SIZE, 
    max_new_tokens=MAX_NEW_TOKENS, 
    use_cache=USE_CACHE)

In [None]:
BATCH_SIZE = 1
ITERATIONS = 3
MAX_NEW_TOKENS = 256
USE_CACHE = False

test_performance(
    model, 
    iterations=ITERATIONS, 
    batch_size=BATCH_SIZE, 
    max_new_tokens=MAX_NEW_TOKENS, 
    use_cache=USE_CACHE)