## Accelerating 2:4 sparse models with Huggingface, torch.compile, and semi-structured sparsity. 



In [1]:
import os
import torch
from torch.sparse import to_sparse_semi_structured

from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

def timed(fn):
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()
    result = fn()
    end.record()
    torch.cuda.synchronize()
    return result, start.elapsed_time(end) / 1000


def benchmark(model, WARMUP=5, N=25):
    time_per_batch = []
    with torch.no_grad():
        # warmup steps
        for _ in range(WARMUP):
            timed(lambda: model.generate(**inputs))
    
        # benchmark
        for _ in tqdm(range(N)):
            with torch.no_grad():
                _ , time_sec =  timed(lambda: model.generate(**inputs))
                time_per_batch.append(time_sec)
            
    # each time we generate 128 tokens - 7 for the prompt = 121 tokens at a time.
    total_time = sum(time_per_batch)
    tokens_per_second = 121 * N / total_time
    print(f"Total time: {total_time:.3f}s | Tokens/second: {tokens_per_second:.3f}")

In [3]:
torch.sparse.SparseSemiStructuredTensor._FORCE_CUTLASS = False
torch.set_float32_matmul_precision('high')

os.environ["TOKENIZERS_PARALLELISM"] = "false" # silence warnings when compiling

In [4]:
model = AutoModelForCausalLM.from_pretrained("nm-testing/SparseLlama-3-8B-pruned_50.2of4", torch_dtype=torch.float16).cuda()
tokenizer = AutoTokenizer.from_pretrained("nm-testing/SparseLlama-3-8B-pruned_50.2of4")

# Load semi-structured spares
for name, mod in model.named_modules():
    if isinstance(mod, torch.nn.Linear):

        # print out linear layers just FYI
        if '10' in name:
            print(name, mod.weight.shape)

        # these two will show speedups
        if 'mlp.gate' in name or 'mlp.up' in name:    
            mod.weight = torch.nn.Parameter(to_sparse_semi_structured(mod.weight))

# Specify the max length (including both the prompt and the response)
# When calling `generate` with `cache_implementation="static" later, this is also used to create a `StaticCache` object
# with sequence length = `max_length`. The longer the more you will re-use it
model.generation_config.max_length = 128
model.generation_config.pad_token_id = tokenizer.eos_token_id
model.generation_config.cache_implementation = "static"


prompt = "Why dogs are so cute?"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



model.layers.10.self_attn.q_proj torch.Size([4096, 4096])
model.layers.10.self_attn.k_proj torch.Size([1024, 4096])
model.layers.10.self_attn.v_proj torch.Size([1024, 4096])
model.layers.10.self_attn.o_proj torch.Size([4096, 4096])
model.layers.10.mlp.gate_proj torch.Size([14336, 4096])
model.layers.10.mlp.up_proj torch.Size([14336, 4096])
model.layers.10.mlp.down_proj torch.Size([4096, 14336])


In [5]:

# without `torch.compile`: each call takes ~ 5.0 seconds (on A100 80G + torch 2.3)
# Total time: 168.715s | Tokens/second: 17.930
outputs = model.generate(**inputs)
response = tokenizer.batch_decode(outputs)[0]
print(response)

# `torch.compile(model, ...)` is not recommended as you compile callbacks
# and full generate. We recommend compiling only the forward for now. 
# "reduce-overhead" will use cudagraphs.
torch._inductor.config.triton.cudagraph_dynamic_shape_warn_limit = None

model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)

<|begin_of_text|>Why dogs are so cute? Why do we love them so much? Why do we want to hug them and kiss them and play with them? Why do we want to take care of them and make sure they are happy and healthy? The answer is simple: because they are cute! But what makes a dog cute? Is it their big eyes, their floppy ears, their wagging tail, or something else? In this article, we will explore the science behind why dogs are so cute and why we love them so much.

First, let's talk about the big eyes. Dogs have large eyes that are set far apart on their


In [6]:
benchmark(model)

100%|██████████████████████████| 25/25 [00:34<00:00,  1.39s/it]

Total time: 34.736s | Tokens/second: 87.086





In [7]:
# sanity check we get same output as non-compiled model
outputs = model.generate(**inputs)
response = tokenizer.batch_decode(outputs)[0]
print(response)

<|begin_of_text|>Why dogs are so cute? Why do we love them so much? Why do we want to hug them and kiss them and play with them? Why do we want to take care of them and make sure they are happy and healthy? The answer is simple: because they are cute! But what makes a dog cute? Is it their big eyes, their floppy ears, their wagging tail, or something else? In this article, we will explore the science behind why dogs are so cute and why we love them so much.

First, let's talk about the big eyes. Dogs have large eyes that are set far apart on their


## Run torch.compile baseline

In [8]:
del model
model = AutoModelForCausalLM.from_pretrained("nm-testing/SparseLlama-3-8B-pruned_50.2of4", torch_dtype=torch.float16).cuda()

# set configs again
model.generation_config.max_length = 128
model.generation_config.pad_token_id = tokenizer.eos_token_id
model.generation_config.cache_implementation = "static"

model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
benchmark(model)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████████████████████| 25/25 [00:37<00:00,  1.52s/it]

Total time: 37.925s | Tokens/second: 79.764





In [9]:
outputs = model.generate(**inputs)
response = tokenizer.batch_decode(outputs)[0]
print(response)

<|begin_of_text|>Why dogs are so cute? Why do we love them so much? Why do we want to hug them and kiss them and play with them? Why do we want to take care of them and make sure they are happy and healthy? The answer is simple: because they are cute! But what makes a dog cute? Is it their big eyes, their floppy ears, their wagging tail, or something else? In this article, we will explore the science behind why dogs are so cute and why we love them so much.

First, let's talk about the big eyes. Dogs have large eyes that are set far apart on their


In [None]:
# e2e runs

In [12]:
!python scripts/hf_eval.py --tasks hellaswag --compile

2024-07-30:14:46:14,569 INFO     [__init__.py:29] Skipping import of cpp extensions
Namespace(repo_id='meta-llama/Meta-Llama-3-8B', tasks=['hellaswag'], limit=None, precision=torch.bfloat16, device='cuda', quantization='None', sparsity='None', compile=True, batch_size=1, max_length=None)
Loading checkpoint shards: 100%|██████████████████| 4/4 [00:03<00:00,  1.20it/s]
2024-07-30:14:46:31,567 INFO     [__init__.py:491] `group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lm-eval. `tag` will be used to allow to call a collection of tasks just like `group`. `group` will be removed in order to not cause confusion with the new ConfigurableGroup which will be the offical way to create groups with addition of group-wide configuations.
2024-07-30:14:46:45,332 INFO     [task.py:423] Building contexts for hellaswag on rank 0...
100%|███████████████████████████████████| 10042/10042 [00:03<00:00, 2510.71it/s]
2024-07-30:14:46:50,668 INFO     [evaluator.py:4

In [11]:
!python scripts/hf_eval.py --sparsity semi_sparse_mlp_only --repo_id "nm-testing/SparseLlama-3-8B-pruned_50.2of4" --tasks hellaswag

2024-07-30:14:03:14,449 INFO     [__init__.py:29] Skipping import of cpp extensions
Namespace(repo_id='nm-testing/SparseLlama-3-8B-pruned_50.2of4', tasks=['hellaswag'], limit=None, precision=torch.bfloat16, device='cuda', quantization='None', sparsity='semi_sparse_mlp_only', compile=False, batch_size=1, max_length=None)
Loading checkpoint shards: 100%|██████████████████| 4/4 [00:03<00:00,  1.26it/s]
2024-07-30:14:03:30,396 INFO     [__init__.py:491] `group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lm-eval. `tag` will be used to allow to call a collection of tasks just like `group`. `group` will be removed in order to not cause confusion with the new ConfigurableGroup which will be the offical way to create groups with addition of group-wide configuations.
2024-07-30:14:03:44,357 INFO     [task.py:423] Building contexts for hellaswag on rank 0...
100%|███████████████████████████████████| 10042/10042 [00:04<00:00, 2336.94it/s]
2024-07-30:14:0