In [1]:
%reload_ext autoreload
%autoreload 2

### Load Model

In [2]:
model_path = "./mpt-gsm8k-dense/training"

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

mpt = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_path)

----- HERE: INITIALIZE MODEL -----
You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
mpt.device

device(type='cpu')

In [5]:
prompt = "Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?"
inputs = tokenizer(prompt, return_tensors="pt")

generated_ids = mpt.generate(**inputs, max_new_tokens=50, eos_token_id=tokenizer.eos_token_id)
print(tokenizer.batch_decode(generated_ids)[0])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?Weng earns $12/hour * 50 minutes = $<<12*50=600>>600 for babysitting.
#### 600<|endoftext|>


In [6]:
!ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
experiments.ipynb	 quant_linearW8A8MatMul8Embeds8LMhead8.yaml
experiments-quant.ipynb  run-0-50sparse
llm-foundry		 run-0-50sparse-results
lm-evaluation-harness	 run-0-50sparse.tar.gz
mpt-gsm8k-dense		 sparseml
neuralmagicml


In [15]:
from neuralmagicml.research.sparsegpt.modelutils import (
    apply_recipe, 
    initialize_scales_from_batches, 
    find_quant_layers, 
    find_layers, 
    freeze_bn_stats
)

class SparseGPTConfig:
    def __init__(
        self,
        nsamples=128,
        sparsity=0.5,
        minlayer=-1,
        maxlayer=100,
        prune_only = "",
        invert=False,
        prune_n=0,
        prune_m=0,
        perc_damp=0.01,
        blocksize=128,
        ptq=False,
        quantize_attn_matmuls=False,
        quantize_embeddings_and_lmhead=False,
        recipe="",
        observer_batches=100
    ):
        self.nsamples = nsamples
        self.sparsity = sparsity
        self.minlayer = minlayer
        self.maxlayer = maxlayer
        self.prune_only = prune_only
        self.invert = invert
        self.prune_n = prune_n
        self.prune_m = prune_m
        self.perc_damp = perc_damp
        self.blocksize = blocksize
        self.ptq = ptq
        self.quantize_attn_matmuls = quantize_attn_matmuls
        self.quantize_embeddings_and_lmhead = quantize_embeddings_and_lmhead
        self.recipe = recipe
        self.observer_batches = observer_batches

sparsegpt_config = SparseGPTConfig(
    ptq=True,
    quantize_attn_matmuls=True,
    quantize_embeddings_and_lmhead=True,
    recipe="quant_linearW8A8MatMul8Embeds8LMhead8.yaml",
)

In [17]:
from datasets import load_dataset

dataset = load_dataset("gsm8k", "main")
train_dataset = dataset["train"]
train_dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 7473
})

In [18]:
from transformers import DataCollatorForLanguageModeling
from datasets import load_dataset

BATCH_SIZE = 1
MAX_SEQ_LEN = 512

def tokenize_fn(element):
    text = f'{element["question"]} {element["answer"]}'

    outputs = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=MAX_SEQ_LEN,
        return_overflowing_tokens=False,
        return_length=False,
    )
    return {
        "input_ids": outputs["input_ids"],
        "attention_mask": outputs["attention_mask"]
    }

tokenized_dataset_train = train_dataset.map(
    tokenize_fn,
    batched=False,
    remove_columns=train_dataset.column_names,
    batch_size=BATCH_SIZE,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
train_dataloader = torch.utils.data.DataLoader(
    tokenized_dataset_train, 
    shuffle=True, 
    collate_fn=data_collator, 
    batch_size=BATCH_SIZE
)

### Run SparseGPT

In [19]:
model, manager = apply_recipe(mpt, sparsegpt_config.recipe)

2023-10-24 13:35:59 sparseml.pytorch.utils.logger INFO     Logging all SparseML modifier-level logs to sparse_logs/24-10-2023_13.35.59.log


In [28]:
initialize_scales_from_batches(model, train_dataloader, sparsegpt_config.observer_batches)

Collecting data statistics for quantization scales...


In [13]:
from neuralmagicml.research.sparsegpt.sparsegpt import SparseGPT
from neuralmagicml.research.sparsegpt.quant import WeightFakeQuantizer

@torch.no_grad()
def mpt_sequential(model, dataloader, data_seq_len, dev, cfg):
    assert cfg.nsamples > 0, "When using sparsegpt, nsamples must be > 0"
    NSAMPLES = cfg.nsamples
    SPARSITY = cfg.sparsity
    MINLAYER = cfg.minlayer
    MAXLAYER = cfg.maxlayer
    PRUNE_ONLY = cfg.prune_only
    INVERT = cfg.invert
    PRUNE_N = cfg.prune_n
    PRUNE_M = cfg.prune_m
    PERC_DAMP = cfg.perc_damp
    BLOCKSIZE = cfg.blocksize

    PTQ = cfg.ptq
    print(f"PTQ = {PTQ}")
    if PTQ:
        model, manager = apply_recipe(model, cfg.recipe)
        initialize_scales_from_batches(model, dataloader, cfg.observer_batches)

    print("Starting ...")

    use_cache = model.config.use_cache
    model.config.use_cache = False
    layers = model.transformer.blocks

    model.transformer.wte = model.transformer.wte.to(dev)
    layers[0] = layers[0].to(dev)

    dtype = next(iter(model.parameters())).dtype
    inps = torch.zeros(
        (NSAMPLES, data_seq_len, model.config.d_model), dtype=dtype, device=dev
    )
    cache = []
    # cache_attn_mask = []

    class Catcher(torch.nn.Module):
        def __init__(self, module):
            super().__init__()
            self.module = module

        def forward(self, inp, **kwargs):
            inps[len(cache)] = inp
            cache.append(kwargs["attn_bias"])
            raise ValueError

    layers[0] = Catcher(layers[0])
    i = 0
    
    for batch in iter(dataloader):
        try:
            tmp = {k: v.to(dev) for k, v in batch.items()}
            # cache_attn_mask.append(tmp["attention_mask"])
            model(**tmp)
        except ValueError:
            pass
        i += 1
        if i == NSAMPLES:
            break
    layers[0] = layers[0].module

    layers[0] = layers[0].cpu()
    model.transformer.wte = model.transformer.wte.cpu()
    torch.cuda.empty_cache()

    outs = torch.zeros_like(inps)

    print("Ready.")

    for i in range(len(layers)):
        layer = layers[i].to(dev)
        if PTQ:
            # The layer names are changed due to quantization modifiers, therefore
            # we need a slightly different func to retrieve layers
            subset = find_quant_layers(layer)
        else:
            subset = find_layers(layer)

        gpts = {}
        for name in subset:
            if (not (MINLAYER <= i < MAXLAYER and PRUNE_ONLY in name)) == (not INVERT):
                continue
            gpts[name] = SparseGPT(subset[name])
            if PTQ:
                gpts[name].quantizer = WeightFakeQuantizer(subset[name])

        def add_batch(name):
            def tmp(_, inp, out):
                gpts[name].add_batch(inp[0].data, out.data)

            return tmp

        handles = []
        for name in gpts:
            handles.append(subset[name].register_forward_hook(add_batch(name)))
        
        for j in range(NSAMPLES):
            outs[j] = layer(inps[j].unsqueeze(0), attn_bias=cache[j])[0]
        for h in handles:
            h.remove()

        for name in gpts:
            print(i, name)
            print("Pruning ...")
            gpts[name].fasterprune(
                SPARSITY,
                prunen=PRUNE_N,
                prunem=PRUNE_M,
                percdamp=PERC_DAMP,
                blocksize=BLOCKSIZE,
            )
            gpts[name].free()

        for j in range(NSAMPLES):
            outs[j] = layer(inps[j].unsqueeze(0), attn_bias=cache[j])[0]

        layers[i] = layer.cpu()
        del layer
        torch.cuda.empty_cache()

        inps, outs = outs, inps

    print(model)
    model.apply(torch.quantization.disable_observer)
    model.apply(freeze_bn_stats)
    model.config.use_cache = use_cache

In [14]:
mpt_sequential(mpt, train_dataloader, MAX_SEQ_LEN, "cuda", sparsegpt_config)

PTQ = False
Starting ...
Ready.
0 attn.Wqkv
Pruning ...
time 2.91
error 25.434906005859375
0 attn.out_proj
Pruning ...
time 0.61
error 18.28067398071289
0 ffn.up_proj
Pruning ...
time 0.64
error 4068.03173828125
0 ffn.down_proj
Pruning ...
time 3.98
error 12.001224517822266
1 attn.Wqkv
Pruning ...
time 2.30
error 232.1502685546875
1 attn.out_proj
Pruning ...
time 0.61
error 10.658576965332031
1 ffn.up_proj
Pruning ...
time 0.64
error 5781.634765625
1 ffn.down_proj
Pruning ...
time 4.07
error 12.84139347076416
2 attn.Wqkv
Pruning ...
time 2.35
error 394.74127197265625
2 attn.out_proj
Pruning ...
time 0.61
error 52.782859802246094
2 ffn.up_proj
Pruning ...
time 0.64
error 6154.84033203125
2 ffn.down_proj
Pruning ...
time 4.06
error 61.63454818725586
3 attn.Wqkv
Pruning ...
time 2.35
error 772.435546875
3 attn.out_proj
Pruning ...
time 0.61
error 19.284317016601562
3 ffn.up_proj
Pruning ...
time 0.65
error 9611.99609375
3 ffn.down_proj
Pruning ...
time 4.12
error 41.04884719848633
4 attn.

### Serialize Model

In [81]:
import os

path_to_save = "./run-0-50sparse"
os.makedirs(path_to_save)

In [82]:
import shutil

files_to_copy = [
    "adapt_tokenizer.py",
    "attention.py",
    "blocks.py",
    "config.json",
    "configuration_mpt.py",
    "custom_embedding.py",
    "flash_attn_triton.py",
    "generation_config.json",
    "hf_prefixlm_converter.py",
    "meta_init_context.py",
    "modeling_mpt.py",
    "norm.py",
    "param_init_fns.py",
    "special_tokens_map.json",
    "tokenizer.json",
    "tokenizer_config.json",
]

for f in files_to_copy:
    print(f"Copying {f}...")
    shutil.copyfile(os.path.join(model_path, f), os.path.join(path_to_save, f))

Copying adapt_tokenizer.py...
Copying attention.py...
Copying blocks.py...
Copying config.json...
Copying configuration_mpt.py...
Copying custom_embedding.py...
Copying flash_attn_triton.py...
Copying generation_config.json...
Copying hf_prefixlm_converter.py...
Copying meta_init_context.py...
Copying modeling_mpt.py...
Copying norm.py...
Copying param_init_fns.py...
Copying special_tokens_map.json...
Copying tokenizer.json...
Copying tokenizer_config.json...


In [84]:
mpt.save_pretrained(path_to_save)

### Reload Model and Confirm Its Running Nicely

In [85]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

mpt2 = AutoModelForCausalLM.from_pretrained(path_to_save, trust_remote_code=True)
tokenizer2 = AutoTokenizer.from_pretrained(path_to_save)

----- HERE: INITIALIZE MODEL -----
You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [87]:
prompt = "Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?"
inputs = tokenizer2(prompt, return_tensors="pt")

generated_ids = mpt2.generate(**inputs, max_new_tokens=50, eos_token_id=tokenizer2.eos_token_id)
print(tokenizer2.batch_decode(generated_ids)[0])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?Weng earned $12/hour * 50 minutes = $<<12*50=600>>600.
#### 600<|endoftext|>


In [88]:
print_sparsity(mpt2)

Sparsity distribution:
transformer.wte.weight = 0.00 %
transformer.blocks.0.norm_1.weight = 0.00 %
transformer.blocks.0.attn.Wqkv.weight = 50.00 %
transformer.blocks.0.attn.out_proj.weight = 50.00 %
transformer.blocks.0.norm_2.weight = 0.00 %
transformer.blocks.0.ffn.up_proj.weight = 50.00 %
transformer.blocks.0.ffn.down_proj.weight = 50.00 %
transformer.blocks.1.norm_1.weight = 0.00 %
transformer.blocks.1.attn.Wqkv.weight = 50.00 %
transformer.blocks.1.attn.out_proj.weight = 50.00 %
transformer.blocks.1.norm_2.weight = 0.00 %
transformer.blocks.1.ffn.up_proj.weight = 50.00 %
transformer.blocks.1.ffn.down_proj.weight = 50.00 %
transformer.blocks.2.norm_1.weight = 0.00 %
transformer.blocks.2.attn.Wqkv.weight = 50.00 %
transformer.blocks.2.attn.out_proj.weight = 50.00 %
transformer.blocks.2.norm_2.weight = 0.00 %
transformer.blocks.2.ffn.up_proj.weight = 50.00 %
transformer.blocks.2.ffn.down_proj.weight = 50.00 %
transformer.blocks.3.norm_1.weight = 0.00 %
transformer.blocks.3.attn.Wqkv.

In [None]:
# acc == 0.235