In [1]:
!git clone https://github.com/siyan-zhao/prepacking.git
%cd prepacking/
!pip install ortools==9.9.3963
!pip install binpacking==1.5.2
!pip install datasets==2.18.0
!pip install -i https://pypi.org/simple/ bitsandbytes

Cloning into 'prepacking'...
remote: Enumerating objects: 51, done.[K
remote: Counting objects: 100% (51/51), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 51 (delta 24), reused 13 (delta 3), pack-reused 0[K
Receiving objects: 100% (51/51), 2.18 MiB | 8.27 MiB/s, done.
Resolving deltas: 100% (24/24), done.
/content/prepacking
Collecting ortools==9.9.3963
  Downloading ortools-9.9.3963-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.8/24.8 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting absl-py>=2.0.0 (from ortools==9.9.3963)
  Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.7/133.7 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf>=4.25.3 (from ortools==9.9.3963)
  Downloading protobuf-5.26.1-cp37-abi3-manylinux2014_x86_64.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━

In [2]:
import torch
from processor import PrePackProcessor
from model import CustomCausalLlamaModel
from transformers import AutoTokenizer
from transformers.trainer_utils import set_seed
from dataset_utils import unpack_kv
from transformers import BitsAndBytesConfig

**load model**

In [3]:
SEED = 42
set_seed(SEED)
model_path = "princeton-nlp/Sheared-LLaMA-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = "[PAD]"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
custom_model = CustomCausalLlamaModel.from_pretrained(model_path)
custom_model.to(device)
custom_model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.38G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

CustomCausalLlamaModel(
  (model): CustomLlamaModel(
    (embed_tokens): Embedding(32000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-23): 24 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (down_proj): Linear(in_features=5504, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRM

**Prepacking Generation**

In [7]:

processor = PrePackProcessor(tokenizer)

sentences = [
    "Rescuers are searching for multiple people in the water after Baltimore bridge collapse, report says",
    "Major bridge in Maryland collapses after being hit by a ship",
    "The capital of Germany is",
    "The capital of Spain is",
    "The capital of Greece is",
    "Today I'm going to the",
    "Baltimore Police Department told NBC",
    "My",
    "It",
]

new_tokens, new_positions, new_mask, restart_dict, original_ids = processor.batch_process(sentences)


with torch.no_grad():
    packed_outputs = custom_model(
        input_ids=new_tokens.to(device),
        attention_mask=new_mask.to(device),
        position_ids=new_positions.to(device),
        return_dict=True,
        output_hidden_states=True,
    )

cache, final_tokens, attention_mask = unpack_kv(
    packed_outputs["past_key_values"], restart_dict, original_ids, device
)

prepack_generated_output = custom_model.generate(
    input_ids=final_tokens.to(device),
    attention_mask=attention_mask.to(device),
    max_new_tokens=20,
    use_cache=True,
    do_sample=False,
    past_key_values=cache,
    num_return_sequences=1,
    output_scores=True,
    return_dict_in_generate=True,
)



**Default generation**

In [8]:

with torch.no_grad():
    normal_tokens_id = tokenizer(sentences, return_tensors="pt", padding=True, truncation=False).to(
        device
    )
    normal_outputs = custom_model(**normal_tokens_id, return_dict=True, output_hidden_states=True)

default_generated_output = custom_model.generate(
    **normal_tokens_id,
    max_new_tokens=20,
    use_cache=True,
    do_sample=False,
    num_return_sequences=1,
    output_scores=True,
    return_dict_in_generate=True
)

attention_mask = normal_tokens_id["attention_mask"]
idx = 0


In [9]:
print("Asserting Same Tokens")

# Check tokens
for i, (prepack_token, default_token) in enumerate(
    zip(prepack_generated_output.sequences, default_generated_output.sequences)
):

    prepack = tokenizer.decode(prepack_token[1:])
    default = tokenizer.decode(default_token[attention_mask.shape[-1] :])
    print("-" * 15, "comparing", "-" * 15)
    print("Prepacked", i, ":", prepack)
    print("Default", i, ":", default)

    assert prepack == default

Asserting Same Tokens
--------------- comparing ---------------
Prepacked 0 : 
The Baltimore Sun reports that the collapse of a bridge in Baltimore on Monday morning has left at least
Default 0 : 
The Baltimore Sun reports that the collapse of a bridge in Baltimore on Monday morning has left at least
--------------- comparing ---------------
Prepacked 1 : 
The bridge was built in 1968 and was the first of its kind in the
Default 1 : 
The bridge was built in 1968 and was the first of its kind in the
--------------- comparing ---------------
Prepacked 2 : Berlin. Berlin is a city of contrasts. Berlin is a city of contrasts. It is
Default 2 : Berlin. Berlin is a city of contrasts. Berlin is a city of contrasts. It is
--------------- comparing ---------------
Prepacked 3 : Madrid, and it is the largest city in the country. The city is located in the central part
Default 3 : Madrid, and it is the largest city in the country. The city is located in the central part
--------------- comparing 