In [1]:
from transformers import GPT2LMHeadModel, AutoTokenizer

model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
encoded = tokenizer("The capital of France ", return_tensors="pt")
import time
st = time.perf_counter()
generate_output = model.generate(**encoded, use_cache=True, return_dict_in_generate=True, max_new_tokens=50)
print(f"Inference time: {time.perf_counter()-st:.3f}")
print(generate_output.sequences[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Inference time: 1.781
tensor([ 464, 3139,  286, 4881,  220, 1849,  271,  262, 3139,  286,  262, 4141,
        2066,   13,  383, 4141, 2066,  318,  257, 1181,  286,  262, 1242, 2422,
         290, 3034, 1080,   13,  383, 4141, 2066,  318,  257, 1181,  286,  262,
        1242, 2422,  290, 3034, 1080,   13,  383, 4141, 2066,  318,  257, 1181,
         286,  262, 1242, 2422,  290, 3034, 1080])


In [3]:
model_config={
    "use_cache":True,
    "return_dict_in_generate":True,
    "max_new_tokens":1,
}
print(encoded)
output = model.generate(**encoded, **model_config)
print(output.sequences[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'input_ids': tensor([[ 464, 3139,  286, 4881,  220]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
tensor([ 464, 3139,  286, 4881,  220, 1849])


In [4]:

old_update = model._update_model_kwargs_for_generation
extracted = {}
import types
def new_func(self,*args, **kwargs):
    extracted["past_key_values"] = args[0]["past_key_values"]
    return old_update(*args, **kwargs)

model._update_model_kwargs_for_generation = types.MethodType(new_func, model)

In [5]:

output = model.generate(**encoded, **model_config)
print([len(extracted["past_key_values"]), len(extracted["past_key_values"][0])] + list(extracted["past_key_values"][0][0].size()))
print(output.sequences[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 5, 64]
tensor([ 464, 3139,  286, 4881,  220, 1849])


In [6]:
import torch
encoded = {
    "input_ids": output.sequences,
    "attention_mask": torch.concat((encoded["attention_mask"], torch.ones((1,1), dtype=torch.int64)), dim=1),
    "past_key_values": extracted["past_key_values"],
}
# print(encoded)
output = model.generate(**encoded, **model_config)
print(output.sequences[0])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([ 464, 3139,  286, 4881,  220, 1849,  271])


In [7]:
encoded = tokenizer("The capital of France ", return_tensors="pt")
st = time.perf_counter()
for _ in range(50):
    output = model.generate(**encoded, **model_config)
    encoded = {
        "input_ids": output.sequences,
        "attention_mask": torch.concat((encoded["attention_mask"], torch.ones((1,1), dtype=torch.int64)), dim=1),
        "past_key_values": extracted["past_key_values"],
    }
    print([len(extracted["past_key_values"]), len(extracted["past_key_values"][0])] + list(extracted["past_key_values"][0][0].size()))
print(f"Inference time: {time.perf_counter()-st:.3f}")
print(output.sequences[0])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 5, 64]
[12, 2, 1, 12, 6, 64]
[12, 2, 1, 12, 7, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 8, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 9, 64]
[12, 2, 1, 12, 10, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 11, 64]
[12, 2, 1, 12, 12, 64]
[12, 2, 1, 12, 13, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 14, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 15, 64]
[12, 2, 1, 12, 16, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 17, 64]
[12, 2, 1, 12, 18, 64]
[12, 2, 1, 12, 19, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 20, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 21, 64]
[12, 2, 1, 12, 22, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 23, 64]
[12, 2, 1, 12, 24, 64]
[12, 2, 1, 12, 25, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 26, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 27, 64]
[12, 2, 1, 12, 28, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 29, 64]
[12, 2, 1, 12, 30, 64]
[12, 2, 1, 12, 31, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 32, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 33, 64]
[12, 2, 1, 12, 34, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 35, 64]
[12, 2, 1, 12, 36, 64]
[12, 2, 1, 12, 37, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 38, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 39, 64]
[12, 2, 1, 12, 40, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 41, 64]
[12, 2, 1, 12, 42, 64]
[12, 2, 1, 12, 43, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 44, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 45, 64]
[12, 2, 1, 12, 46, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 47, 64]
[12, 2, 1, 12, 48, 64]
[12, 2, 1, 12, 49, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 50, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 51, 64]
[12, 2, 1, 12, 52, 64]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 1, 12, 53, 64]
[12, 2, 1, 12, 54, 64]
Inference time: 1.860
tensor([ 464, 3139,  286, 4881,  220, 1849,  271,  262, 3139,  286,  262, 4141,
        2066,   13,  383, 4141, 2066,  318,  257, 1181,  286,  262, 1242, 2422,
         290, 3034, 1080,   13,  383, 4141, 2066,  318,  257, 1181,  286,  262,
        1242, 2422,  290, 3034, 1080,   13,  383, 4141, 2066,  318,  257, 1181,
         286,  262, 1242, 2422,  290, 3034, 1080])


In [8]:
assert all(generate_output.sequences[0] == output.sequences[0])

In [9]:
import torch.functional as F

In [10]:
tokenizer.padding_side="left"
encoded = tokenizer(["The capital of France is ", "Die Hauptstadt von"], return_tensors="pt", padding="longest")
print(encoded)

{'input_ids': tensor([[  464,  3139,   286,  4881,   318,   220],
        [50256, 32423, 49696,   457, 38863, 18042]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1]])}


In [11]:
output = model.generate(**encoded, **model_config)
print([len(extracted["past_key_values"]), len(extracted["past_key_values"][0])] + list(extracted["past_key_values"][0][0].size()))
print(output.sequences[0])
print(output.sequences[1])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[12, 2, 2, 12, 6, 64]
tensor([ 464, 3139,  286, 4881,  318,  220, 1849])
tensor([50256, 32423, 49696,   457, 38863, 18042,   509])


In [12]:
import copy
padded_kv_cache = copy.deepcopy(extracted["past_key_values"])

In [13]:
encoded = tokenizer(["Die Hauptstadt von"], return_tensors="pt")
output = model.generate(**encoded, **model_config)
print(output.sequences[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([32423, 49696,   457, 38863, 18042,   509])


In [14]:
def print_kv_dims(kv):
    print([len(kv), len(kv[0])] + list(kv[0][0].size()))
print_kv_dims(extracted["past_key_values"])

[12, 2, 1, 12, 5, 64]


In [15]:

print_kv_dims(padded_kv_cache)

[12, 2, 2, 12, 6, 64]


In [16]:
print(extracted["past_key_values"][0][0][0,0,...] - padded_kv_cache[0][0][1,0,1:,:])

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0

In [17]:
print(extracted["past_key_values"][0][0][0,0,-1] - padded_kv_cache[0][0][1,0,-1,:])
print(extracted["past_key_values"][0][0][0,0,-1])
print(padded_kv_cache[0][0][1,0,-1,:])
# Could the difference be the leakiness of the attention mask in the attention block? mask is not binary but 1 and float32.min
# see https://github.com/huggingface/transformers/blob/c3ecf2d95d6a9f614d968af2f8b4e317f381e5ec/src/transformers/models/gpt2/modeling_gpt2.py#L823C82-L823C82

tensor([-4.7684e-07,  1.1921e-06,  2.3842e-07,  1.1921e-07,  5.3644e-07,
        -2.3842e-07,  1.1921e-06,  0.0000e+00, -5.9605e-07,  8.9407e-08,
        -8.9407e-08,  1.1921e-07, -6.7055e-08,  3.5763e-07,  4.7684e-07,
         2.3842e-07, -7.1526e-07,  2.9802e-07,  4.7684e-07, -4.7684e-07,
         4.7684e-07, -4.4703e-08, -5.9605e-07,  1.1921e-07,  0.0000e+00,
         1.7881e-07, -1.7881e-07, -7.7486e-07,  2.3842e-07,  1.1921e-07,
         7.1526e-07, -1.1921e-07, -7.1526e-07,  5.9605e-08,  5.3644e-07,
         2.3842e-07,  4.7684e-07, -5.9605e-07, -3.5763e-07,  5.9605e-08,
         4.1723e-07,  3.5763e-07,  1.1921e-06, -2.3842e-07,  8.9407e-07,
         9.5367e-07, -4.1723e-07,  2.3842e-07,  7.4506e-08, -4.7684e-07,
         2.0862e-07, -5.9605e-08,  1.1921e-07, -3.5763e-07, -1.7881e-07,
         9.5367e-07, -8.3074e-07, -3.3528e-08, -2.9802e-07, -3.5763e-07,
         1.1921e-07,  1.7881e-07, -2.6822e-07, -9.5367e-07])
tensor([-1.9769,  2.8057,  1.7984,  1.7875,  0.5844,  2.1871,  

In [18]:
model_config["max_new_tokens"]=5
encoded = tokenizer(["The capital of France", "The capital of France"], return_tensors="pt")
print(encoded)
output = model.generate(**encoded, **model_config)
print(output.sequences[0])
print(output.sequences[1])

encoded = tokenizer(["The capital of France"], return_tensors="pt", max_length=6, padding='max_length', truncation=True)
print(encoded)
output = model.generate(**encoded, **model_config)
print(output.sequences[0])
print(tokenizer.decode(output.sequences[0],skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'input_ids': tensor([[ 464, 3139,  286, 4881],
        [ 464, 3139,  286, 4881]]), 'attention_mask': tensor([[1, 1, 1, 1],
        [1, 1, 1, 1]])}


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([ 464, 3139,  286, 4881,   11, 6342,   11,  318, 1363])
tensor([ 464, 3139,  286, 4881,   11, 6342,   11,  318, 1363])
{'input_ids': tensor([[50256, 50256,   464,  3139,   286,  4881]]), 'attention_mask': tensor([[0, 0, 1, 1, 1, 1]])}
tensor([50256, 50256,   464,  3139,   286,  4881,    11,  6342,    11,   318,
         1363])
The capital of France, Paris, is home


In [22]:
model_config["max_new_tokens"]=1
encoded = tokenizer(["The capital of France", "The capital of France"], return_tensors="pt")
st = time.perf_counter()
for _ in range(5):
    output = model.generate(**encoded, **model_config)
    encoded = {
        "input_ids": output.sequences,
        "attention_mask": torch.concat((encoded["attention_mask"], torch.ones((2,1), dtype=torch.int64)), dim=1),
        "past_key_values": extracted["past_key_values"],
    }
    # print([len(extracted["past_key_values"]), len(extracted["past_key_values"][0])] + list(extracted["past_key_values"][0][0].size()))
print(f"Inference time: {time.perf_counter()-st:.3f}")
print(output.sequences[0])
print(output.sequences[1])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Inference time: 0.414
tensor([ 464, 3139,  286, 4881,   11, 6342,   11,  318, 1363])
tensor([ 464, 3139,  286, 4881,   11, 6342,   11,  318, 1363])


In [24]:
model_config["max_new_tokens"]=5
encoded = tokenizer(["The capital of France", "The capital of France, Paris, is home"], return_tensors="pt", padding=True)
print(encoded)
output = model.generate(**encoded, **model_config)
print(output.sequences[0])
print(output.sequences[1])

encoded = tokenizer(["The capital of France"], return_tensors="pt", max_length=6, padding='max_length', truncation=True)
print(encoded)
output = model.generate(**encoded, **model_config)
print(output.sequences[0])
print(tokenizer.decode(output.sequences[0],skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'input_ids': tensor([[50256, 50256, 50256, 50256, 50256,   464,  3139,   286,  4881],
        [  464,  3139,   286,  4881,    11,  6342,    11,   318,  1363]]), 'attention_mask': tensor([[0, 0, 0, 0, 0, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([50256, 50256, 50256, 50256, 50256,   464,  3139,   286,  4881,    11,
         6342,    11,   318,  1363])
tensor([ 464, 3139,  286, 4881,   11, 6342,   11,  318, 1363,  284,  262,  995,
         338, 4387])
{'input_ids': tensor([[50256, 50256,   464,  3139,   286,  4881]]), 'attention_mask': tensor([[0, 0, 1, 1, 1, 1]])}
tensor([50256, 50256,   464,  3139,   286,  4881,    11,  6342,    11,   318,
         1363])
The capital of France, Paris, is home


In [None]:

padded_kv_cache = copy.deepcopy(extracted["past_key_values"])

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf')
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained(
    'meta-llama/Llama-2-7b-hf',
    device_map="balanced",
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_8bit=True,
    )

encoded = tokenizer(["The capital of France"], return_tensors="pt", return_token_type_ids=False)
print(encoded)
output = model.generate(**encoded, use_cache=True, return_dict_in_generate=True)
print(output.sequences[0])
print(tokenizer.decode(output.sequences[0],skip_special_tokens=True))

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.09it/s]


{'input_ids': tensor([[   1,  450, 7483,  310, 3444]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
tensor([    1,   450,  7483,   310,  3444, 29892,  3681, 29892,   338,   263,
         4272,   310,  6017,   749, 29892,  1616, 29892,   322,  9257, 29889])
The capital of France, Paris, is a city of romance, art, and culture.


In [2]:
old_update = model._update_model_kwargs_for_generation
extracted = {}
import types
def new_func(self,*args, **kwargs):
    extracted["past_key_values"] = args[0]["past_key_values"]
    return old_update(*args, **kwargs)

model._update_model_kwargs_for_generation = types.MethodType(new_func, model)

In [3]:
model_config={
    "use_cache":True,
    "return_dict_in_generate":True,
    "max_new_tokens":1,
}
output = model.generate(**encoded, **model_config)
print([len(extracted["past_key_values"]), len(extracted["past_key_values"][0])] + list(extracted["past_key_values"][0][0].size()))
print(output.sequences[0])

[32, 2, 1, 32, 5, 128]
tensor([    1,   450,  7483,   310,  3444, 29892])


In [4]:
import torch
encoded = {
    "input_ids": output.sequences,
    "attention_mask": torch.concat((encoded["attention_mask"], torch.ones((1,1), dtype=torch.int64)), dim=1),
    "past_key_values": extracted["past_key_values"],
}
# print(encoded)
output = model.generate(**encoded, **model_config)
print(output.sequences[0])

tensor([    1,   450,  7483,   310,  3444, 29892,  3681])


In [6]:
import time
encoded = tokenizer("The capital of France", return_tensors="pt", return_token_type_ids=False)
st = time.perf_counter()
for _ in range(50):
    output = model.generate(**encoded, **model_config)
    if output.sequences[0][-1] == tokenizer.eos_token_id:
        break
    encoded = {
        "input_ids": output.sequences,
        "attention_mask": torch.concat((encoded["attention_mask"], torch.ones((1,1), dtype=torch.int64)), dim=1),
        "past_key_values": extracted["past_key_values"],
    }
    # print([len(extracted["past_key_values"]), len(extracted["past_key_values"][0])] + list(extracted["past_key_values"][0][0].size()))
print(f"Inference time: {time.perf_counter()-st:.3f}")
print(output.sequences[0])
print(tokenizer.decode(output.sequences[0],skip_special_tokens=True))


Inference time: 7.655
tensor([    1,   450,  7483,   310,  3444, 29892,  3681, 29892,   338,   263,
         4272,   310,  6017,   749, 29892,  1616, 29892,   322,  9257, 29889,
          739,   338,   884,   263,  4272,   310, 13460, 29892,  9687, 29892,
          322,  2090, 29889,  3681,   338,   263,  4272,   393,   756,  1554,
          363, 14332, 29889, 26460,   366,   526,  3063,   363,   263,  6017,
         7716,   679, 21694, 29892,   263])
The capital of France, Paris, is a city of romance, art, and culture. It is also a city of fashion, food, and fun. Paris is a city that has something for everyone. Whether you are looking for a romantic getaway, a
