In [2]:
import transformers
print(transformers.__version__)


  from .autonotebook import tqdm as notebook_tqdm


4.57.1


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig

# 创建配置并设置 output_attentions
config = AutoConfig.from_pretrained("gpt2")
config.output_attentions = True

# 使用配置加载模型
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2", config=config)
gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")

with torch.no_grad():

    num_new_tokens = 3

    # tokenize the original input sentence
    inputs = gpt2_tokenizer("Hope is a", return_tensors="pt", add_special_tokens=False)

    # we execute this loop until we generate num_new_tokens new tokens
    for i in range(num_new_tokens):
        print("-"*50)
        print(f"Iteration {i+1}")
        print(f"Generating the next token for sequence: '{gpt2_tokenizer.decode(inputs.input_ids[0])}'")
        
        outputs = gpt2(inputs.input_ids)

        # take the highest scoring token as the next token
        logits = outputs.logits
        next_token = torch.argmax(logits[:, -1, :])

        # concatenate the new token with the rest of the input
        inputs.input_ids = torch.cat([inputs.input_ids, next_token.reshape([1,1])], dim=-1)
        
        print("Generated token:", gpt2_tokenizer.decode(next_token))
        
        # get the attention scores computed by the first layer and first head
        first_layer_attentions = outputs.attentions[0][0]
        print("Attention Scores (Head 1):")
        print(first_layer_attentions[0])
    
    print("-"*50)
    print(f"Final generated sequence: '{gpt2_tokenizer.decode(inputs.input_ids[0])}'")


The following generation flags are not valid and may be ignored: ['output_attentions']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


--------------------------------------------------
Iteration 1
Generating the next token for sequence: 'Hope is a'
Generated token:  great
Attention Scores (Head 1):
tensor([[1.0000, 0.0000, 0.0000],
        [0.9605, 0.0395, 0.0000],
        [0.8410, 0.1055, 0.0534]])
--------------------------------------------------
Iteration 2
Generating the next token for sequence: 'Hope is a great'
Generated token:  way
Attention Scores (Head 1):
tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.9605, 0.0395, 0.0000, 0.0000],
        [0.8410, 0.1055, 0.0534, 0.0000],
        [0.6815, 0.1267, 0.1025, 0.0894]])
--------------------------------------------------
Iteration 3
Generating the next token for sequence: 'Hope is a great way'
Generated token:  to
Attention Scores (Head 1):
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.9605, 0.0395, 0.0000, 0.0000, 0.0000],
        [0.8410, 0.1055, 0.0534, 0.0000, 0.0000],
        [0.6815, 0.1267, 0.1025, 0.0894, 0.0000],
        [0.7019, 0.

In [11]:
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

def generate_tokens(use_kv_cache):

    gpt2 = AutoModelForCausalLM.from_pretrained("gpt2", use_cache=use_kv_cache)
    gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")

    with torch.no_grad():

        num_new_tokens = 500

        # tokenize the original input sentence
        inputs = gpt2_tokenizer("Hope is a", return_tensors="pt", add_special_tokens=False)

        start_time = time.time()
        gpt2.generate(**inputs, max_new_tokens=num_new_tokens, min_new_tokens=num_new_tokens)
        end_time = time.time()

        print(f"Time taken to generate {num_new_tokens} tokens: {end_time - start_time:.4f} seconds")
        print(f"Time taken per token: {(end_time - start_time)/num_new_tokens:.4f} seconds")


# measure latency with key-value caching disabled
print("Without key-value caching:")
generate_tokens(use_kv_cache=False)

# measure latency with key-value caching enabled
print("\nWith key-value caching:")
generate_tokens(use_kv_cache=True)

Without key-value caching:


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Time taken to generate 500 tokens: 12.3311 seconds
Time taken per token: 0.0247 seconds

With key-value caching:


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Time taken to generate 500 tokens: 12.1396 seconds
Time taken per token: 0.0243 seconds


In [5]:
import torch.nn as nn
class MultiHeadAttentionScores(nn.Module):

    def __init__(self, hidden_size, num_attention_heads, attention_head_size):
        super(MultiHeadAttentionScores, self).__init__()
        self.num_attention_heads = num_attention_heads # 8,16, 32, 64
        
        # Create a query, key, and value projection layer
        # for each attention head.  W^Q, W^K, W^V
        self.query_layers = nn.ModuleList([
            nn.Linear(hidden_size, attention_head_size) 
            for _ in range(num_attention_heads)
        ])
        
        self.key_layers = nn.ModuleList([
            nn.Linear(hidden_size, attention_head_size) 
            for _ in range(num_attention_heads)
        ])
        
        self.value_layers = nn.ModuleList([
            nn.Linear(hidden_size, attention_head_size) 
            for _ in range(num_attention_heads)
        ])

    def forward(self, hidden_states):
        # Create a list to store the outputs of each attention head
        all_attention_outputs = []

        for i in range(self.num_attention_heads): # i.e. 8
            query_vectors = self.query_layers[i](hidden_states)
            key_vectors = self.key_layers[i](hidden_states)
            value_vectors = self.value_layers[i](hidden_states)
            
            # softmax(Q&K^T)*V
            attention_scores = torch.matmul(query_vectors, key_vectors.transpose(-1, -2))
            # attention_scores combined with softmax--> normalized_attention_score
            attention_outputs = torch.matmul(attention_scores, value_vectors)
            all_attention_outputs.append(attention_outputs)

        return all_attention_outputs

In [6]:
import torch.nn as nn
class MultiQueryAttention(nn.Module):

    def __init__(self, hidden_size, num_attention_heads, attention_head_size):
        super(MultiQueryAttention, self).__init__()
        self.num_attention_heads = num_attention_heads
        
        # Create a query layer for each attention head.
        self.query_layers = nn.ModuleList([
            nn.Linear(hidden_size, attention_head_size) 
            for _ in range(num_attention_heads)
        ])
        
        # Create a single key layer and a single value layer
        # that will be shared by all attention heads.
        self.key_layer = nn.Linear(hidden_size, attention_head_size)
        self.value_layer = nn.Linear(hidden_size, attention_head_size)

    def forward(self, hidden_states):
        
        # Create a list to store the outputs of each attention head
        all_attention_outputs = []

        for i in range(self.num_attention_heads):
            query_vectors = self.query_layers[i](hidden_states)
            key_vectors = self.key_layer(hidden_states)
            value_vectors = self.value_layer(hidden_states)
            
            attention_scores = torch.matmul(query_vectors, key_vectors.transpose(-1, -2))
            attention_outputs = torch.matmul(attention_scores, value_vectors)
            all_attention_outputs.append(attention_outputs)

        return all_attention_outputs