In [1]:
pip install torch transformers numpy


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m55.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m88.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2


In [14]:
import torch
import numpy as np
from transformers import GPT2Model, GPT2Tokenizer

# Load pre-trained model and tokenizer
model_name = 'gpt2'
model = GPT2Model.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Input sentence
sentence = "Hello, how are you?"

# Tokenize input
input_ids = tokenizer.encode(sentence, add_special_tokens=True, return_tensors='pt')

# Forward pass through the model
outputs = model(input_ids)

# Get the attention weights from the model's output
attention_weights = outputs[-1]

if attention_weights is None:
    raise ValueError("Attention weights not available. Make sure the model supports attention weights.")

# Perform SVD on each layer's attention weights
svd_results = []
for layer_idx, layer_attention in enumerate(attention_weights):
    # Extract qkv values from the attention weights
    qkv_values = [layer[0] for layer in layer_attention]

    # Concatenate qkv values along the last dimension
    qkv_matrix = torch.cat(qkv_values, dim=-1)

    # Convert the qkv_matrix tensor to numpy array
    qkv_matrix_np = qkv_matrix.detach().numpy()

    # Apply SVD to the qkv matrix
    U, s, VT = np.linalg.svd(qkv_matrix_np, full_matrices=False)

    svd_results.append((U, s, VT))

    # Calculate the explained variance ratio with the singular values
    explained_variance = np.sum(s ** 2)
    total_variance = np.sum(layer_attention[0].detach().numpy() ** 2)
    explained_variance_ratio = explained_variance / total_variance
    print(f"Explained variance ratio for layer {layer_idx+1}: {explained_variance_ratio:.4f}")
    print()

# Example printing the first singular value of the first layer
print("First singular value of the first layer:")
print(svd_results[0][1][0])


Explained variance ratio for layer 1: 1.0249

Explained variance ratio for layer 2: 1.1547

Explained variance ratio for layer 3: 1.1083

Explained variance ratio for layer 4: 1.0289

Explained variance ratio for layer 5: 1.0353

Explained variance ratio for layer 6: 1.1683

Explained variance ratio for layer 7: 1.1794

Explained variance ratio for layer 8: 1.1961

Explained variance ratio for layer 9: 1.2419

Explained variance ratio for layer 10: 1.3453

Explained variance ratio for layer 11: 1.4727

Explained variance ratio for layer 12: 2.5693

First singular value of the first layer:
[22.17502    5.817503   4.6172457  4.1615443  4.097065   2.889182 ]


In [15]:
import torch
import numpy as np
from transformers import GPT2Model, GPT2Tokenizer

# Load pre-trained model and tokenizer
model_name = 'gpt2'
model = GPT2Model.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Input sentence
sentence = "Hello, how are you?"

# Tokenize input
input_ids = tokenizer.encode(sentence, add_special_tokens=True, return_tensors='pt')

# Forward pass through the model
outputs = model(input_ids)

# Get the attention weights from the model's output
attention_weights = outputs[-1]

if attention_weights is None:
    raise ValueError("Attention weights not available. Make sure the model supports attention weights.")

# Print all QKV matrices for each layer
for layer_idx, layer_attention in enumerate(attention_weights):
    print(f"Layer {layer_idx + 1}:")
    for head_idx, head_attention in enumerate(layer_attention):
        qkv_values = [layer[0] for layer in head_attention]

        # Concatenate qkv values along the last dimension
        qkv_matrix = torch.cat(qkv_values, dim=-1)

        # Convert the qkv_matrix tensor to numpy array
        qkv_matrix_np = qkv_matrix.detach().numpy()

        print(f"Head {head_idx + 1}:")
        print(qkv_matrix_np)
        print()


Layer 1:
Head 1:
[[-1.25259519e+00  2.31998467e+00  1.72184765e-01 -2.30587125e-01
   6.75784826e-01  5.67377090e-01 -1.70025855e-01  1.72467142e-01
  -2.18835282e+00  9.26291168e-01  9.81360912e-01  1.26313776e-01
   3.07696402e-01  1.17034942e-01 -4.52167392e-01 -7.15890050e-01
   7.59924293e-01  2.33476028e-01  2.42767596e+00  1.40414983e-01
  -9.80589628e-01  3.18124890e-03 -6.59537554e-01 -9.68475044e-01
   5.27993083e-01 -7.79284000e-01 -6.24317825e-01  2.70427227e-01
  -4.56660360e-01 -9.12270188e-01  2.32452178e+00 -3.35902035e-01
  -7.13602424e-01  1.62443481e-02 -3.82341258e-02 -4.76848006e-01
   8.89721632e-01  2.84535319e-01 -1.00488007e-01  3.37675065e-02
   5.10402769e-02 -2.47760490e-02 -7.67219961e-01  3.19120735e-01
  -4.76604193e-01 -2.80151367e-01  8.10873508e-01 -1.94783121e-01
   2.80045956e-01  4.11854178e-01 -1.97854802e-01 -4.09575462e-01
   1.85274982e+00  8.02949727e-01 -2.05064505e-01  1.51908135e+00
  -2.88327128e-01  4.87044752e-02 -9.12229300e-01  6.393489

In [33]:
import torch
import numpy as np
from transformers import GPT2Model, GPT2Tokenizer

# Load pre-trained model and tokenizer
model_name = 'gpt2'
model = GPT2Model.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Input sentence
sentence = "Hello, how are you?"

# Tokenize input
input_ids = tokenizer.encode(sentence, add_special_tokens=True, return_tensors='pt')

# Forward pass through the model
outputs = model(input_ids)

# Get the attention weights from the model's output
attention_weights = outputs[-1]

# Print dimensions of each QKV matrix and number of layers
num_layers = len(attention_weights)
print(f"Number of Layers: {num_layers}\n")





Number of Layers: 12



In [36]:
import torch
import numpy as np
from transformers import GPT2Model, GPT2Tokenizer

# Load pre-trained model and tokenizer
model_name = 'gpt2'
model = GPT2Model.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Input sentence
sentence = "Hello, how are you?"

# Tokenize input
input_ids = tokenizer.encode(sentence, add_special_tokens=True, return_tensors='pt')

# Forward pass through the model
outputs = model(input_ids)

# Get the attention weights from the model's output
attention_weights = outputs[-1]

# Print number of attention heads for each layer
for layer_idx, layer_attention in enumerate(attention_weights):
    num_heads = len(layer_attention)
    print(f"Layer {layer_idx + 1}: Number of Attention Heads: {num_heads}")


Layer 1: Number of Attention Heads: 2
Layer 2: Number of Attention Heads: 2
Layer 3: Number of Attention Heads: 2
Layer 4: Number of Attention Heads: 2
Layer 5: Number of Attention Heads: 2
Layer 6: Number of Attention Heads: 2
Layer 7: Number of Attention Heads: 2
Layer 8: Number of Attention Heads: 2
Layer 9: Number of Attention Heads: 2
Layer 10: Number of Attention Heads: 2
Layer 11: Number of Attention Heads: 2
Layer 12: Number of Attention Heads: 2


In [37]:
import torch
import numpy as np
from transformers import GPT2Model, GPT2Tokenizer

# Load pre-trained model and tokenizer
model_name = 'gpt2'
model = GPT2Model.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Input sentence
sentence = "Hello, how are you?"

# Tokenize input
input_ids = tokenizer.encode(sentence, add_special_tokens=True, return_tensors='pt')

# Forward pass through the model
outputs = model(input_ids)

# Get the attention weights from the model's output
attention_weights = outputs[-1]

if attention_weights is None:
    raise ValueError("Attention weights not available. Make sure the model supports attention weights.")

# Print dimensionality reduction for each layer and head
for layer_idx, layer_attention in enumerate(attention_weights):
    print(f"Layer {layer_idx + 1}:")
    for head_idx, head_attention in enumerate(layer_attention):
        qkv_values = [layer[0] for layer in head_attention]

        # Concatenate qkv values along the last dimension
        qkv_matrix = torch.cat(qkv_values, dim=-1)

        # Convert the qkv_matrix tensor to numpy array
        qkv_matrix_np = qkv_matrix.detach().numpy()

        # Apply SVD to the qkv matrix
        U, s, VT = np.linalg.svd(qkv_matrix_np, full_matrices=False)

        original_dim = qkv_matrix_np.shape[-1]
        reduced_dim = len(s)

        print(f"Head {head_idx + 1}:")
        print(f"Original dimension: {original_dim}")
        print(f"Reduced dimension: {reduced_dim}")
        print(f"Dimensionality reduction: {original_dim - reduced_dim}")
        print()


Layer 1:
Head 1:
Original dimension: 64
Reduced dimension: 6
Dimensionality reduction: 58

Head 2:
Original dimension: 64
Reduced dimension: 6
Dimensionality reduction: 58

Layer 2:
Head 1:
Original dimension: 64
Reduced dimension: 6
Dimensionality reduction: 58

Head 2:
Original dimension: 64
Reduced dimension: 6
Dimensionality reduction: 58

Layer 3:
Head 1:
Original dimension: 64
Reduced dimension: 6
Dimensionality reduction: 58

Head 2:
Original dimension: 64
Reduced dimension: 6
Dimensionality reduction: 58

Layer 4:
Head 1:
Original dimension: 64
Reduced dimension: 6
Dimensionality reduction: 58

Head 2:
Original dimension: 64
Reduced dimension: 6
Dimensionality reduction: 58

Layer 5:
Head 1:
Original dimension: 64
Reduced dimension: 6
Dimensionality reduction: 58

Head 2:
Original dimension: 64
Reduced dimension: 6
Dimensionality reduction: 58

Layer 6:
Head 1:
Original dimension: 64
Reduced dimension: 6
Dimensionality reduction: 58

Head 2:
Original dimension: 64
Reduced dimen