In [12]:
import numpy as np

## Import GPT2 and BERT Models and tokenizers

In [1]:
from transformers import GPT2Model, GPT2Tokenizer
gpt2 = GPT2Model.from_pretrained('gpt2')
tokenizerG = GPT2Tokenizer.from_pretrained('gpt2')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import BertTokenizer, BertModel
bert = BertModel.from_pretrained('bert-base-uncased')
tokenizerB = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:

print(gpt2)

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)


In [5]:
print(bert)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [6]:
# how long tare the tokens
print(f'BERT has {tokenizerB.vocab_size} tokens.')
print(f'GPT2 has {tokenizerG.vocab_size} tokens.')

BERT has 30522 tokens.
GPT2 has 50257 tokens.


In [10]:
# get word token embeddings matrix
embeddingsG = gpt2.wte.weight.detach().numpy()
# detach func detaches the numbers(here weigh matrix) from gradient info that the model uses to train the weights


embeddingsB = bert.embeddings.word_embeddings.weight.detach().numpy()
print(f'BERT embedding matrix is of size {embeddingsB.shape}.')
print(f'GPT2 embedding matrix is of size {embeddingsG.shape}.')
# embeddings have 768 dimensions

# this is just the starting value in the embedding vector, the LLM modifies 
#these vectors according to attention mechanism for each tokens during training

BERT embedding matrix is of size (30522, 768).
GPT2 embedding matrix is of size (50257, 768).


In [13]:
ridx = np.random.randint(10000,20000)
# the index would mean totally diff tokens in 2 diff models
print(embeddingsB[ridx])
print(embeddingsG[ridx])

[-2.97561400e-02 -2.28181724e-02 -5.42982072e-02 -9.89462249e-03
 -2.39128582e-02 -5.23260646e-02 -1.30497769e-01  4.01490508e-03
 -3.24351937e-02 -2.27187257e-02 -4.55229208e-02 -1.20264729e-02
 -6.62170947e-02 -2.03558709e-02 -8.46444909e-03 -7.64379129e-02
 -8.96214545e-02 -9.19117332e-02  3.89797688e-02 -1.05920374e-01
 -1.22628622e-02 -5.39587028e-02 -2.32089665e-02 -1.56708583e-02
 -1.49512604e-01 -6.37007803e-02 -2.12703813e-02 -7.15265796e-02
 -2.88284570e-02 -5.32897515e-03  3.59929204e-02 -3.42434578e-05
 -2.83906050e-02  1.22915059e-02 -1.40020335e-02 -4.82206643e-02
 -3.62801924e-02 -5.62902465e-02 -4.91468981e-02 -5.75001650e-02
 -3.80266421e-02 -5.36494851e-02 -5.76366819e-02 -1.14035485e-02
  3.41956094e-02  1.36880307e-02 -6.67669810e-03 -4.84308526e-02
  4.02782969e-02 -2.13495977e-02  1.36836842e-02 -8.88508186e-02
 -4.70715873e-02  1.19349221e-02 -3.95753756e-02  1.23106111e-02
  1.42925344e-02 -3.08456104e-02 -2.33988371e-02 -7.35864090e-03
 -6.38233423e-02 -2.72625

In [22]:
# embedding for same word
token = ' the'
token_idxB = tokenizerB.encode(token)[1]
token_idxG = tokenizerG.encode(token)[0]

print(f'BERT: "{token}" is index {token_idxB}')
print(f'GPT2: "{token}" is index {token_idxG}')

BERT: " the" is index 1996
GPT2: " the" is index 262


In [25]:
tokenizerB.decode(token_idxB),tokenizerG.decode(token_idxG)

('the', ' the')

In [26]:
#their correlation
corr = np.corrcoef(embeddingsB[token_idxB,:], embeddingsG[token_idxG,:])

plt.plot(embeddingsB[token_idxB,:], embeddingsG[token_idxG,:],'s')
plt.show()

NameError: name 'plt' is not defined