In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from transformers import GPT2Model, GPT2Tokenizer

#pretrained GPT2 model and tokenizer
gpt2 = GPT2Model.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# embeddings matrix
embeddings = gpt2.wte.weight.detach().numpy()  # the weight matrix is detached from gradient info



In [5]:
# find size para in .config
gpt2.config

GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "dtype": "float32",
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.57.1",
  "use_cache": true,
  "vocab_size": 50257
}

In [7]:
print(f'Embediing dimensions: {gpt2.config.n_embd}')
print(f'Vocab size: {gpt2.config.vocab_size}')
print(f'Size of embeddings matrix: {embeddings.shape}')

Embediing dimensions: 768
Vocab size: 50257
Size of embeddings matrix: (50257, 768)


## creating 2 unembedding matrix
## 1. transpose of embedding matrix
## 2. random unembed matrix of same size


In [12]:
unembeddings = embeddings.T

#confirm transposing creates a new copy, not same reference
# if it was just unembed = embedd, this would just create a new reference to same object
# not just actual copy. But since it is embed.T (an operation on embedd array), this creates
# an actual copy with new object
print('id of embeddings: ', id(embeddings))
print('id of unembeddings: ', id(unembeddings))

id of embeddings:  4398153328
id of unembeddings:  14185713840


In [15]:
# random unembed matrix
# sizze of 768 x 50257 (T of embed matrix)
unembedRand = np.random.randn(gpt2.config.n_embd, gpt2.config.vocab_size)

print(f'    Size of embed matrix: {embeddings.shape}')
print(f' Size of random unembed matrix: {unembedRand.shape}')
print(f' Size of real unembedd matrix: {unembeddings.shape}')

    Size of embed matrix: (50257, 768)
 Size of random unembed matrix: (768, 50257)
 Size of real unembedd matrix: (768, 50257)


## California embedding

In [16]:
# pciking a word
seedword = ' California'

#its token index
seed_idx = tokenizer.encode(seedword)

#make sure its one token
seed_idx

[3442]

In [22]:
# finding its embedding vector
embed_vector = embeddings[seed_idx,:]
embed_vector.shape

(1, 768)

In [29]:
# project the embedding vector into umbedd matrix
# i.e dot product

dpRand = embed_vector @ unembedRand

#next token is the max dot product (unscaled cosine similarity, i.e cosine sim without denominator)
nextTokenRand_idx = np.argmax(dpRand)
nextTokenRand = tokenizer.decode(nextTokenRand_idx)

# check the sizes
print('embed_vector   X   unembeddings  =  dotproducts')
print(f'{embed_vector.shape}           {unembedRand.shape}      {dpRand.shape}')


embed_vector   X   unembeddings  =  dotproducts
(1, 768)           (768, 50257)      (1, 50257)


In [39]:
# same for real unembedd matrix
dpReal = embed_vector @ unembeddings
nextTokenReal_idx = np.argmax(dpReal)
nextTokenReal = tokenizer.decode(nextTokenReal_idx)


In [42]:
print('** Random unembed matrix:')
print(f'    "{tokenizer.decode(seed_idx)}" has largest dot product with: "{nextTokenRand}"')

print('** real unembed matrix:')
print(f'    "{tokenizer.decode(seed_idx)}" has largest dot product with: "{nextTokenReal}"')

** Random unembed matrix:
    " California" has largest dot product with: " Purpose"
** real unembed matrix:
    " California" has largest dot product with: "California"


In [44]:
## notice above that final predicted California did not have any space

In [48]:
dpReal.shape

(1, 50257)

In [51]:
# top 10 dot unembeds
top10 = np.argsort(dpReal[0])[::-1][:10]
for i in top10:
    print(f'Dot product {dpReal[0,i]:6.3f} for token "{tokenizer.decode(i)}"')

Dot product 10.136 for token "California"
Dot product  9.617 for token " California"
Dot product  8.816 for token " Californ"
Dot product  8.088 for token "Calif"
Dot product  7.718 for token " Calif"
Dot product  7.359 for token " Nevada"
Dot product  7.158 for token "Arizona"
Dot product  7.151 for token "Colorado"
Dot product  6.974 for token "Florida"
Dot product  6.912 for token "Oregon"


## Generating a token sequence

In [59]:
# sequence length
seq_len = 10


#initial seed
nextword = 'budget'

# init a list that will contain the text\
text = nextword

# loop to create the seq
for i in range(seq_len - 1):
    # step 1 tokenize
    token = tokenizer.encode(nextword)

    #step2 get embedding vector
    embed_vector = embeddings[token,:]

    # step 3 project onto unmebed matrixx(dot products)
    dp = embed_vector @ unembeddings

    #step 4: find top 10 projectons
    top10 = np.argsort(dp[0])[::-1][:10]

    #step 5 randomly pick one for next token
    aRandomToken = np.random.choice(top10)
    nextword = tokenizer.decode(aRandomToken)

    # step6: append the text
    text += ' ' + nextword

#print the final result
print('Our text:',text)
    

Our text: budget  budgetary  budgets  Budget  Treasurer Reviewer Downloadha 覚醒 Downloadha BuyableInstoreAndOnline


In [62]:
# repeat with random unembedd
# sequence length
seq_len = 10


#initial seed
nextword = 'budget'

# init a list that will contain the text\
text = nextword

# loop to create the seq
for i in range(seq_len - 1):
    # step 1 tokenize
    token = tokenizer.encode(nextword)

    #step2 get embedding vector
    embed_vector = embeddings[token,:]

    # step 3 project onto unmebed matrixx(dot products)
    dp = embed_vector @ unembedRand

    #step 4: find top 10 projectons
    top10 = np.argsort(dp[0])[::-1][:10]

    #step 5 randomly pick one for next token
    aRandomToken = np.random.choice(top10)
    nextword = tokenizer.decode(aRandomToken)

    # step6: append the text
    text += ' ' + nextword

#print the final result
print('Our text:',text)
    

Our text: budget  Prosecutor ohydrate  edited  transm gars cod 9  ruined bah


In [63]:
## DID WE JUST BUILT A CHATBOT??????