<a href="https://colab.research.google.com/github/prompt-auto/prompt-automation/blob/main/00_Tokenization_and_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
# Import required libraries
from transformers import BertModel, AutoTokenizer
import pandas as pd

In [38]:
# Specify the pre-trained model to use: BERT-base-cased
model_name = "bert-base-cased"

# Instantiate the model and tokenizer for the specified pre-trained model
model = BertModel.from_pretrained(model_name)
print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [39]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [40]:
# Create a DataFrame with the tokenizer's vocabulary
vocab = tokenizer.vocab
print(vocab)



In [41]:
print(len(vocab))

28996


In [42]:
# Set a sentence for analysis
sentence = "When life gives you lemons, don't make lemonade."
tokens = tokenizer.tokenize(sentence)
print(tokens)
print(len(tokens))

['When', 'life', 'gives', 'you', 'lemon', '##s', ',', 'don', "'", 't', 'make', 'lemon', '##ade', '.']
14


In [43]:

# Encode the sentence into token_ids using the tokenizer
token_ids = tokenizer.encode(sentence)
print(token_ids)
print(len(token_ids))
# first and last token_ids are for [CLS] and [SEP]

[101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1274, 112, 189, 1294, 22782, 6397, 119, 102]
16


In [44]:
encoded_inputs = tokenizer(sentence, return_tensors="pt")
print(encoded_inputs)

{'input_ids': tensor([[  101,  1332,  1297,  3114,  1128, 22782,  1116,   117,  1274,   112,
           189,  1294, 22782,  6397,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [45]:
embeddings = model(**encoded_inputs)[0]

In [46]:
for token_id, embed_vec in zip(token_ids, embeddings[0]):
  print(token_id, ' --- ', embed_vec)

101  ---  tensor([ 5.7520e-01,  4.5968e-02, -2.7510e-02, -1.9917e-01, -4.3701e-02,
        -1.9762e-02,  1.2274e-01,  3.3295e-02, -1.5713e-04, -1.3065e+00,
        -1.2344e-01,  5.3348e-02, -1.1520e-01, -5.4220e-02, -3.5267e-01,
        -8.4895e-02, -5.0679e-02,  4.5851e-02, -4.8741e-01, -1.5633e-01,
         4.2481e-02,  8.4765e-02,  4.4939e-01, -2.9093e-01,  7.5438e-02,
         1.1886e-01,  1.0828e-01,  2.9946e-01, -3.0813e-01,  6.6248e-02,
         1.0494e-02,  1.9561e-01, -4.1229e-01, -5.6246e-02, -3.3441e-01,
         1.2820e-01, -1.1963e-01, -2.1250e-01,  1.2016e-01, -3.0211e-01,
        -5.2153e-01,  7.6712e-03,  5.9753e-01, -1.5611e-01,  1.5947e-01,
        -5.3761e-01, -2.7775e-02,  1.8239e-02,  5.9306e-02,  5.0235e-02,
         5.7883e-02,  4.0236e-01,  1.8010e-01,  8.7487e-02,  3.0202e-01,
         1.7164e-01, -3.5232e-01,  1.1543e-01, -7.1267e-01,  1.7072e-01,
        -1.6500e-01,  3.4772e-02,  2.1914e-01, -8.1813e-04, -3.2057e-02,
         1.2820e-01,  7.8256e-02,  1.9966