In [3]:
from transformers import BertModel, BertTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
# load the bert base uncased tokneizer : 12 encode
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

print(f'Length of BERT Base vocabulary: {len(tokenizer.vocab)}')

Length of BERT Base vocabulary: 30522


In [9]:
# sample
text = 'A simple sentence'

tokens =tokenizer.encode(text)
print(tokens)

[101, 1037, 3722, 6251, 102]


In [10]:
# back a words
tokenizer.decode(tokens)

2024-04-25 18:31:11.087554: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'[CLS] a simple sentence [SEP]'

In [11]:
# sample 2

text = 'My friend told me about this class and i love it so far! she was right'

tokens = tokenizer.encode(text)

print(tokens)

[101, 2026, 2767, 2409, 2033, 2055, 2023, 2465, 1998, 1045, 2293, 2009, 2061, 2521, 999, 2016, 2001, 2157, 102]


In [16]:
print(f'Text {text} , Num tokens {(len(tokens))}')

for t in tokens:
    print(f'Tokens {t}, subword: {tokenizer.decode([t])}')

Text My friend told me about this class and i love it so far! she was right , Num tokens 19
Tokens 101, subword: [CLS]
Tokens 2026, subword: my
Tokens 2767, subword: friend
Tokens 2409, subword: told
Tokens 2033, subword: me
Tokens 2055, subword: about
Tokens 2023, subword: this
Tokens 2465, subword: class
Tokens 1998, subword: and
Tokens 1045, subword: i
Tokens 2293, subword: love
Tokens 2009, subword: it
Tokens 2061, subword: so
Tokens 2521, subword: far
Tokens 999, subword: !
Tokens 2016, subword: she
Tokens 2001, subword: was
Tokens 2157, subword: right
Tokens 102, subword: [SEP]


In [17]:
# check not in words
'sinan' in tokenizer.vocab

False

In [18]:
text_with_unkown_words = 'Sinan loves a beautiful day'
tokens_with_unkown_words = tokenizer.encode(text_with_unkown_words)

for t in tokens_with_unkown_words:
    print(f'Tokens: {t}, subword: {tokenizer.decode([t])}')

Tokens: 101, subword: [CLS]
Tokens: 8254, subword: sin
Tokens: 2319, subword: ##an
Tokens: 7459, subword: loves
Tokens: 1037, subword: a
Tokens: 3376, subword: beautiful
Tokens: 2154, subword: day
Tokens: 102, subword: [SEP]


In [19]:
tokenizer.encode('sinan')

[101, 8254, 2319, 102]

In [20]:
tokenizer.encode('an')

[101, 2019, 102]

In [21]:
text_with_unkown_words = 'Sinan is our instructor for this awesomesouce in class'
tokens_with_unkown_words = tokenizer.encode(text_with_unkown_words)

for t in tokens_with_unkown_words:
    print(f'Tokens: {t}, subword: {tokenizer.decode([t])}')

Tokens: 101, subword: [CLS]
Tokens: 8254, subword: sin
Tokens: 2319, subword: ##an
Tokens: 2003, subword: is
Tokens: 2256, subword: our
Tokens: 9450, subword: instructor
Tokens: 2005, subword: for
Tokens: 2023, subword: this
Tokens: 12476, subword: awesome
Tokens: 6499, subword: ##so
Tokens: 18796, subword: ##uce
Tokens: 1999, subword: in
Tokens: 2465, subword: class
Tokens: 102, subword: [SEP]


In [22]:
text = 'My friend told me about this class and i love it so far'

# encode_plus gives us token ids, attention mask and segment ids (A VS B) ex I like you vs i like you and mom (give a padding)
tokens = tokenizer.encode_plus(text)
print(tokens)

{'input_ids': [101, 2026, 2767, 2409, 2033, 2055, 2023, 2465, 1998, 1045, 2293, 2009, 2061, 2521, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [23]:
python_pet = tokenizer.encode(' I love my pet')
python_language = tokenizer.encode(' I love my python language')

In [28]:
model =BertModel.from_pretrained('bert-base-uncased')

In [34]:
# contextual embedding
python_pet_embedding = model(torch.tensor(python_pet).unsqueeze(0))[0][:, 5, :].detach().numpy()

python_language_embedding = model(torch.tensor(python_language).unsqueeze(0))[0][:, 5, :].detach().numpy()

snake_alone_embedding = model(torch.tensor(tokenizer.encode('snake')).unsqueeze(0))[0][:, 1, :].detach().numpy()

programming_embedding = model(torch.tensor(tokenizer.encode('programming')).unsqueeze(0))[0][:, 1, :].detach().numpy()


In [36]:
python_pet_embedding.shape

(1, 768)

In [37]:
python_language_embedding.shape

(1, 768)

In [38]:
cosine_similarity(python_language_embedding, snake_alone_embedding)

array([[0.53738034]], dtype=float32)

In [41]:
cosine_similarity(python_pet_embedding, programming_embedding)

array([[0.01107592]], dtype=float32)