In [None]:
!pip install transformers nlp bertviz datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nlp
  Downloading nlp-0.4.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bertviz
  Downloading bertviz-1.4.0-py3-none-any.whl (157 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.6/157.6 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[

In [None]:
# imports
from transformers import pipeline, BertTokenizer, BertModel, BertConfig
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# https://github.com/jessevig/bertviz
from bertviz import head_view

# Bert tokenizer
Más pretrained models: https://huggingface.co/models

In [None]:
# load the bert-base uncased tokenizer. Quick check what does "uncased" mean?
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

print(f'Length of BERT base vocabulary: {len(tokenizer.vocab)}')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Length of BERT base vocabulary: 30522


In [None]:
# load the bert-base uncased tokenizer. 
#tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

#print(f'Length of BERT base vocabulary: {len(tokenizer.vocab)}')

In [None]:
text = "A simple sentence!"

tokens = tokenizer.encode(text)
print(tokens)

[101, 1037, 3722, 6251, 999, 102]


¿Porqué hay 6 tokens?

In [None]:
# decode will re-construct the sentence with the added [CLS] and [SEP] token
tokenizer.decode(tokens)

'[CLS] a simple sentence! [SEP]'

In [None]:
# A nicer printout  of token ids and token strings

print(f'Text: {text}. Num tokens: {len(tokens)}')
for t in tokens:
    print(f'Token: {t}, subword: {tokenizer.decode([t])}')

Text: A simple sentence!. Num tokens: 6
Token: 101, subword: [CLS]
Token: 1037, subword: a
Token: 3722, subword: simple
Token: 6251, subword: sentence
Token: 999, subword: !
Token: 102, subword: [SEP]


In [None]:
# alambre is not in our vocab :'(

'alambre' in tokenizer.vocab

False

In [None]:
text_with_unknown_words = 'alambre loves a beautiful day'
tokens_with_unknown_words = tokenizer.encode(text_with_unknown_words)

# We see our sub words in action!
for t in tokens_with_unknown_words:
    print(f'Token: {t}, subword: {tokenizer.decode([t])}')

Token: 101, subword: [CLS]
Token: 26234, subword: alam
Token: 13578, subword: ##bre
Token: 7459, subword: loves
Token: 1037, subword: a
Token: 3376, subword: beautiful
Token: 2154, subword: day
Token: 102, subword: [SEP]


In [None]:
# Let's load a vanilla BERT-base model. 
# Note we have to specify uncased because the vocab size / pre-trained vectors are different
model = BertModel.from_pretrained('bert-base-uncased')

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Vemos a continuación información del modelo utilizado:
- tenemos la capa de embeddings, el primer transformer y la output layer
- Hay 30522 tokens con un tamaño cada uno de 768 elementos

In [None]:
# Get all of the model's parameters as a list of tuples.
named_params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(named_params)))

print('==== Embedding Layer ====\n')
for p in named_params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')
for p in named_params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')
for p in named_params[-2:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 199 different named parameters.

==== Embedding Layer ====

embeddings.word_embeddings.weight                       (30522, 768)
embeddings.position_embeddings.weight                     (512, 768)
embeddings.token_type_embeddings.weight                     (2, 768)
embeddings.LayerNorm.weight                                   (768,)
embeddings.LayerNorm.bias                                     (768,)

==== First Transformer ====

encoder.layer.0.attention.self.query.weight               (768, 768)
encoder.layer.0.attention.self.query.bias                     (768,)
encoder.layer.0.attention.self.key.weight                 (768, 768)
encoder.layer.0.attention.self.key.bias                       (768,)
encoder.layer.0.attention.self.value.weight               (768, 768)
encoder.layer.0.attention.self.value.bias                     (768,)
encoder.layer.0.attention.output.dense.weight             (768, 768)
encoder.layer.0.attention.output.dense.bias                   (

**Ejemplo de tokenizador y cómo es capaz de identificar el contexto de las palabras**

In [None]:
# python is the 6th token (don't forget the [CLS] token!)
python_pet = tokenizer.encode('I love my pet python')

# python is the 6th token (don't forget the [CLS] token!)
python_language = tokenizer.encode('I love coding in python')

for i, t in enumerate(python_pet):
    print(f'Position: {i}, Token: {t}, subword: {tokenizer.decode([t])}')

for i, t in enumerate(python_language):
    print(f'Position: {i}, Token: {t}, subword: {tokenizer.decode([t])}')


Position: 0, Token: 101, subword: [CLS]
Position: 1, Token: 1045, subword: i
Position: 2, Token: 2293, subword: love
Position: 3, Token: 2026, subword: my
Position: 4, Token: 9004, subword: pet
Position: 5, Token: 18750, subword: python
Position: 6, Token: 102, subword: [SEP]
Position: 0, Token: 101, subword: [CLS]
Position: 1, Token: 1045, subword: i
Position: 2, Token: 2293, subword: love
Position: 3, Token: 16861, subword: coding
Position: 4, Token: 1999, subword: in
Position: 5, Token: 18750, subword: python
Position: 6, Token: 102, subword: [SEP]


In [None]:
python_pet_embedding = model(torch.tensor(python_pet).unsqueeze(0))[0][:,5,:].detach().numpy()
python_language_embedding = model(torch.tensor(python_language).unsqueeze(0))[0][:,5,:].detach().numpy()

snake_alone_embedding = model(torch.tensor(tokenizer.encode('snake')).unsqueeze(0))[0][:,1,:].detach().numpy()
coding_alone_embedding = model(torch.tensor(tokenizer.encode('coding')).unsqueeze(0))[0][:,1,:].detach().numpy()

In [None]:
# Similarity of the representation of the word Python in a sentence about coding to the word snake
cosine_similarity(python_language_embedding, snake_alone_embedding)

array([[0.58434784]], dtype=float32)

In [None]:
# Similarity of the representation of the word Python in a sentence about pets to the word snake. More similar!
cosine_similarity(python_pet_embedding, snake_alone_embedding)

array([[0.6928657]], dtype=float32)

**Veamos ahora cómo es posible a partir de un texto ver cómo los transformers son capaces de relacionar cada palabra con otras según el contexto**

In [None]:
text = "My friend told me about this class and I love it so far! She was right."

tokens = tokenizer.encode(text)
print(tokens)


[101, 2026, 2767, 2409, 2033, 2055, 2023, 2465, 1998, 1045, 2293, 2009, 2061, 2521, 999, 2016, 2001, 2157, 1012, 102]


In [None]:
inputs = torch.tensor(tokens).unsqueeze(0) # unsqueeze changes the shape from (20,) -> (1, 20)

outputs = model(inputs)
print(f'output type: {type(outputs)}, output length: {len(outputs)}')
print(f'first item shape: {outputs[0].shape}')
print(f'second item shape: {outputs[1].shape}')

output type: <class 'transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions'>, output length: 2
first item shape: torch.Size([1, 20, 768])
second item shape: torch.Size([1, 768])


In [None]:
attention = model(inputs, output_attentions=True)[2]


In [None]:
model.encoder.layer[0](model.embeddings(inputs))[0].shape  # output of first encoder


torch.Size([1, 20, 768])

In [None]:
tokens_as_list = tokenizer.convert_ids_to_tokens(inputs[0])
head_view(attention, tokens_as_list)

<IPython.core.display.Javascript object>

Podemos primero seleccionar la capa de self-encoder layer 

cada color representa cada una de las 12 multi-head layers



In [None]:
# The configuration of this BERT model to give us a high level configuration settings

config = BertConfig()
config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.27.4",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [None]:
bank_sit_txt = "she is sitting on the bank"
bank_money_txt = "i went to the bank but it was closed so i went home"

bank_sit = tokenizer.encode(bank_sit_txt)

bank_money = tokenizer.encode(bank_money_txt)

for i, t in enumerate(bank_sit):
    print(f'Position: {i}, Token: {t}, subword: {tokenizer.decode([t])}')

for i, t in enumerate(bank_money):
    print(f'Position: {i}, Token: {t}, subword: {tokenizer.decode([t])}')


Position: 0, Token: 101, subword: [CLS]
Position: 1, Token: 2016, subword: she
Position: 2, Token: 2003, subword: is
Position: 3, Token: 3564, subword: sitting
Position: 4, Token: 2006, subword: on
Position: 5, Token: 1996, subword: the
Position: 6, Token: 2924, subword: bank
Position: 7, Token: 102, subword: [SEP]
Position: 0, Token: 101, subword: [CLS]
Position: 1, Token: 1045, subword: i
Position: 2, Token: 2253, subword: went
Position: 3, Token: 2000, subword: to
Position: 4, Token: 1996, subword: the
Position: 5, Token: 2924, subword: bank
Position: 6, Token: 2021, subword: but
Position: 7, Token: 2009, subword: it
Position: 8, Token: 2001, subword: was
Position: 9, Token: 2701, subword: closed
Position: 10, Token: 2061, subword: so
Position: 11, Token: 1045, subword: i
Position: 12, Token: 2253, subword: went
Position: 13, Token: 2188, subword: home
Position: 14, Token: 102, subword: [SEP]


In [None]:

bank_sit_embedding = model(torch.tensor(bank_sit).unsqueeze(0))[0][:,6,:].detach().numpy()
bank_money_embedding = model(torch.tensor(bank_money).unsqueeze(0))[0][:,5,:].detach().numpy()

chair_alone_embedding = model(torch.tensor(tokenizer.encode('chair')).unsqueeze(0))[0][:,1,:].detach().numpy()
money_alone_embedding = model(torch.tensor(tokenizer.encode('money')).unsqueeze(0))[0][:,1,:].detach().numpy()

In [None]:
cosine_similarity(bank_sit_embedding, chair_alone_embedding)

array([[0.5926274]], dtype=float32)

In [None]:
cosine_similarity(bank_sit_embedding, money_alone_embedding)

array([[0.55129385]], dtype=float32)

In [None]:
cosine_similarity(bank_money_embedding, chair_alone_embedding)

array([[0.16394058]], dtype=float32)

In [None]:
cosine_similarity(bank_money_embedding, money_alone_embedding)

array([[0.2362975]], dtype=float32)