# Working with BETO in SMM4h-Spanish

In this notebook a BETO model is fine-tuned for NER. We also generate the word embeddings that we'll use later in the other notebook. Finally, we will extract the word embeddings vectors associate to the entities, to be used through cosine similarity in the final model.

## Requirements

In [None]:
!pip install torch
!pip install transformers
!pip install matplotlib
!pip install spacy
!pip install pandas

## BETO fine-tuning to ProfNER

We fine-tuned the Spanish BERT model, known as BETO and based in https://github.com/dccuchile/beto
This model is trained on the training and validation set. Once the training process is finished, we save the model and the tokenizer.

In [1]:
sst_home = '/home/sergio/Escritorio/ProfNER'

In [None]:
!python -m spacy download es_core_news_md

import spacy
import re
import es_core_news_md

nlp = es_core_news_md.load()

In [3]:
############# BIOES NOTATION #####################
BEGIN = 'B'
INSIDE = 'I'
OUTSIDE = 'O'
END = 'E'
SINGLE = 'S'

def getDictEntities(file_ann, ent_classes = ['PROFESION', 'SITUACION_LABORAL']):
  entities = {}
  with open(file_ann) as anns:
    for ann in anns:
      if ann.split('\t')[1].split(' ')[0] in ent_classes:
          ent = ann[:-1].split('\t')[2]
          #print(ent)
          #ent = [token for token in nlp(ent) if not token.is_stop]
          ent = nlp(ent)
          start = int(ann[:-1].split('\t')[1].split(' ')[1])
          end = int(ann[:-1].split('\t')[1].split(' ')[2])
          if (len(ent) == 1):
            entities[(start, end)] = SINGLE + '_' + ann.split('\t')[1].split(' ')[0]
          else:
            entities[(start, start + len(ent[0].text))] = BEGIN + '_' + ann.split('\t')[1].split(' ')[0]
            entities[(end - len(ent[-1].text)), end] = END + '_' + ann.split('\t')[1].split(' ')[0]
            for i in range(len(ent) - 2):
              spaces = (ent[i + 1].idx) - (ent[i].idx + len(ent[i].text))
              start = start + len(ent[i].text) + spaces
              entities[(start, start + len(ent[i + 1].text))] = INSIDE + '_' + ann.split('\t')[1].split(' ')[0]
            
  return entities


In [4]:
import os
def getProccessText(sst_home):
    sentences = []
    labels = []
    for file in [file[:-4] for file in os.listdir(sst_home) if file.endswith('.txt') and not ' ' in file]:
        file_text = os.path.join(sst_home, file + '.txt')
        file_ann = os.path.join(sst_home, file + '.ann')
        _entities = getDictEntities(file_ann)
        with open(file_text) as f:
          text = f.read()
          spacy_text = nlp(text)
          for sent in spacy_text.sents:
            sentence = []
            sent_labels = []
            for token in sent:
                if not token.like_url:
                    sentence.append(token.text.replace('#',''))
                    entity = _entities.get((token.idx, token.idx + len(token.text)), 'O')
                    sent_labels.append(entity)
            sentences.append(sentence)
            labels.append(sent_labels)
    
    return sentences, labels


In [5]:
sst_home_train = sst_home + '/final-profner-data/subtask-2/brat/train'
sentences, labels = getProccessText(sst_home_train)

In [6]:
tag_values = ['B_SITUACION_LABORAL', 'I_SITUACION_LABORAL', 'E_SITUACION_LABORAL', 'S_SITUACION_LABORAL' ,
      'B_PROFESION', 'I_PROFESION', 'E_PROFESION', 'S_PROFESION',
      'O']
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}


In [7]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

torch.__version__

'1.7.1'

In [8]:
MAX_LEN = 75
bs = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [9]:
torch.cuda.get_device_name(0)

'GeForce GTX 1050 Ti'

In [10]:
!pip3 install IProgress
import ipywidgets
import IProgress



In [11]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased', do_lower_case=False)

In [12]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [13]:
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(sentences, labels)
]

In [14]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]


In [15]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [16]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [17]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [18]:
tr_inputs, tr_tags, tr_masks = input_ids, tags, attention_masks

'''tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)'''

'tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,\n                                                            random_state=2018, test_size=0.1)\ntr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,\n                                             random_state=2018, test_size=0.1)'

In [19]:
tr_inputs = torch.tensor(tr_inputs)
tr_tags = torch.tensor(tr_tags)
tr_masks = torch.tensor(tr_masks)

In [20]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

In [21]:
import transformers
from transformers import BertForTokenClassification, AdamW

transformers.__version__

'4.3.2'

In [None]:
model = BertForTokenClassification.from_pretrained(
    'dccuchile/bert-base-spanish-wwm-cased',
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)


In [23]:
model.cuda()

In [24]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

In [25]:
from transformers import get_linear_schedule_with_warmup

epochs = 3
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)


In [26]:
from seqeval.metrics import f1_score, accuracy_score

In [27]:
import numpy as np
from tqdm import trange
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []

for _ in trange(epochs, desc="Epoch"):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.

    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)


    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    '''
    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
    #print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
    print()
    '''

Epoch:  33%|███▎      | 1/3 [09:46<19:32, 586.18s/it]

Average train loss: 0.04524885599088429


Epoch:  67%|██████▋   | 2/3 [19:46<09:54, 594.73s/it]

Average train loss: 0.014281350695130623


Epoch: 100%|██████████| 3/3 [29:48<00:00, 596.32s/it]

Average train loss: 0.005524212891464112





In [28]:
model.bert.save_pretrained('/home/sergio/Escritorio/ProfNER/fine-tuned-bert_3_epochs')

In [None]:
tokenizer.save_pretrained('/home/sergio/Escritorio/ProfNER/fine-tuned-bert_3_epochs')

## BETO Word Embeddings

Using the fine-tuned BETO model, we generate the word-level embeddings that will serve as input to the final model, described in the other notebook. Since in the final model we use the Spacy tokenization, we try to align both tokenizations using the "tokenizations" library. The word-level embeddings of BETO will be generated by the mean of the vectors of the subwords that compose a single word.

In [1]:
sst_home = '/home/sergio/Escritorio/ProfNER'

In [None]:
!pip install spacy==2.3.5

In [32]:
!python -m spacy download es_core_news_md

import spacy
import re
import es_core_news_md

nlp = es_core_news_md.load()

Collecting es_core_news_md==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_md-2.3.1/es_core_news_md-2.3.1.tar.gz (47.4 MB)
[K     |████████████████████████████████| 47.4 MB 3.1 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('es_core_news_md')


In [40]:
sst_home_train = sst_home + '/final-profner-data/subtask-2/test-background-txt-files'
print(sst_home_train)
import re
import os
regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
texts = []

for file in [file[:-4] for file in os.listdir(sst_home_train) if file.endswith('.txt') and not ' ' in file]:
    file_text = os.path.join(sst_home_train, file + '.txt')
    with open(file_text) as f:
      doc = f.read()
      #print(doc)
      doc = doc.replace('#', '')
      doc = re.sub(regex, '', doc)
      #print(doc)
    
    spacy_text = nlp(doc)
    text = '[CLS] '
    for sent in spacy_text.sents:
        text = text + sent.text + ' [SEP]'
    
    texts.append((text, [token.text for token in spacy_text], file))
            

/home/sergio/Escritorio/ProfNER/final-profner-data/subtask-2/test-background-txt-files


In [29]:
!wget https://users.dcc.uchile.cl/~jperez/beto/cased_2M/pytorch_weights.tar.gz 
!wget https://users.dcc.uchile.cl/~jperez/beto/cased_2M/vocab.txt 
!wget https://users.dcc.uchile.cl/~jperez/beto/cased_2M/config.json 
!tar -xzvf pytorch_weights.tar.gz
!mv config.json pytorch/.
!mv vocab.txt pytorch/.

--2021-02-13 15:53:41--  https://users.dcc.uchile.cl/~jperez/beto/cased_2M/pytorch_weights.tar.gz
Resolviendo users.dcc.uchile.cl (users.dcc.uchile.cl)... 200.9.99.211, 192.80.24.4
Conectando con users.dcc.uchile.cl (users.dcc.uchile.cl)[200.9.99.211]:443... conectado.
Petición HTTP enviada, esperando respuesta... 200 OK
Longitud: 409871727 (391M) [application/x-gzip]
Guardando como: “pytorch_weights.tar.gz”


2021-02-13 15:59:16 (1,17 MB/s) - “pytorch_weights.tar.gz” guardado [409871727/409871727]

--2021-02-13 15:59:16--  https://users.dcc.uchile.cl/~jperez/beto/cased_2M/vocab.txt
Resolviendo users.dcc.uchile.cl (users.dcc.uchile.cl)... 192.80.24.4, 200.9.99.211
Conectando con users.dcc.uchile.cl (users.dcc.uchile.cl)[192.80.24.4]:443... conectado.
Petición HTTP enviada, esperando respuesta... 200 OK
Longitud: 242120 (236K) [text/plain]
Guardando como: “vocab.txt”


2021-02-13 15:59:19 (205 KB/s) - “vocab.txt” guardado [242120/242120]

--2021-02-13 15:59:19--  https://users.dcc.uchil

In [34]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('/home/sergio/Escritorio/ProfNER/fine-tuned-bert_3_epochs', do_lower_case=False)
model = BertModel.from_pretrained('/home/sergio/Escritorio/ProfNER/fine-tuned-bert_3_epochs/', output_hidden_states = True)
model.eval()

Some weights of BertModel were not initialized from the model checkpoint at /home/sergio/Escritorio/ProfNER/fine-tuned-bert_3_epochs/ and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(31002, 768, padding_idx=1)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [41]:
import tokenizations

def getVectors(tokens, tokens_vectors, spacy_tokens, BETO_DIM = 1536):
    special_tokens = ['[CLS]', '[SEP]', '[UNK]']
    tweet_data = []
    
    aligment, _ = tokenizations.get_alignments(spacy_tokens, tokens)
    for token_aligment in aligment:
        vector = [tokens_vectors[i] for i in token_aligment if not tokens[i] in special_tokens]
        if vector == []:
            vector = np.zeros(BETO_DIM)
        else:
            mean = torch.mean(torch.stack(vector), dim = 0)
            vector = mean.cpu().detach().numpy()
        tweet_data.append(vector)
          
    return np.array(tweet_data)

In [42]:
data = {}
import numpy as np
# Now test it
for elements in texts:
    text = elements[0]
    spacy_tokens = elements[1]
    file_name = elements[2]
    
    tokens = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
    tokens_tensor = torch.tensor([indexed_tokens])

    segments_ids = [1] * len(tokens)
    segments_tensors = torch.tensor([segments_ids])

    with torch.no_grad():

        outputs = model(tokens_tensor, segments_tensors)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        #print(outputs)

        hidden_states = outputs[2]

    # Concatenate the tensors for all layers. We use `stack` here to
    # create a new dimension in the tensor.
    token_embeddings = torch.stack(hidden_states, dim=0)
    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)
    
    # Stores the token vectors, with shape [22 x 768]
    token_vecs_sum = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec_1 = torch.mean(token[-2:], dim=0)
        sum_vec_2 = torch.mean(token[-4:-2], dim=0)
        sum_vec = torch.cat((sum_vec_1, sum_vec_2))
        

        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)
    #print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))
    data[file_name] = getVectors(tokens, token_vecs_sum, spacy_tokens)

In [11]:
!pip install pickle5

Collecting pickle5
  Using cached pickle5-0.0.11.tar.gz (132 kB)
Building wheels for collected packages: pickle5
  Building wheel for pickle5 (setup.py) ... [?25ldone
[?25h  Created wheel for pickle5: filename=pickle5-0.0.11-cp36-cp36m-linux_x86_64.whl size=247167 sha256=64ce560b25bdf24715a461cb168319b43f52cdfcfacb4d43d5c3714b97ea74b0
  Stored in directory: /home/sergio/.cache/pip/wheels/f9/b7/be/bf9768ab0daa28fa4b386f3ad1bac5dd4d9c349c60e83b24e3
Successfully built pickle5
Installing collected packages: pickle5
Successfully installed pickle5-0.0.11


In [44]:
import pickle5 as pickle
with open('/home/sergio/Escritorio/ProfNER/final-saved_data/bert_test.pickle', 'wb') as file:
    pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL)

## BETO Getting the Entity Vectors

In this section, we store the vectors that correspond with entities found in the training and validation subset to be used in the other notebook along with the cosine similarity.

In [2]:
!python -m spacy download es_core_news_md

import spacy
import re
import es_core_news_md

nlp = es_core_news_md.load()

Collecting es_core_news_md==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_md-2.3.1/es_core_news_md-2.3.1.tar.gz (47.4 MB)
[K     |████████████████████████████████| 47.4 MB 2.0 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('es_core_news_md')


In [4]:
############# BIOES NOTATION #####################
BEGIN = 'B'
INSIDE = 'I'
OUTSIDE = 'O'
END = 'E'
SINGLE = 'S'

def getDictEntities(file_ann, ent_classes = ['PROFESION', 'SITUACION_LABORAL']):
  entities = {}
  with open(file_ann) as anns:
    for ann in anns:
      if ann.split('\t')[1].split(' ')[0] in ent_classes:
          ent = ann[:-1].split('\t')[2]
          #print(ent)
          #ent = [token for token in nlp(ent) if not token.is_stop]
          ent = nlp(ent)
          start = int(ann[:-1].split('\t')[1].split(' ')[1])
          end = int(ann[:-1].split('\t')[1].split(' ')[2])
          if (len(ent) == 1):
            entities[(start, end)] = SINGLE + '_' + ann.split('\t')[1].split(' ')[0]
          else:
            entities[(start, start + len(ent[0].text))] = BEGIN + '_' + ann.split('\t')[1].split(' ')[0]
            entities[(end - len(ent[-1].text)), end] = END + '_' + ann.split('\t')[1].split(' ')[0]
            for i in range(len(ent) - 2):
              spaces = (ent[i + 1].idx) - (ent[i].idx + len(ent[i].text))
              start = start + len(ent[i].text) + spaces
              entities[(start, start + len(ent[i + 1].text))] = INSIDE + '_' + ann.split('\t')[1].split(' ')[0]
            
  return entities


In [5]:
import os
import re
#from sklearn.preprocessing import MultiLabelBinarizer

def getElements(sst_home, max_len_seq, getTags = True):
  _words = dict()
  _doc_tags = {}
  _entities = {}
  _docs = {}
  _docs_offset = {}
  #mlb = MultiLabelBinarizer(classes = classes)
    
  for file in [file[:-4] for file in os.listdir(sst_home) if file.endswith('.txt') and not ' ' in file]:
    file_text = os.path.join(sst_home, file + '.txt')
    if getTags:
      file_ann = os.path.join(sst_home, file + '.ann')
      _entities = getDictEntities(file_ann)
    with open(file_text) as f:
      text = f.read()
      spacy_text = nlp(text)
      #spacy_text = [token for token in spacy_text if not token.is_stop]
    
      _tweet = []
      _tweet_tags = []
    
      for token in spacy_text[0:max_len_seq]:
          if not token.like_url:
              _tweet.append(token.text)
              _entity = _entities.get((token.idx, token.idx + len(token.text)), 'O')
              _tweet_tags.append(_entity)

    _docs[file] = _tweet
    _doc_tags[file] = _tweet_tags

  #_words = list(_words)

  return _docs, _doc_tags

sst_home_train = sst_home + '/final-profner-data/subtask-2/brat/train'
tweets, tags = getElements(sst_home_train, 75, getTags = True)

In [7]:
sst_home_train = sst_home + '/final-profner-data/subtask-2/brat/train'
import re
import os
regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
texts = {}

for file in [file[:-4] for file in os.listdir(sst_home_train) if file.endswith('.txt') and not ' ' in file]:
    file_text = os.path.join(sst_home_train, file + '.txt')
    with open(file_text) as f:
      doc = f.read()
      doc = doc.replace('#', '')
      doc = re.sub(regex, '', doc)
    
    spacy_text = nlp(doc)
    text = '[CLS] '
    for sent in spacy_text.sents:
        text = text + sent.text + ' [SEP]'
    
    texts[file] = text

In [None]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('/home/sergio/Escritorio/ProfNER/fine-tuned-bert_3_epochs', do_lower_case=False)
model = BertModel.from_pretrained('/home/sergio/Escritorio/ProfNER/fine-tuned-bert_3_epochs/', output_hidden_states = True)
model.eval()

In [9]:
import tokenizations

def getEntitiesVectors(tokens, tokens_vectors, spacy_tokens, tags, BETO_DIM = 1536):
    special_tokens = ['[CLS]', '[SEP]', '[UNK]']
    entity_vectors = []
    aligment, _ = tokenizations.get_alignments(spacy_tokens, tokens)
    
    for token_aligment, tag in zip(aligment, tags):
        if tag != 'O':
            txtken = [tokens[i] for i in token_aligment if not tokens[i] in special_tokens]
            print(txtken)
            vector = [tokens_vectors[i] for i in token_aligment if not tokens[i] in special_tokens]
            if vector == []:
                vector = np.zeros(BETO_DIM)
            else:
                mean = torch.mean(torch.stack(vector), dim = 0)
                vector = mean.cpu().detach().numpy()
            entity_vectors.append(vector)
          
    return entity_vectors

In [13]:
entity_vectors = []
import numpy as np
for file_name, text in texts.items():
    spacy_tokens = tweets[file_name]
    tweet_tags = tags[file_name]
    
    tokens = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
    tokens_tensor = torch.tensor([indexed_tokens])

    segments_ids = [1] * len(tokens)
    segments_tensors = torch.tensor([segments_ids])

    with torch.no_grad():

        outputs = model(tokens_tensor, segments_tensors)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        #print(outputs)

        hidden_states = outputs[2]

    # Concatenate the tensors for all layers. We use `stack` here to
    # create a new dimension in the tensor.
    token_embeddings = torch.stack(hidden_states, dim=0)
    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)
    
    # Stores the token vectors, with shape [22 x 768]
    token_vecs_sum = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec_1 = torch.mean(token[-2:], dim=0)
        sum_vec_2 = torch.mean(token[-4:-2], dim=0)
        sum_vec = torch.cat((sum_vec_1, sum_vec_2))
        

        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)
    #print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))
    entity_vectors = entity_vectors + getEntitiesVectors(tokens, token_vecs_sum, spacy_tokens, tweet_tags)

['profesionales']
['de']
['Sanidad']
['epidemi', '##ólogo']
['rastre', '##adores']
['sanitarios']
['Diri', '##gentes']
['Rey']
['autoridades']
['sanitarias']
['médicos']
['personal']
['sanitario']
['políticos']
['rey']
['em', '##éri', '##to']
['rastre', '##adores']
['Ministro']
['jugadores']
['políticos']
['periodistas']
['Sep', '##ult', '##urero']
['director']
['médico']
['trabajadores']
['obreros']
['fotógrafo']
['juez']
['santa', '##rios']
['colegiados']
['divul', '##gador']
['ex', '##minist', '##ros']
['juez']
['del']
['Tribunal']
['Supremo']
['neur', '##ólogo']
['Presidente']
['dirigentes']
['de']
['la']
['Comunidad']
['Del']
['.']
['de']
['Gobierno']
['profesionales']
['de']
['los']
['servicios']
['sociales']
['médico']
['profesionales']
['de']
['la']
['información']
['políticos']
['policia']
['Port', '##av', '##oz']
['del']
['Grupo']
['Municipal']
['edi', '##les']
['enfermera']
['ER', '##TE', '##s']
['autónomos']
['parados']
['personas']
['trabajadoras']
['personas']
['de']
['la

['G', '.', 'C', '.']
['ex']
['vicepresidente']
['médicos']
['residentes']
['adjunto', '##s']
['senadores']
['Presidente']
['de']
['la']
['República']
['alcalde']
['sanitarios']
['científicos']
['profesionales']
['sanitarios']
['trabajadores']
['Policía']
['Bomb', '##eros']
['Médico', '##s']
['Enferme', '##ros']
['Auxiliar', '##es']
['taxi', '##stas']
['militares']
['Personal']
['de']
['supermercado', '##s']
['farmacéuticos']
['administrativos']
['Ministros']
['investigadores']
['investigadora', '##s']
['filósofo']
['primera']
['dama']
['escritor']
['trabajadores']
['ER', '##TE']
['Ministra']
['personal']
['presidente']
['de']
['la']
['J', '##unta']
['especul', '##adores']
['sanitarios']
['juez', '##a']
['presidente']
['de']
['la']
['Generalitat']
['medi', '##co']
['personal']
['sanitario']
['autores']
['de']
['este']
['blog']
['profesionales']
['sanitarios']
['ministros']
['presidente']
['jefe']
['de']
['Opin', '##ión']
['Ter', '##tul', '##iano']
['Guardia', '##C', '##iv', '##il']
['Em

['taxi', '##stas']
['sanitarios']
['taxi', '##stas']
['empresario']
['sanitarios']
['gerente']
['jubi', '##lada']
['árbitro', '##s']
['guardia']
['civil']
['guardia']
['civil']
['Trabajadores']
['compañera']
['traba', '##ya', '##dores']
['públicos']
['personal']
['sanitario']
['cantante']
['policia']
['mos', '##sos']
['rastre', '##ador']
['alumnado']
['alcalde', '##sa']
['conductor']
['del']
['bus']
['personas']
['trabajadoras']
['por']
['cuenta']
['propia']
[]
['autónomas']
['pres', '##bí', '##tero']
['dictador']
['FUN', '##CI', '##ONA', '##RI', '##OS']
['DE']
['PR', '##ISI', '##ONES']
['desempleo']
['director']
['en']
['paro']
['sanitarios']
['alumno']
['médicos']
['doctora']
['periodistas']
['autónomos']
['Er', '##tes']
['autónomos']
['trabajadores']
['ER', '##TE']
['diputados']
['diputados']
['operar', '##ios']
['técnicos']
['Papa']
['asesores']
['agentes']
['sociales']
['miembros']
['del']
['Gobierno']
['Guardia']
['Civil']
['Policía']
['Nacional']
['ministro']
['de']
['Sanidad']


['econom', '##ista']
['escritor']
['arzobispo']
['Policía']
['arzobispo']
['policia']
['sin']
['trabajo']
['trabajador']
['Rey']
['Rey']
['guardia', '##ci', '##vil']
['operar', '##io']
['de']
['Protección']
['Civil']
['vir', '##ólogo']
['experto']
['en']
['corona', '##vir', '##us']
['policías']
['guardias']
['civiles']
['personal']
['de']
['riesgo']
['ministro']
['policía']
['alcalde']
['limpia', '##doras']
['de']
['hospitales']
['personal']
['rastre', '##adores']
['minist', '##ra']
['portavoz']
['miembros']
['del']
['gobierno']
['ministro']
['de']
['Sanidad']
['Presidente']
['Policía']
['D', '##J']
['trabajadores']
['investigadores']
['profesionales']
['del']
['sector']
['presentador', '##a']
['Policía', '##s']
['agentes']
['investigador']
['director']
['ladrón']
['policía']
['nin', '##ja']
['agentes']
['de']
['la']
['policía']
['guardia']
['civil']
['médicos']
['compañeros']
['sanitarios']
['jubi', '##lados']
['presidente']
['portavoz']
['de']
['la']
['Generalitat']
['periodistas']
[

['presidente']
['productor']
['de']
['narco']
['series']
['persona']
['del']
['ámbito']
['sanitario']
['portavoz']
['ER', '##TE']
['GC']
['diputado']
['policía']
[]
['Estudi', '##ante']
['poli', '##ticos']
['ladrón']
['ministro']
['políticas']
['políticos']
['presidente']
['jefe']
['de']
['gabinete']
['alum', '##na']
['doctora']
['empresario']
['empresario']
['legi', '##onarios']
['directivos']
['de']
['la']
['industria']
['farmacéutica']
['representantes']
['del']
['arco']
['político']
['Director']
['general']
['de']
['economía']
['Doctor']
['Secretario']
['de']
['Salud']
['del']
['Estado']
['minist', '##ra']
['de']
['educación']
['trabajadores']
['residentes']
['residentes']
['médicos']
['abogado']
['escolta', '##s']
['personal']
['escolta', '##s']
['presidente']
['policia']
['secreta']
['alcalde']
['alcalde']
['alcalde', '##ee']
['policía']
['Far', '##mac', '##é', '##uti', '##cos']
['funcionar', '##ia']
['de']
['prisiones']
['Go', '##b', '.']
['alcalde']
['presidente']
['Médico', '#

['monjas']
['de']
['clausura']
['rastre', '##adores']
['profesionales']
['colaborador', '##a']
['policía']
['minist', '##ras']
['policia']
['doctora']
['médicos']
['sanitarios']
['sanitarios']
['sanitarios']
['trabajadores']
['ficti', '##cios']
['ER', '##TE', '##s']
['consejera']
['de']
['Sanidad']
['sanitarios']
['Capitán']
['personal']
['sanitario']
['pere', '##odi', '##sta']
['Ex']
['director']
['ex', '-', 'asalariados']
['rastre', '##adores']
['estudiantes']
['ministro']
['de']
['Sanidad']
['médicos']
['agentes']
['personal']
['de']
['rastre', '##o']
['sanitarios']
['juez']
['consejero']
['Presidente']
['sanitarios']
['Diri', '##gentes']
['médico']
['Neu', '##ró', '##log', '##os']
['expertos']
['en']
['ce', '##fal', '##eas']
['Reyes']
['gobernantes']
['directora']
['M', '##É', '##DI', '##CA']
['DE']
['AT', '##ENCI', '##Ó', '##N']
['PRIM', '##ARIA']
['J', '##ugador', '##es']
['sanitarios']
['policía']
['porta', '##vo', '##ces']
['Alcalde', '##sa']
['fotógrafo']
['presidente']
['del'

['sanitarios']
['Policía']
['sanitarios']
['dipu', '##tadas']
['presidente']
['políticos']
['Comisario']
['J', '##efe']
['Opera', '##tivo']
['de']
['la']
['Comis', '##aría']
['policia']
['Comisario']
['Inspector']
['J', '##efe']
['policías']
['fijos']
['preca', '##rios']
['despedido', '##s']
['presidente']
['sanitarios']
['putas']
['líderes']
['europeos']
['Agentes']
['sociales']
['alcalde', '##s']
['porta', '##vo', '##ces']
['vicepres', '##identa']
['investigadores']
['investigadores']
['vir', '##óloga']
['personal']
['del']
['avión']
['Médico', '##s']
['periodistas']
['Policía']
['Nacional']
['Policía']
['Nacional']
['comisario', '##s']
['políticos']
['porta', '##vo', '##ces']
['de']
['la']
['Asamblea']
['dem', '##óg', '##ra', '##fa']
['empleados']
['sanitarios']
['Rey']
['ministro']
['de']
['Agricultura']
[',']
['Pesca']
['investigador']
['gene', '##tista']
['desempleo']
['sanitaria']
['reportero']
['Presidente']
['de']
['la']
['República']
['experta']
['policia']
['hac', '##k', '##

In [16]:
import pickle5 as pickle

with open('/home/sergio/Escritorio/ProfNER/final-saved_data/bert_entities.pickle', 'wb') as file:
    pickle.dump(entity_vectors, file, protocol=pickle.HIGHEST_PROTOCOL)