In [1]:
MODEL = 'DeepPavlov/rubert-base-cased-sentence'

In [2]:
import torch
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained(MODEL, do_lower_case=True)

import pandas as pd
from tqdm import tqdm
tqdm.pandas()

import config

In [3]:
final_df = pd.read_csv(config.final_df)

In [4]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained(MODEL,
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )
if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu"  
device = torch.device(dev) 
model.to(device)
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [5]:
def texts_from_vectors(text):

    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    if len(tokenized_text) > 512:
        marked_text = "[CLS] " + text[:512] + " [SEP]"
        tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    segments_ids = [1] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    tokens_tensor = tokens_tensor.to(device)
    segments_tensors = segments_tensors.to(device)

    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]

    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)
    token_vecs_sum = []
    for token in token_embeddings:
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs_sum.append(sum_vec)

        token_vecs = hidden_states[-2][0]

    # Calculate the average of all 22 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)
    return sentence_embedding

In [6]:
final_df['bert_vectors'] = final_df['no_references'].fillna('').progress_apply(lambda x: texts_from_vectors(x))

100%|██████████| 63018/63018 [13:31<00:00, 77.67it/s] 


In [7]:
final_df['bert_vectors'] = final_df['bert_vectors'].progress_apply(lambda x: x.cpu())

100%|██████████| 63018/63018 [00:00<00:00, 75809.09it/s]


In [8]:
final_df['bert_vectors'] = final_df['bert_vectors'].progress_apply(lambda x: x.tolist())

100%|██████████| 63018/63018 [00:01<00:00, 55051.85it/s]


In [9]:
final_df.to_csv(config.final_df, index=False)