In [1]:

import torch
from transformers import BertTokenizer, BertModel
import sys
import os
from tqdm import tqdm
sys.path.append("/home/jovyan/20230406_ArticleClassifier/ArticleClassifier")

import src.general.global_variables as gv
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname('data_loader.py'), os.path.pardir)))
from src.data.data_loader import DataLoader

from src.general.utils import cc_path

import pandas as pd
import numpy as np

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")


In [3]:
model_version = 'scibert_scivocab_uncased'
do_lower_case = True
# model = BertModel.from_pretrained(model_version)
# model = torch.load(cc_path(f'models/embedders/finetuned_bert_56k_20e_3lay_best_iter.pt'))
model = torch.load(cc_path(f'models/baselines/paula_finetuned_bert_56k_10e_tka.pt'),map_location=torch.device('cpu'))
model = model.base_model

tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [4]:
from sklearn.metrics.pairwise import cosine_similarity
def embed_text(text, model):
    # print(text)
    encoded_text = tokenizer.encode(text, max_length=512, truncation=True)
    input_ids = torch.tensor(encoded_text).unsqueeze(0).to(device)  # Batch size 1
    outputs = model(input_ids)
    last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    return last_hidden_states 

def get_similarity(em, em2):
    return cosine_similarity(em.detach().cpu().numpy(), em2.detach().cpu().numpy())

In [5]:
# We will use a mean of all word embeddings. To do that we will take mean over dimension 1 which is the sequence length.
coronavirus_em = embed_text(["Coronavirus"]*50000, model).mean(1)
mers_em = embed_text("Middle East Respiratory Virus", model).mean(1)
flu_em = embed_text("Flu", model).mean(1)
bog_em = embed_text("Bog", model).mean(1)
covid_2019 = embed_text("COVID-2019", model).mean(1)
print("Similarity for Coronavirus and Flu:" + str(get_similarity(coronavirus_em, flu_em)))
print("Similarity for Coronavirus and MERs:" + str(get_similarity(coronavirus_em, mers_em)))
print("Similarity for Coronavirus and COVID-2019:" + str(get_similarity(coronavirus_em, covid_2019)))
print("Similarity for Coronavirus and Bog:" + str(get_similarity(coronavirus_em, bog_em)))

Similarity for Coronavirus and Flu:[[0.1371265]]
Similarity for Coronavirus and MERs:[[0.13969515]]
Similarity for Coronavirus and COVID-2019:[[0.0399775]]
Similarity for Coronavirus and Bog:[[0.14438891]]


In [7]:
def scibert_embedding(emb_dat, embedding_dim):
    """Create the SciBERT embedding"""

    print('Initiating DataFrame for saving embedding...')
    embedding_cols = [f'd{i}' for i in range(embedding_dim)]
    embedded_df = pd.DataFrame(columns=['pui'] + embedding_cols)
    embedded_df['pui'] = emb_dat.loc[:, 'pui']
    embedded_df.set_index('pui', inplace=True)

    emb_dat.set_index('pui', inplace=True)

    # create embeddings
    print('Creating embeddings for all documents...')
    for idx, sentence in tqdm(emb_dat.iterrows(), total=len(emb_dat)):
        embedded_df.loc[idx] = embed_text(sentence['title'] + ' ' + ' '.join(sentence['keywords'])  + ' ' + sentence['abstract'], model).mean(1).detach().cpu().numpy()

    embedded_df.reset_index(names='pui', inplace=True)

    return embedded_df

In [8]:
loc_dict = {
        'processed_csv': cc_path('data/processed/canary/articles_cleaned.csv')
    }
data_loader = DataLoader(loc_dict)
processed_df = data_loader.load_processed_csv()
label_columns = processed_df.loc[:, ~processed_df.columns.isin(
    ['file_name', 'pui', 'title', 'keywords', 'abstract', 'abstract_2', 'authors', 'organization', 'chemicals',
     'num_refs', 'date-delivered', 'labels_m', 'labels_a'])]
label_columns = label_columns.astype(int)

embedding_type = 'scibert'

data_for_embedding = processed_df.dropna(subset=['abstract'])
data_for_embedding.loc[:, 'labels_m'] = data_for_embedding.loc[:, 'labels_m'].fillna('')
# data_for_embedding.loc[:, 'list_label'] = data_for_embedding.loc[:, 'labels_m'].str.split(',')

embedding_dim = 768
embedded_df = scibert_embedding(data_for_embedding, embedding_dim)


embedded_df.to_csv(cc_path(f'data/processed/canary/embeddings_{embedding_type}_paula_finetuned_20230430.csv'), index=False)

Initiating DataFrame for saving embedding...
Creating embeddings for all documents...


  0%|          | 39/117310 [00:10<8:33:54,  3.80it/s] 


KeyboardInterrupt: 

In [9]:
print(embedded_df)

               pui        d0        d1        d2        d3        d4   
0        624531411   0.74236 -0.535504  0.036692 -0.416541  -0.75081  \
1        625340088  0.967976 -0.299125  0.064699 -0.364201 -0.022266   
2        625805682  1.133411  0.667837  0.488474 -0.367336  0.449929   
3        626662493  1.099647 -0.859732 -0.199522  1.031542 -0.404239   
4        626822402  1.280313 -0.539932 -0.401579  0.372705 -0.481402   
...            ...       ...       ...       ...       ...       ...   
117305  2011621972  0.418833  -0.86512 -0.347888  1.183458 -1.132746   
117306  2011622024  0.401095  -0.67544 -0.615833  1.649338 -0.555637   
117307  2011622065 -0.066398 -0.604516   0.10489  1.134272  -1.20695   
117308  2011626864  0.510944  -0.35875 -0.455006  0.771072 -1.275665   
117309  2011632199  0.125398 -0.542556 -0.835572  1.569164 -0.376448   

              d5        d6        d7        d8  ...      d758      d759   
0      -0.258733  0.937975  0.529564 -0.371828  ...  0.60683