In [1]:
from transformers import BertModel, BertTokenizer
import torch

# Load ClinicalBERT model and tokenizer
model_name = 'emilyalsentzer/Bio_ClinicalBERT'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)


  from .autonotebook import tqdm as notebook_tqdm
  return self.fget.__get__(instance, owner)()


In [2]:
import numpy as np
from tqdm import tqdm

# Get the list of tokens in the vocabulary
vocab = tokenizer.get_vocab()

# Prepare a dictionary to store token embeddings
token_embeddings = {}

# Iterate over the vocabulary to get embeddings for each token
for token, token_id in tqdm(vocab.items(), desc="Processing tokens", total=len(vocab)):
    # Tokenize and get input IDs
    inputs = tokenizer(token, return_tensors='pt')
    
    # Get hidden states (embeddings) from the model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the embedding for the token (taking the first token's embedding)
    embedding = outputs.last_hidden_state[0][0].numpy()
    
    # Store the token and its corresponding embedding
    token_embeddings[token] = embedding


Processing tokens: 100%|██████████| 28996/28996 [16:04<00:00, 30.06it/s]


In [3]:
# Save the embeddings to a txt file
embedding_file = 'clinicalbert_embeddings.txt'

with open(embedding_file, 'w', encoding='utf-8') as f:
    for token, embedding in token_embeddings.items():
        embedding_str = ' '.join(map(str, embedding))
        f.write(f"{token} {embedding_str}\n")

print(f"Embeddings saved to {embedding_file}")


Embeddings saved to clinicalbert_embeddings.txt


In [7]:
from transformers import BertModel, BertTokenizer
import torch
import numpy as np

# Load BlueBERT model and tokenizer
# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("bionlp/bluebert_pubmed_uncased_L-24_H-1024_A-16")

In [8]:
# Get the list of tokens in the vocabulary
vocab = tokenizer.get_vocab()

# Prepare a dictionary to store token embeddings
from tqdm import tqdm

# Prepare a dictionary to store token embeddings
token_embeddings = {}

# Iterate over the vocabulary to get embeddings for each token
for token, token_id in tqdm(vocab.items(), desc="Extracting Embeddings"):
    # Tokenize and get input IDs
    inputs = tokenizer(token, return_tensors='pt')
    
    # Get hidden states (embeddings) from the model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the embedding for the token (taking the first token's embedding)
    embedding = outputs.last_hidden_state[0][0].numpy()
    
    # Store the token and its corresponding embedding
    token_embeddings[token] = embedding



Extracting Embeddings: 100%|██████████| 28996/28996 [6:19:50<00:00,  1.27it/s]      


In [9]:

# Save the embeddings to a txt file
embedding_file = 'bluebert_embeddings.txt'

with open(embedding_file, 'w', encoding='utf-8') as f:
    for token, embedding in token_embeddings.items():
        embedding_str = ' '.join(map(str, embedding))
        f.write(f"{token} {embedding_str}\n")

print(f"Embeddings saved to {embedding_file}")


Embeddings saved to bluebert_embeddings.txt
