In [11]:
from transformers import AutoTokenizer, AutoModel
import torch
from pathlib import Path
from tqdm import tqdm  # Single additional import

In [12]:

# Load BioBERT
MODEL_NAME = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [13]:
import os
print("Working Directory:", os.getcwd())

Working Directory: /Users/ishaanroopesh/Documents/Final_Year_Project/Ingestion/scripts


In [None]:
# File path
input_file = Path("../embeddings/input_texts.txt") 
output_file = Path("../embeddings/bio_clincalbert_embeddings.pt")

# Load texts
with open(input_file, "r", encoding="utf-8") as f:
    texts = [line.strip() for line in f.readlines() if line.strip()]

# Generate embeddings with simple progress bar
all_embeddings = []
with torch.no_grad():
    for text in tqdm(texts, desc="Generating embeddings"):  # Only this line changed
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        all_embeddings.append(cls_embedding.squeeze(0))

# Stack all into one tensor and save
embeddings_tensor = torch.stack(all_embeddings)
torch.save(embeddings_tensor, output_file)

print(f"✅ Saved {len(all_embeddings)} BioBERT embeddings to {output_file}")

Generating embeddings: 100%|██████████| 70465/70465 [52:56<00:00, 22.19it/s]    


✅ Saved 70465 BioBERT embeddings to ../embeddings/bio_clincalbert_embeddings.pt
