In [11]:
# CELL 1: Mount Drive and load cleaned RA discharge notes

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import os

# Base project path (same as before)
base_path = "/content/drive/MyDrive/Rahul_DTSC5082_Project/Scenario2_MIMIC"

# Path to cleaned RA notes we saved earlier
notes_clean_path = os.path.join(
    base_path,
    "outputs",
    "RA_NOTES_CLEANED.parquet"
)

# Load cleaned notes
ra_notes = pd.read_parquet(notes_clean_path)

print("RA notes shape:", ra_notes.shape)
ra_notes.head(3)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
RA notes shape: (4979, 9)


Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text,clean_text
0,10010997-DS-9,10010997,20783870,DS,9,2139-05-02 00:00:00,2139-05-07 15:37:00,\nName: ___ Unit No: ___\n \...,admission date: discharge date: date of birth:...
1,10021930-DS-13,10021930,20480646,DS,13,2177-01-14 00:00:00,2177-01-15 07:20:00,\nName: ___ Unit No: ___\...,admission date: discharge date: date of birth:...
2,10028125-DS-14,10028125,29060034,DS,14,2171-07-01 00:00:00,2171-07-02 10:06:00,\nName: ___ Unit No: ___\n \n...,admission date: discharge date: date of birth:...


**2 — Install + Setup**

In [12]:
# CELL 2: Install transformers and check GPU

!pip install transformers sentencepiece accelerate -q

import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Using device:", device)


Using device: cuda


**3 — Load ClinicalBERT Model & Tokenizer**

In [13]:
# CELL 3: Load ClinicalBERT model and tokenizer

from transformers import AutoTokenizer, AutoModel

# ClinicalBERT model name from Hugging Face
clinicalbert_name = "emilyalsentzer/Bio_ClinicalBERT"

# Load tokenizer and model
clinical_tokenizer = AutoTokenizer.from_pretrained(clinicalbert_name)
clinical_model = AutoModel.from_pretrained(clinicalbert_name)

# Move model to GPU (cuda) or CPU depending on device
clinical_model.to(device)
clinical_model.eval()  # set to evaluation mode

print("Loaded ClinicalBERT:", clinicalbert_name)
print("Hidden size:", clinical_model.config.hidden_size)


Loaded ClinicalBERT: emilyalsentzer/Bio_ClinicalBERT
Hidden size: 768


**4 — Improved mean-pooling embedding function**

In [14]:
# CELL 4: Improved embedding function with attention-masked mean pooling

from tqdm.auto import tqdm
import torch
import numpy as np

def compute_bert_embeddings_meanpool(texts, tokenizer, model, batch_size=16, max_length=512):
    """
    texts: list of strings
    Returns: numpy array of shape (len(texts), hidden_size)
    Uses attention-masked mean pooling instead of CLS-only.
    """
    all_embeddings = []

    model.eval()
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size)):
            batch_texts = texts[i : i + batch_size]

            encodings = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors="pt"
            )

            input_ids = encodings["input_ids"].to(device)
            attention_mask = encodings["attention_mask"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            last_hidden = outputs.last_hidden_state  # (batch, seq_len, hidden)

            # Attention-masked mean pooling
            # Expand mask to (batch, seq_len, 1)
            mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()

            # Sum token embeddings, divide by number of valid tokens
            sum_embeddings = torch.sum(last_hidden * mask_expanded, dim=1)
            sum_mask = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)  # avoid div by zero

            mean_embeddings = sum_embeddings / sum_mask  # (batch, hidden)

            # Move to CPU + numpy
            mean_embeddings = mean_embeddings.cpu().numpy()
            all_embeddings.append(mean_embeddings)

    all_embeddings = np.vstack(all_embeddings)
    return all_embeddings


**5 — Recompute ClinicalBERT embeddings with mean pooling**

In [15]:
# NEW CELL 5: Recompute ClinicalBERT embeddings using mean pooling and save

texts = ra_notes['clean_text'].tolist()
print("Total texts to embed:", len(texts))

clinical_embeddings_mean = compute_bert_embeddings_meanpool(
    texts=texts,
    tokenizer=clinical_tokenizer,
    model=clinical_model,
    batch_size=32,
    max_length=512
)

print("Embedding array shape (mean-pooled):", clinical_embeddings_mean.shape)

# Build DataFrame
emb_cols = [f"cb_mean_{i}" for i in range(clinical_embeddings_mean.shape[1])]
clinical_mean_df = pd.DataFrame(clinical_embeddings_mean.astype("float32"), columns=emb_cols)

# Attach IDs
clinical_mean_df["note_id"] = ra_notes["note_id"].values
clinical_mean_df["subject_id"] = ra_notes["subject_id"].values
clinical_mean_df["hadm_id"] = ra_notes["hadm_id"].values

# Save
clin_mean_path = os.path.join(
    base_path,
    "outputs",
    "RA_NOTES_EMB_ClinicalBERT_meanpool.parquet"
)

clinical_mean_df.to_parquet(clin_mean_path, index=False)

print("Saved ClinicalBERT (mean-pooled) embeddings to:")
print(clin_mean_path)
print("Final DF shape:", clinical_mean_df.shape)


Total texts to embed: 4979


  0%|          | 0/156 [00:00<?, ?it/s]

Embedding array shape (mean-pooled): (4979, 768)
Saved ClinicalBERT (mean-pooled) embeddings to:
/content/drive/MyDrive/Rahul_DTSC5082_Project/Scenario2_MIMIC/outputs/RA_NOTES_EMB_ClinicalBERT_meanpool.parquet
Final DF shape: (4979, 771)


**6 — Clear ClinicalBERT and Load BioBERT**

In [16]:
# CELL 6: Clear ClinicalBERT from memory and load BioBERT (mean-pooling setup)

import gc
from transformers import AutoTokenizer, AutoModel

# 1) Safely delete ClinicalBERT objects if they exist
for var in ["clinical_model", "clinical_tokenizer"]:
    if var in globals():
        del globals()[var]
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# 2) Load BioBERT
biobert_name = "dmis-lab/biobert-base-cased-v1.1"

bio_tokenizer = AutoTokenizer.from_pretrained(biobert_name)
bio_model = AutoModel.from_pretrained(biobert_name)

bio_model.to(device)
bio_model.eval()

print("Loaded BioBERT:", biobert_name)
print("Hidden size:", bio_model.config.hidden_size)


Loaded BioBERT: dmis-lab/biobert-base-cased-v1.1
Hidden size: 768


**7 — Compute & Save BioBERT Mean-Pooled Embeddings**

In [17]:
# CELL 7: Compute BioBERT mean-pooled embeddings and save

texts = ra_notes['clean_text'].tolist()
print("Total texts to embed:", len(texts))

bio_embeddings_mean = compute_bert_embeddings_meanpool(
    texts=texts,
    tokenizer=bio_tokenizer,
    model=bio_model,
    batch_size=32,
    max_length=512
)

print("BioBERT embedding array shape (mean-pooled):", bio_embeddings_mean.shape)

# Build DataFrame
bio_emb_cols = [f"bb_mean_{i}" for i in range(bio_embeddings_mean.shape[1])]
bio_mean_df = pd.DataFrame(bio_embeddings_mean.astype("float32"), columns=bio_emb_cols)

# Attach IDs for merging later
bio_mean_df["note_id"] = ra_notes["note_id"].values
bio_mean_df["subject_id"] = ra_notes["subject_id"].values
bio_mean_df["hadm_id"] = ra_notes["hadm_id"].values

# Save to parquet
bio_mean_path = os.path.join(
    base_path,
    "outputs",
    "RA_NOTES_EMB_BioBERT_meanpool.parquet"
)

bio_mean_df.to_parquet(bio_mean_path, index=False)

print("Saved BioBERT (mean-pooled) embeddings to:")
print(bio_mean_path)
print("Final BioBERT DF shape:", bio_mean_df.shape)


Total texts to embed: 4979


  0%|          | 0/156 [00:00<?, ?it/s]

BioBERT embedding array shape (mean-pooled): (4979, 768)
Saved BioBERT (mean-pooled) embeddings to:
/content/drive/MyDrive/Rahul_DTSC5082_Project/Scenario2_MIMIC/outputs/RA_NOTES_EMB_BioBERT_meanpool.parquet
Final BioBERT DF shape: (4979, 771)


**8 — Clear BioBERT & Load BERT-base**

In [18]:
# CELL 8: Clear BioBERT from memory and load BERT-base

import gc
import torch
from transformers import AutoTokenizer, AutoModel

# 1) Delete BioBERT objects
for var in ["bio_model", "bio_tokenizer"]:
    if var in globals():
        del globals()[var]

gc.collect()

if torch.cuda.is_available():
    torch.cuda.empty_cache()

# 2) Load generic BERT-base model
bert_name = "bert-base-uncased"

bert_tokenizer = AutoTokenizer.from_pretrained(bert_name)
bert_model = AutoModel.from_pretrained(bert_name)

bert_model.to(device)
bert_model.eval()

print("Loaded BERT-base:", bert_name)
print("Hidden size:", bert_model.config.hidden_size)


Loaded BERT-base: bert-base-uncased
Hidden size: 768


**9 — Compute & Save BERT-base Mean-Pooled Embeddings**

In [19]:
# CELL 9: Compute BERT-base mean-pooled embeddings and save

texts = ra_notes['clean_text'].tolist()
print("Total texts to embed:", len(texts))

bert_embeddings_mean = compute_bert_embeddings_meanpool(
    texts=texts,
    tokenizer=bert_tokenizer,
    model=bert_model,
    batch_size=32,    # safe for your big GPU
    max_length=512
)

print("BERT-base embedding array shape (mean-pooled):", bert_embeddings_mean.shape)

# Build DataFrame of embeddings
bert_emb_cols = [f"bert_mean_{i}" for i in range(bert_embeddings_mean.shape[1])]
bert_mean_df = pd.DataFrame(bert_embeddings_mean.astype("float32"), columns=bert_emb_cols)

# Attach IDs for merging later
bert_mean_df["note_id"] = ra_notes["note_id"].values
bert_mean_df["subject_id"] = ra_notes["subject_id"].values
bert_mean_df["hadm_id"] = ra_notes["hadm_id"].values

# Save to parquet
bert_mean_path = os.path.join(
    base_path,
    "outputs",
    "RA_NOTES_EMB_BERTbase_meanpool.parquet"
)

bert_mean_df.to_parquet(bert_mean_path, index=False)

print("Saved BERT-base (mean-pooled) embeddings to:")
print(bert_mean_path)
print("Final BERT-base DF shape:", bert_mean_df.shape)


Total texts to embed: 4979


  0%|          | 0/156 [00:00<?, ?it/s]

BERT-base embedding array shape (mean-pooled): (4979, 768)
Saved BERT-base (mean-pooled) embeddings to:
/content/drive/MyDrive/Rahul_DTSC5082_Project/Scenario2_MIMIC/outputs/RA_NOTES_EMB_BERTbase_meanpool.parquet
Final BERT-base DF shape: (4979, 771)
