Import Libraries

In [4]:
import pandas as pd
import re
from collections import Counter
import scispacy
import spacy
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.preprocessing import normalize
print(torch.__version__)

2.6.0+cu118


Extract Progress Notes' Headers

In [8]:
# === CONFIG ===
MIMIC_NOTES_PATH = r'C:\Users\Administrator\Desktop\medllm evn\Data\mimic-iii-clinical-database-1.4\NOTEEVENTS.csv.gz'  # update to your actual path
NOTE_CATEGORIES = ['Discharge summary', 'Physician']
SAMPLE_SIZE = 2000  # adjust for speed vs. coverage

# === 1. Load Data ===
print("Loading notes...")
notes_df = pd.read_csv(MIMIC_NOTES_PATH, dtype={"TEXT": str}, usecols=["CATEGORY", "TEXT"])
notes_df = notes_df[notes_df["CATEGORY"].isin(NOTE_CATEGORIES)].dropna(subset=["TEXT"])
notes_sample = notes_df.sample(n=min(SAMPLE_SIZE, len(notes_df)), random_state=42)["TEXT"]

# === 2. Extract Candidate Section Headers ===
def extract_headers(text):
    # Match UPPERCASE lines (possibly ending with :) as section headers
    pattern = re.compile(r'^\s*([A-Z][A-Z\s\-_/]+):?\s*$', re.MULTILINE)
    return pattern.findall(text)

# === 3. Normalize Headers ===
def normalize_header(header):
    header = header.lower()
    header = header.strip(": ")
    header = header.replace("hx", "history")
    header = re.sub(r'\s+', ' ', header)
    return header

# === 4. Run Extraction ===
header_counter = Counter()
print("Extracting headers...")
for note in notes_sample:
    raw_headers = extract_headers(note)
    norm_headers = [normalize_header(h) for h in raw_headers]
    header_counter.update(norm_headers)

# === 5. Show Results ===
print("\nTop 50 Section Headers:")
for i, (header, count) in enumerate(header_counter.most_common(50), 1):
    print(f"{i:2d}. {header:40s} ({count} occurrences)")


Loading notes...
Extracting headers...

Top 50 Section Headers:
 1. impression                               (456 occurrences)
 2. discharge diagnoses                      (240 occurrences)
 3. past medical history                     (179 occurrences)
 4. discharge medications                    (168 occurrences)
 5. reason for this examination              (126 occurrences)
 6. htn                                      (109 occurrences)
 7. nc                                       (95 occurrences)
 8. leuk-neg                                 (85 occurrences)
 9. medications                              (78 occurrences)
10. imaging                                  (67 occurrences)
11. admission labs                           (65 occurrences)
12. n/a                                      (62 occurrences)
13. medications on discharge                 (61 occurrences)
14. discharge diagnosis                      (61 occurrences)
15. medications on admission                 (57 occurrences)


Create User Notes Embeddings per Visit

In [9]:
SECTION_HEADERS = list(header_counter.keys())

def split_into_sections(text):
    # Convert to lowercase for easier matching
    text = text.lower()
    pattern = '|'.join([fr'\n.*{header}.*\n' for header in SECTION_HEADERS])
    splits = re.split(pattern, text)
    return [s.strip() for s in splits if s.strip()]

nlp = spacy.load('en_core_sci_md')

def split_into_sentences(section_texts):
    sentences = []
    for section in section_texts:
        doc = nlp(section)
        sentences.extend([sent.text.strip() for sent in doc.sents if sent.text.strip()])
    return sentences


tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model.eval()

def encode_sentences(sentences, batch_size=16, max_len=128):
    all_embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=max_len, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
            all_embeddings.append(cls_embeddings)
    return torch.cat(all_embeddings).cpu().numpy()

def aggregate_patient_embedding(note_text):
    sections = split_into_sections(note_text)
    sentences = split_into_sentences(sections)
    sentence_embeddings = encode_sentences(sentences)
    patient_embedding = np.mean(sentence_embeddings, axis=0)
    return normalize(patient_embedding.reshape(1, -1))[0]  # Normalize per patient

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [10]:
notes = pd.read_csv(
    r'...\NOTEEVENTS.csv.gz',
    dtype={4: str, 5: str}  # or int, float, etc. depending on data
)

notes = notes[notes["CATEGORY"].isin(["Discharge summary"])]
notes = notes.dropna(subset=["TEXT", "HADM_ID"])

patient_texts = notes.groupby("HADM_ID")["TEXT"].apply(lambda x: "\n".join(x)).reset_index()

patient_embeddings = []

for _, row in patient_texts.iterrows():
    embedding = aggregate_patient_embedding(row["TEXT"])
    #print(embedding)
    patient_embeddings.append(embedding)

# Resulting matrix
embeddings = np.array(patient_embeddings)


KeyboardInterrupt: 

In [None]:
np.savez(r'...\useremb.npz', array1=embeddings)