In [40]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# ✅ Use the Best SDOH Model
model_name = "michiyasunaga/BioLinkBERT-large"

# ✅ Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# ✅ Enable GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ✅ Load NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=0 if torch.cuda.is_available() else -1)

print("✅ Model loaded successfully!")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at michiyasunaga/BioLinkBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model loaded successfully!


In [43]:
import json

# ✅ Load SDOH keywords
keywords_path = r"C:\Users\subha\Downloads\keywords.json"  # 

with open(keywords_path, "r", encoding="utf-8") as f:
    sdoh_keywords = set(json.load(f).keys())

print(f"✅ Loaded {len(sdoh_keywords)} SDOH keywords successfully!")
print("🔹 Sample Keywords:", list(sdoh_keywords)[:10])


✅ Loaded 1067 SDOH keywords successfully!
🔹 Sample Keywords: ['laid', 'attorney', 'dad', 'use/social', '2-1/2', 'office', '18', 'imposed', 'alcohol-containing', 'outside']


In [44]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")  # Required for sentence tokenization

# ✅ Define SDOH categories
sdoh_mapping = {
    "SMOKING": {"smoking", "cigarettes", "tobacco", "vaping", "e-cigarettes", "nicotine"},
    "DRUG_USE": {"cocaine", "heroin", "substance use", "opioids", "marijuana", "methamphetamine", "prescription drugs"},
    "ALCOHOL_USE": {"alcohol", "binge drinking", "drinking", "wine", "beer", "spirits", "liquor"},
    "HOUSING": {"homeless", "unstable housing", "shelter", "housing insecurity", "eviction", "foreclosure"},
    "SOCIAL_SUPPORT": {"lives alone", "no family", "social isolation", "social support", "community", "family"},
    "EMPLOYMENT": {"unemployed", "job loss", "joblessness", "income", "underemployment", "workplace stress"},
    "TRANSPORTATION": {"no transportation", "lack of transportation", "public transport", "vehicle access", "transportation barrier"},
    "MENTAL_HEALTH": {"depression", "anxiety", "PTSD", "mental illness", "stress", "bipolar", "schizophrenia", "mood disorder"},
    "EDUCATION": {"low education", "no high school", "GED", "college", "education level", "graduation", "dropout"},
    "FOOD_SECURITY": {"food insecurity", "hunger", "food access", "nutrition", "food desert", "malnutrition", "grocery store access"},
    "HEALTHCARE_ACCESS": {"lack of healthcare", "health insurance", "uninsured", "underinsured", "medical care", "healthcare access"},
    "SAFETY": {"violence", "domestic abuse", "bullying", "crime", "unsafe neighborhood", "gun violence", "child abuse", "elder abuse"},
}

print("✅ SDOH Categories Loaded!")


✅ SDOH Categories Loaded!


[nltk_data] Downloading package punkt to C:\Users\subha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [54]:
def map_to_sdoh_category(entity_text):
    entity_text_lower = entity_text.lower()

    # ✅ First, check against predefined SDOH keywords
    if entity_text_lower in sdoh_keywords:
        return sdoh_keywords[entity_text_lower]  # Get the correct category

    # ✅ If not found in predefined keywords, use existing SDOH mapping
    for category, keywords in sdoh_mapping.items():
        if any(word in entity_text_lower for word in keywords):
            return category

    return "OTHER"


In [55]:
# ✅ Function to Map Extracted Entities to SDOH Categories
def map_to_sdoh_category(entity_text):
    entity_text_lower = entity_text.lower()
    for category, keywords in sdoh_mapping.items():
        if any(word in entity_text_lower for word in keywords):
            return category
    return "OTHER"

print("✅ SDOH Mapping Function Ready!")

# ✅ Smart Preprocessing: Extract Key Sections (If Available)
def preprocess_note(text):
    """
    Extracts important sections of the note (SOCIAL HISTORY, ASSESSMENT, etc.).
    Falls back to full text if no structure is detected.
    """
    sections = []
    pattern = re.compile(r"(SOCIAL HISTORY|HISTORY OF PRESENT ILLNESS|ASSESSMENT|PLAN|REVIEW OF SYSTEMS):(.+?)(?=\n[A-Z\s]+:|\Z)", re.DOTALL)
    matches = pattern.findall(text)
    
    for match in matches:
        sections.append(match[1].strip())

    return "\n".join(sections) if sections else text  # Return extracted sections OR full text

# ✅ Sentence-Based Chunking for NER Processing
def extract_sdoh_from_text(text):
    text = preprocess_note(text)  # ✅ Extract key sections first
    sentences = nltk.sent_tokenize(text)  # ✅ Split into sentences
    sdoh_entities = []

    for sentence in sentences:
        input_ids = tokenizer.encode(sentence, add_special_tokens=True)

        # ✅ Skip empty or too short sentences
        if len(input_ids) == 0:
            continue

        # ✅ Process long sentences in safe token-based chunks
        if len(input_ids) > 512:
            for i in range(0, len(input_ids), 450):  # ✅ Overlapping chunks
                chunk_ids = input_ids[i:i + 512]
                chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
                chunk_entities = process_chunk(chunk_text)
                sdoh_entities.extend(chunk_entities)
        else:
            chunk_entities = process_chunk(sentence)
            sdoh_entities.extend(chunk_entities)

    return sdoh_entities

# ✅ Function to Process a Single Chunk (Fixed for 512 Token Limit)
def process_chunk(chunk_text):
    """Runs the NER pipeline on a text chunk and returns extracted entities."""
    chunk_entities = []
    
    # ✅ Ensure input does not exceed 512 tokens
    input_ids = tokenizer.encode(chunk_text, add_special_tokens=True, truncation=True, max_length=512)

    chunk_text = tokenizer.decode(input_ids, skip_special_tokens=True)  # ✅ Convert back to text

    # ✅ Run NER model safely
    ner_results = ner_pipeline(chunk_text)
    for entity in ner_results:
        entity_text = entity["word"]
        entity_type = map_to_sdoh_category(entity_text)
        if entity_type != "OTHER":
            chunk_entities.append((entity_text, entity_type))

    return chunk_entities

print("✅ SDOH Extraction Pipeline Ready!")


✅ SDOH Mapping Function Ready!
✅ SDOH Extraction Pipeline Ready!


In [50]:
import re

# ✅ Sample Test (Before Running on 6,000+ Notes)
sample_text = """
The patient has a history of smoking and reports alcohol use.
He has been unemployed for the last 6 months and has no transportation.
There are signs of anxiety and depression, and he lives alone in unstable housing.
"""

# ✅ Extract & Display SDOH
sdoh_results = extract_sdoh_from_text(sample_text)
print("\n✅ Extracted SDOH Entities:")
for entity, category in sdoh_results:
    print(f"🔹 Word: {entity} | SDOH Category: {category}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



✅ Extracted SDOH Entities:
🔹 Word: has a history of smoking and | SDOH Category: SMOKING
🔹 Word: reports alcohol use. | SDOH Category: ALCOHOL_USE
🔹 Word: no transportation. | SDOH Category: TRANSPORTATION
🔹 Word: there are signs of anxiety and depression, and he lives | SDOH Category: MENTAL_HEALTH
🔹 Word: alone in unstable housing. | SDOH Category: HOUSING


In [56]:
# ✅ Sample clinical note
sample_text = """
The patient has a history of smoking and reports alcohol use.
He has been unemployed for the last 6 months and has no transportation.
There are signs of anxiety and depression, and he lives alone in unstable housing.
"""

# ✅ Extract SDOH entities
sdoh_results = extract_sdoh_from_text(sample_text)

# ✅ Display results
print("\n✅ Extracted SDOH Entities:")
for entity, category in sdoh_results:
    print(f"🔹 Word: {entity} | SDOH Category: {category}")



✅ Extracted SDOH Entities:
🔹 Word: has a history of smoking and | SDOH Category: SMOKING
🔹 Word: reports alcohol use. | SDOH Category: ALCOHOL_USE
🔹 Word: no transportation. | SDOH Category: TRANSPORTATION
🔹 Word: there are signs of anxiety and depression, and he lives | SDOH Category: MENTAL_HEALTH
🔹 Word: alone in unstable housing. | SDOH Category: HOUSING


In [None]:
import os
import pandas as pd
from tqdm import tqdm

# ✅ Input Folder
input_folder = r"C:\Users\subha\Box\Pipeline\Cleaned_Notes\aria"

# ✅ Output File Path
output_path = r"C:\Users\subha\Box\Pipeline\sdoh_aria_results.csv"

# ✅ Recursively find all .txt files in subdirectories
all_files = []
for root, _, files in os.walk(input_folder):
    for file in files:
        if file.endswith(".txt"):
            all_files.append(os.path.join(root, file))  # Save full file path

print(f"✅ Found {len(all_files)} files to process in {input_folder}")

# ✅ Load existing results (to resume if script restarts)
if os.path.exists(output_path):
    existing_df = pd.read_csv(output_path)
    processed_files = set(existing_df["File"].unique())  # Track already processed files
    print(f"🔄 Resuming from {len(processed_files)} processed files...")
else:
    processed_files = set()
    existing_df = pd.DataFrame(columns=["File", "Extracted_Text", "SDOH_Category"])

# ✅ Process All Notes
sdoh_results = []

with tqdm(total=len(all_files), desc="Processing Notes") as pbar:
    for idx, file_path in enumerate(all_files):
        file_name = os.path.basename(file_path)  # Get just the file name
        
        # ✅ Skip if already processed
        if file_name in processed_files:
            pbar.update(1)
            continue

        try:
            with open(file_path, "r", encoding="utf-8") as f:
                note_text = f.read().strip()

            if not note_text:
                continue

            # ✅ Extract SDOH
            extracted_sdoh = extract_sdoh_from_text(note_text)

            for entity_text, entity_type in extracted_sdoh:
                sdoh_results.append([file_name, entity_text, entity_type])

            # ✅ Save progress every 100 files
            if idx % 100 == 0 and idx > 0:
                df_temp = pd.DataFrame(sdoh_results, columns=["File", "Extracted_Text", "SDOH_Category"])
                df_temp.to_csv(output_path, mode="a", header=not os.path.exists(output_path), index=False)  # Append mode
                print(f"✅ Saved progress at {idx} files")
                sdoh_results = []  # Reset list after saving

        except Exception as e:
            print(f"❌ Error in {file_path}: {e}")

        pbar.update(1)

# ✅ Final Save
if sdoh_results:
    df_temp = pd.DataFrame(sdoh_results, columns=["File", "Extracted_Text", "SDOH_Category"])
    df_temp.to_csv(output_path, mode="a", header=not os.path.exists(output_path), index=False)

print(f"✅ Final results saved at: {output_path}")


✅ Found 6435 files to process in C:\Users\subha\Box\Pipeline\Cleaned_Notes\aria


Processing Notes:   2%|▏         | 101/6435 [10:20<9:22:22,  5.33s/it] 

✅ Saved progress at 100 files


Processing Notes:   3%|▎         | 201/6435 [17:17<6:14:41,  3.61s/it] 

✅ Saved progress at 200 files


Processing Notes:   5%|▍         | 301/6435 [35:29<37:33:46, 22.05s/it]

✅ Saved progress at 300 files


Processing Notes:   6%|▌         | 401/6435 [53:24<14:15:26,  8.51s/it]

✅ Saved progress at 400 files


Processing Notes:   8%|▊         | 501/6435 [1:09:29<22:48:12, 13.83s/it]

✅ Saved progress at 500 files


Processing Notes:   9%|▉         | 601/6435 [1:22:56<18:15:28, 11.27s/it]

✅ Saved progress at 600 files


Processing Notes:  11%|█         | 701/6435 [1:45:40<49:10:25, 30.87s/it]

✅ Saved progress at 700 files


Processing Notes:  11%|█         | 713/6435 [1:50:00<30:05:17, 18.93s/it]