In [1]:
# Step 1: Imports and setup
import spacy
from spacy.pipeline import EntityRuler
from spacy.tokens import Span
from spacy.language import Language
import pandas as pd
import csv


In [2]:
# Step 2: Initialize spaCy pipeline
nlp = spacy.blank("en")
ruler = nlp.add_pipe("entity_ruler")

print("✅ Blank English pipeline and EntityRuler created.")


✅ Blank English pipeline and EntityRuler created.


In [3]:
# Step 3: Define and add clinical NER patterns
patterns = [
    {"label": "MEDICATION", "pattern": "metformin"},
    {"label": "MEDICATION", "pattern": "lisinopril"},
    {"label": "MEDICATION", "pattern": "ibuprofen"},
    {"label": "DOSAGE", "pattern": [{"LIKE_NUM": True}, {"LOWER": {"IN": ["mg", "ml"]}}]},
    {"label": "CONDITION", "pattern": "diabetes"},
    {"label": "CONDITION", "pattern": "hypertension"},
    {"label": "SYMPTOM", "pattern": "chest pain"},
    {"label": "SYMPTOM", "pattern": "headache"},
    {"label": "PROCEDURE", "pattern": "blood test"},
    {"label": "PROCEDURE", "pattern": "MRI"},
    {"label": "PROCEDURE", "pattern": "x-ray"},
]
ruler.add_patterns(patterns)

print("✅ Clinical patterns added to EntityRuler.")


✅ Clinical patterns added to EntityRuler.


In [4]:
# Step 4: Custom pipeline component to simulate negation detection

# Avoid error if re-run
if not Span.has_extension("is_negated"):
    Span.set_extension("is_negated", default=False)

@Language.component("negation_detector")
def detect_negation(doc):
    negation_terms = {"no", "not", "without", "denies"}
    for i, token in enumerate(doc):
        if token.text.lower() in negation_terms:
            for ent in doc.ents:
                if token.i < ent.start:
                    ent._.is_negated = True
    return doc

# Add to pipeline
nlp.add_pipe("negation_detector", after="entity_ruler")

print("🧠 Negation detector added to pipeline.")


🧠 Negation detector added to pipeline.


In [5]:
import re

# Define section header patterns
section_patterns = {
    "HPI": r"history of present illness[:\-]",
    "MEDICATIONS": r"medications[:\-]",
    "ALLERGIES": r"allergies[:\-]",
    "ASSESSMENT": r"assessment[:\-]",
    "PLAN": r"plan[:\-]",
    "PROCEDURES": r"procedures[:\-]",
    "DIAGNOSIS": r"diagnosis[:\-]",
}

def detect_section(text):
    """
    Assigns section headers to each sentence based on simple regex matches.
    """
    sections = []
    current_section = "UNKNOWN"
    for line in text.split("\n"):
        stripped = line.strip().lower()
        for label, pattern in section_patterns.items():
            if re.match(pattern, stripped):
                current_section = label
                break
        sections.append((line.strip(), current_section))
    return sections


In [6]:
# Step 5: Sample clinical text
texts = [
    "Patient is prescribed metformin 500 mg for diabetes.",
    "She complains of chest pain and is scheduled for a blood test.",
    "MRI was recommended due to persistent headache.",
    "Lisinopril 10 mg is taken daily to manage hypertension.",
    "No signs of headache after taking ibuprofen."
]

results = []

for text in texts:
    doc = nlp(text)
    print(f"\n🔎 Text: {text}")
    for ent in doc.ents:
        neg_status = "NEGATED" if ent._.is_negated else "affirmed"
        print(f"  → ENTITY: {ent.text:<20} | LABEL: {ent.label_:<12} | Negation: {neg_status}")
        results.append((text, ent.text, ent.label_, neg_status))



🔎 Text: Patient is prescribed metformin 500 mg for diabetes.
  → ENTITY: metformin            | LABEL: MEDICATION   | Negation: affirmed
  → ENTITY: 500 mg               | LABEL: DOSAGE       | Negation: affirmed
  → ENTITY: diabetes             | LABEL: CONDITION    | Negation: affirmed

🔎 Text: She complains of chest pain and is scheduled for a blood test.
  → ENTITY: chest pain           | LABEL: SYMPTOM      | Negation: affirmed
  → ENTITY: blood test           | LABEL: PROCEDURE    | Negation: affirmed

🔎 Text: MRI was recommended due to persistent headache.
  → ENTITY: MRI                  | LABEL: PROCEDURE    | Negation: affirmed
  → ENTITY: headache             | LABEL: SYMPTOM      | Negation: affirmed

🔎 Text: Lisinopril 10 mg is taken daily to manage hypertension.
  → ENTITY: 10 mg                | LABEL: DOSAGE       | Negation: affirmed
  → ENTITY: hypertension         | LABEL: CONDITION    | Negation: affirmed

🔎 Text: No signs of headache after taking ibuprofen.
  → EN

In [7]:
# Step 6: Export results
csv_path = "clinical_ner_results_expanded.csv"

with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Original Text", "Entity", "Label", "Negation Status"])
    writer.writerows(results)

print(f"\n📁 Results saved to: {csv_path}")

# Load and display
df = pd.read_csv(csv_path)
df.head()



📁 Results saved to: clinical_ner_results_expanded.csv


Unnamed: 0,Original Text,Entity,Label,Negation Status
0,Patient is prescribed metformin 500 mg for dia...,metformin,MEDICATION,affirmed
1,Patient is prescribed metformin 500 mg for dia...,500 mg,DOSAGE,affirmed
2,Patient is prescribed metformin 500 mg for dia...,diabetes,CONDITION,affirmed
3,She complains of chest pain and is scheduled f...,chest pain,SYMPTOM,affirmed
4,She complains of chest pain and is scheduled f...,blood test,PROCEDURE,affirmed


In [8]:
# Step 6: Test section detection on a sample clinical note
sample_note = """
History of Present Illness:
Patient is prescribed metformin 500 mg for diabetes.

Medications:
Lisinopril 10 mg daily

Assessment:
Hypertension under control

Plan:
Schedule a blood test and MRI
"""

# Run section detection on each line
sectioned_lines = detect_section(sample_note)

# Print each line with its identified section
for line, section in sectioned_lines:
    print(f"[{section}] {line}")


[UNKNOWN] 
[HPI] History of Present Illness:
[HPI] Patient is prescribed metformin 500 mg for diabetes.
[HPI] 
[MEDICATIONS] Medications:
[MEDICATIONS] Lisinopril 10 mg daily
[MEDICATIONS] 
[ASSESSMENT] Assessment:
[ASSESSMENT] Hypertension under control
[ASSESSMENT] 
[PLAN] Plan:
[PLAN] Schedule a blood test and MRI
[PLAN] 


In [9]:
# Step 7: Combine NER, negation, and section detection
combined_results = []

for text in texts:
    doc = nlp(text)
    sections = detect_section(text)

    for ent in doc.ents:
        matched_section = "UNKNOWN"
        for line, section in sections:
            start_index = text.find(line)
            if start_index != -1:
                end_index = start_index + len(line)
                if start_index <= ent.start_char < end_index:
                    matched_section = section
                    break
        combined_results.append((text, ent.text, ent.label_, ent._.is_negated, matched_section))

# Save to updated CSV
csv_path_combined = "clinical_ner_with_negation_and_section.csv"
with open(csv_path_combined, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Original Text", "Entity", "Label", "Negated", "Section"])
    writer.writerows(combined_results)

print(f"📄 Final results saved to: {csv_path_combined}")


📄 Final results saved to: clinical_ner_with_negation_and_section.csv


In [10]:
# View results using pandas
df_combined = pd.read_csv("clinical_ner_with_negation_and_section.csv")
df_combined.head()


Unnamed: 0,Original Text,Entity,Label,Negated,Section
0,Patient is prescribed metformin 500 mg for dia...,metformin,MEDICATION,False,UNKNOWN
1,Patient is prescribed metformin 500 mg for dia...,500 mg,DOSAGE,False,UNKNOWN
2,Patient is prescribed metformin 500 mg for dia...,diabetes,CONDITION,False,UNKNOWN
3,She complains of chest pain and is scheduled f...,chest pain,SYMPTOM,False,UNKNOWN
4,She complains of chest pain and is scheduled f...,blood test,PROCEDURE,False,UNKNOWN


In [12]:
from spacy import displacy
from IPython.display import HTML, display

# Define colors per label
LABEL_COLORS = {
    "MEDICATION": "#a6cee3",
    "DOSAGE": "#1f78b4",
    "CONDITION": "#b2df8a",
    "SYMPTOM": "#33a02c",
    "PROCEDURE": "#fb9a99",
    "NEGATED": "#999999"  # gray for negated entities
}

def visualize_with_negation_polished(doc):
    ents = []
    for ent in doc.ents:
        label = f"{ent.label_} (neg)" if ent._.is_negated else ent.label_
        color = LABEL_COLORS["NEGATED"] if ent._.is_negated else LABEL_COLORS.get(ent.label_, "#dddddd")

        ents.append({
            "start": ent.start_char,
            "end": ent.end_char,
            "label": label,
        })

    example = {
        "text": doc.text,
        "ents": ents,
        "title": "Negation-Aware Entity Visualizer"
    }

    html = displacy.render(example, style="ent", manual=True, options={"colors": LABEL_COLORS})
    display(HTML(html))

# Re-run on all clinical texts
print("🖼️ Highlighted Clinical Entities (with negation tags):")
for text in texts:
    doc = nlp(text)
    visualize_with_negation_polished(doc)



🖼️ Highlighted Clinical Entities (with negation tags):


<IPython.core.display.HTML object>

<IPython.core.display.HTML object>

<IPython.core.display.HTML object>

<IPython.core.display.HTML object>

<IPython.core.display.HTML object>