In [None]:
# In a notebook cell
!pip install -U spacy[transformers]
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
nlp = spacy.load("en_core_web_trf")
prompt = "Show me all diabetic patients over 50"
doc = nlp(prompt)
print([(ent.text, ent.label_) for ent in doc.ents])

[('50', 'DATE')]


Okay, now I know my benchmark only detect age and frame it as date :/

Let's focus on detecting tags.

The goal is to make the model able to identiy specific dearch parameters for both paitent and condition resources.

Let's start with Patient:
Patient resoruce has these specific search paramters according to HL7:
- **Demographics** (name, birthdate, gender, deceased)
- **Identifiers** (ID, MRN, SSN, etc.)
- **Contact** (phone, email, address parts)
- **Relations** (organization, GP, linked patient)

Some require synthetic data, some can be identified through regex and shape.

Let's start simple -> RegEx

In [None]:
phone_patterns = [
        r'\+\d{1,3}[\s-]?\d{3,4}[\s-]?\d{3,4}[\s-]?\d{3,4}',  # International
        r'\d{10,15}',                   # 10+ digits usually phone
        r'\+\d+',                       # Any number starting with +
    ]

identifier_patterns = [
        r'\d{9}',                  # 123456789 (9 consecutive digits)
        r'^[A-Z0-9]{6,10}$'        # General format

    ]


In [None]:
patterns = []

patterns.append({
    "label": "EMAIL",
    "pattern": [{"TEXT": {"REGEX": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"}}]
})

for p in phone_patterns:
    patterns.append({"label": "PHONE", "pattern": [{"TEXT": {"REGEX": p}}]})

for p in identifier_patterns:
    patterns.append({"label": "IDENTIFIER", "pattern": [{"TEXT": {"REGEX": p}}]})

**age-related distinctions**

I'll build my RegEx based on the following examples:


*   Find patients born in 1985 → Patients?birthdate=1985
*   Show me patients born between 1990 and 2000 → Patients?birthdate=ge1990&birthdate=le2000
*   Get patients born on January 15, 1995 → Patients?birthdate=1995-01-15
*   Show patients named Fatima, born in 1995, living in Cairo.
*   Find all pediatric patients (born after 2010) → Patients?birthdate=gt2010-01-01
*   Show elderly patients (born before 1950) → Patients?birthdate=lt1950-01-01
*   Find patients born this century → Patients?birthdate=ge2000-01-01
*   Show patients in their 30s → Patients?birthdate=ge1990-01-01&birthdate=le1999-12-31





First approach, was to identify the birth year kind of possibilities. But year can be used in death, condition timing and so many thing.

The plan is that, whenever birth keyword is followed by a year. this will be flagged later in the consturction of the prompt as birth year.

- Find patients who died in 2023 → Patients?deceased=2023
- Get patients deceased on specific 2023-03-15 → Patients?deceased=2023-03-15
- Show patients who died after 2020 → Patients?deceased=gt2020-12-31

All these e

In [None]:
patterns.extend([
    # words that identify we're talking about death
    {"label": "DEATH_KEYWORD", "pattern": [{"LOWER": "died"}]},
    {"label": "DEATH_KEYWORD", "pattern": [{"LOWER": "deceased"}]},
    {"label": "DEATH_KEYWORD", "pattern": [{"LOWER": "passed"}, {"LOWER": "away"}]},

    # words that identify we're talking about birth
    {"label": "BIRTH_KEYWORD", "pattern": [{"LOWER": "born"}]},
    {"label": "BIRTH_KEYWORD", "pattern": [{"LOWER": "birth"}]},
    {"label": "BIRTH_KEYWORD", "pattern": [{"LOWER": "dob"}]},

    # Ranges: 1990 to 2000
    {"label": "YEAR_RANGE", "pattern": [{"TEXT": {"REGEX": r"\d{4}"}},
                                         {"LOWER": {"IN": ["to", "and", "-"]}},
                                         {"TEXT": {"REGEX": r"\d{4}"}}]},

    # Keywords: after / before
    {"label": "YEAR_AFTER", "pattern": [{"LOWER": {"IN": ["after", "gt", "greater"]}},
                                         {"TEXT": {"REGEX": r"\d{4}"}}]},

    {"label": "YEAR_BEFORE", "pattern": [{"LOWER": {"IN": ["before", "lt", "less"]}},
                                          {"TEXT": {"REGEX": r"\d{4}"}}]},

    # Exact year
    {"label": "ON_YEAR", "pattern": [{"LOWER": {"IN": ["in", "on"]}},
                                        {"TEXT": {"REGEX": r"\d{4}+"}}]}
])

Genders can be covered easily as well.

In [None]:
patterns.extend([
    # Male
    {"label": "GENDER_MALE", "pattern": [{"LOWER": "male"}]},
    {"label": "GENDER_MALE", "pattern": [{"LOWER": "m"}]},
    {"label": "GENDER_MALE", "pattern": [{"LOWER": "man"}]},
    {"label": "GENDER_MALE", "pattern": [{"LOWER": "boy"}]},
    {"label": "GENDER_MALE", "pattern": [{"LOWER": "boys"}]},

    # Female
    {"label": "GENDER_FEMALE", "pattern": [{"LOWER": "female"}]},
    {"label": "GENDER_FEMALE", "pattern": [{"LOWER": "f"}]},
    {"label": "GENDER_FEMALE", "pattern": [{"LOWER": "woman"}]},
    {"label": "GENDER_FEMALE", "pattern": [{"LOWER": "girl"}]},
    {"label": "GENDER_FEMALE", "pattern": [{"LOWER": "girls"}]},

    # Other / non-binary
    {"label": "GENDER_OTHER", "pattern": [{"LOWER": "non-binary"}]},
    {"label": "GENDER_OTHER", "pattern": [{"LOWER": "nb"}]},
    {"label": "GENDER_OTHER", "pattern": [{"LOWER": "transgender"}]},
    {"label": "GENDER_OTHER", "pattern": [{"LOWER": "genderqueer"}]},
    {"label": "GENDER_OTHER", "pattern": [{"LOWER": "agender"}]},
    {"label": "GENDER_OTHER", "pattern": [{"LOWER": "other"}]},
    {"label": "GENDER_OTHER", "pattern": [{"LOWER": "unknown"}]},
    {"label": "GENDER_OTHER", "pattern": [{"LOWER": "lgbtq"}]}
])

Let's include age as well

In [None]:
patterns.extend([
        {"label": "AGE_OVER", "pattern": [
        {"LOWER": {"IN": ["over", "older", "above", "greater", "more"]}},
        {"TEXT": {"REGEX": r"\d{1,3}"}}
    ]},
    {"label": "AGE_UNDER", "pattern": [
        {"LOWER": {"IN": ["under", "younger", "below", "less"]}},
        {"TEXT": {"REGEX": r"\d{1,3}"}}
    ]},
    {"label": "AGE_EXACT", "pattern": [
        {"LOWER": {"IN": ["age", "aged", "is", "years", "year"]}},
        {"TEXT": {"REGEX": r"\d{1,3}"}}
    ]}
])

Names are already identitied as "PERSON" by NER. I need to differentiate between middle, last and initial names.

In [None]:
# First / Given name triggers
given_name_patterns = [
    {"label": "GIVEN_NAME_TRIGGER", "pattern": [{"LOWER": "first"}, {"LOWER": "name"}]},
    {"label": "GIVEN_NAME_TRIGGER", "pattern": [{"LOWER": "given"}, {"LOWER": "name"}]},
    {"label": "GIVEN_NAME_TRIGGER", "pattern": [{"LOWER": "forename"}]},
    {"label": "GIVEN_NAME_TRIGGER", "pattern": [{"LOWER": "personal"}, {"LOWER": "name"}]},
]

# Last / Family name triggers
family_name_patterns = [
    {"label": "FAMILY_NAME_TRIGGER", "pattern": [{"LOWER": "last"}, {"LOWER": "name"}]},
    {"label": "FAMILY_NAME_TRIGGER", "pattern": [{"LOWER": "family"}, {"LOWER": "name"}]},
    {"label": "FAMILY_NAME_TRIGGER", "pattern": [{"LOWER": "surname"}]},
    {"label": "FAMILY_NAME_TRIGGER", "pattern": [{"LOWER": "second"}, {"LOWER": "name"}]},
    {"label": "FAMILY_NAME_TRIGGER", "pattern": [{"LOWER": "clan"}, {"LOWER": "name"}]},
    {"label": "FAMILY_NAME_TRIGGER", "pattern": [{"LOWER": "maiden"}, {"LOWER": "name"}]},
]

# General name references
general_name_patterns = [
    {"label": "NAME_GENERAL", "pattern": [{"LOWER": "name"}]},
    {"label": "NAME_GENERAL", "pattern": [{"LOWER": "named"}]},
    {"label": "NAME_GENERAL", "pattern": [{"LOWER": "called"}]},
    {"label": "NAME_GENERAL", "pattern": [{"LOWER": "known"}, {"LOWER": "as"}]},
    {"label": "NAME_GENERAL", "pattern": [{"LOWER": "alias"}]},
    {"label": "NAME_GENERAL", "pattern": [{"LOWER": "aka"}]},
    {"label": "NAME_GENERAL", "pattern": [{"LOWER": "a.k.a."}]},
    {"label": "NAME_GENERAL", "pattern": [{"LOWER": "middle"}, {"LOWER": "name"}]},
]

patterns.extend(given_name_patterns + family_name_patterns + general_name_patterns)

To identify general practitioner, I picked specific phrases that usually come before the name or id (info that identify the GP) as triggers.

In [None]:
 patterns.extend([
     # Titles / prefixes
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": {"REGEX": r"dr\.?"}}]},
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "doctor"}]},
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "physician"}]},
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "gp"}]},
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "family"}, {"LOWER": "doctor"}]},
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "primary"}, {"LOWER": "care"}, {"LOWER": "provider"}]},

    # Care assignment phrases
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "patients"}, {"LOWER": "of"}]},
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "under"}]},
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "seeing"}]},
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "assigned"}, {"LOWER": "to"}]},
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "managed"}, {"LOWER": "by"}]},
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "supervised"}, {"LOWER": "by"}]},
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "attending"}]},
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "covering"}]},
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "co-managing"}]},

    # IDs / reference numbers
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "npi"}]},
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "dea"}]},
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "provider"}, {"LOWER": "code"}]},

     # More references to GPs
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "patients"}, {"LOWER": "of"}]},
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "patients"}, {"LOWER": "seeing"}]},
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "who"}, {"LOWER": "go"}, {"LOWER": "to"}]},
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "primary"}, {"LOWER": "care"}]},
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "primary"}, {"LOWER": "medical"}, {"LOWER": "provider"}]},
    {"label": "GP_TRIGGER", "pattern": [{"LOWER": "primary"}, {"LOWER": "healthcare"}, {"LOWER": "provider"}]},
 ])

In [None]:
if "entity_ruler" not in nlp.pipe_names:
    ruler = nlp.add_pipe("entity_ruler", before="ner")
else:
    ruler = nlp.get_pipe("entity_ruler")

ruler.add_patterns(patterns)

In [None]:
phone_identifiers_prompts = [
    "Find patients with email sara@example.com",
    "Get the patient with MRN 445566.",
    "Show patients with mobile +971501234567",
    "Get patient by hospital ID H789012",
    "Find multiple patients by MRNs 123456, 789012, 345678",
]
documents = [nlp(prompt) for prompt in phone_identifiers_prompts]
for document in documents:
  print([(ent.text, ent.label_) for ent in document.ents])

[('sara@example.com', 'EMAIL')]
[('445566', 'IDENTIFIER')]
[('+971501234567', 'PHONE')]
[('H789012', 'IDENTIFIER')]
[('123456', 'IDENTIFIER'), ('789012', 'IDENTIFIER'), ('345678', 'IDENTIFIER')]


So far using RegEx can cover most cases related to identifiers and phone numbers as well as emails.

What it can't do:
- Identify SSN and phone numbers with spaces and dashes in between
- Wildcard searches: Using * for partial matches

In [None]:
born_prompts = [
    "Find patients born in 1985",
    "Show me patients born between 1990 and 2000",
    "Get patients born on JAN 15 1995",
    "Find all pediatric patients (born after 2010)",
    "Show elderly patients (born before 1950)",
    "Find patients born this century ",
    "Find patients born in 15 Sep 1995",
]
documents = [nlp(prompt) for prompt in born_prompts]
for document in documents:
  print([(ent.text, ent.label_) for ent in document.ents])

[('born', 'BIRTH_KEYWORD'), ('in 1985', 'ON_YEAR')]
[('born', 'BIRTH_KEYWORD'), ('1990 and 2000', 'YEAR_RANGE')]
[('born', 'BIRTH_KEYWORD'), ('JAN 15 1995', 'DATE')]
[('born', 'BIRTH_KEYWORD'), ('after 2010', 'YEAR_AFTER')]
[('born', 'BIRTH_KEYWORD'), ('before 1950', 'YEAR_BEFORE')]
[('born', 'BIRTH_KEYWORD'), ('this century', 'DATE')]
[('born', 'BIRTH_KEYWORD'), ('15 Sep 1995', 'DATE')]


This is not bad! It covers a considerably good range of possibilities when I'm only using the year. But here's a list of out limitations:
- Find patients born this century -> it's not smart enough to detect decades or any lingustic expression that doesn't include numbers

In [None]:
death_prompts = [
  "Find patients who died in 2023",
  "Get patients deceased on 2023-03-15",
  "Show patients who died after 2020"
]

documents = [nlp(prompt) for prompt in death_prompts]
for document in documents:
  print([(ent.text, ent.label_) for ent in document.ents])

[('died', 'DEATH_KEYWORD'), ('in 2023', 'ON_YEAR')]
[('deceased', 'DEATH_KEYWORD'), ('on 2023', 'ON_YEAR'), ('03-15', 'DATE')]
[('died', 'DEATH_KEYWORD'), ('after 2020', 'YEAR_AFTER')]


Limitations:
- Show me all living patients → Patients?deceased=false

In [None]:
gender_prompts = ["Find all male patients",
                  "Show all female patients",
                  "Get all female patients over 65",
                  "Find unknown gender patients",
                  "Show me all diabetic patients over 50"
]

documents = [nlp(prompt) for prompt in gender_prompts]
for document in documents:
  print([(ent.text, ent.label_) for ent in document.ents])

[('male', 'GENDER_MALE')]
[('female', 'GENDER_FEMALE')]
[('female', 'GENDER_FEMALE'), ('over 65', 'AGE_OVER')]
[('unknown', 'GENDER_OTHER')]
[('over 50', 'AGE_OVER')]


For names, NER already recognizes those as PERSON. Let's test that:

In [None]:
names_prompts = [
    "Find patients with first name Sarah",
    "Show me all patients with family name Johnson",
    "Find patients named Maria Garcia",
    "Get all patients with last name starting with 'Al'",
    "Show patients named John Smith or John Doe",
    "Show me patients named Ahmed Ali",
    "Show patients named Fatima, born in 1995, living in Cairo."
]
documents = [nlp(prompt) for prompt in names_prompts]
for document in documents:
  print([(ent.text, ent.label_) for ent in document.ents])

[('first name', 'GIVEN_NAME_TRIGGER'), ('Sarah', 'PERSON')]
[('family name', 'FAMILY_NAME_TRIGGER'), ('Johnson', 'PERSON')]
[('named', 'NAME_GENERAL'), ('Maria Garcia', 'PERSON')]
[('last name', 'FAMILY_NAME_TRIGGER'), ('Al', 'PERSON')]
[('named', 'NAME_GENERAL'), ('John Smith', 'PERSON'), ('John Doe', 'PERSON')]
[('named', 'NAME_GENERAL'), ('Ahmed Ali', 'PERSON')]
[('named', 'NAME_GENERAL'), ('Fatima', 'PERSON'), ('born', 'BIRTH_KEYWORD'), ('in 1995', 'ON_YEAR'), ('Cairo', 'GPE')]


This captures most of the prompts with triggers that help identify the category of the name after.

Limitations:
- Wildcard searches: Using * for partial matches

In [None]:

patient_query = [
    "Find patients at Mayo Clinic",
    "Show patients managed by Johns Hopkins",
    "Get patients from Dubai Hospital",
    "Find patients on Main Street",
    "Show patients at apartment buildings", # won't be identified
    "Get patients on 5th Avenue",
    "Find patients in Dubai, UAE",
    "Show patients living in New York",
    "Get patients from California",
    "List patients treated at Cleveland Clinic",
    "Show all patients from Memorial Hospital",
    "Find patients under Mercy Health System",
    "Show patients living on Oak Street",
    "Find patients in downtown area", # won't be identified
    "Get patients from residential complexes", # won't be identified
    "Find patients in Los Angeles, California",
    "Show patients from Miami, Florida",
    "Get patients in Toronto, Canada"
]

documents = [nlp(prompt) for prompt in patient_query ]
for document in documents:
  print([(ent.text, ent.label_) for ent in document.ents])

[('Mayo Clinic', 'ORG')]
[('managed by', 'GP_TRIGGER'), ('Johns Hopkins', 'ORG')]
[('Dubai Hospital', 'ORG')]
[('Main Street', 'FAC')]
[]
[('5th Avenue', 'FAC')]
[('Dubai', 'GPE'), ('UAE', 'GPE')]
[('New York', 'GPE')]
[('California', 'GPE')]
[('Cleveland Clinic', 'ORG')]
[('Memorial Hospital', 'ORG')]
[('under', 'GP_TRIGGER'), ('Mercy Health System', 'ORG')]
[('Oak Street', 'FAC')]
[]
[]
[('Los Angeles', 'GPE'), ('California', 'GPE')]
[('Miami', 'GPE'), ('Florida', 'GPE')]
[('Toronto', 'GPE'), ('Canada', 'GPE')]


I think it's diffficult to differntiate between city, state and country without LLMs

In [None]:
GP_prompts = [
    "List patients of Dr Ahmed Al-Rashid",
    "Find patients under Dr. Maria Garcia",
    "Show patients of Dr. Williams",
    "Show patients under GP ID DOC001",
    "Find patients with physician ID MD123",
    "Show patients under family physician Taylor",
    "List patients with primary care Miller",
    "Get patients with family doctors Johnson, Williams, Brown",
    "Who are Dr. Baker's patients?",
    "Show me patients seeing Dr. Cooper",
    "Which patients go to Dr. Murphy?",
    "Get patients with primary medical provider Dr. Bell",
    "List patients of primary healthcare provider Dr. Gray"
]

documents = [nlp(prompt) for prompt in GP_prompts]
for document in documents:
  print([(ent.text, ent.label_) for ent in document.ents])

[('patients of', 'GP_TRIGGER'), ('Dr', 'GP_TRIGGER'), ('Ahmed Al', 'PERSON')]
[('under', 'GP_TRIGGER'), ('Dr.', 'GP_TRIGGER'), ('Maria Garcia', 'PERSON')]
[('patients of', 'GP_TRIGGER'), ('Dr.', 'GP_TRIGGER'), ('Williams', 'PERSON')]
[('under', 'GP_TRIGGER'), ('GP', 'GP_TRIGGER'), ('DOC001', 'IDENTIFIER')]
[('physician', 'GP_TRIGGER')]
[('under', 'GP_TRIGGER'), ('physician', 'GP_TRIGGER'), ('Taylor', 'PERSON')]
[('primary care', 'GP_TRIGGER'), ('Miller', 'PERSON')]
[('Johnson', 'PERSON'), ('Williams', 'PERSON'), ('Brown', 'PERSON')]
[('Family Medical Group', 'ORG')]
[('Dr.', 'GP_TRIGGER'), ('Baker', 'PERSON')]
[('patients seeing', 'GP_TRIGGER'), ('Dr.', 'GP_TRIGGER'), ('Cooper', 'PERSON')]
[('Dr.', 'GP_TRIGGER'), ('Murphy', 'PERSON')]
[('primary medical provider', 'GP_TRIGGER'), ('Dr.', 'GP_TRIGGER'), ('Bell', 'PERSON')]
[('patients of', 'GP_TRIGGER'), ('primary healthcare provider', 'GP_TRIGGER'), ('Dr.', 'GP_TRIGGER'), ('Gray', 'PERSON')]


Now the NER matches most of the filters used as search parameters in the Patient resource related SEARCH query.

Limitations (prioritized to address):
- conditions
- state/city/country as addresses

These require training the NER with relevant data that is annotated, because they’re highly context-dependent.