In [4]:
# import dependencies
import os, re, random
import numpy as np
import pandas as pd
from tqdm import tqdm

import spacy
from spacy import displacy
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# set deterministic seed
random.seed(42); np.random.seed(42)
tqdm.pandas()

# Load Note Events, Diagnoses ICD and Dict tables

Load the data necessary to build models. I included the `D_ICD_DIAGNOSES` table just so I could see the description of the ICD9 codes.

In [7]:
# load tables
NOTE_EVENTS_PATH = "../mimic-iii/NOTEEVENTS.csv.gz"
df_noteevents = pd.read_csv(NOTE_EVENTS_PATH, compression='gzip')
df_noteevents.info()

DIAGNOSES_PATH = "../mimic-iii/DIAGNOSES_ICD.csv.gz"
df_diagnoses_icd = pd.read_csv(DIAGNOSES_PATH, compression='gzip', dtype={'SEQ_NUM': 'Int64'})
df_diagnoses_icd.info()

DIAG_DICT_PATH = "../mimic-iii/D_ICD_DIAGNOSES.csv.gz"
df_diagnoses_dict = pd.read_csv(DIAG_DICT_PATH, compression='gzip', dtype={'SEQ_NUM': 'Int64'})
df_diagnoses_dict.info()

  df_noteevents = pd.read_csv(NOTE_EVENTS_PATH, compression='gzip')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2083180 entries, 0 to 2083179
Data columns (total 11 columns):
 #   Column       Dtype  
---  ------       -----  
 0   ROW_ID       int64  
 1   SUBJECT_ID   int64  
 2   HADM_ID      float64
 3   CHARTDATE    object 
 4   CHARTTIME    object 
 5   STORETIME    object 
 6   CATEGORY     object 
 7   DESCRIPTION  object 
 8   CGID         float64
 9   ISERROR      float64
 10  TEXT         object 
dtypes: float64(3), int64(2), object(6)
memory usage: 174.8+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651047 entries, 0 to 651046
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   ROW_ID      651047 non-null  int64 
 1   SUBJECT_ID  651047 non-null  int64 
 2   HADM_ID     651047 non-null  int64 
 3   SEQ_NUM     651000 non-null  Int64 
 4   ICD9_CODE   651000 non-null  object
dtypes: Int64(1), int64(3), object(1)
memory usage: 25.5+ MB
<class 'pandas.core.frame.D

# Determine diagnoses on which to focus

Choose a diagnoses to focus on by listing the more common diagnoses, but choosing one with a manageable number of corresponding admission records.

In [9]:
# display the top 20 most common diagnoses
top_icd9_counts = df_diagnoses_icd['ICD9_CODE'].value_counts().head(20)
print("Top 10 ICD9_CODE counts:")
for code, count in top_icd9_counts.items():
    print(f"{code} ({df_diagnoses_dict[df_diagnoses_dict['ICD9_CODE'] == code]['SHORT_TITLE'].values[0]}): {count}")

# filter rows with ICD9_CODE
ICD_FILTER = ['99592']
df_filtered = df_diagnoses_icd[df_diagnoses_icd['ICD9_CODE'].isin(ICD_FILTER)]
admissions_with_diagnoses = df_filtered['HADM_ID'].unique()
print(f"Number of ICD-9 {ICD_FILTER} admissions: {admissions_with_diagnoses.size}")

# filter notes for admissions with diagnoses
df_noteevents_filtered = df_noteevents[df_noteevents['HADM_ID'].isin(admissions_with_diagnoses)]
df_noteevents_filtered.info()

df_noteevents_filtered['CATEGORY'].unique()
df_diag_notes = df_noteevents_filtered[df_noteevents_filtered['CATEGORY'] == 'Respiratory ']
df_diag_notes.info()



Top 10 ICD9_CODE counts:
4019 (Hypertension NOS): 20703
4280 (CHF NOS): 13111
42731 (Atrial fibrillation): 12891
41401 (Crnry athrscl natve vssl): 12429
5849 (Acute kidney failure NOS): 9119
25000 (DMII wo cmp nt st uncntr): 9058
2724 (Hyperlipidemia NEC/NOS): 8690
51881 (Acute respiratry failure): 7497
5990 (Urin tract infection NOS): 6555
53081 (Esophageal reflux): 6326
2720 (Pure hypercholesterolem): 5930
V053 (Need prphyl vc vrl hepat): 5779
V290 (NB obsrv suspct infect): 5519
2859 (Anemia NOS): 5406
2449 (Hypothyroidism NOS): 4917
486 (Pneumonia, organism NOS): 4839
2851 (Ac posthemorrhag anemia): 4552
2762 (Acidosis): 4528
496 (Chr airway obstruct NEC): 4431
99592 (Severe sepsis): 3912
Number of ICD-9 ['99592'] admissions: 3912
<class 'pandas.core.frame.DataFrame'>
Index: 239980 entries, 26 to 2066679
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   ROW_ID       239980 non-null  int64  
 1   SUBJECT_ID   2

# Examine Respiratory Notes for Severe Sepsis

I chose Severe Sepsis diagnosis, focusing on notes under the Respiratory category. This yielded `9895` entries.

## Load SpaCy and SciSpaCy Models

For comparison, I'm using the general model `en_core_web_sm` and `scispacy` model `en_core_sci_md`.

In [10]:
nlp_gen = spacy.load("en_core_web_sm")
nlp_sci_md = spacy.load("en_core_sci_md")

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


## Extract Entities

Define a function that can be reused to extract entities using any `spacy` model

In [12]:
# define function to extract entities
def extract_ents(nlp, texts, batch_size=128):
    docs = []
    # only run the NER pipe for speed
    with nlp.select_pipes(enable=["ner"]):
        for doc in tqdm(
            nlp.pipe(texts, batch_size=batch_size),
            total=len(texts), desc=f"NER ({nlp.meta.get('name','model')})"
        ):
            docs.append(doc)
    return docs



Use the function to extract entities with the two loaded models

In [15]:
notes = df_diag_notes["TEXT"].tolist()
ents_gen = extract_ents(nlp_gen, notes)
ents_sci = extract_ents(nlp_sci_md, notes)

NER (core_web_sm):   0%|          | 0/9895 [00:00<?, ?it/s]

NER (core_web_sm): 100%|██████████| 9895/9895 [01:45<00:00, 93.35it/s] 
NER (core_sci_md): 100%|██████████| 9895/9895 [02:13<00:00, 73.93it/s]


In [16]:
# show entities for first text sample from each model
displacy.render(ents_gen[0], style="ent", jupyter=True)
displacy.render(ents_sci[0], style="ent", jupyter=True)