In [None]:
import pandas as pd
import numpy as np
import duckify as db
import re

In [146]:
import scispacy
import spacy
from spacy import displacy

# Get the Note Events Only for One Diagnosis

Given someone close to me as something called "agenesis of the corpus callosum" and has developmental delays, I wanted to research this potential diagnosis and what else might be available in this dataset

In [115]:
db.run_query(
"""
select * 
from D_ICD_DIAGNOSES 
where upper(LONG_TITLE) like '%CONG%BRAIN%'
or ICD9_CODE = '3159'
limit 10
""")

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,3058,3159,Development delay NOS,Unspecified delay in development
1,11811,7422,"Reduction deform, brain",Congenital reduction deformities of brain
2,11813,7424,Brain anomaly NEC,Other specified congenital anomalies of brain
3,11818,7429,Nervous system anom NOS,"Unspecified congenital anomaly of brain, spina..."


In [117]:
db.run_query(
"""
select 
HADM_ID
from DIAGNOSES_ICD
where ICD9_CODE = '7424'
intersect
select
HADM_ID
from DIAGNOSES_ICD
where ICD9_CODE = '3159'
"""
)

Unnamed: 0,HADM_ID


Since I don't have visits with both, I will focus on the code associated with the brain abnormality

In [112]:
notes_df = db.run_query(
"""
select
*
from NOTEEVENTS
where hadm_id in (
    select hadm_id
    from DIAGNOSES_ICD
    where ICD9_CODE = '7424'
)
"""
)

In [114]:
notes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6959 entries, 0 to 6958
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   ROW_ID       6959 non-null   int64         
 1   SUBJECT_ID   6959 non-null   int64         
 2   HADM_ID      6959 non-null   float64       
 3   CHARTDATE    6959 non-null   datetime64[ns]
 4   CHARTTIME    6833 non-null   datetime64[ns]
 5   STORETIME    6392 non-null   datetime64[ns]
 6   CATEGORY     6959 non-null   object        
 7   DESCRIPTION  6959 non-null   object        
 8   CGID         6392 non-null   float64       
 9   ISERROR      0 non-null      float64       
 10  TEXT         6959 non-null   object        
dtypes: datetime64[ns](3), float64(3), int64(2), object(3)
memory usage: 598.2+ KB


In [None]:
import re

In [None]:
# get notes into list
notes_list = notes_df["TEXT"].apply(lambda txt: re.sub(r'[^\x20-\x7E]',' ',txt)).to_list()

# Spacy Extracted Entities

In [147]:
nlp = spacy.load("en_core_web_sm")

In [148]:
for token in nlp(notes_list[0]):
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_)

Attending attend VERB VBG compound
Note Note PROPN NNP dobj
Day Day PROPN NNP dobj
of of ADP IN prep
life life NOUN NN pobj
15 15 NUM CD nummod
PMA PMA PROPN NNP nsubj
31 31 NUM CD compound
1/7 1/7 NUM CD nummod
CPAP cpap NOUN NN nmod
5 5 NUM CD nummod
FiO2 FiO2 PROPN NNP appos
21 21 NUM CD nummod
% % NOUN NN appos
after after ADP IN prep
being be VERB VBG pcomp
off off ADP RP prt
for for ADP IN prep
[ [ PUNCT -LRB- pobj
* * PUNCT NFP punct
* * PUNCT NFP punct
4 4 NUM CD pobj
- - SYM SYM punct
14 14 NUM CD prep
* * PUNCT NFP punct
* * PUNCT NFP punct
] ] PUNCT -RRB- punct
day day NOUN NN pobj
RR rr NOUN NN appos
30 30 NUM CD nummod
- - SYM SYM punct
40 40 NUM CD prep
's be AUX VBZ ROOT
on on ADP IN prep
caffeine caffeine NOUN NN pobj
one one NUM CD nummod
spell spell NOUN NN npadvmod
in in ADP IN prep
24 24 NUM CD nummod
hours hour NOUN NNS pobj
HR hr NOUN NN appos
130 130 NUM CD nummod
- - SYM SYM punct
160 160 NUM CD prep
's 's PART POS case
BP BP PROPN NNP nmod
72/38 72/38 NUM CD nu

# SciSpacy Extracted Entities

## en_core_sci_md

In [None]:
import en_core_sci_md

In [None]:
nlp = en_core_sci_md.load()

In [None]:
for entity in nlp(notes_list[1]).ents:
    print(entity.text, entity.label_)

NICU Fellow Physical Exam Please ENTITY
attending ENTITY
hospital course ENTITY
plan ENTITY
Prone ENTITY
CPAP ENTITY
p130 ENTITY
CPAP ENTITY
Anterior fontanelle soft open ENTITY
flat ENTITY
Equal ENTITY
clear breath sounds ENTITY
bilaterally ENTITY
Mild subcostal retractions ENTITY
Regular rhythm ENTITY
normal rate ENTITY
no murmur ENTITY
Abdomen soft ENTITY
nondistended ENTITY
Warm ENTITY
pink ENTITY


In [None]:
displacy.render(nlp(notes_list[1]), style="ent", jupyter=True)

## en_core_sci_lg

## en_ner_craft_md

In [None]:
import en_ner_craft_md

## en_ner_jnlpba_md

In [None]:
import en_ner_jnlpba_md

## en_ner_bionlp13cf_md

In [None]:
import en_ner_bionlp13cg_md

## en_ner_bc5cdr_md

In [None]:
import en_ner_bc5cdr_md

# Word2Vec Plots

# tSNE Plots