# MIMIC Notes Pre-Processing

Pre-processing MIMIC notes for language model and CUIs

## Imports and Inits

In [29]:
import pandas as pd
import psycopg2
import numpy as np
import re
import random

from pathlib import Path

In [30]:
PATH = Path('data')

## Functions

## Grab sample data from MIMIC

In [31]:
cats = pd.read_csv('cats.csv')
max_limit = 10

queries = []
for category, n_notes in zip(cats['category'], cats['number_of_notes']):
    limit = min(max_limit, n_notes) if max_limit > 0 else n_notes
    if limit == max_limit:
        q = f"""
        select category, text from correctnotes where category=\'{category}\' order by random() limit {limit};
        """
    else:
        q = f"""
        select category, text from correctnotes where category=\'{category}\';
        """
    queries.append(q)

In [32]:
# limit = 50
# queries = [
#     f"""
#     select category, text from correctnotes where category=\'{cats.iloc[7]['category']}\' order by random() limit {limit}
#     """
# ]

In [33]:
%%time
dfs = []

con = psycopg2.connect(dbname='mimic', user='sudarshan', host='/var/run/postgresql')
for q in queries:
    df = pd.read_sql_query(q, con)
    dfs.append(df)
con.close()
    
df = pd.concat(dfs)
# df.set_index('row_id', inplace=True)
print(df.shape)

(140, 2)
CPU times: user 25.9 ms, sys: 555 µs, total: 26.5 ms
Wall time: 2.33 s


1. Get list of redacted types using `re`
2. replace it with appropriate holder tokens

Redacted items:
* ~~First Name~~
* ~~Last Name~~
* Hospital
* Date of format M-DD
* Date of format YYYY-M-DD
* ~~Initials~~
* ~~Name~~
* NULL
* ~~Known lastname~~
* ~~Doctor First Name~~
* Doctor Last Name
* Month (only)
* Just numbers
* Location
* Month/Day
* Telephone/Fax
* Wardname
* Pager Numeric Identifier
* Pharmacy MD Number

Redacted items:
* [x] First Name: `[**First Name (Titles) 137**]`, `t_name_first`
* [x] Last Name: `[**Last Name (Titles) **]`, `t_name_last`
* [x] Initials: `[**Initials (NamePattern4) **]`, `t_name_inits`
* [x] Name: `[**Name (NI) **]`, `t_name`
* [x] Doctor First Name: `[**Doctor First Name 1266**]`, `t_name_doc_first`
* [x] Doctor Last Name: `[**Doctor Last Name 1266**]`, `t_name_doc_last`
* [x] Known Last Name: `[**Known lastname 658**]`, `t_name_last`
* Hospital
* Date of format M-DD
* Date of format YYYY-M-DD
* NULL
* Month (only)
* Just numbers
* Location
* Month/Day
* Telephone/Fax
* Wardname
* Pager Numeric Identifier
* Pharmacy MD Number

In [28]:
def replace_name(match):
    r = match.group()
    grp = match.group().lower()
    if 'name' in grp:
        r = 't_name'
        if 'last' in grp:
            if 'doctor' in grp:
                r = 't_name_doc_last'
            else:
                r = 't_name_last'
        elif 'first' in grp:
            if 'doctor' in grp:
                r = 't_name_doc_first'
            else:
                r = 't_name_first'
        elif 'initials' in grp:
            r = 't_name_inits'
    return r

In [None]:
def replace_redacted(text):
    pat = re.compile(r'\[\*\*(.*?)\*\*\]', re.IGNORECASE)
    
    # replace name types with unique tokens
    text = pat.sub(replace_name, text)
    
    return text

In [42]:
test = df.iloc[random.randint(0, 50)]['text']
print(test)

TITLE:
   DIVISION OF CARDIOLOGY COMPREHENSIVE NOTE
                 Initial Visit, Electrophysiology Service
   .
   Reason for consult: ?AVNRT vs AVRT
   .
   EVENTS / HISTORY OF PRESENTING ILLNESS:
   73 yo Spanish speaking F with COPD on home O2, diastolic CHF, right TKR
   one month ago who presented to the ED with shortness of breath. In the
   ED she had an episode of SVT (of which she had another episode as
   recently [**3-26**] in the ED, broken with adenosine) that broke with high
   flow oxygen therapy alone. She was initially admitted to the MICU where
   she had another episode of SVT that also broke with O2 therapy.
   [**Hospital **] transfered to the floor but again had an episode of SVT to
   120's and respiratory distress and transfered back to the MICU. In the
   MICU she was noted to have this episoded of SVT in the setting of cough
   spells despite having started diltiazem 30 mg PO QID on [**2153-4-8**]. Given
   her history of dCHF the medicine team is worried t

In [43]:
pat_all = re.compile(r'\[\*\*(.*?)\*\*\]', re.IGNORECASE)

for m in pat_all.finditer(test):
    print(m)

<_sre.SRE_Match object; span=(431, 441), match='[**3-26**]'>
<_sre.SRE_Match object; span=(642, 657), match='[**Hospital **]'>
<_sre.SRE_Match object; span=(925, 939), match='[**2153-4-8**]'>
<_sre.SRE_Match object; span=(1225, 1235), match='[**3-26**]'>
<_sre.SRE_Match object; span=(1644, 1661), match='[**Hospital1 7**]'>
<_sre.SRE_Match object; span=(1775, 1792), match='[**Hospital1 7**]'>
<_sre.SRE_Match object; span=(2058, 2075), match='[**Hospital1 7**]'>
<_sre.SRE_Match object; span=(2917, 2934), match='[**Hospital1 7**]'>
<_sre.SRE_Match object; span=(3515, 3528), match='[**12/2151**]'>
<_sre.SRE_Match object; span=(3682, 3696), match='[**2153-3-5**]'>
<_sre.SRE_Match object; span=(4605, 4619), match='[**2153-4-8**]'>
<_sre.SRE_Match object; span=(4727, 4743), match='[**2151-12-16**]'>
<_sre.SRE_Match object; span=(6068, 6097), match='[**Last Name (STitle) 5152**]'>
<_sre.SRE_Match object; span=(6202, 6231), match='[**Last Name (STitle) 5152**]'>
<_sre.SRE_Match object; span=(66

In [44]:
# out = pat_all.sub(replace_name, test)
out = replace_redacted(test)

In [45]:
print(out)

TITLE:
   DIVISION OF CARDIOLOGY COMPREHENSIVE NOTE
                 Initial Visit, Electrophysiology Service
   .
   Reason for consult: ?AVNRT vs AVRT
   .
   EVENTS / HISTORY OF PRESENTING ILLNESS:
   73 yo Spanish speaking F with COPD on home O2, diastolic CHF, right TKR
   one month ago who presented to the ED with shortness of breath. In the
   ED she had an episode of SVT (of which she had another episode as
   recently [**3-26**] in the ED, broken with adenosine) that broke with high
   flow oxygen therapy alone. She was initially admitted to the MICU where
   she had another episode of SVT that also broke with O2 therapy.
   [**Hospital **] transfered to the floor but again had an episode of SVT to
   120's and respiratory distress and transfered back to the MICU. In the
   MICU she was noted to have this episoded of SVT in the setting of cough
   spells despite having started diltiazem 30 mg PO QID on [**2153-4-8**]. Given
   her history of dCHF the medicine team is worried t

In [None]:
pat_all = re.compile(r'\[\*\*(.*?)\*\*\]', re.IGNORECASE)

for m in pat_all.finditer(test):
    print(m)

In [None]:
pat_name = re.compile(r'(\[\*\*(?:(?!\[\*\*).)*?name.*?\*\*\])', re.IGNORECASE)

for m in pat_name.finditer(out):
    print(m)

In [None]:
df['scrubbed'] = df['text'].apply(replace_redacted)

In [None]:
np.all(df['scrubbed'].apply(lambda t: len(pat_name.findall(t))).values == 0)

In [None]:
for _, row in df.iterrows():
    print(len(pat_name.findall(row['scrubbed'])))

In [None]:
for m in pat_name.finditer(out):
    print(m)

print(out)