# MIMIC Notes Pre-Processing

Pre-processing MIMIC notes for language model and CUIs

## Imports and Inits

In [1]:
import pandas as pd
import psycopg2
import numpy as np
import re
import random

from pathlib import Path

In [2]:
PATH = Path('data')

## Functions

## Grab sample data from MIMIC

In [3]:
cats = pd.read_csv('cats.csv')
max_limit = 4

queries = []
for category, n_notes in zip(cats['category'], cats['number_of_notes']):
    limit = min(max_limit, n_notes) if max_limit > 0 else n_notes
    if limit == max_limit:
        q = f"""
        select category, text from correctnotes where category=\'{category}\' order by random() limit {limit};
        """
    else:
        q = f"""
        select category, text from correctnotes where category=\'{category}\';
        """
    queries.append(q)

In [4]:
# limit = 50
# queries = [
#     f"""
#     select category, text from correctnotes where category=\'{cats.iloc[7]['category']}\' order by random() limit {limit}
#     """
# ]

In [5]:
dfs = []

con = psycopg2.connect(dbname='mimic', user='sudarshan', host='/var/run/postgresql')
for q in queries:
    df = pd.read_sql_query(q, con)
    dfs.append(df)
con.close()
    
df = pd.concat(dfs)
# df.set_index('row_id', inplace=True)
df.shape

(56, 2)

1. Get list of redacted types using `re`
2. replace it with appropriate holder tokens

Redacted items:
* ~~First Name~~
* ~~Last Name~~
* Hospital
* Date of format M-DD
* Date of format YYYY-M-DD
* ~~Initials~~
* ~~Name~~
* NULL
* ~~Known lastname~~
* ~~Doctor First Name~~
* Doctor Last Name
* Month (only)
* Just numbers
* Location
* Month/Day
* Telephone/Fax
* Wardname
* Pager Numeric Identifier
* Pharmacy MD Number

In [27]:
test = df.iloc[random.randint(0, 50)]['text']
print(test)

Chief Complaint:
   24 Hour Events:
   - increased lamivudine to 150mg [**Hospital1 7**] (appropriate for his CrCl)
   - diet advanced to clears
   - called out to medical floor, but then called back in for
   bradycardia/hypotension
   - given one dose of nadolol 20mg at 5pm, then HRs decreasing to high
   40s-50s, SBP decreased to 70s, nadolol discontinued, given 1L NS bolus
   without response in BP so started on low dose levophed, BCx drawn. Hct
   down from 25-27 range down to 23.9 -> 1 unit pRBCs
   History obtained from Patient
   Allergies:
   History obtained from PatientSulfa (Sulfonamide Antibiotics)
   Rash;
   Penicillins
   Unknown;
   Last dose of Antibiotics:
   Ciprofloxacin - [**2200-3-18**] 06:41 PM
   Ceftriaxone - [**2200-3-19**] 08:11 AM
   Infusions:
   Norepinephrine - 0.06 mcg/Kg/min
   Other ICU medications:
   Pantoprazole (Protonix) - [**2200-3-19**] 08:00 PM
   Other medications:
   Changes to medical and family history:
   Review of systems is unchanged fr

In [28]:
pat_all = re.compile(r'\[\*\*(.*?)\*\*\]', re.IGNORECASE)

for m in pat_all.finditer(test):
    print(m)

<_sre.SRE_Match object; span=(71, 88), match='[**Hospital1 7**]'>
<_sre.SRE_Match object; span=(703, 718), match='[**2200-3-18**]'>
<_sre.SRE_Match object; span=(745, 760), match='[**2200-3-19**]'>
<_sre.SRE_Match object; span=(875, 890), match='[**2200-3-19**]'>
<_sre.SRE_Match object; span=(1082, 1097), match='[**2200-3-20**]'>
<_sre.SRE_Match object; span=(1309, 1317), match='[**01**]'>
<_sre.SRE_Match object; span=(3520, 3535), match='[**2200-3-18**]'>
<_sre.SRE_Match object; span=(3575, 3590), match='[**2200-3-18**]'>
<_sre.SRE_Match object; span=(3630, 3645), match='[**2200-3-18**]'>
<_sre.SRE_Match object; span=(3685, 3700), match='[**2200-3-19**]'>
<_sre.SRE_Match object; span=(3740, 3755), match='[**2200-3-19**]'>
<_sre.SRE_Match object; span=(3795, 3810), match='[**2200-3-19**]'>
<_sre.SRE_Match object; span=(3850, 3865), match='[**2200-3-19**]'>
<_sre.SRE_Match object; span=(3905, 3920), match='[**2200-3-20**]'>
<_sre.SRE_Match object; span=(4721, 4738), match='[**Hospital1 

In [29]:
pat_name = re.compile(r'\[\*\*(.*?Name.*?)\*\*\]', re.IGNORECASE)

for m in pat_name.finditer(test):
    print(m)

<_sre.SRE_Match object; span=(5060, 5089), match='[**Last Name (STitle) 1174**]'>
<_sre.SRE_Match object; span=(5623, 5639), match='[**Name (NI) **]'>
<_sre.SRE_Match object; span=(5674, 5705), match='[**Last Name (NamePattern4) **]'>
<_sre.SRE_Match object; span=(5707, 5736), match='[**Last Name (STitle) 2956**]'>
<_sre.SRE_Match object; span=(6578, 6597), match='[**Name (NI) 135**]'>
<_sre.SRE_Match object; span=(6598, 6614), match='[**Name (NI) **]'>
<_sre.SRE_Match object; span=(6615, 6634), match='[**Name (NI) 151**]'>


In [6]:
def replace_name(match):
    r = 't_name'
    if 'Last' in match.group() or 'last' in match.group():
        if 'Doctor' in match.group():
            r = 't_name_doc_last'
        else:
            r = 't_name_last'
    elif 'First' in match.group() or 'first' in match.group():
        if 'Doctor' in match.group():
            r = 't_name_doc_first'
        else:
            r = 't_name_first'
    elif 'Initials' in match.group():
        r = 't_name_inits'
    return r

In [7]:
def replace_redacted(text):
    # replace name types with unique tokens
    pat_name = re.compile(r'\[\*\*(.*?Name.*?)\*\*\]', re.IGNORECASE) 
    text = pat_name.sub(replace_name, text)
    
    return text

In [15]:
df['scrubbed'] = df['text'].apply(replace_redacted)

In [32]:
pat_name = re.compile(r'\[\*\*(.*?Name.*?)\*\*\]', re.IGNORECASE)

In [34]:
for _, row in df.iterrows():
    print(len(pat_name.findall(row['scrubbed'])))

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [None]:
# out = pat_name.sub(replace_name, test)
out = replace_redacted(test)

In [None]:
for m in pat_name.finditer(out):
    print(m)

print(out)