# MIMIC Notes Pre-Processing

Pre-processing MIMIC notes for language model and CUIs

## Imports and Inits

In [1]:
import pandas as pd
import psycopg2
import numpy as np
import re
import random

from pathlib import Path

In [2]:
PATH = Path('data')

## Functions

## Grab sample data from MIMIC

In [3]:
cats = pd.read_csv('cats.csv')
max_limit = 10

queries = []
for category, n_notes in zip(cats['category'], cats['number_of_notes']):
    limit = min(max_limit, n_notes) if max_limit > 0 else n_notes
    if limit == max_limit:
        q = f"""
        select category, text from correctnotes where category=\'{category}\' order by random() limit {limit};
        """
    else:
        q = f"""
        select category, text from correctnotes where category=\'{category}\';
        """
    queries.append(q)

In [4]:
# limit = 50
# queries = [
#     f"""
#     select category, text from correctnotes where category=\'{cats.iloc[7]['category']}\' order by random() limit {limit}
#     """
# ]

In [5]:
%%time
dfs = []

con = psycopg2.connect(dbname='mimic', user='sudarshan', host='/var/run/postgresql')
for q in queries:
    df = pd.read_sql_query(q, con)
    dfs.append(df)
con.close()
    
df = pd.concat(dfs)
# df.set_index('row_id', inplace=True)
print(df.shape)

(140, 2)
CPU times: user 26.1 ms, sys: 644 µs, total: 26.7 ms
Wall time: 2.09 s


1. Get list of redacted types using `re`
2. replace it with appropriate holder tokens

Redacted items:
* ~~First Name~~
* ~~Last Name~~
* Hospital
* Date of format M-DD
* Date of format YYYY-M-DD
* ~~Initials~~
* ~~Name~~
* NULL
* ~~Known lastname~~
* ~~Doctor First Name~~
* Doctor Last Name
* Month (only)
* Just numbers
* Location
* Month/Day
* Telephone/Fax
* Wardname
* Pager Numeric Identifier
* Pharmacy MD Number

Redacted items:
* [x] First Name: `[**First Name (Titles) 137**]`, `t_firstname`
* [x] Last Name: `[**Last Name (Titles) **]`, `t_lastname`
* [x] Initials: `[**Initials (NamePattern4) **]`, `t_initials`
* [x] Name: `[**Name (NI) **]`, `t_name`
* [x] Doctor First Name: `[**Doctor First Name 1266**]`, `t_doctor_firstname`
* [x] Doctor Last Name: `[**Doctor Last Name 1266**]`, `t_doctor_lastname`
* [x] Known Last Name: `[**Known lastname 658**]`, `t_lastname`
* [x] Hospital: `[**Hospital1 **]`, `t_hospital`
* [x] Company: `[**Company 12924**]`, `t_workplace`
* Country: `[**Country 9958**]`
* Date of format YYYY-M-DD: `[**2112-4-18**]`
* Year: `[**Year (4 digits) **]`
* Date of format M-DD: `[**6-12**]`
* Month/Day: `[**Month/Day (2) 509**]`
* Month (only): `[**Month (only) 51**]`
* Just numbers
* Location
* Month/Day
* Telephone/Fax: `[**Telephone/Fax (3) 8049**]`
* Wardname
* Pager Numeric Identifier
* Pharmacy MD Number
* Clip Number: `[**Clip Number (Radiology) 29923**]`
* NULL

In [6]:
# def replace_name(match):
#     r = match.group()
#     grp = match.group().lower()
#     if 'name' in grp:
#         r = 't_name'
#         if 'last' in grp:
#             if 'doctor' in grp:
#                 r = 't_doctor_lastname'
#             else:
#                 r = 't_lastname'
#         elif 'first' in grp:
#             if 'doctor' in grp:
#                 r = 't_doctor_firstname'
#             else:
#                 r = 't_firstname'
#         elif 'initials' in grp:
#             r = 't_initials'
#     return r

# def replace_place(match):
#     r = match.group()
#     grp = match.group().lower()
#     if 'hospital' in grp:
#         r = 't_hospital'
#     elif 'company' in grp:
#         r = 't_workplace'
#     elif 'location' in grp:
#         r = 't_location'
#     return r

In [68]:
def redacorator(func):
    def replace(match):
        ori = match.group()
        text = match.group().lower()
        return func(text, ori)
    return replace

@redacorator
def replace_name(text, ori):
    r = ori
    if 'name' in text:
        r = 't_name'
        if 'last' in text:
            if 'doctor' in text:
                r = 't_doctor_lastname'
            else:
                r = 't_lastname'
        elif 'first' in text:
            if 'doctor' in text:
                r = 't_doctor_firstname'
            else:
                r = 't_firstname'
        elif 'initials' in text:
            r = 't_initials'
    return r

@redacorator
def replace_place(text, ori):
    r = ori
    if 'hospital' in text:
        r = 't_hospital'
    elif 'company' in text:
        r = 't_workplace'
    elif 'location' in text:
        r = 't_location'
    return r

@redacorator
def replace_dates(text, ori):
    r = ori
    if 'year' in text:
        r = 't_year'
    elif re.search(r'\d{4}-\d{0,2}-\d{0,2}', text):
        r = 't_fulldate'        
    elif re.search(r'\d{0,2}-\d{0,2}', text) or 'month/day' in text:
        r = 't_monthday'
    elif 'month' in text:
        r = 't_month'        
    return r

In [69]:
def replace_redacted(text):
    pat = re.compile(r'\[\*\*(.*?)\*\*\]', re.IGNORECASE)
    
    # replace name types
    text = pat.sub(replace_name, text)
    
    # replace place types
    text = pat.sub(replace_place, text)
    
    # replace date types
    text = pat.sub(replace_dates, text)
    
    return text

In [None]:
test = df.iloc[random.randint(0, 140)]['text']

In [70]:
pat = re.compile(r'\[\*\*(.*?)\*\*\]', re.IGNORECASE)

for m in pat.finditer(test):
    print(m)

<_sre.SRE_Match object; span=(64, 78), match='[**2107-1-4**]'>
<_sre.SRE_Match object; span=(121, 135), match='[**2107-1-4**]'>
<_sre.SRE_Match object; span=(165, 179), match='[**2107-1-4**]'>
<_sre.SRE_Match object; span=(219, 233), match='[**2107-1-4**]'>
<_sre.SRE_Match object; span=(264, 278), match='[**2107-1-4**]'>
<_sre.SRE_Match object; span=(308, 322), match='[**2107-1-5**]'>
<_sre.SRE_Match object; span=(819, 836), match='[**Hospital 75**]'>
<_sre.SRE_Match object; span=(942, 956), match='[**2107-1-4**]'>
<_sre.SRE_Match object; span=(982, 996), match='[**2107-1-4**]'>
<_sre.SRE_Match object; span=(1043, 1057), match='[**2107-1-5**]'>
<_sre.SRE_Match object; span=(1259, 1273), match='[**2107-1-4**]'>
<_sre.SRE_Match object; span=(1312, 1326), match='[**2107-1-5**]'>
<_sre.SRE_Match object; span=(1637, 1651), match='[**2107-1-5**]'>
<_sre.SRE_Match object; span=(1863, 1871), match='[**08**]'>
<_sre.SRE_Match object; span=(4395, 4409), match='[**2107-1-4**]'>
<_sre.SRE_Match ob

In [71]:
# out = pat_all.sub(replace_name, test)
out = replace_redacted(test)
for m in pat.finditer(out):
    print(m)

<_sre.SRE_Match object; span=(1808, 1816), match='[**08**]'>
<_sre.SRE_Match object; span=(4994, 5023), match='[**Telephone/Fax (3) 13007**]'>


In [73]:
print(test)

Chief Complaint: sepsis
   24 Hour Events:
 BLOOD CULTURED - At [**2107-1-4**] 08:30 AM
   from CVL
 BLOOD CULTURED - At [**2107-1-4**] 09:45 AM
 URINE CULTURE - At [**2107-1-4**] 10:00 AM
 INVASIVE VENTILATION - START [**2107-1-4**] 01:15 PM
 SPUTUM CULTURE - At [**2107-1-4**] 10:06 PM
 STOOL CULTURE - At [**2107-1-5**] 03:23 AM
   -intubated yesterday am
   -on vanc zosyn and cipro
   -a-line unable to be places
   -insulin increased to 24 then to 32, plus 10 x2, sugars persistantly
   over 350, however gap closing
   -on max neo, cvp 10-12, edematous, BPs
   -ECG ordered for K 5.6, no peaked t waves, now resolved, got one dose
   kayexylate
   -kub for firmer abd, film difficult to interpret but had BM and seemed
   stable to improved, with improved bowel sounds
   History obtained from Patient, Family / [**Hospital 75**] Medical records
   Allergies:
   No Known Drug Allergies
   Last dose of Antibiotics:
   Ciprofloxacin - [**2107-1-4**] 05:00 PM
   Vancomycin - [**2107-1-4**] 08:

In [72]:
print(out)

Chief Complaint: sepsis
   24 Hour Events:
 BLOOD CULTURED - At t_fulldate 08:30 AM
   from CVL
 BLOOD CULTURED - At t_fulldate 09:45 AM
 URINE CULTURE - At t_fulldate 10:00 AM
 INVASIVE VENTILATION - START t_fulldate 01:15 PM
 SPUTUM CULTURE - At t_fulldate 10:06 PM
 STOOL CULTURE - At t_fulldate 03:23 AM
   -intubated yesterday am
   -on vanc zosyn and cipro
   -a-line unable to be places
   -insulin increased to 24 then to 32, plus 10 x2, sugars persistantly
   over 350, however gap closing
   -on max neo, cvp 10-12, edematous, BPs
   -ECG ordered for K 5.6, no peaked t waves, now resolved, got one dose
   kayexylate
   -kub for firmer abd, film difficult to interpret but had BM and seemed
   stable to improved, with improved bowel sounds
   History obtained from Patient, Family / t_hospital Medical records
   Allergies:
   No Known Drug Allergies
   Last dose of Antibiotics:
   Ciprofloxacin - t_fulldate 05:00 PM
   Vancomycin - t_fulldate 08:51 PM
   Piperacillin/Tazobactam (Zosyn

In [None]:
df['scrubbed'] = df['text'].apply(replace_redacted)

In [None]:
pat_name = re.compile(r'(\[\*\*(?:(?!\[\*\*).)*?name.*?\*\*\])', re.IGNORECASE)
print(np.all(df['scrubbed'].apply(lambda t: len(pat_name.findall(t))).values == 0))

# pat_hos = re.compile(r'(\[\*\*(?:(?!\[\*\*).)*?hospital.*?\*\*\])', re.IGNORECASE)
# print(np.all(df['scrubbed'].apply(lambda t: len(pat_hos.findall(t))).values == 0))

In [None]:
for _, row in df.iterrows():
    if len(pat_name.findall(row['scrubbed'])) != 0:
        print(pat_name.findall(row['scrubbed']))

In [58]:
x = "2118-4-18"

In [60]:
re.search(r'\d{4}-\d{0,2}-\d{0,2}', x, re.IGNORECASE)

<_sre.SRE_Match object; span=(0, 9), match='2118-4-18'>