# MIMIC Notes Pre-Processing

Pre-processing MIMIC notes for language model and CUIs

## Imports and Inits

In [1]:
import pandas as pd
import psycopg2
import numpy as np
import re
import random

from pathlib import Path

In [2]:
PATH = Path('data')

## Functions

## Grab sample data from MIMIC

In [3]:
cats = pd.read_csv('cats.csv')
max_limit = 10

queries = []
for category, n_notes in zip(cats['category'], cats['number_of_notes']):
    limit = min(max_limit, n_notes) if max_limit > 0 else n_notes
    if limit == max_limit:
        q = f"""
        select category, text from correctnotes where category=\'{category}\' order by random() limit {limit};
        """
    else:
        q = f"""
        select category, text from correctnotes where category=\'{category}\';
        """
    queries.append(q)

In [4]:
# limit = 50
# queries = [
#     f"""
#     select category, text from correctnotes where category=\'{cats.iloc[7]['category']}\' order by random() limit {limit}
#     """
# ]

In [5]:
%%time
dfs = []

con = psycopg2.connect(dbname='mimic', user='sudarshan', host='/var/run/postgresql')
for q in queries:
    df = pd.read_sql_query(q, con)
    dfs.append(df)
con.close()
    
df = pd.concat(dfs)
# df.set_index('row_id', inplace=True)
print(df.shape)

(140, 2)
CPU times: user 26.1 ms, sys: 644 µs, total: 26.7 ms
Wall time: 2.09 s


1. Get list of redacted types using `re`
2. replace it with appropriate holder tokens

Redacted items:
* [x] First Name: `[**First Name (Titles) 137**]`, `t_firstname`
* [x] Last Name: `[**Last Name (Titles) **]`, `t_lastname`
* [x] Initials: `[**Initials (NamePattern4) **]`, `t_initials`
* [x] Name: `[**Name (NI) **]`, `t_name`
* [x] Doctor First Name: `[**Doctor First Name 1266**]`, `t_doctor_firstname`
* [x] Doctor Last Name: `[**Doctor Last Name 1266**]`, `t_doctor_lastname`
* [x] Known Last Name: `[**Known lastname 658**]`, `t_lastname`
* [x] Hospital: `[**Hospital1 **]`, `t_hospital`
* [x] Company: `[**Company 12924**]`, `t_workplace`
* [x] Date of format YYYY-M-DD: `[**2112-4-18**]`, `t_fulldate`
* [x] Year: `[**Year (4 digits) **]`, `t_year`
* [x] Date of format M-DD: `[**6-12**]`, `t_monthday`
* [x] Month/Day: `[**Month/Day (2) 509**]`, `t_monthday`
* [x] Month (only): `[**Month (only) 51**]`, `t_month`
* [x] Country: `[**Country 9958**]`, `t_country`
* [x] Location: `**Location (un) 2432**`, `t_location`
* [x] Telephone/Fax: `[**Telephone/Fax (3) 8049**]`, `t_phone`
* [x] Clip Number: `[**Clip Number (Radiology) 29923**]`, `t_radclip_id`
* [x] Pager Numeric Identifier: `**Numeric Identifier 6403**`, `t_pager_id`
* Just numbers: `[** 7901**]`
* Wardname
* Pharmacy MD Number* 
* Age over: `**Age over 90 212**`
* NULL

In [171]:
def redacorator(func):
    def replace(match):
        ori = match.group()
        text = match.group().lower()
        return func(text, ori)
    return replace

@redacorator
def replace_name(text, ori):
    r = ori
    if 'name' in text:
        r = 't_name'
        if 'last' in text:
            if 'doctor' in text:
                r = 't_doctor_lastname'
            else:
                r = 't_lastname'
        elif 'first' in text:
            if 'doctor' in text:
                r = 't_doctor_firstname'
            else:
                r = 't_firstname'
        elif 'initials' in text:
            r = 't_initials'
    return r

@redacorator
def replace_place(text, ori):
    r = ori
    if 'hospital' in text:
        r = 't_hospital'
    elif 'company' in text:
        r = 't_workplace'
    elif 'location' in text:
        r = 't_location'
    elif 'country' in text:
        r = 't_country'
    elif 'street address' in text:
        r = 't_address'
    return r

@redacorator
def replace_dates(text, ori):
    r = ori
    if 'year' in text:
        r = 't_year'
    elif re.search(r'\d{4}-\d{0,2}-\d{0,2}', text):
        r = 't_fulldate'        
    elif re.search(r'\d{0,2}-\d{0,2}', text) or 'month/day' in text:
        r = 't_monthday'
    elif 'month' in text:
        r = 't_month'        
    return r

@redacorator
def replace_comm(text, ori):
    r = ori
    if 'telephone/fax' in text:
        r = 't_phone'
    return r

@redacorator
def replace_ids(text, ori):
    r = ori
    if 'numeric identifier' in text:
        r = 't_pager_id'
    elif '(radiology)' in text:
        r = 't_radclip_id'
    return r

In [185]:
def replace_redacted(text):
    pat = re.compile(r'\[\*\*(.*?)\*\*\]', re.IGNORECASE)
    
    # replace name types
    text = pat.sub(replace_name, text)
    
    # replace place types
    text = pat.sub(replace_place, text)
    
    # replace date types
    text = pat.sub(replace_dates, text)

    # replace communication types
    text = pat.sub(replace_comm, text)
    
    # replace id types
    text = pat.sub(replace_ids, text)
    
    return text

def misc_scrub(text):
    # replace different types of "year old" with year_old
    # matches: y.o., y/o, years old. year old, yearold
    text = re.sub(r'\byears? ?old\b|\by(?:o|r)*[ ./-]*o(?:ld)?\b', 'year_old',
               text, flags=re.IGNORECASE)
    
    # replaces yr, yr's, yrs with years
    text = re.sub(r'\byr[\'s]*\b', 'years', text, re.IGNORECASE)
    
    # replace Pt and pt with patient, and IN/OUT/OT PT with patient
    # Note: PT also refers to physical therapy and physical therapist
    text = re.sub(r'\b[P|p]t.?|\b(IN|OU?T) PT\b', 'patient', text)
    
    return text

In [181]:
def scrub_text(text):
    # replace redacted info with tokens
    text = replace_redacted(text)
    
    # misc scrubbing
    text = misc_scrub(text)
    
    return text

In [None]:
pat = re.compile(r'\[\*\*(.*?)\*\*\]', re.IGNORECASE)

In [239]:
test = df.iloc[random.randint(0, 140)]['text']
for m in pat.finditer(test):
    print(m)

<_sre.SRE_Match object; span=(38, 58), match='[**Hospital3 6401**]'>
<_sre.SRE_Match object; span=(62, 86), match='[**Location (un) 3356**]'>
<_sre.SRE_Match object; span=(231, 242), match='[**10-29**]'>
<_sre.SRE_Match object; span=(244, 254), match='[**11-2**]'>
<_sre.SRE_Match object; span=(258, 293), match='[**First Name8 (NamePattern2) 16**]'>
<_sre.SRE_Match object; span=(297, 328), match='[**Last Name (NamePattern1) **]'>
<_sre.SRE_Match object; span=(329, 360), match='[**Last Name (Prefixes) 6402**]'>
<_sre.SRE_Match object; span=(385, 398), match='[**1-/7436**]'>
<_sre.SRE_Match object; span=(408, 437), match='[**Numeric Identifier 6403**]'>


In [233]:
# out = pat_all.sub(replace_name, test)
out = scrub_text(test)
for m in pat.finditer(out):
    print(m)

<_sre.SRE_Match object; span=(47, 58), match='[** 7901**]'>


In [234]:
print(test)

Family meeting held again today with pt
s [** 7901**] [**Name (NI) 11659**] and
   [**Name (NI) 11660**].  SICU and neurology teams updated family on pt
s condition
   and asked for their direction in terms of moving forward.  [**Name (NI) **]
   will discuss today
s meeting with the extended family, pt has never
   talked about end of life issue
s with her [**Name (NI) 7901**].    [**Name (NI) **]
   appropriately tearful, asking good questions for clarification and
   working well with the teams.



In [235]:
print(out)

Family meeting held again today with patient
s [** 7901**] t_name and
   t_name.  SICU and neurology teams updated family on patient
s condition
   and asked for their direction in terms of moving forward.  t_name
   will discuss today
s meeting with the extended family, patienthas never
   talked about end of life issue
s with her t_name.    t_name
   appropriately tearful, asking good questions for clarification and
   working well with the teams.



In [None]:
df['scrubbed'] = df['text'].apply(replace_redacted)

In [None]:
pat_name = re.compile(r'(\[\*\*(?:(?!\[\*\*).)*?name.*?\*\*\])', re.IGNORECASE)
print(np.all(df['scrubbed'].apply(lambda t: len(pat_name.findall(t))).values == 0))

# pat_hos = re.compile(r'(\[\*\*(?:(?!\[\*\*).)*?hospital.*?\*\*\])', re.IGNORECASE)
# print(np.all(df['scrubbed'].apply(lambda t: len(pat_hos.findall(t))).values == 0))

In [None]:
for _, row in df.iterrows():
    if len(pat_name.findall(row['scrubbed'])) != 0:
        print(pat_name.findall(row['scrubbed']))

In [58]:
x = "2118-4-18"

In [60]:
re.search(r'\d{4}-\d{0,2}-\d{0,2}', x, re.IGNORECASE)

<_sre.SRE_Match object; span=(0, 9), match='2118-4-18'>