# MIMIC Notes Pre-Processing

Pre-processing MIMIC notes for language model and CUIs

## Imports and Inits

In [1]:
import pandas as pd
import psycopg2
import numpy as np
import re
import random
import datetime

from pathlib import Path

In [2]:
from process_notes import *

In [3]:
PATH = Path('data')

## Functions

## Grab sample data from MIMIC

In [4]:
cats = pd.read_csv('cats.csv')
max_limit = 100

queries = []
for category, n_notes in zip(cats['category'], cats['number_of_notes']):
    limit = min(max_limit, n_notes) if max_limit > 0 else n_notes
    if limit == max_limit:
        q = f"""
        select category, text from correctnotes where category=\'{category}\' order by random() limit {limit};
        """
    else:
        q = f"""
        select category, text from correctnotes where category=\'{category}\';
        """
    queries.append(q)

In [5]:
# limit = 50
# queries = [
#     f"""
#     select category, text from correctnotes where category=\'{cats.iloc[7]['category']}\' order by random() limit {limit}
#     """
# ]

In [6]:
%%time
dfs = []

con = psycopg2.connect(dbname='mimic', user='sudarshan', host='/var/run/postgresql')
for q in queries:
    df = pd.read_sql_query(q, con)
    dfs.append(df)
con.close()
    
df = pd.concat(dfs)
df.reset_index(inplace=True, drop=True)
# df.set_index('row_id', inplace=True)
print(df.shape)

(1398, 2)
CPU times: user 21.9 ms, sys: 11.9 ms, total: 33.8 ms
Wall time: 3.05 s


1. Get list of redacted types using `re`
2. replace it with appropriate holder tokens

Below is a list of redacted items with an example and the replacement token.

Redacted items:
* [x] First Name: `[**First Name (Titles) 137**]`, `t_firstname`
* [x] Last Name: `[**Last Name (Titles) **]`, `t_lastname`
* [x] Initials: `[**Initials (NamePattern4) **]`, `t_initials`
* [x] Name: `[**Name (NI) **]`, `t_name`
* [x] Doctor First Name: `[**Doctor First Name 1266**]`, `t_doctor_firstname`
* [x] Doctor Last Name: `[**Doctor Last Name 1266**]`, `t_doctor_lastname`
* [x] Known Last Name: `[**Known lastname 658**]`, `t_lastname`
* [x] Hospital: `[**Hospital1 **]`, `t_hospital`
* [x] Hospital Unit Name: `**Hospital Unit Name 10**`, `t_hospital`
* [x] Company: `[**Company 12924**]`, `t_workplace`
* [x] University/College: `[**University/College **]`, `t_workplace`
* [x] Date of format YYYY-M-DD: `[**2112-4-18**]`, `t_fulldate`
* [x] Year: `[**Year (4 digits) **]`, `t_year`
* [x] Year YYYY format: `[**2119**]`, `t_year` - I use a regex `\b\d{4}\b` that will match **any** 4 digits which might be problematic, but for the most part 4 digits by itself seems to indicate a year.
* [x] Date of format M-DD: `[**6-12**]`, `[**12/2151**]`, `t_monthday`
* [x] Month/Day: `[**Month/Day (2) 509**]`, `t_monthday`
* [x] Month (only): `[**Month (only) 51**]`, `t_month`
* [x] Holiday: `[**Holiday 3470**]`, `t_month`
* [x] Date Range: `[**Date range (1) 7610**]`, `t_daterange`
* [x] Country: `[**Country 9958**]`, `t_country`
* [x] State: `[**State 3283**]`, `t_state`
* [x] Location: `**Location (un) 2432**`, `t_location`
* [x] Telephone/Fax: `[**Telephone/Fax (3) 8049**]`, `t_phone`
* [x] Clip Number: `[**Clip Number (Radiology) 29923**]`, `t_radclip_id`
* [x] Pager Numeric Identifier: `[**Numeric Identifier 6403**]`, `t_pager_id`
* [x] Pager Number: `[**Pager number 13866**]`, `t_pager_id`
* [x] Social Security Number: `[**Security Number 10198**]`, `t_ssn`
* [x] Serial Number: `[**Serial Number 3567**]`, `t_sn`
* [x] Medical Record Number: `[**Medical Record Number **]`, `t_mrn`
* [x] Provider Number: `[**Provider Number 12521**]`, `t_provider_no`
* [x] Age over 90: `[**Age over 90 **]`, `t_oldage`
* [x] Time: `12:52 PM`, split into 6 segments by the hour and replace with the following tokens: `midnight, dawn, forenoon, afternoon, dusk, night`
* Just numbers: `[** 7901**]`
* Wardname
* Pharmacy MD Number* 

In [9]:
pat = re.compile(r'\[\*\*(.*?)\*\*\]', re.IGNORECASE)

In [10]:
tpat = re.compile(r'\[\*\*(\d{2})\*\*\] \b[A|P].?M.?\b', re.IGNORECASE)

In [7]:
%%time
df['scrubbed'] = df['text'].apply(process_note)

CPU times: user 757 ms, sys: 0 ns, total: 757 ms
Wall time: 756 ms


In [None]:
for i, row in df.iterrows():
    if len(pat.findall(row['scrubbed'])) != 0:
        print(i, pat.findall(row['scrubbed']))

In [12]:
test = df.iloc[1049]['text']
# for m in pat.finditer(test):
#     print(m)

print(test)

[**2107-1-22**] 11:31 AM
 UNILAT LOWER EXT VEINS LEFT PORT                                Clip # [**Clip Number (Radiology) 52556**]
 Reason: please evaluate for DVT focusing on Left lower extremity.
 Admitting Diagnosis: CRF/SDA
 ______________________________________________________________________________
 [**Hospital 4**] MEDICAL CONDITION:
  47 year old woman s/p renal transplant now with HUS/TTP now with edema LLE.
 REASON FOR THIS EXAMINATION:
  please evaluate for DVT focusing on Left lower extremity.
 ______________________________________________________________________________
                                 FINAL REPORT
 INDICATION:  47-year-old, post-renal transplant with left lower extremity
 edema.

 Grayscale and pulse color Doppler images of the left common femoral,
 superficial femoral, popliteal, and greater saphenous veins was performed.
 The examination was somewhat technically difficult due to patient's leg
 swelling.  There is however normal compressibility, col

In [14]:
out = process_note(test)
# for m in pat.finditer(out):
#     print(m)

print(out)

t_fulldate t_forenoon
 UNILAT LOWER EXT VEINS LEFT PORT                                Clip # t_radclip_id
 Reason: please evaluate for DVT focusing on Left lower extremity.
 Admitting Diagnosis: CRF/SDA
 ______________________________________________________________________________
 t_hospital MEDICAL CONDITION:
  47 t_year_old woman s/p renal transplant now with HUS/TTP now with edema LLE.
 REASON FOR THIS EXAMINATION:
  please evaluate for DVT focusing on Left lower extremity.
 ______________________________________________________________________________
                                 FINAL REPORT
 INDICATION:  47-year-old, post-renal transplant with left lower extremity
 edema.

 Grayscale and pulse color Doppler images of the left common femoral,
 superficial femoral, popliteal, and greater saphenous veins was performed.
 The examination was somewhat technically difficult due to patient's leg
 swelling.  There is however normal compressibility, color flow, waveforms, and
 augme

In [None]:
for i, row in df.iterrows():
    if len(tpat.findall(row['text'])) != 0:
        print(i, tpat.findall(row['text']))

In [None]:
for i, row in df.iterrows():
    if len(tpat.findall(row['scrubbed'])) != 0:
        print(i, tpat.findall(row['scrubbed']))

In [None]:
def redacorator(func):
    def replace(match):
        ori = match.group()
        text = match.group().lower()
        if set(ori) == set(' *]['):
            ori = ''
        return func(text, ori)
    return replace

@redacorator
def replace_name(text, ori):
    r = ori
    if 'name' in text:
        r = 't_name'
        if 'last' in text:
            if 'doctor' in text:
                r = 't_doctor_lastname'
            else:
                r = 't_lastname'
        elif 'first' in text:
            if 'doctor' in text:
                r = 't_doctor_firstname'
            else:
                r = 't_firstname'
        elif 'initials' in text:
            r = 't_initials'
    return r

@redacorator
def replace_place(text, ori):
    r = ori
    if 'hospital' in text:
        r = 't_hospital'
    elif ('company' in text) or ('university/college' in text):
        r = 't_workplace'
    elif 'location' in text:
        r = 't_location'
    elif 'country' in text:
        r = 't_country'
    elif 'state' in text:
        r = 't_state'
    elif ('address' in text) or ('po box' in text):
        r = 't_address'
    return r

@redacorator
def replace_dates(text, ori):
    r = ori
    if re.search(r'\d{4}-\d{0,2}-\d{0,2}', text):
        r = 't_fulldate'
    elif (re.search(r'\d{0,2}-\d{0,2}', text)) or (re.search(r'\d{0,2}\/\d{0,2}', text)) or ('month/day' in text):
        r = 't_monthday'        
    elif 'year' in text or re.search(r'\b\d{4}\b', text):
        r = 't_year'
    elif 'month' in text:
        r = 't_month'
    elif 'holiday' in text:
        r = 't_holiday'
    elif 'date range' in text:
        r = 't_daterange'
    return r

@redacorator
def replace_identifiers(text, ori):
    r = ori
    if ('numeric identifier' in text) or ('pager number' in text):
        r = 't_pager_id'
    elif '(radiology)' in text:
        r = 't_radclip_id'
    elif 'social security number' in text:
        r = 't_ssn'
    elif 'medical record number' in text:
        r = 't_mrn'
    elif 'age over 90' in text:
        r = 't_oldage'
    elif 'serial number' in text:
        r = 't_sn'
    elif 'unit number' in text:
        r = 't_unit_no'
    elif 'md number' in text:
        r = 't_md_no'
    elif 'telephone/fax' in text:
        r = 't_phone'
    elif 'provider number' in text:
        r = 't_provider_no'
    elif 'contact info' in text:
        r = 't_contact_info'
    return r

def replace_redacted(text):    
    pat = re.compile(r'\[\*\*(.*?)\*\*\]', re.IGNORECASE)
    
    # replace name types
    text = pat.sub(replace_name, text)
    
    # replace place types
    text = pat.sub(replace_place, text)
    
    # replace date types
    text = pat.sub(replace_dates, text)

    # replace person identifier types
    text = pat.sub(replace_identifiers, text)
    
    text = re.sub(r'\[\*\*(\d{2})\*\*\] \b[A|P].?M.?\b', 't_hour', text, re.IGNORECASE)    
    return text

def replace_time(match):
    try:
        time = match.group().strip().lower()

        # handle exceptions with custom rules
        f, s = time.split()
        s = 'am' if s[0] == 'a' else 'pm'
        l, r = f.split(':')
        if l == '' or l == '00':
            if r == '':
                r = str(0).zfill(2)
            l = str(12)
        if int(l) > 12:
            l = str(int(l) % 12)
        f = ':'.join([l, r])
        time = ' '.join([f, s])
        
        d = datetime.datetime.strptime(time, '%I:%M %p')
        if d.hour >= 0 and d.hour < 4:
            time = 't_midnight'
        elif d.hour >= 4 and d.hour < 8:
            time = 't_dawn'
        elif d.hour >= 8 and d.hour < 12:
            time = 't_forenoon'
        elif d.hour >= 12 and d.hour < 16:
            time = 't_afternoon'
        elif d.hour >=16 and d.hour <20:
            time = 't_dusk'
        else:
            time = 't_night'
    except ValueError:
        time = match.group()
    return time

def misc_scrub(text):
    # replace different types of "year old" with year_old
    # matches: y.o., y/o, years old. year old, yearold
    text = re.sub(r'\byears? ?old\b|\by(?:o|r)*[ ./-]*o(?:ld)?\b', 't_year_old',
               text, flags=re.IGNORECASE)
    
    # replaces yr, yr's, yrs with years
    text = re.sub(r'\byr[\'s]*\b', 'years', text, re.IGNORECASE)
    
    # replace Pt and pt with patient, and IN/OUT/OT PT with patient
    # Note: PT also refers to physical therapy and physical therapist
    text = re.sub(r'\b[P|p]t.?|\b(IN|OU?T) PT\b', 'patient', text)
    
    text = re.sub(r'\d{0,2}:\d{0,2} \b[A|P]\.?M\.?\b', replace_time, text, flags=re.IGNORECASE)
    return text

def scrub_text(text):      
    # replace redacted info with tokens
    text = replace_redacted(text)
    
    # misc scrubbing
    text = misc_scrub(text)    
    return text