# MIMIC Notes Pre-Processing

Pre-processing MIMIC notes for language model and CUIs

## Imports and Inits

In [1]:
import pandas as pd
import psycopg2
import numpy as np
import re
import random

from pathlib import Path

In [2]:
PATH = Path('data')

## Functions

## Grab sample data from MIMIC

In [3]:
cats = pd.read_csv('cats.csv')
max_limit = 1000

queries = []
for category, n_notes in zip(cats['category'], cats['number_of_notes']):
    limit = min(max_limit, n_notes) if max_limit > 0 else n_notes
    if limit == max_limit:
        q = f"""
        select category, text from correctnotes where category=\'{category}\' order by random() limit {limit};
        """
    else:
        q = f"""
        select category, text from correctnotes where category=\'{category}\';
        """
    queries.append(q)

In [4]:
# limit = 50
# queries = [
#     f"""
#     select category, text from correctnotes where category=\'{cats.iloc[7]['category']}\' order by random() limit {limit}
#     """
# ]

In [5]:
%%time
dfs = []

con = psycopg2.connect(dbname='mimic', user='sudarshan', host='/var/run/postgresql')
for q in queries:
    df = pd.read_sql_query(q, con)
    dfs.append(df)
con.close()
    
df = pd.concat(dfs)
# df.set_index('row_id', inplace=True)
print(df.shape)

(12152, 2)
CPU times: user 121 ms, sys: 17.8 ms, total: 139 ms
Wall time: 45.8 s


1. Get list of redacted types using `re`
2. replace it with appropriate holder tokens

Below is a list of redacted items with an example and the replacement token.

Redacted items:
* [x] First Name: `[**First Name (Titles) 137**]`, `t_firstname`
* [x] Last Name: `[**Last Name (Titles) **]`, `t_lastname`
* [x] Initials: `[**Initials (NamePattern4) **]`, `t_initials`
* [x] Name: `[**Name (NI) **]`, `t_name`
* [x] Doctor First Name: `[**Doctor First Name 1266**]`, `t_doctor_firstname`
* [x] Doctor Last Name: `[**Doctor Last Name 1266**]`, `t_doctor_lastname`
* [x] Known Last Name: `[**Known lastname 658**]`, `t_lastname`
* [x] Hospital: `[**Hospital1 **]`, `t_hospital`
* [x] Hospital Unit Name: `**Hospital Unit Name 10**`, `t_hospital`
* [x] Company: `[**Company 12924**]`, `t_workplace`
* [x] University/College: `[**University/College **]`, `t_workplace`
* [x] Date of format YYYY-M-DD: `[**2112-4-18**]`, `t_fulldate`
* [x] Year: `[**Year (4 digits) **]`, `t_year`
* [x] Date of format M-DD: `[**6-12**]`, `[**12/2151**]`, `t_monthday`
* [x] Month/Day: `[**Month/Day (2) 509**]`, `t_monthday`
* [x] Month (only): `[**Month (only) 51**]`, `t_month`
* [x] Holiday: `[**Holiday 3470**]`, `t_month`
* [x] Date Range: `[**Date range (1) 7610**]`, `t_daterange`
* [x] Country: `[**Country 9958**]`, `t_country`
* [x] State: `[**State 3283**]`, `t_state`
* [x] Location: `**Location (un) 2432**`, `t_location`
* [x] Telephone/Fax: `[**Telephone/Fax (3) 8049**]`, `t_phone`
* [x] Clip Number: `[**Clip Number (Radiology) 29923**]`, `t_radclip_id`
* [x] Pager Numeric Identifier: `[**Numeric Identifier 6403**]`, `t_pager_id`
* [x] Pager Number: `[**Pager number 13866**]`, `t_pager_id`
* [x] Social Security Number: `[**Security Number 10198**]`, `t_ssn`
* [x] Serial Number: `[**Serial Number 3567**]`, `t_sn`
* [x] Medical Record Number: `[**Medical Record Number **]`, `t_mrn`
* [x] Age over 90: `[**Age over 90 **]`, `oldage`
* Just numbers: `[** 7901**]`
* Wardname
* Pharmacy MD Number* 
* Age over: `**Age over 90 212**`
* NULL

In [130]:
def redacorator(func):
    def replace(match):
        ori = match.group()
        text = match.group().lower()
        if set(ori) == set(' *]['):
            ori = ''
        return func(text, ori)
    return replace

@redacorator
def replace_name(text, ori):
    r = ori
    if 'name' in text:
        r = 't_name'
        if 'last' in text:
            if 'doctor' in text:
                r = 't_doctor_lastname'
            else:
                r = 't_lastname'
        elif 'first' in text:
            if 'doctor' in text:
                r = 't_doctor_firstname'
            else:
                r = 't_firstname'
        elif 'initials' in text:
            r = 't_initials'
    return r

@redacorator
def replace_place(text, ori):
    r = ori
    if 'hospital' in text:
        r = 't_hospital'
    elif ('company' in text) or ('university/college' in text):
        r = 't_workplace'
    elif 'location' in text:
        r = 't_location'
    elif 'country' in text:
        r = 't_country'
    elif 'state' in text:
        r = 't_state'
    elif ('address' in text) or ('po box' in text):
        r = 't_address'
    return r

@redacorator
def replace_dates(text, ori):
    r = ori
    if 'year' in text:
        r = 't_year'
    elif re.search(r'\d{4}-\d{0,2}-\d{0,2}', text):
        r = 't_fulldate'        
    elif (re.search(r'\d{0,2}-\d{0,2}', text)) or (re.search(r'\d{0,2}\/\d{0,2}', text)) or ('month/day' in text):
        r = 't_monthday'
    elif 'month' in text:
        r = 't_month'
    elif 'holiday' in text:
        r = 't_holiday'
    elif 'date range' in text:
        r = 't_daterange'
    return r

@redacorator
def replace_identifiers(text, ori):
    r = ori
    if ('numeric identifier' in text) or ('pager number' in text):
        r = 't_pager_id'
    elif '(radiology)' in text:
        r = 't_radclip_id'
    elif 'social security number' in text:
        r = 't_ssn'
    elif 'medical record number' in text:
        r = 't_mrn'
    elif 'age over 90' in text:
        r = 't_oldage'
    elif 'serial number' in text:
        r = 't_sn'
    elif 'unit number' in text:
        r = 't_unit_no'
    elif 'md number' in text:
        r = 't_md_no'
    if 'telephone/fax' in text:
        r = 't_phone'        
    return r

In [131]:
def replace_redacted(text):
    pat = re.compile(r'\[\*\*(.*?)\*\*\]', re.IGNORECASE)
    
    # replace name types
    text = pat.sub(replace_name, text)
    
    # replace place types
    text = pat.sub(replace_place, text)
    
    # replace date types
    text = pat.sub(replace_dates, text)

    # replace person identifier types
    text = pat.sub(replace_identifiers, text)
    
    return text

def misc_scrub(text):
    # replace different types of "year old" with year_old
    # matches: y.o., y/o, years old. year old, yearold
    text = re.sub(r'\byears? ?old\b|\by(?:o|r)*[ ./-]*o(?:ld)?\b', 'year_old',
               text, flags=re.IGNORECASE)
    
    # replaces yr, yr's, yrs with years
    text = re.sub(r'\byr[\'s]*\b', 'years', text, re.IGNORECASE)
    
    # replace Pt and pt with patient, and IN/OUT/OT PT with patient
    # Note: PT also refers to physical therapy and physical therapist
    text = re.sub(r'\b[P|p]t.?|\b(IN|OU?T) PT\b', 'patient', text)
    
    return text

In [132]:
def scrub_text(text):
    # replace redacted info with tokens
    text = replace_redacted(text)
    
    # misc scrubbing
    text = misc_scrub(text)
    
    return text

In [9]:
pat = re.compile(r'\[\*\*(.*?)\*\*\]', re.IGNORECASE)

In [117]:
test = df.iloc[500]['text']
for m in pat.finditer(test):
    print(m)

<_sre.SRE_Match object; span=(43, 50), match='[** **]'>
<_sre.SRE_Match object; span=(81, 100), match='[**Name (NI) 129**]'>
<_sre.SRE_Match object; span=(148, 164), match='[**Name (NI) **]'>
<_sre.SRE_Match object; span=(167, 184), match='[**Name2 (NI) **]'>
<_sre.SRE_Match object; span=(248, 274), match='[**Doctor Last Name 129**]'>
<_sre.SRE_Match object; span=(323, 345), match='[**Location (un) 23**]'>
<_sre.SRE_Match object; span=(348, 368), match='[**Location (un) **]'>
<_sre.SRE_Match object; span=(393, 413), match='[**Location (un) **]'>


In [129]:
# out = pat_all.sub(replace_name, test)
out = scrub_text(test)
# for m in pat.finditer(out):
#     print(m)

{' ', '*', ']', '['} [** **]


In [108]:
x = '** **'
y = '* '

In [111]:
set(x) == set(y)

True

In [None]:
print(test)

In [None]:
print(out)

In [133]:
df['scrubbed'] = df['text'].apply(scrub_text)

In [134]:
for i, row in df.iterrows():
    if len(pat.findall(row['scrubbed'])) != 0:
        print(i, pat.findall(row['scrubbed']))

22 ['2141']
27 ['2104']
56 ['2155']
85 ['2175']
117 ['2148']
140 ['2187']
207 ['2150']
295 ['2119']
350 ['2127']
366 ['2121']
459 ['2121', '2121']
491 ['2196']
558 ['2073']
590 [' 122']
643 ['2153']
680 ['2164']
691 ['60']
692 ['2179']
736 ['2181']
738 ['2181']
761 ['2144']
814 [' 2880']
865 ['2114']
882 ['2184']
888 ['2184']
891 ['2142']
893 ['2142']
905 ['2169']
910 ['2142']
919 ['2172']
10 ['2104', '2104', '2103', '2103', '2104']
11 ['2104', '2104', '2103', '2103', '2104']
12 ['2141', '2142']
14 ['2101', '2093', '2101', '2103']
15 ['57']
16 ['2179', '2179']
17 ['30']
18 ['2149', '57', '2129', '2145']
36 ['2122', '2122', '2123', '2123', '2122', '2122', '2130', '2120', '2122', '2130']
53 ['2106', '2096']
54 ['10']
55 ['08']
56 ['2076', '2137']
57 ['65']
58 ['2091', '2091', '2099', '2091', '2096', '2099']
63 ['01']
65 ['71']
66 ['16']
67 ['2190']
68 ['2190']
69 ['16']
70 ['01']
71 ['52']
72 ['2167', '2175', '85']
73 ['52']
75 ['2145']
76 ['2145']
80 ['2154', '2154']
81 ['2154', '2154']