# MIMIC Notes Pre-Processing

Pre-processing MIMIC notes for language model and CUIs

## Imports and Inits

In [None]:
import pandas as pd
import psycopg2
import numpy as np
import re
import random

from pathlib import Path

In [None]:
PATH = Path('data')

## Functions

## Grab sample data from MIMIC

In [None]:
cats = pd.read_csv('cats.csv')
max_limit = 10

queries = []
for category, n_notes in zip(cats['category'], cats['number_of_notes']):
    limit = min(max_limit, n_notes) if max_limit > 0 else n_notes
    if limit == max_limit:
        q = f"""
        select category, text from correctnotes where category=\'{category}\' order by random() limit {limit};
        """
    else:
        q = f"""
        select category, text from correctnotes where category=\'{category}\';
        """
    queries.append(q)

In [None]:
# limit = 50
# queries = [
#     f"""
#     select category, text from correctnotes where category=\'{cats.iloc[7]['category']}\' order by random() limit {limit}
#     """
# ]

In [None]:
%%time
dfs = []

con = psycopg2.connect(dbname='mimic', user='sudarshan', host='/var/run/postgresql')
for q in queries:
    df = pd.read_sql_query(q, con)
    dfs.append(df)
con.close()
    
df = pd.concat(dfs)
# df.set_index('row_id', inplace=True)
print(df.shape)

1. Get list of redacted types using `re`
2. replace it with appropriate holder tokens

Redacted items:
* ~~First Name~~
* ~~Last Name~~
* Hospital
* Date of format M-DD
* Date of format YYYY-M-DD
* ~~Initials~~
* ~~Name~~
* NULL
* ~~Known lastname~~
* ~~Doctor First Name~~
* Doctor Last Name
* Month (only)
* Just numbers
* Location
* Month/Day
* Telephone/Fax
* Wardname
* Pager Numeric Identifier
* Pharmacy MD Number

Redacted items:
* `[**First Name (Titles) 137**]`
* `[**Last Name (Titles) **]`
* `[**Initials (NamePattern4) **]`
* `[**Name (NI) **]`
* `[**Doctor Last Name 1266**]`
* `[**Doctor Last Name 1266**]`
* `[**Known lastname 658**]`
* Hospital
* Date of format M-DD
* Date of format YYYY-M-DD
* NULL
* Month (only)
* Just numbers
* Location
* Month/Day
* Telephone/Fax
* Wardname
* Pager Numeric Identifier
* Pharmacy MD Number

In [44]:
test = df.iloc[random.randint(0, 50)]['text']
print(test)

Planned Discharge Date: [**2138-3-21**]
   Insurance Update
   Primary insurance / reviewer: Not [**Hospital 8**]
   Hospital days authorized to:
   Current Discharge Plan: Home with services
   A family has been held and it was determined Mr. [**Known lastname 658**] wishes to go
   home to [**Location (un) 784**] with hospice care. Mrs [**Known lastname 658**] did not have a
   preference on which agency was called to provide care for her husband.
   [**Name (NI) 7**](s) To Discharge:
   Family Meeting: Yes
   Referrals:
   1)       VNA of [**Location (un) 785**] East [**Telephone/Fax (1) 786**]  Fax: [**Telephone/Fax (1) 786**]
   Narrative / Plan:
   The VNA of [**Location (un) 785**] has nursing availablity on Sunday. The equipment:
   bed, mattress, wheelchair have been ordered and will be delievered on
   Saturday. I have relayed this information to the team. Once the
   equipment is delivered on Saturday Mr. [**Known lastname 658**] could go home and have
   the Hospice visit o

In [45]:
pat_all = re.compile(r'\[\*\*(.*?)\*\*\]', re.IGNORECASE)

for m in pat_all.finditer(test):
    print(m)

<_sre.SRE_Match object; span=(24, 39), match='[**2138-3-21**]'>
<_sre.SRE_Match object; span=(97, 113), match='[**Hospital 8**]'>
<_sre.SRE_Match object; span=(244, 268), match='[**Known lastname 658**]'>
<_sre.SRE_Match object; span=(293, 316), match='[**Location (un) 784**]'>
<_sre.SRE_Match object; span=(340, 364), match='[**Known lastname 658**]'>
<_sre.SRE_Match object; span=(457, 474), match='[**Name (NI) 7**]'>
<_sre.SRE_Match object; span=(548, 571), match='[**Location (un) 785**]'>
<_sre.SRE_Match object; span=(577, 604), match='[**Telephone/Fax (1) 786**]'>
<_sre.SRE_Match object; span=(611, 638), match='[**Telephone/Fax (1) 786**]'>
<_sre.SRE_Match object; span=(674, 697), match='[**Location (un) 785**]'>
<_sre.SRE_Match object; span=(930, 954), match='[**Known lastname 658**]'>


In [48]:
pat_name = re.compile(r'(\[\*\*(?:(?!\[\*\*).)*?name.*?\*\*\])', re.IGNORECASE)

In [49]:
pat = pat_name
for m in pat.finditer(test):
    print(m)

<_sre.SRE_Match object; span=(244, 268), match='[**Known lastname 658**]'>
<_sre.SRE_Match object; span=(340, 364), match='[**Known lastname 658**]'>
<_sre.SRE_Match object; span=(457, 474), match='[**Name (NI) 7**]'>
<_sre.SRE_Match object; span=(930, 954), match='[**Known lastname 658**]'>


In [None]:
out = replace_redacted(test)

In [None]:
print(out)

In [None]:
def replace_name(match):
    r = 't_name'
    if 'Last' in match.group() or 'last' in match.group():
        if 'Doctor' in match.group():
            r = 't_name_doc_last'
        else:
            r = 't_name_last'
    elif 'First' in match.group() or 'first' in match.group():
        if 'Doctor' in match.group():
            r = 't_name_doc_first'
        else:
            r = 't_name_first'
    elif 'Initials' in match.group():
        r = 't_name_inits'
    return r

In [None]:
def replace_redacted(text):
    # replace name types with unique tokens
    pat_name = re.compile(r'\[\*\*(.*?Name.*?)\*\*\]', re.IGNORECASE) 
    text = pat_name.sub(replace_name, text)
    
    return text

In [None]:
df['scrubbed'] = df['text'].apply(replace_redacted)

In [None]:
pat_name = re.compile(r'\[\*\*(.*?Name.*?)\*\*\]', re.IGNORECASE)

In [None]:
np.all(df['scrubbed'].apply(lambda t: len(pat_name.findall(t))).values == 0)

In [None]:
for _, row in df.iterrows():
    print(len(pat_name.findall(row['scrubbed'])))

In [None]:
# out = pat_name.sub(replace_name, test)
out = replace_redacted(test)

In [None]:
for m in pat_name.finditer(out):
    print(m)

print(out)