# Dataset Setup for First ICU Prediction

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import psycopg2
import math
import spacy
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split

In [3]:
from remove_redacted import process_notes

In [4]:
path = Path('./data')
nlp = spacy.load('en_core_sci_sm', disable=['parser', 'ner', 'tagger'])

## Functions

In [5]:
def test(a,b,cmp,cname=None):
    if cname is None: cname=cmp.__name__
    assert cmp(a,b),f"{cname}:\n{a}\n{b}"

def near(a,b): return np.allclose(a, b, rtol=1e-3, atol=1e-5)
def test_near(a,b): test(a,b,near)

def fix_df(df):
    df.drop_duplicates(inplace=True)    
    df['note'] = df['category'].str.cat(df['description'], sep='\n')
    df['note'] = df['note'].str.cat(df['text'], sep='\n')
    df.drop(['category', 'description', 'text'], axis=1, inplace=True)
    cols = list(df.columns)
    cols[-1] = 'class_label'
    cols[-2] = 'note'
    df = df.reindex(cols, axis=1)
    return df

def set_splits(df, val_pct, test_pct=0.0, sample_pct=0.0, seed=None):    
    new_test_pct = np.around(test_pct / (val_pct + test_pct), 2)
    train_pct = 1 - (val_pct + test_pct)
    train_idxs, inter = train_test_split(np.arange(len(df)), test_size=(val_pct + test_pct), random_state=seed)
    val_idxs, test_idxs = train_test_split(inter, test_size=new_test_pct, random_state=seed)
    
    df['split'] = None    
    df.iloc[train_idxs, df.columns.get_loc('split')] = 'train'
    df.iloc[val_idxs, df.columns.get_loc('split')] = 'val'
    df.iloc[test_idxs, df.columns.get_loc('split')] = 'test'
    
    if sample_pct > 0.0:
        df['is_sample'] = False
        _, sample_idxs = train_test_split(train_idxs, test_size=sample_pct)
        df.iloc[sample_idxs, df.columns.get_loc('is_sample')] = True
        test_near(round(len(df[df['is_sample']])/len(df), 2), round(sample_pct * train_pct, 2))
    
    test_near(round(len(df[df['split'] == 'train'])/len(df), 2), train_pct)
    test_near(round(len(df[df['split'] == 'val'])/len(df), 2), val_pct)
    test_near(round(len(df[df['split'] == 'test'])/len(df), 2), test_pct)
    
    return df

In [6]:
def tokenize_text(text):
    tokens = [token.text for token in nlp(text)]
    return ' '.join(tokens)

## Grab Data from Database

Grab the data fields from the MIMIC postgres database. This query assumes that view/table/materialized view called **notes** is already setup. We are discarding the notes with *class_label* = -1 as these fall in the time period which are not used. We also discard notes whose lengths are either below 100 characters or more than 8500 characters. 

In [7]:
con = psycopg2.connect(dbname='mimic', user='sudarshan', host='/var/run/postgresql')

query = """
select * from co where class_label != -1 and length(text) between 100 and 8500
"""
df = pd.read_sql_query(query, con)
con.close()
df.head()

Unnamed: 0,hadm_id,subject_id,icustay_id,admission_age,gender,admittime,charttime,intime,wait_period,note_wait_time,chartinterval,category,description,text,class_label
0,100012,60039,239289,67.71,M,2177-03-12 11:48:00,2177-03-12 16:40:00,2177-03-14 10:52:23,1.96,1.76,1,Radiology,CHEST (PRE-OP PA & LAT),[**2177-3-12**] 4:40 PM\n CHEST (PRE-OP PA & L...,1
1,100036,30078,296976,82.87,F,2187-07-13 13:58:00,2187-07-16 08:34:00,2187-07-17 09:09:36,3.8,1.02,1,Radiology,CHEST (PA & LAT),[**2187-7-16**] 8:34 AM\n CHEST (PA & LAT) ...,1
2,100037,58947,221136,58.4,M,2183-03-23 18:21:00,2183-03-25 07:26:00,2183-04-20 13:16:43,27.79,26.24,15,Physician,Physician Resident Progress Note,Chief Complaint:\n 24 Hour Events:\n - spo...,0
3,100037,58947,221136,58.4,M,2183-03-23 18:21:00,2183-03-25 05:28:00,2183-04-20 13:16:43,27.79,26.33,15,Nursing,Nursing Progress Note,58 y/o M presented to [**Hospital1 **] [**Loca...,0
4,100037,58947,221136,58.4,M,2183-03-23 18:21:00,2183-03-24 06:26:00,2183-04-20 13:16:43,27.79,27.29,15,Physician,Physician Resident/Attending Progress Note - MICU,Chief Complaint:\n 24 Hour Events:\n -rece...,0


These are various helper functions. Test functions are copied from the fastai notebook found [here](https://nbviewer.jupyter.org/github/fastai/fastai_docs/blob/master/dev_course/dl2/01_matmul.ipynb).

`fix_df` drops duplicate records, concatenates `category`, `description`, and `text` columns into one column called `note` and drops the concatenated columns.

`set_splits` adds additional columns called `splits` and optionally `is_sample` to set the training, validation, and testing dataset from the dataframe according to the percentage specified in the function. In addition, optionally, there is a `sample_pct` to get a sample out of the training set for initial experimentation. Finally, the function also accepts a random seed for reproducability. This function uses `sklearn`'s `train_test_split` to get the splits.

In [8]:
df = fix_df(df)
df.head()

Unnamed: 0,hadm_id,subject_id,icustay_id,admission_age,gender,admittime,charttime,intime,wait_period,note_wait_time,chartinterval,note,class_label
0,100012,60039,239289,67.71,M,2177-03-12 11:48:00,2177-03-12 16:40:00,2177-03-14 10:52:23,1.96,1.76,1,Radiology\nCHEST (PRE-OP PA & LAT)\n[**2177-3-...,1
1,100036,30078,296976,82.87,F,2187-07-13 13:58:00,2187-07-16 08:34:00,2187-07-17 09:09:36,3.8,1.02,1,Radiology\nCHEST (PA & LAT)\n[**2187-7-16**] 8...,1
2,100037,58947,221136,58.4,M,2183-03-23 18:21:00,2183-03-25 07:26:00,2183-04-20 13:16:43,27.79,26.24,15,Physician \nPhysician Resident Progress Note\n...,0
3,100037,58947,221136,58.4,M,2183-03-23 18:21:00,2183-03-25 05:28:00,2183-04-20 13:16:43,27.79,26.33,15,Nursing\nNursing Progress Note\n58 y/o M prese...,0
4,100037,58947,221136,58.4,M,2183-03-23 18:21:00,2183-03-24 06:26:00,2183-04-20 13:16:43,27.79,27.29,15,Physician \nPhysician Resident/Attending Progr...,0


In [9]:
print(f"Total number of notes: {len(df)}")
print(f"Distribution of classes:\n {df.groupby('class_label').size()}")    

Total number of notes: 50820
Distribution of classes:
 class_label
0    38815
1    12005
dtype: int64


In [23]:
df.loc[(df['class_label'] == 1)]['hadm_id'].nunique()

5522

Iterative over the class labels to split the notes into train/val/test with the ratio 0.8/0.1/0.1 and a 10% sample from the training set. We set a random seed of 42. Splitting the data this way maintains the proportion of the classes in the splits of the class labels.

## Preprocess

In [None]:
from dask import dataframe as ddf
dd = ddf.from_pandas(df, 8)

In [None]:
dd['proc_note'] = dd['note'].apply(process_notes, meta=('note', 'object'))
dd['tok_note'] = dd['note'].apply(tokenize_text, meta=('note', 'object'))
dd['tok_proc_note'] = dd['proc_note'].apply(tokenize_text, meta=('note', 'object'))

In [None]:
df = dd.compute()

## Set Splits

In [None]:
classes = [None] * df['class_label'].nunique()
class_labels = df['class_label'].unique()

for idx, label in enumerate(class_labels):
    classes[idx] = df[df['class_label'] == label].copy()
    classes[idx].reset_index(inplace=True, drop=True)

for idx in range(len(classes)):
    classes[idx] = set_splits(classes[idx], 0.1, 0.1, 0.1, seed=42)
    
df = pd.concat(classes, axis=0)
df.head()    

In [None]:
print(f"Distribution of splits:\n{df.groupby('split').size()}")
print(f"Number of samples: {len(df[df['is_sample']])}")

Write the dataset to disk and verify that it is written correctly, by loading it back again.

In [None]:
path = Path('./data')
df.to_csv(path/'data.csv', index=False)

df = pd.read_csv(path/'data.csv')
df.head()