In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import psycopg2
import random
import math
import pandas as pd
from pathlib import Path

In [5]:
con = psycopg2.connect(dbname='mimic', user='sudarshan', host='/var/run/postgresql')

query = """
select hadm_id, subject_id, icustay_id, admission_age, wait_period, category, description, text, class_label from notes where class_label != -1 and length(text) between 100 and 8500
"""
df = pd.read_sql_query(query, con)
con.close()

In [6]:
df.shape

(51739, 9)

In [10]:
df.head(5)

Unnamed: 0,hadm_id,subject_id,icustay_id,admission_age,wait_period,category,description,text,class_label
0,100012,60039,239289,67.71,1.96,Radiology,CHEST (PRE-OP PA & LAT),[**2177-3-12**] 4:40 PM\n CHEST (PRE-OP PA & L...,1
1,100036,30078,296976,82.87,3.8,Radiology,CHEST (PA & LAT),[**2187-7-16**] 8:34 AM\n CHEST (PA & LAT) ...,1
2,100037,58947,221136,58.4,27.79,Nursing,Nursing Progress Note,Neutropenia\n Assessment:\n Action:\n Re...,0
3,100037,58947,221136,58.4,27.79,Radiology,CT HEAD W/O CONTRAST,"[**Last Name (LF) 5733**],[**First Name3 (LF) ...",0
4,100037,58947,221136,58.4,27.79,Physician,Physician Resident Admission Note,"Chief Complaint: Pancytopenia, ICH\n HPI:\n...",0


In [None]:
df = fix_df(df)
pos = df[df['class_label'] == 1].copy()
pos.reset_index(inplace=True, drop=True)
neg = df[df['class_label'] == 0].copy()
neg.reset_index(inplace=True, drop=True)

pos = set_splits(pos, 0.1, 0.1)
neg = set_splits(neg, 0.1, 0.1)
df = pd.concat([pos, neg], axis=0)
df.head()

In [None]:
def fix_age(df):
    cols = df.columns
    age = df.groupby(['subject_id'])['admission_age'].mean().round(2)
    df = pd.merge(df, age, on='subject_id', how='left')
    df.drop(['admission_age_x'], axis=1, inplace=True)
    df.rename({'admission_age_y': 'admission_age'}, axis=1, inplace=True)
    df = df.reindex(cols, axis=1)
    return df

def fix_df(df):
    df = fix_age(df)  
    df.drop_duplicates(inplace=True)    
    df['note'] = df['category'].str.cat(df['description'], sep='\n')
    df['note'] = df['note'].str.cat(df['text'], sep='\n')
    df.drop(['category', 'description', 'text'], axis=1, inplace=True)
    cols = list(df.columns)
    cols[-1] = 'class_label'
    cols[-2] = 'note'
    df = df.reindex(cols, axis=1)
    return df

def set_splits(df, val_pct, test_pct=None):
    df['split'] = 'train'
    df_len = len(df)
    idxs = list(range(df_len))
    random.shuffle(idxs)

    val_idx = math.ceil(df_len * val_pct)
    val_idxs = idxs[:val_idx]
    df.loc[val_idxs, 'split'] = 'val'

    if test_pct:
        test_idx = val_idx + math.ceil(df_len * test_pct)
        test_idxs = idxs[val_idx:test_idx]
        df.loc[test_idxs, 'split'] = 'test'

    return df

In [None]:
df = fix_df(df)
pos = df[df['class_label'] == 1].copy()
pos.reset_index(inplace=True, drop=True)
neg = df[df['class_label'] == 0].copy()
neg.reset_index(inplace=True, drop=True)

pos = set_splits(pos, 0.1, 0.1)
neg = set_splits(neg, 0.1, 0.1)
df = pd.concat([pos, neg], axis=0)
df.head()

In [None]:
path = Path('./data')
df.to_csv(path/'data.csv', index=False)

In [None]:
len(df)

In [None]:
df = pd.read_csv(path/'data.csv')

In [None]:
df.head()