In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import psycopg2
import random
import math
import pandas as pd
from pathlib import Path

In [3]:
def fix_age(df):
    cols = df.columns
    age = df.groupby(['subject_id'])['admission_age'].mean().round(2)
    df = pd.merge(df, age, on='subject_id', how='left')
    df.drop(['admission_age_x'], axis=1, inplace=True)
    df.rename({'admission_age_y': 'admission_age'}, axis=1, inplace=True)
    df = df.reindex(cols, axis=1)
    return df

def fix_df(df):
    df = fix_age(df)  
    df.drop_duplicates(inplace=True)    
    df['note'] = df['category'].str.cat(df['description'], sep='\n')
    df['note'] = df['note'].str.cat(df['text'], sep='\n')
    df.drop(['category', 'description', 'text'], axis=1, inplace=True)
    cols = list(df.columns)
    cols[-1] = 'class_label'
    cols[-2] = 'note'
    df = df.reindex(cols, axis=1)
    return df

def set_splits(df, val_pct, test_pct=None):
    df['split'] = 'train'
    df_len = len(df)
    idxs = list(range(df_len))
    random.shuffle(idxs)

    val_idx = math.ceil(df_len * val_pct)
    val_idxs = idxs[:val_idx]
    df.loc[val_idxs, 'split'] = 'val'

    if test_pct:
        test_idx = val_idx + math.ceil(df_len * test_pct)
        test_idxs = idxs[val_idx:test_idx]
        df.loc[test_idxs, 'split'] = 'test'

    return df

In [4]:
con = psycopg2.connect(dbname='mimic', user='sudarshan', host='/var/run/postgresql')

query = """
select hadm_id, subject_id, icustay_id, admission_age, wait_period, category, description, text, class_label from data where class_label <> -1 and length(text) between 500 and 6000
"""
df = pd.read_sql_query(query, con)
con.close()

In [5]:
df = fix_df(df)
pos = df[df['class_label'] == 1].copy()
pos.reset_index(inplace=True, drop=True)
neg = df[df['class_label'] == 0].copy()
neg.reset_index(inplace=True, drop=True)

pos = set_splits(pos, 0.1, 0.1)
neg = set_splits(neg, 0.1, 0.1)
df = pd.concat([pos, neg], axis=0)
df.head()

Unnamed: 0,hadm_id,subject_id,icustay_id,admission_age,wait_period,note,class_label,split
0,100012,60039,239289,67.71,47.07,Radiology\nCHEST (PRE-OP PA & LAT)\n[**2177-3-...,1,test
1,100036,30078,296976,82.87,91.19,Radiology\nCHEST (PA & LAT)\n[**2187-7-16**] 8...,1,train
2,100088,22872,238680,73.06,151.68,Radiology\nCT ABD W&W/O C\n[**2176-6-5**] 10:2...,1,train
3,100112,25418,281606,66.82,43.25,Radiology\nCHEST (PRE-OP PA & LAT)\n[**2139-2-...,1,train
4,100131,10150,282123,300.0,38.66,Radiology\nR FEMUR (AP & LAT) RIGHT\n[**2144-3...,1,train


In [6]:
path = Path('./data')
df.to_csv(path/'data.csv', index=False)

In [7]:
len(df)

14405

In [8]:
df = pd.read_csv(path/'data.csv')

In [11]:
df.head()

Unnamed: 0,hadm_id,subject_id,icustay_id,admission_age,wait_period,note,class_label,split
0,100012,60039,239289,67.71,47.07,Radiology\nCHEST (PRE-OP PA & LAT)\n[**2177-3-...,1,test
1,100036,30078,296976,82.87,91.19,Radiology\nCHEST (PA & LAT)\n[**2187-7-16**] 8...,1,train
2,100088,22872,238680,73.06,151.68,Radiology\nCT ABD W&W/O C\n[**2176-6-5**] 10:2...,1,train
3,100112,25418,281606,66.82,43.25,Radiology\nCHEST (PRE-OP PA & LAT)\n[**2139-2-...,1,train
4,100131,10150,282123,300.0,38.66,Radiology\nR FEMUR (AP & LAT) RIGHT\n[**2144-3...,1,train
