In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import psycopg2
import math
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split

In [3]:
con = psycopg2.connect(dbname='mimic', user='sudarshan', host='/var/run/postgresql')

query = """
select hadm_id, subject_id, icustay_id, admission_age, wait_period, chartinterval, category, description, text, class_label from notes where class_label != -1 and length(text) between 100 and 8500
"""
df = pd.read_sql_query(query, con)
con.close()
df.head()

Unnamed: 0,hadm_id,subject_id,icustay_id,admission_age,wait_period,chartinterval,category,description,text,class_label
0,100012,60039,239289,67.71,1.96,1,Radiology,CHEST (PRE-OP PA & LAT),[**2177-3-12**] 4:40 PM\n CHEST (PRE-OP PA & L...,1
1,100036,30078,296976,82.87,3.8,1,Radiology,CHEST (PA & LAT),[**2187-7-16**] 8:34 AM\n CHEST (PA & LAT) ...,1
2,100037,58947,221136,58.4,27.79,15,Nursing,Nursing Progress Note,Neutropenia\n Assessment:\n Action:\n Re...,0
3,100037,58947,221136,58.4,27.79,6,Radiology,CT HEAD W/O CONTRAST,"[**Last Name (LF) 5733**],[**First Name3 (LF) ...",0
4,100037,58947,221136,58.4,27.79,15,Physician,Physician Resident Admission Note,"Chief Complaint: Pancytopenia, ICH\n HPI:\n...",0


In [4]:
def fix_df(df):
    df.drop_duplicates(inplace=True)    
    df['note'] = df['category'].str.cat(df['description'], sep='\n')
    df['note'] = df['note'].str.cat(df['text'], sep='\n')
    df.drop(['category', 'description', 'text'], axis=1, inplace=True)
    cols = list(df.columns)
    cols[-1] = 'class_label'
    cols[-2] = 'note'
    df = df.reindex(cols, axis=1)
    return df

In [5]:
# def set_splits(df, val_pct, test_pct=None):
#     df['split'] = 'train'
#     df_len = len(df)
#     idxs = list(range(df_len))
#     random.shuffle(idxs)

#     val_idx = math.ceil(df_len * val_pct)
#     val_idxs = idxs[:val_idx]
#     df.loc[val_idxs, 'split'] = 'val'

#     if test_pct:
#         test_idx = val_idx + math.ceil(df_len * test_pct)
#         test_idxs = idxs[val_idx:test_idx]
#         df.loc[test_idxs, 'split'] = 'test'

#     return df

In [15]:
import pdb

In [86]:
def set_splits(df, val_pct, test_pct=0.0, sample_pct=0.0):    
    new_test_pct = np.around(test_pct / (val_pct + test_pct), 2)
#     new_val_pct = 1.0 - new_test_pct
#     print(new_test_pct, new_val_pct)
#     pdb.set_trace()
    

    train_pct = 1 - (val_pct + test_pct)
    train_idxs, inter = train_test_split(np.arange(len(df)), test_size=(val_pct + test_pct))
    val_idxs, test_idxs = train_test_split(inter, test_size=new_test_pct)
    
#     print(train_pct, val_pct, test_pct)
#     print(len(train_idxs), len(val_idxs), len(test_idxs))
#     print(round(len(train_idxs)/len(df), 2), round(len(val_idxs)/len(df), 2), round(len(test_idxs)/len(df), 2))
    
#     print(math.isclose(round(len(train_idxs)/len(df), 2), train_pct))
#     print(math.isclose(round(len(val_idxs)/len(df), 2), val_pct))
#     print(math.isclose(round(len(test_idxs)/len(df), 2), test_pct))

    df['split'] = None    
    df.iloc[train_idxs, df.columns.get_loc('split')] = 'train'
    df.iloc[val_idxs, df.columns.get_loc('split')] = 'val'
    df.iloc[test_idxs, df.columns.get_loc('split')] = 'test'
    
    if sample_pct > 0.0:
        df['is_sample'] = False
        _, sample_idxs = train_test_split(train_idxs, test_size=sample_pct)
#         print(sample_pct, round(sample_pct * train_pct, 2), len(sample_idxs))
#         print(math.isclose(round(len(sample_idxs)/len(df), 2), round(sample_pct * train_pct, 2)))
        df.iloc[sample_idxs, df.columns.get_loc('is_sample')] = True
#         print(round(len(df[df['is_sample']])/len(df), 2))
        print(math.isclose(round(len(df[df['is_sample']])/len(df), 2), round(sample_pct * train_pct, 2)))
#         print((round(len(df[df['is_sample']])/len(df), 2), round(sample_pct * train_pct, 2)))
    
    print(math.isclose(round(len(df[df['split'] == 'train'])/len(df), 2), train_pct))
    print(math.isclose(round(len(df[df['split'] == 'val'])/len(df), 2), val_pct))
    print(math.isclose(round(len(df[df['split'] == 'test'])/len(df), 2), test_pct))
    
#     assert(math.isclose(round(len(df[df['split'] == 'train'])/len(df), 2), train_pct))    
#     assert(math.isclose(round(len(df[df['split'] == 'val'])/len(df), 2), 0.1))
#     assert(math.isclose(round(len(df[df['split'] == 'test'])/len(df), 2), 0.1))

In [87]:
set_splits(df, 0.25,0.15, 0.1)

True
True
True
True


In [7]:
df = fix_df(df)
df.head()

Unnamed: 0,hadm_id,subject_id,icustay_id,admission_age,wait_period,chartinterval,note,class_label
0,100012,60039,239289,67.71,1.96,1,Radiology\nCHEST (PRE-OP PA & LAT)\n[**2177-3-...,1
1,100036,30078,296976,82.87,3.8,1,Radiology\nCHEST (PA & LAT)\n[**2187-7-16**] 8...,1
2,100037,58947,221136,58.4,27.79,15,Nursing\nNursing Progress Note\nNeutropenia\n ...,0
3,100037,58947,221136,58.4,27.79,6,Radiology\nCT HEAD W/O CONTRAST\n[**Last Name ...,0
4,100037,58947,221136,58.4,27.79,15,Physician \nPhysician Resident Admission Note\...,0


In [None]:
df['split'] = None
df['is_sample'] = False

In [8]:
train_idxs, test_idxs = train_test_split(np.arange(len(df)), test_size=0.0)

In [None]:
test_idxs, val_idxs = train_test_split(test_idxs, test_size=0.5)

In [None]:
len(train_idxs), len(val_idxs), len(test_idxs)

In [None]:
df.iloc[train_idxs, df.columns.get_loc('split')] = 'train'
df.iloc[val_idxs, df.columns.get_loc('split')] = 'val'
df.iloc[test_idxs, df.columns.get_loc('split')] = 'test'

In [None]:
_, samp_idxs = train_test_split(train_idxs, test_size=0.1)

In [None]:
df.iloc[samp_idxs, df.columns.get_loc('is_sample')] = True

In [None]:
round(len(df[df['is_sample']])/len(df), 2)

In [None]:
math.isclose(round(len(df[df['is_sample'] == True])/len(df), 2), 0.08)

In [None]:
len(df[(df['is_sample'] == True) & (df['split'] == 'train')]) == len(samp_idxs)

In [None]:
x = np.array([3, 5, 1])

In [None]:
df.reindex(train_idxs)['split'] = 'train'

In [None]:
x['split'] = 'train'

In [None]:
df[df['split'] == 'train']

In [None]:
df.loc[train_idxs]

In [None]:
df.loc[train_idxs,'split'] = 'train'

In [None]:
df.iloc[train_idxs]['split'] = 'train'

In [None]:
df.loc[0:10, 'split']

In [None]:
# len(train_idxs) + len(val_idxs) + len(test_idxs)
len(val_idxs) / len(df)

In [None]:
classes = [None] * df['class_label'].nunique()
for idx in range(len(classes)):
    classes[idx] = df[df['class_label'] == idx].copy()
    classes[idx].reset_index(inplace=True, drop=True)

for idx in range(len(classes)):
    classes[idx] = set_splits(classes[idx], 0.1, 0.1)

df = pd.concat(classes, axis=0)

In [None]:
df['is_sample'] = False

In [None]:
samples = df[df['split'] == 'train']

In [None]:
df_len = len(samples)
idxs = list(range(df_len))
random.shuffle(idxs)

In [None]:
samp_idx = math.ceil(df_len * 0.1)
samp_idxs = idxs[:samp_idx]
samples.loc[samp_idxs, 'is_sample'] = True

In [None]:
print(len(df[(df['class_label'] == 0) & (df['split'] == 'train')]), len(df[(df['class_label'] == 0) & (df['split'] == 'train')])/len(classes[0]))
print(len(df[(df['class_label'] == 1) & (df['split'] == 'train')]), len(df[(df['class_label'] == 1) & (df['split'] == 'train')])/len(classes[1]))

In [None]:
path = Path('./data')
df.to_csv(path/'data.csv', index=False)

In [None]:
df = pd.read_csv(path/'data.csv')
df.head()