In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import os
import warnings
from tqdm import tqdm_notebook as tqdm

In [2]:
np.random.seed(32)

In [3]:
raw_path = Path('..')/'data'/'raw'
proc_path = Path('..')/'data'/'processed'

In [4]:
with open(raw_path/'raw_disch_full.csv','r') as f:
    lines = f.read()
lines = lines.split('ryan\n')

In [5]:
rows = []
weirds = []
for line in lines:
    row = line.split('||')
    if len(row) == 4:
        rows.append(row)
    else:
        weirds.append(line)

In [6]:
text_df = pd.DataFrame(columns=rows[0], data=rows[1:])
text_df = text_df.drop('CHARTTIME',axis=1) # they're all missing
text_df[['SUBJECT_ID','HADM_ID']] = text_df[['SUBJECT_ID','HADM_ID']].astype(int)
text_df = text_df.groupby(['SUBJECT_ID','HADM_ID']).TEXT.apply(lambda x: '\n\n<NEWNOTE>\n\n'.join(x)).reset_index()
text_df.sort_values('SUBJECT_ID').head(5)

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT
0,3,145834,Admission Date: [**2101-10-20**] Discharg...
1,4,185777,Admission Date: [**2191-3-16**] Discharge...
2,6,107064,Admission Date: [**2175-5-30**] Dischar...
3,9,150750,Admission Date: [**2149-11-9**] Dischar...
4,10,184167,Admission Date: [**2103-6-28**] Dischar...


In [7]:
df_all = pd.read_csv(raw_path/'despair_labels.csv')
df_all.head(5)

Unnamed: 0,SUBJECT_ID,HADM_IDS,DIAGNOSIS,D_LABEL,READM_30,READM_90,READM_365,DEATH_30,DEATH_90,DEATH_365,DEATH_INHOSPITAL
0,2,[163353],"['V290', 'V3001', 'V053']",0,0,0,0,0,0,0,0
1,3,[145834],"['2639', '4254', '4275', '4280', '41071', '584...",0,0,0,0,0,0,1,0
2,4,[185777],"['5715', '7994', '042', '7907', '04111', 'V090...",1,0,0,0,0,0,0,0
3,5,[178980],"['V290', 'V3000', 'V053']",0,0,0,0,0,0,0,0
4,6,[107064],"['40391', '2766', '2753', '2767', '9972', '444...",0,0,0,0,0,0,0,0


In [8]:
tmp = df_all[['READM_90','DEATH_90']].sum(axis=1)
frac = len(tmp[tmp>0])/len(tmp) # fraction of ids with at least one readmit/death event
print('% IDs with 90-day readmit or death: ',round(frac,4)*100,'%')

% IDs with 90-day readmit or death:  24.81 %


In [41]:
warnings.filterwarnings('ignore')

# subset to just 90-day cases and create empty text column
df = df_all[['SUBJECT_ID','HADM_IDS','READM_90','DEATH_90']]
df['labels'] = df[['READM_90','DEATH_90']].sum(axis=1).replace(2,1)
df = df.drop(['READM_90','DEATH_90'],axis=1)
df.columns = [s.lower() for s in df.columns]
df['hadm_ids'] = df['hadm_ids'].apply(lambda x: ''.join([n for n in x[1:-1].split(',')]))
df['text'] = ''
df.head()

Unnamed: 0,subject_id,hadm_ids,labels,text
0,2,163353,0,
1,3,145834,0,
2,4,185777,0,
3,5,178980,0,
4,6,107064,0,


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58976 entries, 0 to 58975
Data columns (total 4 columns):
subject_id    58976 non-null int64
hadm_ids      58976 non-null object
labels        58976 non-null int64
text          58976 non-null object
dtypes: int64(2), object(2)
memory usage: 1.8+ MB


In [43]:
all_text_list = []
for hadm_ids in tqdm(df.hadm_ids):
    text_list = []
    for hadm_id in hadm_ids.split(' '):
        try:
            text = text_df.TEXT[text_df.HADM_ID == int(hadm_id)]
            text = text.item()
            text_list.append(text)
        except ValueError:
            continue
    all_text = '\n\n<NEWNOTE>\n\n'.join(text_list)
    all_text_list.append(all_text)
df.text = all_text_list

HBox(children=(IntProgress(value=0, max=58976), HTML(value='')))




In [51]:
df.to_csv(proc_path/'90_day_data_preprossed.csv',index=False)