In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
admissions = pd.read_csv("ADMISSIONS.csv.gz")
lab_events = pd.read_csv("LABEVENTS.csv.gz")
diagnoses = pd.read_csv("DIAGNOSES_ICD.csv.gz")
diag_dict = pd.read_csv("D_ICD_DIAGNOSES.csv.gz")

In [4]:
admissions.head(3)

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1


In [6]:
print("ORIGINAL SIZE:", len(lab_events))
lab_events = lab_events[lab_events["VALUENUM"].notna()][["SUBJECT_ID", "ITEMID", "VALUENUM"]]
lab_events = lab_events.groupby(["SUBJECT_ID", "ITEMID"]).mean().reset_index()
print(len(lab_events))
lab_events.head(3)

ORIGINAL SIZE: 24932835
2597712


Unnamed: 0,SUBJECT_ID,ITEMID,VALUENUM
0,2,50883,0.3
1,2,50884,9.0
2,2,50885,9.3


In [7]:
diagnoses.head(3)

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,1297,109,172335,1.0,40301
1,1298,109,172335,2.0,486
2,1299,109,172335,3.0,58281


In [8]:
diag_dict.head()

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,174,1166,TB pneumonia-oth test,"Tuberculous pneumonia [any form], tubercle bac..."
1,175,1170,TB pneumothorax-unspec,"Tuberculous pneumothorax, unspecified"
2,176,1171,TB pneumothorax-no exam,"Tuberculous pneumothorax, bacteriological or h..."
3,177,1172,TB pneumothorx-exam unkn,"Tuberculous pneumothorax, bacteriological or h..."
4,178,1173,TB pneumothorax-micro dx,"Tuberculous pneumothorax, tubercle bacilli fou..."


In [9]:
avg_dx_per_patient = diagnoses.groupby("SUBJECT_ID").size().mean()
print("Average number of diagnoses per patient:", avg_dx_per_patient)

avg_events_per_patient = lab_events.groupby("SUBJECT_ID").size().mean()
print("Average number of lab events per patient:", avg_events_per_patient)

Average number of diagnoses per patient: 13.994991401547722
Average number of lab events per patient: 56.16674594594595


In [10]:
sepsis_icd9 = diag_dict[diag_dict["SHORT_TITLE"].str.contains("sepsis", flags=re.IGNORECASE, regex=True)==True][["ICD9_CODE", "SHORT_TITLE"]]
sepsis_icd9

Unnamed: 0,ICD9_CODE,SHORT_TITLE
9049,77181,NB septicemia [sepsis]
10304,99591,Sepsis
10305,99592,Severe sepsis
13293,67020,Puerperal sepsis-unsp
13294,67022,Puerprl sepsis-del w p/p
13295,67024,Puerperl sepsis-postpart


In [11]:
sepsis_pos = diagnoses[diagnoses["ICD9_CODE"].isin(sepsis_icd9["ICD9_CODE"])]
sepsis_subjects = sepsis_pos["SUBJECT_ID"].unique()
sepsis_subjects

array([  117,   124,    64, ..., 95803, 97143, 97158], dtype=int64)

In [12]:
# lab_events["ITEMID"].unique().shape
features = np.sort(lab_events["ITEMID"].unique()).tolist()
n_features = len(features)
print("Number of features:", n_features)
print("Features:")
print(features)

Number of features: 494
Features:
[50801, 50802, 50803, 50804, 50805, 50806, 50808, 50809, 50810, 50811, 50813, 50814, 50815, 50816, 50817, 50818, 50819, 50820, 50821, 50822, 50823, 50824, 50825, 50826, 50830, 50831, 50832, 50833, 50834, 50835, 50836, 50837, 50838, 50839, 50840, 50841, 50842, 50843, 50844, 50846, 50847, 50848, 50849, 50850, 50851, 50852, 50853, 50854, 50855, 50856, 50857, 50858, 50859, 50860, 50861, 50862, 50863, 50864, 50865, 50866, 50867, 50868, 50869, 50870, 50877, 50878, 50881, 50882, 50883, 50884, 50885, 50889, 50890, 50891, 50892, 50893, 50894, 50895, 50896, 50898, 50899, 50900, 50902, 50903, 50904, 50905, 50906, 50907, 50908, 50909, 50910, 50911, 50912, 50914, 50915, 50916, 50917, 50921, 50922, 50924, 50925, 50926, 50927, 50928, 50929, 50930, 50931, 50934, 50935, 50936, 50945, 50946, 50947, 50949, 50950, 50951, 50952, 50953, 50954, 50956, 50957, 50958, 50960, 50961, 50962, 50963, 50964, 50965, 50966, 50967, 50968, 50969, 50970, 50971, 50972, 50973, 50974, 50976,

In [13]:
#We want each patient to have n_features + 1 amount of features. The last feature is a binary indicator that specifies
#if the patient was diagnosed with sepsis
feature_dict = {}
subjects = admissions["SUBJECT_ID"].unique()
print(len(subjects))
for subject in subjects:
    feature_dict[subject] = [0]*(n_features+1)

46520


In [15]:
for subject in subjects:
    if subject == 9549:
        print("ONE FOURTH DONE")
    if subject == 21332:
        print("HALFWAY DONE")
    for f, f_val in lab_events[lab_events["SUBJECT_ID"]==subject][["ITEMID", "VALUENUM"]].itertuples(index=False):
        feature_dict[subject][features.index(f)] = f_val

ONE FOURTH DONE
HALFWAY DONE


In [20]:
print(feature_dict[21332])

[0, -1.0, 0, 23.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20.0, 0, 36.0, 0, 7.414999999999999, 45.75, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 261.0, 0, 0, 0, 0, 19.166666666666668, 0, 0, 0, 0, 0, 17.333333333333332, 0.33333333333333326, 6.888888888888889, 7.222222222222222, 0, 0, 0, 0, 10.5, 0, 0, 0, 0, 0, 0, 114.66666666666667, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7.0, 5.366666666666666, 0, 0, 0, 0, 0, 0, 0, 0, 0, 145.83333333333334, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [112]:
feature_df = pd.DataFrame.from_dict(data=feature_dict, orient="index").reset_index(level=0).rename(columns={"index": "SUBJECT_ID"})
feature_df.head(3)

Unnamed: 0,SUBJECT_ID,0,1,2,3,4,5,6,7,8,...,485,486,487,488,489,490,491,492,493,494
0,22,0.0,0.0,0.0,22.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,23,0.0,-1.133333,0.0,24.6,0.0,102.0,1.131429,116.666667,32.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,24,0.0,0.0,0.0,27.0,0.0,0.0,0.0,0.0,44.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [139]:
for sepsis_subject in sepsis_subjects:
    if sepsis_subject in feature_df["SUBJECT_ID"]:
        feature_df.loc[feature_df["SUBJECT_ID"]==sepsis_subject, 494] = 1
    
feature_df.to_csv("feature_matrix.csv", header=True, index=False)

In [127]:
feature_df[494].sum()

2937

In [128]:
feature_df.head(3)

Unnamed: 0,SUBJECT_ID,0,1,2,3,4,5,6,7,8,...,485,486,487,488,489,490,491,492,493,494
0,22,0.0,0.0,0.0,22.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,23,0.0,-1.133333,0.0,24.6,0.0,102.0,1.131429,116.666667,32.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,24,0.0,0.0,0.0,27.0,0.0,0.0,0.0,0.0,44.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [130]:
feature_df[feature_df["SUBJECT_ID"]==124]

Unnamed: 0,SUBJECT_ID,0,1,2,3,4,5,6,7,8,...,485,486,487,488,489,490,491,492,493,494
98,124,0.0,-6.428571,19.0,19.614286,0.0,111.6,1.122857,119.8,26.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [105]:
sepsis_subjects

array([  117,   124,    64, ..., 95803, 97143, 97158], dtype=int64)

In [131]:
test=[]
for s in sepsis_subjects:
    if s not in feature_df["SUBJECT_ID"]:
        test.append(s)

In [138]:
feature_df.shape

(46520, 496)