In [2]:
import numpy as np
import pandas as pd
import math
import random
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [54]:
thresholds = dict()
thresholds['procedures_icd9'] = 10
thresholds['medication_ndc'] = 50
thresholds['lab_org'] = 0
thresholds['lab_ab'] = 0
thresholds['lab_chart'] = 100
thresholds['lab_chart_final'] = 0

In [55]:
def get_reindexed(df, column, threshold):
    mapping = dict()
    counts = dict()
    for i in range(len(df)):
        if(df[column][i] not in counts):
            counts[df[column][i]] = 0
        else:
            counts[df[column][i]] +=1
    for i in range(len(df)):
        if((df[column][i] not in mapping) and (counts[df[column][i]]>=threshold)):
            mapping[df[column][i]] = len(mapping)
    counts = dict(sorted(counts.items(), key=lambda item: item[1]))
    print("CURRENT LENGTH IS:", len(mapping))
    print("TOTAL LENGTH WOULD HAVE BEEN:", len(counts))
    return mapping, counts

## PATIENTS

In [11]:
patient_df = pd.read_csv("Raw/patient.csv")
patient_df

Unnamed: 0.1,Unnamed: 0,SUBJECT_ID,GENDER,EXPIRE_FLAG,HADM_ID,ICUSTAY_ID,LOS,AGE,cohort,Obesity,...,Advanced.Lung.Disease,Schizophrenia.and.other.Psychiatric.Disorders,Alcohol.Abuse,Other.Substance.Abuse,Chronic.Pain.Fibromyalgia,Chronic.Neurological.Dystrophies,Advanced.Cancer,Depression,Dementia,Unsure
0,0,9973,M,1,100020,282580,1.1011,58,1,0,...,0,0,0,0,1,1,0,0,0,0
1,1,3365,F,0,100103,200434,3.2836,72,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2,27290,M,1,100137,212691,3.7297,82,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,9882,M,0,100177,251800,6.5389,55,1,0,...,0,0,0,0,0,0,0,0,0,0
4,4,5525,M,1,100473,257484,5.7583,65,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526,1526,23549,F,1,199807,212087,35.7563,74,1,0,...,1,0,0,0,0,0,0,0,0,0
1527,1527,8772,M,1,199828,215516,4.9366,69,0,0,...,1,0,1,0,0,1,0,0,1,0
1528,1528,7029,M,0,199883,247475,1.2379,26,1,0,...,0,0,1,0,0,0,0,0,0,0
1529,1529,1931,M,1,199884,265365,2.6764,72,1,0,...,0,0,0,0,0,0,0,1,0,0


In [57]:
gender_mapping = {'M': 0, 'F': 1}
patient_df['GENDER'] = patient_df['GENDER'].map(gender_mapping)
patient_df.head()

Unnamed: 0,SUBJECT_ID,GENDER,EXPIRE_FLAG,HADM_ID,ICUSTAY_ID,AGE,Obesity,Non.Adherence,Developmental.Delay.Retardation,Advanced.Heart.Disease,Advanced.Lung.Disease,Schizophrenia.and.other.Psychiatric.Disorders,Alcohol.Abuse,Other.Substance.Abuse,Chronic.Pain.Fibromyalgia,Chronic.Neurological.Dystrophies,Advanced.Cancer,Depression,Dementia
0,9973,0,1,100020,282580,58,0,0,0,0,0,0,0,0,1,1,0,0,0
1,3365,1,0,100103,200434,72,0,0,0,0,0,0,0,0,0,0,0,0,0
2,27290,0,1,100137,212691,82,0,0,0,0,0,0,0,0,0,0,0,0,0
3,9882,0,0,100177,251800,55,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5525,0,1,100473,257484,65,1,0,0,0,0,0,0,0,0,0,0,0,0


## PROCEDURES

In [7]:
procedure_df = pd.read_csv("Raw/procedure.csv").drop("Unnamed: 0", axis=1)
procedure_df

Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ICD9_CODE
0,3365,100103,200434,3523
1,3365,100103,200434,3615
2,3365,100103,200434,3611
3,3365,100103,200434,3961
4,3365,100103,200434,8968
...,...,...,...,...
6548,8772,199828,215516,331
6549,8772,199828,215516,3893
6550,1931,199884,265365,4443
6551,1931,199884,265365,9634


In [59]:
icd9_codes_mapping, icd9_codes_counts = get_reindexed(procedure_df, 'ICD9_CODE', thresholds['procedures_icd9'])

CURRENT LENGTH IS: 93
TOTAL LENGTH WOULD HAVE BEEN: 591


In [60]:
print(icd9_codes_mapping)

{3615: 0, 3611: 1, 3961: 2, 66: 3, 3606: 4, 3950: 5, 45: 6, 40: 7, 3722: 8, 8856: 9, 3990: 10, 4516: 11, 3979: 12, 3893: 13, 9915: 14, 3891: 15, 3895: 16, 3995: 17, 3612: 18, 331: 19, 387: 20, 14: 21, 966: 22, 9904: 23, 9905: 24, 4513: 25, 4233: 26, 4443: 27, 5491: 28, 9604: 29, 9671: 30, 3491: 31, 4523: 32, 9672: 33, 3324: 34, 7761: 35, 4311: 36, 9907: 37, 370: 38, 3721: 39, 8872: 40, 8191: 41, 3897: 42, 3607: 43, 9920: 44, 93: 45, 3521: 46, 9925: 47, 4131: 48, 8964: 49, 8604: 50, 8605: 51, 8855: 52, 3723: 53, 4432: 54, 311: 55, 3321: 56, 5011: 57, 3404: 58, 9960: 59, 5498: 60, 9390: 61, 3613: 62, 3761: 63, 3601: 64, 9962: 65, 13: 66, 8611: 67, 46: 68, 17: 69, 8622: 70, 8842: 71, 9723: 72, 3322: 73, 8847: 74, 8628: 75, 9462: 76, 8841: 77, 5459: 78, 4639: 79, 5110: 80, 8853: 81, 9605: 82, 3726: 83, 9910: 84, 3323: 85, 5059: 86, 8754: 87, 309: 88, 9607: 89, 3949: 90, 9357: 91, 3734: 92}


In [61]:
procedure_grouped_df = procedure_df.groupby(['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID'])['ICD9_CODE'].agg(list).reset_index()
procedure_grouped_df['ICD9_FEATURES'] = None

for i in range(len(procedure_grouped_df)):
    feature_vec = np.zeros(len(icd9_codes_mapping), dtype=int)
    codes = list(set(procedure_grouped_df["ICD9_CODE"][i]))
    for code in codes:
        if(icd9_codes_counts[code]>=thresholds['procedures_icd9']):
            index = icd9_codes_mapping[code]
            feature_vec[index] = 1
    procedure_grouped_df['ICD9_FEATURES'][i] = list(feature_vec)
print(len(procedure_grouped_df))
procedure_grouped_df.head()

1334


Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ICD9_CODE,ICD9_FEATURES
0,78,100536,233150,[331],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,109,164029,290868,[3995],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,154,102354,201272,[8856],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
3,154,162891,250579,"[3614, 3615, 3961, 8872, 8856, 8853]","[1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
4,188,123860,213646,"[9671, 5491, 3893]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


In [62]:
index = 4
lista = []
listb = []
for i in range(len(procedure_grouped_df['ICD9_FEATURES'][index])):
    if(procedure_grouped_df['ICD9_FEATURES'][index][i]==1):
        lista.append(i)

for key in procedure_grouped_df['ICD9_CODE'][index]:
    if(key in icd9_codes_mapping):
        listb.append(icd9_codes_mapping[key])
        
print(lista)
print(listb)
print(set(lista)==set(listb))

[13, 28, 30]
[30, 28, 13]
True


## MEDICATION

In [63]:
med_df = pd.read_csv("data/Raw/medication.csv").drop("Unnamed: 0", axis=1).fillna(0).astype(int)
med_df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,NDC
0,9973,100020,282580,781710955
1,9973,100020,282580,0
2,9973,100020,282580,904565661
3,9973,100020,282580,574705050
4,9973,100020,282580,182853489


In [64]:
ndc_mapping, ndc_counts = get_reindexed(med_df, 'NDC', thresholds['medication_ndc'])
print(ndc_mapping)

CURRENT LENGTH IS: 277
TOTAL LENGTH WOULD HAVE BEEN: 2041
{0: 0, 574705050: 1, 182853489: 2, 338055318: 3, 338004902: 4, 310032520: 5, 182844789: 6, 74407532: 7, 517391025: 8, 74610204: 9, 8084199: 10, 56016975: 11, 58177000104: 12, 62584078833: 13, 51079055271: 14, 904770418: 15, 17714002001: 16, 51079001920: 17, 45050130: 18, 54829725: 19, 8418806: 20, 338004938: 21, 10019001303: 22, 51079000522: 23, 51079033530: 24, 338001702: 25, 74258702: 26, 338070341: 27, 17714001110: 28, 713016550: 29, 74148402: 30, 51079025520: 31, 517760425: 32, 338004904: 33, 409662502: 34, 338001704: 35, 409176230: 36, 517570425: 37, 2831501: 38, 46287000660: 39, 409672924: 40, 51079074520: 41, 56017275: 42, 904404073: 43, 64253033335: 44, 338055002: 45, 49502069724: 46, 63323026201: 47, 51079002420: 48, 517571025: 49, 409663734: 50, 66553000401: 51, 54001820: 52, 487980125: 53, 63739002401: 54, 10432017002: 55, 904224461: 56, 63323031110: 57, 904272561: 58, 54872425: 59, 456066270: 60, 58177020211: 61, 409

In [65]:
med_grouped_df = med_df.groupby(['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID'])['NDC'].agg(list).reset_index()
med_grouped_df['NDC_FEATURES'] = None

for i in range(len(med_grouped_df)):
    feature_vec = np.zeros(len(ndc_mapping), dtype=int)
    codes = list(set(med_grouped_df["NDC"][i]))
    for code in codes:
        if(ndc_counts[code]>=thresholds['medication_ndc']):
            index = ndc_mapping[code]
            feature_vec[index] = 1
    med_grouped_df['NDC_FEATURES'][i] = list(feature_vec)
print(len(med_grouped_df))
med_grouped_df.head()

1375


Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,NDC,NDC_FEATURES
0,78,100536,233150,"[63323018410, 338040360, 63323001302, 0, 74407...","[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ..."
1,109,164029,290868,"[182845389, 51079029920, 172376010, 182845389,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,154,102354,201272,"[10019017644, 51079025520, 51079098320, 0, 338...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,188,123860,213646,"[0, 338355248, 409490234, 0, 8092355, 33835524...","[1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,188,150463,255726,"[85036207, 338008904, 85036207, 74131230, 7413...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [66]:
index = 2
lista = []
listb = []
for i in range(len(med_grouped_df['NDC_FEATURES'][index])):
    if(med_grouped_df['NDC_FEATURES'][index][i]==1):
        lista.append(i)

for key in med_grouped_df['NDC'][index]:
    if(key in ndc_mapping):
        listb.append(ndc_mapping[key])
        
print(lista)
print(listb)
print(set(lista)==set(listb))

[0, 1, 31, 40, 47, 56, 109, 128, 171, 204]
[31, 0, 109, 40, 31, 171, 47, 128, 204, 56, 1, 204, 47, 0]
True


## LAB

In [67]:
lab_df = pd.read_csv("data/Raw/lab_final.csv").astype(int).drop_duplicates().reset_index().drop(columns=['index'],axis=1)
print(len(lab_df))
subjects = list(set(list(lab_df['HADM_ID'])))
print(len(subjects))
lab_df.head()

682928
1478


Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,CHART_ITEMID,CHART_VALUENUM
0,154,102354,201272,220180,62
1,154,102354,201272,220181,72
2,154,102354,201272,220045,74
3,154,102354,201272,220210,23
4,154,102354,201272,220179,104


In [68]:
# unique_org = set(list(lab_df['ORG_ITEMID']))
# unique_ab = set(list(lab_df['AB_ITEMID']))
# unique_chart = set(list(lab_df['CHART_ITEMID']))
# print(len(unique_org), len(unique_ab), len(unique_chart))

In [69]:
# org_mapping, org_counts = get_reindexed(lab_df, 'ORG_ITEMID', thresholds['lab_org'])
# ab_mapping, ab_counts = get_reindexed(lab_df, 'AB_ITEMID', thresholds['lab_ab'])

#### Dropping ORG_ITEMID, AB_ITEMID

In [70]:
chart_mapping, chart_counts = get_reindexed(lab_df, 'CHART_ITEMID', thresholds['lab_chart'])

CURRENT LENGTH IS: 401
TOTAL LENGTH WOULD HAVE BEEN: 1044


In [71]:
chart_items = dict()
chart_items_mean = dict()
chart_items_std = dict()

for i in range(len(lab_df)):
    itemid = lab_df["CHART_ITEMID"][i]
    if(itemid not in chart_mapping):
        continue
    else:
        if(itemid not in chart_items):
            chart_items[itemid] = []
        chart_items[itemid].append(lab_df["CHART_VALUENUM"][i])
        
for item in chart_items:
    chart_items_mean[item] = np.mean(np.array(chart_items[item]))
    chart_items_std[item]  = np.std(np.array(chart_items[item]))
    
print(len(chart_items), len(chart_items_mean), len(chart_items_std))
print(chart_items_mean[220181], chart_items_std[220181])

401 401 401
77.61071068185095 18.625224293705106


In [72]:
lab_df['CHART'] = None
for i in range(len(lab_df)):
    itemid = lab_df["CHART_ITEMID"][i]
    if(itemid not in chart_mapping):
        continue
    else:
        value = lab_df["CHART_VALUENUM"][i]
        mean = chart_items_mean[itemid]
        std = chart_items_std[itemid]
        if(value > (mean+std)):
            lab_df['CHART'][i] = str(itemid)+":HIGH"
        elif(value < (mean-std)):
            lab_df['CHART'][i] = str(itemid)+":LOW"
        else:
            lab_df['CHART'][i] = str(itemid)+":MED"
print(len(lab_df))
lab_df.head()

682928


Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,CHART_ITEMID,CHART_VALUENUM,CHART
0,154,102354,201272,220180,62,220180:MED
1,154,102354,201272,220181,72,220181:MED
2,154,102354,201272,220045,74,220045:MED
3,154,102354,201272,220210,23,220210:MED
4,154,102354,201272,220179,104,220179:MED


In [73]:
lab_df = lab_df.dropna(subset=['CHART']).reset_index().drop(columns=['index','CHART_ITEMID', 'CHART_VALUENUM'], axis=1)
print(len(lab_df))
lab_df.head()

671479


Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,CHART
0,154,102354,201272,220180:MED
1,154,102354,201272,220181:MED
2,154,102354,201272,220045:MED
3,154,102354,201272,220210:MED
4,154,102354,201272,220179:MED


In [74]:
chart_final_mapping, chart_final_counts = get_reindexed(lab_df, 'CHART', thresholds['lab_chart_final'])

CURRENT LENGTH IS: 1051
TOTAL LENGTH WOULD HAVE BEEN: 1051


In [75]:
lab_grouped_df = lab_df.groupby(['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID'])['CHART'].agg(list).reset_index()
lab_grouped_df['CHART_FEATURES'] = None

for i in range(len(lab_grouped_df)):
    feature_vec = np.zeros(len(chart_final_mapping), dtype=int)
    codes = list(set(lab_grouped_df["CHART"][i]))
    for code in codes:
        if(chart_final_counts[code]>=thresholds['lab_chart_final']):
            index = chart_final_mapping[code]
            feature_vec[index] = 1
    lab_grouped_df['CHART_FEATURES'][i] = list(feature_vec)
print(len(lab_grouped_df))
lab_grouped_df.head()

1478


Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,CHART,CHART_FEATURES
0,78,100536,233150,"[455:HIGH, 455:HIGH, 455:HIGH, 455:HIGH, 455:H...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,109,164029,290868,"[455:HIGH, 5815:MED, 5817:MED, 5819:MED, 5820:...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,154,102354,201272,"[220180:MED, 220181:MED, 220045:MED, 220210:ME...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,154,162891,250579,"[51:LOW, 51:LOW, 51:MED, 51:MED, 51:LOW, 51:LO...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,188,123860,213646,"[225664:MED, 220045:MED, 220179:MED, 220180:ME...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [76]:
index = 2
lista = []
listb = []
for i in range(len(lab_grouped_df['CHART_FEATURES'][index])):
    if(lab_grouped_df['CHART_FEATURES'][index][i]==1):
        lista.append(i)

for key in lab_grouped_df['CHART'][index]:
    if(key in chart_final_mapping):
        listb.append(chart_final_mapping[key])
        
print(lista)
print(listb)
print(set(lista)==set(listb))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81]
[0, 1, 2, 3, 4, 0, 1, 2, 4, 0, 1, 3, 2, 3, 4, 0, 1, 5, 2, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 2, 4, 1, 3, 5, 17, 18, 2, 3, 5, 19, 1, 2, 19, 0, 1, 2, 2, 4, 1, 5, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 18, 38, 39, 40, 2, 19, 0, 3, 2, 4, 0, 1, 41, 2, 19, 0, 1, 19, 1, 2, 19, 1, 42, 38, 39, 2, 3, 2, 4, 0, 0, 3, 2, 19, 0, 43, 2, 4, 10, 44, 31, 34, 37, 4, 0, 1, 4, 1, 2, 4, 1, 3, 3, 12, 13, 5, 45, 2, 31, 37, 4, 0, 1, 44, 31, 34, 42, 39, 2, 1, 2, 0, 43, 5, 13, 2, 0, 44, 31, 37, 4, 0, 3, 10, 0, 46, 0, 2, 0, 43, 0, 0, 43, 3, 21, 22, 23, 25, 26, 27, 47, 44, 31, 48, 36, 37, 18, 1, 49, 19, 19, 2, 2, 3, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 

## MERGING ALL 4

In [77]:
encounter_dict = dict()
procedure_found = 0
med_found = 0
lab_found = 0
deleted = 0

for i in range(len(patient_df)): 
    
    procedure_flag = 0
    med_flag = 0
    lab_flag = 0
    
    subjid = patient_df["SUBJECT_ID"][i]
    hadmid = patient_df["HADM_ID"][i]
    icustayid = patient_df["ICUSTAY_ID"][i]
    
    key = str(subjid) + "_" + str(hadmid) + "_" + str(icustayid)
    encounter_dict[key] = dict()
    
    encounter_dict[key]["AGE"] = patient_df["AGE"][i]
    encounter_dict[key]["GENDER"] = patient_df["GENDER"][i]
    encounter_dict[key]["OBESITY"] = patient_df["Obesity"][i]
    
    encounter_dict[key]['Procedures_ICD9_Features'] = np.zeros(len(icd9_codes_mapping), dtype=int)
    encounter_dict[key]['Medication_NDC_Features'] = np.zeros(len(ndc_mapping), dtype=int)
    encounter_dict[key]['Lab_Chart_Features'] = np.zeros(len(chart_final_mapping), dtype=int)
    
    encounter_dict[key]['y_Expired'] = patient_df["EXPIRE_FLAG"][i]
    encounter_dict[key]['y_Non.Adherence'] = patient_df["Non.Adherence"][i]
    encounter_dict[key]['y_Developmental.Delay.Retardation'] = patient_df["Developmental.Delay.Retardation"][i]
    encounter_dict[key]['y_Advanced.Heart.Disease'] = patient_df["Advanced.Heart.Disease"][i]
    encounter_dict[key]['y_Advanced.Lung.Disease'] = patient_df["Advanced.Lung.Disease"][i]
    encounter_dict[key]['y_Schizophrenia.and.other.Psychiatric.Disorders'] = patient_df["Schizophrenia.and.other.Psychiatric.Disorders"][i]
    encounter_dict[key]['y_Alcohol.Abuse'] = patient_df["Alcohol.Abuse"][i]
    encounter_dict[key]['y_Other.Substance.Abuse'] = patient_df["Other.Substance.Abuse"][i]
    encounter_dict[key]['y_Chronic.Pain.Fibromyalgia'] = patient_df["Chronic.Pain.Fibromyalgia"][i]
    encounter_dict[key]['y_Chronic.Neurological.Dystrophies'] = patient_df["Chronic.Neurological.Dystrophies"][i]
    encounter_dict[key]['y_Advanced.Cancer'] = patient_df["Advanced.Cancer"][i]
    encounter_dict[key]['y_Depression'] = patient_df["Depression"][i]
    encounter_dict[key]['y_Dementia'] = patient_df["Dementia"][i]
    
    for j in range(len(procedure_grouped_df)):
        if((procedure_grouped_df['SUBJECT_ID'][j]==subjid) and (procedure_grouped_df['HADM_ID'][j]==hadmid)
           and (procedure_grouped_df['ICUSTAY_ID'][j]==icustayid)):
            encounter_dict[key]['Procedures_ICD9_Features'] = procedure_grouped_df['ICD9_FEATURES'][j]
            procedure_found+=1
            procedure_flag = 1
            
    for j in range(len(med_grouped_df)):
        if((med_grouped_df['SUBJECT_ID'][j]==subjid) and (med_grouped_df['HADM_ID'][j]==hadmid)
           and (med_grouped_df['ICUSTAY_ID'][j]==icustayid)):
            encounter_dict[key]['Medication_NDC_Features'] = med_grouped_df['NDC_FEATURES'][j]
            med_found+=1
            med_flag= 1
            
    for j in range(len(lab_grouped_df)):
        if((lab_grouped_df['SUBJECT_ID'][j]==subjid) and (lab_grouped_df['HADM_ID'][j]==hadmid)
           and (lab_grouped_df['ICUSTAY_ID'][j]==icustayid)):
            encounter_dict[key]['Lab_Chart_Features'] = np.array(lab_grouped_df['CHART_FEATURES'][j], dtype=int)
            lab_found+=1
            lab_flag = 1
            
    if((procedure_flag+med_flag+lab_flag) < 2):
        deleted +=1
        del encounter_dict[key]

print(len(patient_df))
print(procedure_found, med_found, lab_found)
print(len(encounter_dict))

1531
1334 1375 1478
1483


In [78]:
mimic = pd.DataFrame(encounter_dict).T
print(len(mimic))
print(len(mimic.columns))
mimic['PATIENT_KEY'] = mimic.index
mimic = mimic[['PATIENT_KEY'] + [col for col in mimic.columns if col != 'PATIENT_KEY']]
mimic.reset_index(drop=True, inplace=True)
mimic.head()

1483
19


Unnamed: 0,PATIENT_KEY,AGE,GENDER,OBESITY,Procedures_ICD9_Features,Medication_NDC_Features,Lab_Chart_Features,y_Expired,y_Non.Adherence,y_Developmental.Delay.Retardation,y_Advanced.Heart.Disease,y_Advanced.Lung.Disease,y_Schizophrenia.and.other.Psychiatric.Disorders,y_Alcohol.Abuse,y_Other.Substance.Abuse,y_Chronic.Pain.Fibromyalgia,y_Chronic.Neurological.Dystrophies,y_Advanced.Cancer,y_Depression,y_Dementia
0,9973_100020_282580,58,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,0,0,0,0,0,0,0,1,1,0,0,0
1,3365_100103_200434,72,1,0,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,0,0,0,0,0,0,0,0,0,0
2,27290_100137_212691,82,0,0,"[0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...","[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0,0,0,0,0,0,0,0,0,0,0,0
3,9882_100177_251800,55,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,0,0,0,0,0,0,0,0,0,0
4,5525_100473_257484,65,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0,0,0,0,0,0,0,0,0,0,0,0


In [79]:
mimic.to_csv("data/Processed/MIMIC.csv")

In [None]:
X = []
for i in range(len(mimic)):
    X_sample = []
    X_sample.append(mimic["AGE"][i])
    X_sample.append(mimic["GENDER"][i])
    X_sample.append(mimic["OBESITY"][i])
    X_sample.extend(mimic["Procedures_ICD9_Features"][i])
    X_sample.extend(mimic["Medication_NDC_Features"][i])
    X_sample.extend(mimic["Lab_Chart_Features"][i])
    X_sample = np.array(X_sample)
    X.append(X_sample)
    
X = np.array(X)
print(X.shape, X[0][0], X[1][0])
np.save('data/Processed/MIMIC_X.npy', X)

(1483, 1424) 58 72


In [None]:
y = []
labels = ["y_Expired", "y_Non.Adherence", "y_Developmental.Delay.Retardation", "y_Advanced.Heart.Disease",
          "y_Advanced.Lung.Disease", "y_Schizophrenia.and.other.Psychiatric.Disorders", "y_Alcohol.Abuse",
          "y_Other.Substance.Abuse", "y_Chronic.Pain.Fibromyalgia", "y_Chronic.Neurological.Dystrophies",
          "y_Advanced.Cancer", "y_Depression", "y_Dementia"]

for i in range(len(mimic)):
    y_sample = []
    for column in labels:
        y_sample.append(mimic[column][i])
    y_sample = np.array(y_sample)
    y.append(y_sample)
    
y = np.array(y, dtype=int)
print(y.shape, y[0][0], y[1][0])
np.save('data/Processed/MIMIC_y.npy', y)

(1483, 13) 1 0


In [81]:
print("Length Procedures Vector:", len(icd9_codes_mapping))
print("Length Medication Vector:", len(ndc_mapping))
print("Length Lab Vector:", len(chart_final_mapping))

Length Procedures Vector: 93
Length Medication Vector: 277
Length Lab Vector: 1051


## Patient Mapping

In [22]:
X = np.load('data/Processed/MIMIC_X.npy')
print(X.shape)

(1483, 1424)


In [17]:
mimic = pd.read_csv("data/Processed/MIMIC.csv").drop("Unnamed: 0", axis=1)
mimic.head()

Unnamed: 0,PATIENT_KEY,AGE,GENDER,OBESITY,Procedures_ICD9_Features,Medication_NDC_Features,Lab_Chart_Features,y_Expired,y_Non.Adherence,y_Developmental.Delay.Retardation,y_Advanced.Heart.Disease,y_Advanced.Lung.Disease,y_Schizophrenia.and.other.Psychiatric.Disorders,y_Alcohol.Abuse,y_Other.Substance.Abuse,y_Chronic.Pain.Fibromyalgia,y_Chronic.Neurological.Dystrophies,y_Advanced.Cancer,y_Depression,y_Dementia
0,9973_100020_282580,58,0,0,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,"[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",[1 1 1 ... 0 0 0],1,0,0,0,0,0,0,0,1,1,0,0,0
1,3365_100103_200434,72,1,0,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...",[0 0 0 ... 0 0 0],0,0,0,0,0,0,0,0,0,0,0,0,0
2,27290_100137_212691,82,0,0,"[0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...","[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[0 0 0 ... 0 0 0],1,0,0,0,0,0,0,0,0,0,0,0,0
3,9882_100177_251800,55,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ...",[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,[0 0 0 ... 0 0 0],0,0,0,0,0,0,0,0,0,0,0,0,0
4,5525_100473_257484,65,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, ...",[0 0 0 ... 0 0 0],1,0,0,0,0,0,0,0,0,0,0,0,0


### Patient Features

In [31]:
patient_features = np.array(X[:, :3], dtype=int)
print(patient_features[:2, :])
print(patient_features.shape)
np.save('data/Processed/patient_features.npy', patient_features)

[[58  0  0]
 [72  1  0]]
(1483, 3)


### Procedure Features

In [37]:
procedure_features = np.eye(93, dtype=int)
print(procedure_features.shape)
np.save('data/Processed/procedure_features.npy', procedure_features)

(93, 93)


### Medication Features

In [38]:
medication_features = np.eye(277, dtype=int)
print(medication_features.shape)
np.save('data/Processed/medication_features.npy', medication_features)

(277, 277)


### Lab Features

In [39]:
lab_features = np.eye(1051, dtype=int)
print(lab_features.shape)
np.save('data/Processed/lab_features.npy', lab_features)

(1051, 1051)


### Patient Edge Index

In [52]:
same_patient_dict = dict()
counter = 0
patient_ids = []
for i in range(len(mimic)):
    same_patient_dict[i] = []
    patient_ids.append(int(mimic["PATIENT_KEY"][i].split("_")[0]))

for i in range(len(patient_ids)):
    for j in range(i, len(patient_ids)):
        if(i!=j and patient_ids[i]==patient_ids[j]):
            same_patient_dict[i].append(j)
            counter +=1
print(counter)
# print(same_patient_dict)

patient_edge_index = [[], []]

for i in same_patient_dict:
    source = int(i)
    for j in same_patient_dict[i]:
        destination = int(j)
        patient_edge_index[0].append(source)
        patient_edge_index[1].append(destination)

patient_edge_index = np.array(patient_edge_index, dtype=int)
print(patient_edge_index.shape)
np.save('data/Processed/patient_edges.npy', patient_edge_index)

1025
(2, 1025)


### Procedure Edge Index

In [49]:
procedures_section = X[:, 3:96]
print(procedures_section.shape)
procedures_edge_index = [[], []]

for i in range(len(procedures_section)):
    source = int(i)
    for j in range(len(procedures_section[0])):
        dest = int(j)
        if(procedures_section[i][j]==1):
            procedures_edge_index[0].append(source)
            procedures_edge_index[1].append(dest)

procedures_edge_index = np.array(procedures_edge_index, dtype=int)
print(procedures_edge_index.shape)
print(procedures_edge_index[0][:5], procedures_edge_index[1][:5])
np.save('data/Processed/procedures_edges.npy', procedures_edge_index)

(1483, 93)
(2, 4837)
[1 1 1 2 2] [0 1 2 3 4]


### Medication Edge Index

In [50]:
medication_section = X[:, 96:373]
print(medication_section.shape)
medication_edge_index = [[], []]

for i in range(len(medication_section)):
    source = int(i)
    for j in range(len(medication_section[0])):
        dest = int(j)
        if(medication_section[i][j]==1):
            medication_edge_index[0].append(source)
            medication_edge_index[1].append(dest)

medication_edge_index = np.array(medication_edge_index, dtype=int)
print(medication_edge_index.shape)
print(medication_edge_index[0][:5], medication_edge_index[1][:5])
np.save('data/Processed/medication_edges.npy', medication_edge_index)

(1483, 277)
(2, 28006)
[0 0 0 0 0] [0 1 2 3 4]


### Lab Edge Index

In [51]:
lab_section = X[:, 373:]
print(lab_section.shape)
lab_edge_index = [[], []]

for i in range(len(lab_section)):
    source = int(i)
    for j in range(len(lab_section[0])):
        dest = int(j)
        if(lab_section[i][j]==1):
            lab_edge_index[0].append(source)
            lab_edge_index[1].append(dest)

lab_edge_index = np.array(lab_edge_index, dtype=int)
print(lab_edge_index.shape)
print(lab_edge_index[0][:5], lab_edge_index[1][:5])
np.save('data/Processed/lab_edges.npy', lab_edge_index)

(1483, 1051)
(2, 165019)
[0 0 0 0 0] [0 1 2 3 4]
