In [1]:
import sys,os
import numpy as np
import pandas as pd
from un_dl_models.data_loader import physionet_loader

In [2]:
outcome_file_a = 'physionet2012/Outcomes-a.txt'

df_outcome_a = pd.read_csv(outcome_file_a)

In [4]:
# load mortality data
x_train, x_val, x_test, y_train, y_val, y_test = physionet_loader(task='mortality',
                                                                  scale=False,input_path='')

x_all = np.concatenate((x_train,x_val,x_test),axis=0)

In [4]:
x_all.shape

(4000, 155, 35)

In [5]:
matching_dict = {}
for key in df_outcome_a.RecordID:
    input_file = "".join(['physionet2012/set-a/',str(key),'.txt'])
    df_input = pd.read_csv(input_file)
    ad = df_input[df_input.Time == '00:00']
    ad = ad[ad.Parameter != 'ICUType']
    ad = ad[ad.Parameter != 'RecordID']
    ad_val_list = list(set(ad.Value))
    loc_list = []
    for i in range(x_all.shape[0]):
        if all([v in x_all[i,0] for v in ad_val_list]): loc_list.append(i)
    matching_dict[key] = loc_list

In [6]:
matching_dict

{132539: [234,
  354,
  396,
  445,
  608,
  653,
  749,
  789,
  945,
  1059,
  1091,
  1355,
  1446,
  1540,
  1617,
  1657,
  1689,
  1706,
  1976,
  1983,
  1990,
  2134,
  2159,
  2161,
  2295,
  2301,
  2306,
  2309,
  2400,
  2426,
  2553,
  2638,
  2756,
  2783,
  2950,
  3133,
  3384,
  3948,
  3990],
 132540: [573, 3206, 3640, 3792],
 132541: [2017],
 132543: [714],
 132545: [154,
  172,
  321,
  390,
  441,
  564,
  567,
  599,
  638,
  650,
  661,
  1042,
  1153,
  1381,
  1428,
  1660,
  1778,
  1834,
  1872,
  1996,
  2012,
  2162,
  2286,
  2375,
  2502,
  2593,
  2725,
  2758,
  2788,
  3227,
  3360,
  3399,
  3515,
  3872],
 132547: [3014],
 132548: [1464],
 132551: [3362],
 132554: [1842],
 132555: [3226],
 132556: [367, 3615],
 132567: [1271],
 132568: [2127],
 132570: [239],
 132573: [1739],
 132575: [3686],
 132577: [2178],
 132582: [1913],
 132584: [2250],
 132585: [3530],
 132588: [2284],
 132590: [687],
 132591: [463],
 132592: [2321],
 132595: [291, 443, 755, 1

In [7]:
found, not_found = [], list(range(4000))
for key in matching_dict:
    if len(matching_dict[key]) == 1:
        matching_dict[key] = matching_dict[key][0]
        found.append(matching_dict[key])
        not_found.remove(matching_dict[key])

In [8]:
# special cases
manual_cases = {140501:3626, 140936:1107, 141264:2010, 135361:2889, 136278:2802, 137334:241,
               141807:1260, 141165:708, 133147:3394, 135757:3795, 136763:287, 137431:3538,
               139374:697, 139390:2164, 140762:3288}

for k in manual_cases:
    matching_dict[k] = manual_cases[k]
    found.append(manual_cases[k])
    not_found.remove(manual_cases[k])

In [9]:
# removing found values from matching_dict and updating the matching_dict
prev_len_found = 0
while prev_len_found < len(found):
    print(prev_len_found, len(found))
    prev_len_found = len(found)
    for key in matching_dict:
        v = matching_dict[key]
        if isinstance(v,list):
            if len(v) > 1:
                for val in v:
                    if val in found:
                        v.remove(val)
                if len(v) == 1:
                    matching_dict[key] = v[0]
                    found.append(v[0])
                    not_found.remove(v[0])
                else:
                    matching_dict[key] = v
print(prev_len_found, len(found))

0 3387
3387 3419
3419 3425
3425 3430
3430 3431
3431 3431


In [10]:
for line in range(1,5):
    print(len(found))
    for key in matching_dict:
        if isinstance(matching_dict[key],list):
            input_file = "".join(['physionet2012/set-a/',str(key),'.txt'])
            df_input = pd.read_csv(input_file)
            t1 = df_input[df_input.Time == sorted(list(set(df_input.Time)))[line]]
            t1_val_list = list(set(t1.Value))
            remainings = []
            for i in matching_dict[key]:
                if all([v in x_all[i,line] for v in t1_val_list]): remainings.append(i)
            if len(remainings) == 1:
                matching_dict[key] = remainings[0]
                found.append(remainings[0])
                not_found.remove(remainings[0])
            elif len(remainings) > 0:
                matching_dict[key] = remainings
print(len(found))

3431
3943
3992
3999
4000


In [44]:
dic_icu_type = {}
for key in df_outcome_a.RecordID:
    input_file = "".join(['physionet2012/set-a/',str(key),'.txt'])
    df_input = pd.read_csv(input_file)
    dic_icu_type[key] = int(list(df_input[df_input.Parameter=='ICUType']['Value'])[0])

print([l_icu_type.count(i) for i in range(1,5)])

arr_icu_type = np.zeros(4000)
for key in matching_dict:
    arr_icu_type[matching_dict[key]] = dic_icu_type[key]

[577, 874, 1481, 1068]


In [63]:
icu_1_idx = np.where(arr_icu_type==1)[0]
icu_1_train,icu_1_val,icu_1_test = (icu_1_idx[np.where(icu_1_idx < 3199)],
                                    icu_1_idx[np.where((icu_1_idx>=3199)&(icu_1_idx<3599))],
                                    icu_1_idx[np.where(icu_1_idx >= 3599)])
np.save('physionet_data_a/icu_1_train_idx.npy',icu_1_train)
np.save('physionet_data_a/icu_1_val_idx.npy',icu_1_val)
np.save('physionet_data_a/icu_1_test_idx.npy',icu_1_test)

In [61]:
icu_1_train,icu_1_val,icu_1_test

(array([   9,   10,   15,   18,   21,   26,   27,   40,   49,   67,   80,
          81,   92,   96,  101,  118,  180,  201,  214,  215,  220,  226,
         228,  230,  239,  258,  263,  264,  266,  280,  285,  297,  302,
         307,  308,  324,  331,  332,  333,  339,  351,  352,  353,  364,
         368,  375,  378,  390,  399,  412,  424,  428,  430,  451,  456,
         460,  467,  474,  475,  488,  490,  497,  503,  505,  509,  513,
         514,  517,  518,  523,  535,  536,  537,  540,  566,  573,  579,
         608,  609,  614,  631,  633,  634,  638,  639,  655,  657,  658,
         677,  685,  697,  699,  709,  713,  715,  729,  743,  745,  757,
         764,  768,  769,  776,  779,  786,  791,  814,  828,  830,  834,
         853,  855,  857,  858,  863,  870,  884,  889,  897,  904,  916,
         923,  924,  930,  935,  946,  959,  971,  979,  980,  983,  993,
         995, 1002, 1012, 1020, 1023, 1028, 1031, 1053, 1071, 1082, 1096,
        1120, 1122, 1125, 1126, 1132, 

In [40]:
x_train.shape[0],x_val.shape[0],x_test.shape[0]

(3199, 400, 401)

In [58]:
icu_1_train,icu_1_val,icu_1_test = icu_1_idx[np.where(icu_1_idx < 3199)],icu_1_idx[np.where((icu_1_idx>3199)&(icu_1_idx<3599))],icu_1_idx[np.where(icu_1_idx >= 3599)]

In [59]:
icu_1_train.shape,icu_1_val.shape,icu_1_test.shape

((454,), (60,), (63,))

In [37]:
df_outcome_a['los_bin'] = 1*(df_outcome_a['In-hospital_death'] == 1)
df_outcome_a['x_loc'] = df_outcome_a['RecordID'].map(matching_dict)

los_bin_dict = pd.Series(df_outcome_a.los_bin.values,index=df_outcome_a.x_loc).to_dict()

y_los_all = np.zeros(4000)
for i in range(4000):
    y_los_all[i] = los_bin_dict[i]

y_los_train, y_los_val, y_los_test = np.copy(y_los_all[:3199]), np.copy(y_los_all[3199:3599]), np.copy(y_los_all[3599:])

#np.save('physionet_los_data/1_train_y.npy',y_los_train)
#np.save('physionet_los_data/1_val_y.npy',y_los_val)
#np.save('physionet_los_data/1_test_y.npy',y_los_test)

In [31]:
x_train[0][0]

array([ 0. ,  1. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
       77. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. , 83.9,  0. ,  0. ,  0. ,
        0. , -1. ])

In [34]:
for k in matching_dict:
    if matching_dict[k] == 0:
        print(k)

140675


In [38]:
los_bin_dict[2]

1

In [36]:
df_outcome_a[df_outcome_a.RecordID==134826]

Unnamed: 0,RecordID,SAPS-I,SOFA,Length_of_stay,Survival,In-hospital_death,los_bin,x_loc
906,134826,12,2,4,3,1,1,2


In [41]:
df_outcome_a['los_bin'] = 1*(df_outcome_a.Length_of_stay < 3)
df_outcome_a['x_loc'] = df_outcome_a['RecordID'].map(matching_dict)

los_bin_dict = pd.Series(df_outcome_a.los_bin.values,index=df_outcome_a.x_loc).to_dict()

y_los_all = np.zeros(4000)
for i in range(4000):
    y_los_all[i] = los_bin_dict[i]

y_los_train, y_los_val, y_los_test = np.copy(y_los_all[:3199]), np.copy(y_los_all[3199:3599]), np.copy(y_los_all[3599:])
np.save('physionet_data_a/physionet_los_label/1_train_y.npy',y_los_train)
np.save('physionet_data_a/physionet_los_label/1_val_y.npy',y_los_val)
np.save('physionet_data_a/physionet_los_label/1_test_y.npy',y_los_test)

In [328]:
icu_types = []
for key in df_outcome_a.RecordID:
    input_file = "".join(['physionet2012/set-a/',str(key),'.txt'])
    df_input = pd.read_csv(input_file)
    icu_types.append(np.array(df_input[df_input.Parameter=='ICUType'].Value)[0])

In [330]:
np.histogram(icu_types)

(array([ 577,    0,    0,  874,    0,    0, 1481,    0,    0, 1068]),
 array([1. , 1.3, 1.6, 1.9, 2.2, 2.5, 2.8, 3.1, 3.4, 3.7, 4. ]))

In [54]:
ttttt = [1,1,1,3,4,3,8,8,8,8]
len(set(ttttt))

4

In [79]:
matrix = [
  [1, 2, 3, 4],
  [5, 6, 7, 8],
  [9,10,11,12]
]

In [70]:
out = []
m,n = len(matrix),len(matrix[0])
for idx,sum_ in enumerate(range(m+n-1)):
    if idx%2:
        j = min(sum_, n-1)
        i = sum_ - j
        out.append(matrix[i][j])
        while j > 0 and i < m-1:
            j -= 1
            i += 1
            out.append(matrix[i][j])
    else:
        i = min(sum_, m-1)
        j = sum_ - i
        out.append(matrix[i][j])
        while i > 0 and j < n-1:
            i -= 1
            j += 1
            out.append(matrix[i][j])

In [80]:
m,n = len(matrix), len(matrix[0])
out = matrix[0] + [matrix[i][n-1] for i in range(1,m)] + matrix[m-1][:n-1].reverse() + [matrix[i][0] for i in range(1,m-1)].reverse()

TypeError: can only concatenate list (not "NoneType") to list