In [20]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import sklearn
from sklearn import preprocessing, svm
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

from tqdm import tqdm
tqdm.pandas()

# for jupyter notebook
pd.options.mode.chained_assignment = None  # default='warn'

# NEW CAT VAL

In [21]:
# read in new cat values
df = pd.read_csv('mimic-iii-clinical-database-1.4/DIAG_ROWS.csv')
df_loc = pd.read_csv('mimic-iii-clinical-database-1.4/ICUSTAYS.csv.gz')
df_loc = df_loc[['HADM_ID', 'LOS', 'ICUSTAY_ID']]

df = pd.merge(df_loc, df, on='HADM_ID', how='left')
df = df.drop('HADM_ID', axis=1)
df = df.dropna()

df

Unnamed: 0,LOS,ICUSTAY_ID,DISEASES AND INJURIES,"SYMPTOMS, SIGNS, AND ILL-DEFINED CONDITIONS",SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES,SUPPLEMENTARY CLASSIFICATION OF EXTERNAL CAUSES OF INJURY AND POISONING
7,7.1314,219649,389.0,78552.0,V667,E8889
8,7.1314,219649,389.0,78552.0,V1582,E8889
9,7.1314,219649,51881.0,78552.0,V667,E8889
10,7.1314,219649,51881.0,78552.0,V1582,E8889
11,7.1314,219649,5849.0,78552.0,V667,E8889
12,7.1314,219649,5849.0,78552.0,V1582,E8889
13,7.1314,219649,2869.0,78552.0,V667,E8889
14,7.1314,219649,2869.0,78552.0,V1582,E8889
15,7.1314,219649,85221.0,78552.0,V667,E8889
16,7.1314,219649,85221.0,78552.0,V1582,E8889


In [22]:
cat_values = ['DISEASES AND INJURIES', 
             'SYMPTOMS, SIGNS, AND ILL-DEFINED CONDITIONS', 
             'SUPPLEMENTARY CLASSIFICATION OF EXTERNAL CAUSES OF INJURY AND POISONING', 
             'SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES']

target = 'LOS'

In [None]:
# encoding
df = pd.read_csv('ENC.csv')
df_loc = pd.read_csv('mimic-iii-clinical-database-1.4/ICUSTAYS.csv.gz')
df_loc = df_loc[['HADM_ID', 'LOS', 'ICUSTAY_ID']]

df = pd.merge(df_loc, df, on='HADM_ID', how='left')
df = df.drop('HADM_ID', axis=1)
df = df.dropna()

df

# OLD FEATURES

In [3]:
# read in features
df = pd.read_csv('mimic-iii-clinical-database-1.4/FEATURES.csv')
df = df.dropna()

df.head()

Unnamed: 0,ICUSTAY_ID,LOS,HeartRate,sysBP,diasBP,oxygen_met,respRate,gluc,GENDER,AGE,DISEASES AND INJURIES,PROCEDURES,SUPPLEMENTARY CLASSIFICATION OF EXTERNAL CAUSES OF INJURY AND POISONING,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES
7,219649,7.1314,72.625,109.888889,68.111111,99.625,14.2,115.666667,M,82.166667,389.0,9604.0,E8889,V667
8,219649,7.1314,72.625,109.888889,68.111111,99.625,14.2,115.666667,M,82.166667,51881.0,9604.0,E8889,V667
9,219649,7.1314,72.625,109.888889,68.111111,99.625,14.2,115.666667,M,82.166667,78552.0,9604.0,E8889,V667
10,219649,7.1314,72.625,109.888889,68.111111,99.625,14.2,115.666667,M,82.166667,5849.0,9604.0,E8889,V667
11,219649,7.1314,72.625,109.888889,68.111111,99.625,14.2,115.666667,M,82.166667,4538.0,9604.0,E8889,V667


# Categorical values

convert categorical values into one-hot-encoded vectors

In [4]:
cat_values = ['GENDER', 'DISEASES AND INJURIES', 
             'PROCEDURES', 
             'SUPPLEMENTARY CLASSIFICATION OF EXTERNAL CAUSES OF INJURY AND POISONING', 
             'SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES']

cont_values = ['HeartRate', 
               'sysBP', 
               'diasBP', 
               'oxygen_met', 
               'respRate', 
               'gluc', 
               'AGE']

target = 'LOS'

In [None]:
# ---- Only categorical
df = df.drop(cont_values, axis = 1)

df

###  -- continue

In [23]:
# give every category a number
# df['ADMISSION_TYPE'] = df['ADMISSION_TYPE'].astype('category').cat.codes

for cat in tqdm(cat_values):
    # give each category a one-hot-encoded vector
    df = df.join(pd.get_dummies(df[cat], prefix=cat + "_"))

100%|██████████| 4/4 [00:37<00:00, 11.83s/it]


In [24]:
# remove columns
df = df.drop(cat_values, axis = 1)

df.set_index('ICUSTAY_ID', inplace=True)

In [None]:
len(df.columns)

## Modify target

In [None]:
df = df[df['LOS'] < 5]

In [25]:
def maplos(x):
    #if x < 1.0: # short stay
    #    return 0
    if x < 1.0: # middel stay
        return 0
    elif x < 200.0: # long stay
        return 1

df['LOS'] = df['LOS'].progress_map(lambda x: maplos(x))

100%|██████████| 618081/618081 [00:00<00:00, 729722.55it/s]


In [8]:
df.head()

Unnamed: 0_level_0,LOS,HeartRate,sysBP,diasBP,oxygen_met,respRate,gluc,AGE
ICUSTAY_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
219649,1,72.625,109.888889,68.111111,99.625,14.2,115.666667,82.166667
219649,1,72.625,109.888889,68.111111,99.625,14.2,115.666667,82.166667
219649,1,72.625,109.888889,68.111111,99.625,14.2,115.666667,82.166667
219649,1,72.625,109.888889,68.111111,99.625,14.2,115.666667,82.166667
219649,1,72.625,109.888889,68.111111,99.625,14.2,115.666667,82.166667


# shuffle

In [26]:
# shuffle

## pandas can shuffle to by index, but sentdex says thats ugly
df = sklearn.utils.shuffle(df)

df.head()

Unnamed: 0_level_0,LOS,DISEASES AND INJURIES__35.0,DISEASES AND INJURIES__38.0,DISEASES AND INJURIES__42.0,DISEASES AND INJURIES__74.0,DISEASES AND INJURIES__75.0,DISEASES AND INJURIES__85.0,DISEASES AND INJURIES__88.0,DISEASES AND INJURIES__90.0,DISEASES AND INJURIES__91.0,...,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES__V860,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES__V861,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES__V8709,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES__V872,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES__V8741,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES__V8801,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES__V8812,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES__V8821,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES__V902,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES__V9089
ICUSTAY_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
264595,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
217934,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
247864,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
249899,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
208400,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# split features and targets

In [27]:
y = df[target].values
X = df.drop(target, axis=1).values

In [None]:
y=y.astype('int')

y

# Preprocessing

In [None]:
# Only do this for the numerical variables
X[cont_values] = preprocessing.scale(X[cont_values])

X.head()

In [None]:
%%time
## alle
# 33 min
X = pd.DataFrame(preprocessing.scale(X))

X.head()

In [13]:
X = X.values
X

array([[-0.44888319, -0.49189811,  0.3175648 , ..., -0.22706971,
        -0.45631475, -0.49917781],
       [-0.78521216, -0.68123489, -1.63421733, ...,  1.00525765,
         0.84860331, -0.00737962],
       [ 0.47484945,  0.51065568,  0.32807687, ..., -0.33897336,
        -0.60373227, -0.07882749],
       ...,
       [ 1.7197102 , -1.019558  , -2.09622766, ..., -0.19353269,
         2.32277853, -0.40434709],
       [ 0.57010008, -1.03362893, -1.63936206, ..., -0.3256559 ,
        -0.39079585, -1.07223876],
       [ 0.11271847,  0.42271237, -1.3769808 , ...,  1.42725463,
        -1.04598483, -0.63981302]])

# Split data into training and testing

In [14]:
%%time
# 17,38 min
X_train, X_test, y_train, y_test = train_test_split(X, y)

CPU times: user 25.9 ms, sys: 1.06 ms, total: 26.9 ms
Wall time: 28.1 ms


In [None]:
X_train.shape

## Results

* age all
* ICD9_CODES: proc, dis, fact, ext


1. Categorical and continious

|LOS   | MAPPING                   | TEST_SIZE | SCALE | FUNC | SOLVER | HIDDEN | RES  | Learning Rate |
|------|---------------------------|-----------|-------|------|--------|--------|------|---------------|
| <10  | <1.0, <5.0, <200.0        | 0.4       | CONT  | LOGI | SGD    | 10,15  | 0.67 |    Default    |
| <10  | <1.0, <3.0, <200.0        | 0.4       | CONT  | LOGI | SGD    | 10,15  | 0.45 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | CONT  | LOGI | SGD    | 10,15  | 0.72 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | ALL   | LOGI | SGD    | 10,15  | 0.83 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | SGD    | 40,90  | 0.73 |    Default    |


2. Continiues (only)

|LOS   | MAPPING                   | TEST_SIZE | SCALE | FUNC | SOLVER | HIDDEN | RES  | Learning Rate |
|------|---------------------------|-----------|-------|------|--------|--------|------|---------------|
| ALL  | <1.0, <10.0, <200.0       | 0.4       | CONT  | LOGI | SGD    | 10,15  | 0.72 |    Default    |


3. Categorical (only)

|LOS   | MAPPING                   | TEST_SIZE | SCALE | FUNC | SOLVER | HIDDEN | RES  | Learning Rate |
|------|---------------------------|-----------|-------|------|--------|--------|------|---------------|
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | LOGI | SGD    | 10,15  | 0.72 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | SGD    | 10,15  | 0.84 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | SGD    | 40,90  | 0.87 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | CONT  | RELU | SGD    | 10,15  | 0.81 |    Default    |
| <5   | NONE                      | 0.4       | NONE  | RELU | SGD    | 10,15  | 0.59 |    Default    |

## Post new values

* age none
* ICD9_CODES: symp, dis, fact, ext - repeated for first combination


3. Categorical (only)

|LOS   | MAPPING                   | TEST_SIZE | SCALE | FUNC | SOLVER | HIDDEN | RES  | Learning Rate |
|------|---------------------------|-----------|-------|------|--------|--------|------|---------------|
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | LOGI | SGD    | 10,15  | 0.72 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | SGD    | 10,15  | 0.79 |    Default    |

4. Encoded Categorical (only)

|LOS   | MAPPING                   | TEST_SIZE | SCALE | FUNC | SOLVER | HIDDEN | RES  | Learning Rate |
|------|---------------------------|-----------|-------|------|--------|--------|------|---------------|
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | SGD    | 10,15  | 0.72 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | CAT   | RELU | SGD    | 10,15  | 0.72 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | CAT   | LOGI | SGD    | 10,15  | 0.72 |    Default    |

## Post supervisor explenation - change to max 100 iterations

* age none
* ICD9_CODES: symp, dis, fact, ext - all combination for each diagnose


3. Categorical (only)

|LOS   | MAPPING                   | TEST_SIZE | SCALE | FUNC | SOLVER | HIDDEN | RES  | Learning Rate |
|------|---------------------------|-----------|-------|------|--------|--------|------|---------------|
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | ADAM   | 10,15  | 0.80 |    Default    |

3. Categorical (only) [Encoded]

|LOS   | MAPPING                   | TEST_SIZE | SCALE | FUNC | SOLVER | HIDDEN | RES  | Learning Rate |
|------|---------------------------|-----------|-------|------|--------|--------|------|---------------|
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | LOGI | SGD    | 10,15  | 0.71 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | SGD    | 10,15  | 0.71 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | ADAM   | 10,15  | 0.71 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | ADAM   | 10,15  | 0.71 |    0.001      |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | ADAM   | 10,15  | 0.71 |    0.01       |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | ADAM   | 10,15  | 0.71 |    0.1        |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | ADAM   | 10,15  | 0.71 |    1.0        |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | ADAM   | 10,15  | 0.71 |    2.0        |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | ADAM   | 10,15  | 0.71 |    0.00001    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | ADAM   | 10,15  | 0.71 |    0.000001   |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | LOGI | SGD    | 10,15  | 0.71 |    0.01       |

# NN

In [17]:
#activation 
# - sigmoid = logistic
# - relu
# - tanh

#solver
# sgd - stochastic gradiant decent
# adam

#hidden_layer_sieze
# - 45,90 take some time -- 16min
# - 10,15 lesser time -- 13min
# - 100, 150 -- 48min
# - 1000, 1500 time -- 2h 4min

# hidden_layer_sizes=(10,15)

#parameter_space = {
#    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
#    'activation': ['tanh', 'relu'],
#    'solver': ['sgd', 'adam'],
#    'alpha': [0.0001, 0.05],
#    'learning_rate': ['constant','adaptive'],
#}

#nn=MLPClassifier(activation='relu', solver='adam', verbose=1, max_iter=100, alpha=0.001)
nn=MLPClassifier(verbose=1, hidden_layer_sizes=(10,15))

In [18]:
%%time
## 1h 14, 21 min
nn.fit(X_train, y_train)

Iteration 1, loss = 0.68505621
Iteration 2, loss = 0.67155985
Iteration 3, loss = 0.66833417
Iteration 4, loss = 0.66606448
Iteration 5, loss = 0.66436004
Iteration 6, loss = 0.66283401
Iteration 7, loss = 0.66136147
Iteration 8, loss = 0.66014571
Iteration 9, loss = 0.65888180
Iteration 10, loss = 0.65779524
Iteration 11, loss = 0.65656214
Iteration 12, loss = 0.65579697
Iteration 13, loss = 0.65513883
Iteration 14, loss = 0.65475474
Iteration 15, loss = 0.65415247
Iteration 16, loss = 0.65361077
Iteration 17, loss = 0.65311752
Iteration 18, loss = 0.65287459
Iteration 19, loss = 0.65255355
Iteration 20, loss = 0.65227923
Iteration 21, loss = 0.65196014
Iteration 22, loss = 0.65163582
Iteration 23, loss = 0.65138799
Iteration 24, loss = 0.65121222
Iteration 25, loss = 0.65107379
Iteration 26, loss = 0.65086259
Iteration 27, loss = 0.65086839
Iteration 28, loss = 0.65047017
Iteration 29, loss = 0.65038303
Iteration 30, loss = 0.65012708
Iteration 31, loss = 0.65002031
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(10, 15), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=1, warm_start=False)

In [19]:
pred=nn.predict(X_test)

count = 0

for i in range(len(y_test)):
    if pred[i]==y_test[i]:
        count += 1

count/len(pred)

0.6135571745327842

## Other algorithmes

In [None]:
clf = svm.SVR(kernel="linear")
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

# Neural Network

In [None]:
# normalize??



In [None]:
# split into train, valid, test

# features capital x, and lowercase y is labels