In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import sklearn
from sklearn import preprocessing, svm
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

from tqdm import tqdm
tqdm.pandas()

# for jupyter notebook
pd.options.mode.chained_assignment = None  # default='warn'

# NEW CAT VAL

In [None]:
# read in new cat values
df = pd.read_csv('mimic-iii-clinical-database-1.4/DIAG_ROWS.csv')
df_loc = pd.read_csv('mimic-iii-clinical-database-1.4/ICUSTAYS.csv.gz')
df_loc = df_loc[['HADM_ID', 'LOS', 'ICUSTAY_ID']]

df = pd.merge(df_loc, df, on='HADM_ID', how='left')
df = df.drop('HADM_ID', axis=1)
df = df.dropna()

df

In [None]:
cat_values = ['DISEASES AND INJURIES', 
             'SYMPTOMS, SIGNS, AND ILL-DEFINED CONDITIONS', 
             'SUPPLEMENTARY CLASSIFICATION OF EXTERNAL CAUSES OF INJURY AND POISONING', 
             'SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES']

target = 'LOS'

In [None]:
# encoding
df = pd.read_csv('ENC.csv')
df_loc = pd.read_csv('mimic-iii-clinical-database-1.4/ICUSTAYS.csv.gz')
df_loc = df_loc[['HADM_ID', 'LOS', 'ICUSTAY_ID']]

df = pd.merge(df_loc, df, on='HADM_ID', how='left')
df = df.drop('HADM_ID', axis=1)
df = df.dropna()

df

# OLD FEATURES

In [None]:
# read in features
df = pd.read_csv('mimic-iii-clinical-database-1.4/FEATURES.csv')
df = df.dropna()

df.head()

# Categorical values

convert categorical values into one-hot-encoded vectors

In [None]:
cat_values = ['GENDER', 'DISEASES AND INJURIES', 
             'PROCEDURES', 
             'SUPPLEMENTARY CLASSIFICATION OF EXTERNAL CAUSES OF INJURY AND POISONING', 
             'SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES']

cont_values = ['HeartRate', 
               'sysBP', 
               'diasBP', 
               'oxygen_met', 
               'respRate', 
               'gluc', 
               'AGE']

target = 'LOS'

In [None]:
# ---- Only categorical
df = df.drop(cont_values, axis = 1)

df

###  -- continue

In [None]:
# give every category a number
# df['ADMISSION_TYPE'] = df['ADMISSION_TYPE'].astype('category').cat.codes

for cat in tqdm(cat_values):
    # give each category a one-hot-encoded vector
    df = df.join(pd.get_dummies(df[cat], prefix=cat + "_"))

In [None]:
# remove columns
df = df.drop(cat_values, axis = 1)

df.set_index('ICUSTAY_ID', inplace=True)

In [None]:
len(df.columns)

## Modify target

In [None]:
df = df[df['LOS'] < 5]

In [None]:
def maplos(x):
    #if x < 1.0: # short stay
    #    return 0
    if x < 1.0: # middel stay
        return 0
    elif x < 200.0: # long stay
        return 1

df['LOS'] = df['LOS'].progress_map(lambda x: maplos(x))

In [None]:
df.head()

# shuffle

In [None]:
# shuffle

## pandas can shuffle to by index, but sentdex says thats ugly
df = sklearn.utils.shuffle(df)

df.head()

# split features and targets

In [None]:
y = df[target].values
X = df.drop(target, axis=1).values

In [None]:
y=y.astype('int')

y

# Preprocessing

In [None]:
# Only do this for the numerical variables
X[cont_values] = preprocessing.scale(X[cont_values])

X.head()

In [None]:
%%time
## alle
# 33 min
X = pd.DataFrame(preprocessing.scale(X))

X.head()

In [None]:
X = X.values
X

# Split data into training and testing

In [None]:
%%time
# 17,38 min
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
X_train.shape

## Results

* age all
* ICD9_CODES: proc, dis, fact, ext


1. Categorical and continious

|LOS   | MAPPING                   | TEST_SIZE | SCALE | FUNC | SOLVER | HIDDEN | RES  | Learning Rate |
|------|---------------------------|-----------|-------|------|--------|--------|------|---------------|
| <10  | <1.0, <5.0, <200.0        | 0.4       | CONT  | LOGI | SGD    | 10,15  | 0.67 |    Default    |
| <10  | <1.0, <3.0, <200.0        | 0.4       | CONT  | LOGI | SGD    | 10,15  | 0.45 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | CONT  | LOGI | SGD    | 10,15  | 0.72 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | ALL   | LOGI | SGD    | 10,15  | 0.83 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | SGD    | 40,90  | 0.73 |    Default    |


2. Continiues (only)

|LOS   | MAPPING                   | TEST_SIZE | SCALE | FUNC | SOLVER | HIDDEN | RES  | Learning Rate |
|------|---------------------------|-----------|-------|------|--------|--------|------|---------------|
| ALL  | <1.0, <10.0, <200.0       | 0.4       | CONT  | LOGI | SGD    | 10,15  | 0.72 |    Default    |


3. Categorical (only)

|LOS   | MAPPING                   | TEST_SIZE | SCALE | FUNC | SOLVER | HIDDEN | RES  | Learning Rate |
|------|---------------------------|-----------|-------|------|--------|--------|------|---------------|
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | LOGI | SGD    | 10,15  | 0.72 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | SGD    | 10,15  | 0.84 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | SGD    | 40,90  | 0.87 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | CONT  | RELU | SGD    | 10,15  | 0.81 |    Default    |
| <5   | NONE                      | 0.4       | NONE  | RELU | SGD    | 10,15  | 0.59 |    Default    |

## Post new values

* age none
* ICD9_CODES: symp, dis, fact, ext - repeated for first combination


3. Categorical (only)

|LOS   | MAPPING                   | TEST_SIZE | SCALE | FUNC | SOLVER | HIDDEN | RES  | Learning Rate |
|------|---------------------------|-----------|-------|------|--------|--------|------|---------------|
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | LOGI | SGD    | 10,15  | 0.72 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | SGD    | 10,15  | 0.79 |    Default    |

4. Encoded Categorical (only)

|LOS   | MAPPING                   | TEST_SIZE | SCALE | FUNC | SOLVER | HIDDEN | RES  | Learning Rate |
|------|---------------------------|-----------|-------|------|--------|--------|------|---------------|
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | SGD    | 10,15  | 0.72 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | CAT   | RELU | SGD    | 10,15  | 0.72 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | CAT   | LOGI | SGD    | 10,15  | 0.72 |    Default    |

## Post supervisor explenation - change to max 100 iterations

* age none
* ICD9_CODES: symp, dis, fact, ext - all combination for each diagnose


3. Categorical (only)

|LOS   | MAPPING                   | TEST_SIZE | SCALE | FUNC | SOLVER | HIDDEN | RES  | Learning Rate |
|------|---------------------------|-----------|-------|------|--------|--------|------|---------------|
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | ADAM   | 10,15  | 0.80 |    Default    |

3. Categorical (only) [Encoded]

|LOS   | MAPPING                   | TEST_SIZE | SCALE | FUNC | SOLVER | HIDDEN | RES  | Learning Rate |
|------|---------------------------|-----------|-------|------|--------|--------|------|---------------|
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | LOGI | SGD    | 10,15  | 0.71 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | SGD    | 10,15  | 0.71 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | ADAM   | 10,15  | 0.71 |    Default    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | ADAM   | 10,15  | 0.71 |    0.001      |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | ADAM   | 10,15  | 0.71 |    0.01       |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | ADAM   | 10,15  | 0.71 |    0.1        |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | ADAM   | 10,15  | 0.71 |    1.0        |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | ADAM   | 10,15  | 0.71 |    2.0        |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | ADAM   | 10,15  | 0.71 |    0.00001    |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | RELU | ADAM   | 10,15  | 0.71 |    0.000001   |
| ALL  | <1.0, <10.0, <200.0       | 0.4       | NONE  | LOGI | SGD    | 10,15  | 0.71 |    0.01       |

# NN

In [None]:
#activation 
# - sigmoid = logistic
# - relu
# - tanh

#solver
# sgd - stochastic gradiant decent
# adam

#hidden_layer_sieze
# - 45,90 take some time -- 16min
# - 10,15 lesser time -- 13min
# - 100, 150 -- 48min
# - 1000, 1500 time -- 2h 4min

# hidden_layer_sizes=(10,15)

#parameter_space = {
#    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
#    'activation': ['tanh', 'relu'],
#    'solver': ['sgd', 'adam'],
#    'alpha': [0.0001, 0.05],
#    'learning_rate': ['constant','adaptive'],
#}

#nn=MLPClassifier(activation='relu', solver='adam', verbose=1, max_iter=100, alpha=0.001)
nn=MLPClassifier(verbose=1, hidden_layer_sizes=(10,15))

In [None]:
%%time
## 1h 14, 21 min
nn.fit(X_train, y_train)

In [None]:
pred=nn.predict(X_test)

count = 0

for i in range(len(y_test)):
    if pred[i]==y_test[i]:
        count += 1

count/len(pred)

## Other algorithmes

In [None]:
clf = svm.SVR(kernel="linear")
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

# Neural Network

In [None]:
# normalize??



In [None]:
# split into train, valid, test

# features capital x, and lowercase y is labels