## Classifier with autoencoder

#### This notebook contains classifier with 3 different medical targets
##### Target 1 : Mordibity classification
##### Target 2 : Clinical procedures classification
##### Target 3 : Destination classification

In [57]:
import pandas as pd
import numpy as np
import keras
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from keras.wrappers.scikit_learn import KerasClassifier

## Target 1 : Classify major clinical category
### Read the data and split into 3 datasets
##### 10,000 inpatients records with 12290 clinical binarized descriptors

In [4]:
x = pd.read_csv("data/BPPR-x1-5-x7-10-y1-2008.txt", sep=",", header=None)
y = pd.read_csv("data/labels_y1-x1-5-x7-10-y1-2008.txt", sep=",", header=None)

In [14]:
print("X contains {} columns for {} inpatients".format(x.shape[1], x.shape[0]))
print("Y contains {} categories".format(y.shape[1]))

X contains 12290 columns for 10000 inpatients
Y contains 20 categories


In [10]:
x_train, x_test, y_train, y_test = train_test_split(x.values, y.values, test_size=0.2)
x_test, x_valid, y_test, y_valid = train_test_split(x_test, y_test, test_size=0.5)
input_shape = (x.train.shape[1],)
num_classes = y_train.shape[1]

In [13]:
print("Train      :: {} inpatients".format(x_train.shape[0]))
print("Test       :: {} inpatients".format(x_test.shape[0]))
print("validation :: {} inpatients".format(x_valid.shape[0]))

Train      :: 8000 inpatients
Test       :: 1000 inpatients
validation :: 1000 inpatients


In [80]:
def create_model(dropout_rate=0.4,
                 neurons=input_shape[0],
                 init_mode='uniform',
                 optimizer='adam', 
                 learn_rate=0.01, 
                 activation='relu',
                 decay_rate=0.6):
    nodes = []
    fn = lambda x: int(x*decay_rate)
    while True:
        neurons = fn(neurons)
        if neurons <= 20:
            break
        else:
            nodes.append(neurons)
    print(nodes)
    inputs = keras.layers.Input(shape=input_shape)
    x = inputs
    
    for i in range(len(nodes)):
        x = keras.layers.Dense(nodes[i], kernel_initializer=init_mode, activation=activation)(x)
        x = keras.layers.Dropout(dropout_rate)(x)

    outputs = keras.layers.Dense(num_classes, kernel_initializer=init_mode, activation='softmax')(x)
    
    model = keras.models.Model(inputs=inputs, outputs=outputs)
    
    #compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [81]:
model = KerasClassifier(build_fn=create_model, verbose=2)

#### Hyperparameters

In [85]:
# define the grid search parameters
#batch_size = [8, 16, 32, 64, 128, 256, 512, 1024]
gridSearch_batch_Size = [2]

#epochs = [10, 50, 100,500,1000]
gridSearch_epochs = [30,50]

#optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
gridSearch_optimizer = ['Adam']

#learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
gridSearch_learn_rate = [0.01]

#activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
gridSearch_activation = ['relu']

#dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
gridSearch_dropout_rate = [0.4]

gridSearch_decay_rate = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

#gridSearch_layers = np.arange(1, 10)

param_grid = dict(batch_size=gridSearch_batch_Size
                  #,hidden_layers=gridSearch_layers
                  ,decay_rate=gridSearch_decay_rate
                  ,dropout_rate=gridSearch_dropout_rate
                  #,epochs=gridSearch_epochs
                  ,optimizer=gridSearch_optimizer 
                  ,learn_rate=gridSearch_learn_rate 
                  ,activation=gridSearch_activation)

init_mode='uniform'
momentum=0

In [86]:
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_jobs=-1, cv=3)

In [None]:
grid_result = grid.fit(x_train, y_train, verbose=1)

In [99]:
es = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
checkpoint= keras.callbacks.ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', save_best_only=True)
model = create_model(decay_rate=0.4)

In [100]:
model.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 12290)             0         
_________________________________________________________________
dense_21 (Dense)             (None, 4916)              60422556  
_________________________________________________________________
dropout_18 (Dropout)         (None, 4916)              0         
_________________________________________________________________
dense_22 (Dense)             (None, 1966)              9666822   
_________________________________________________________________
dropout_19 (Dropout)         (None, 1966)              0         
_________________________________________________________________
dense_23 (Dense)             (None, 786)               1546062   
_________________________________________________________________
dropout_20 (Dropout)         (None, 786)               0   

In [None]:
model.fit(x_train, y_train, 
          batch_size=128, 
          epochs=50,
          callbacks=[es, checkpoint],
          validation_data=(x_test, y_test))

Train on 8000 samples, validate on 1000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
 640/8000 [=>............................] - ETA: 47s - loss: 0.0529 - accuracy: 0.9793