## Classifier with autoencoder

#### This notebook contains basic classifier with 3 different medical targets
##### Target 1 : Mordibity classification
##### Target 2 : Clinical procedures classification
##### Target 3 : Destination classification

In [1]:
import pandas as pd
import numpy as np
import keras
from keras.models import Model
from keras.optimizers import RMSprop
from keras.layers import Input,Dense,Flatten,Dropout,merge,Reshape,Conv2D,MaxPooling2D,UpSampling2D,Conv2DTranspose
from keras.layers.normalization import BatchNormalization
from keras.models import Model,Sequential
from keras.optimizers import Adadelta, RMSprop,SGD,Adam
from keras import regularizers
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from keras.wrappers.scikit_learn import KerasClassifier
from matplotlib import pyplot as plt
plt.style.use('ggplot')

Using TensorFlow backend.


#### Convolutional Autoencoder

In [2]:
def encoder(inputs):
    #encoder
    conv1 = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    conv1 = BatchNormalization()(conv1)
    pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
    conv2 = Conv2D(64, (3, 3), activation='relu', padding='same')(pool1)
    conv2 = BatchNormalization()(conv2)
    pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
    conv3 = Conv2D(128, (3, 3), activation='relu', padding='same')(pool2)
    conv3 = BatchNormalization()(conv3)
    conv4 = Conv2D(256, (3, 3), activation='relu', padding='same')(conv3)
    conv4 = BatchNormalization()(conv4)
    return conv4

def decoder(conv4):
    #decoder
    conv5 = Conv2D(128, (3, 3), activation='relu', padding='same')(conv4)
    conv5 = BatchNormalization()(conv5)
    conv6 = Conv2D(64, (3, 3), activation='relu', padding='same')(conv5)
    conv6 = BatchNormalization()(conv6)
    up1 = UpSampling2D((2,2))(conv6)
    conv7 = Conv2D(32, (3, 3), activation='relu', padding='same')(up1)
    conv7 = BatchNormalization()(conv7)
    up2 = UpSampling2D((2,2))(conv7)
    decoded = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(up2)
    return decoded

In [3]:
def create_classifier(inputs, autoencoder, layer=10, decay_rate=0.3, dropout_rate=0.5):
    
    encode = encoder(inputs)
    out = Flatten()(encode)
    
    neurons = np.prod(autoencoder.layers[layer].output_shape[1:])
    
    nodes = []
    while True:
        neurons = int(neurons * decay_rate)
        if neurons <= num_classes*2:
            break
        else:
            nodes.append(neurons)
    
    for i in range(len(nodes)):
        out = Dense(nodes[i], activation='relu')(out)
        out = Dropout(dropout_rate)(out)
    
    out = Dense(num_classes, activation='softmax')(out)
    
    return Model(inputs, out)

In [4]:
es = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

batch_size = 128
epochs = 1000

## Target 1 : Classify major clinical category
### Read the data and split into 3 datasets
##### 10,000 inpatients records with 12290 clinical binarized descriptors

In [5]:
x1 = pd.read_csv("data/y1/BPPR-KMM_x1_x2_x3_x4_x5_x7_x83_x9_x10_Y1-2008.txt", sep=",", header=None)
y1 = pd.read_csv("data/y1/labels_Y1-KMM_x1_x2_x3_x4_x5_x7_x83_x9_x10_Y1-2008.txt", sep=",", header=None)

In [6]:
print("X1 contains {} columns for {} inpatients".format(x1.shape[1], x1.shape[0]))
print("Y1 contains {} categories".format(y1.shape[1]))

X1 contains 12290 columns for 10000 inpatients
Y1 contains 20 categories


In [7]:
x, y = 108, 108
inputs = Input(shape = (x, y, 1))
x1 = x1.values[:, :x*y].reshape(-1, x, y, 1)
x1.shape

(10000, 108, 108, 1)

In [16]:
autoencoder = Model(inputs, decoder(encoder(inputs)))
autoencoder.compile(loss='binary_crossentropy', optimizer = RMSprop())
autoencoder.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 108, 108, 1)       0         
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 108, 108, 32)      320       
_________________________________________________________________
batch_normalization_8 (Batch (None, 108, 108, 32)      128       
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 54, 54, 32)        0         
_________________________________________________________________
conv2d_10 (Conv2D)           (None, 54, 54, 64)        18496     
_________________________________________________________________
batch_normalization_9 (Batch (None, 54, 54, 64)        256       
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 27, 27, 64)        0   

In [10]:
train_X,valid_X,train_ground,valid_ground = train_test_split(x1, x1, test_size=0.2)

In [None]:
checkpoint = keras.callbacks.ModelCheckpoint('models/autoencoder_Y1_best_checkpoint.h5', monitor='val_loss', mode='min', save_best_only=True)

autoencoder_train = autoencoder.fit(train_X, train_ground, 
                                    batch_size=batch_size,
                                    epochs=epochs,
                                    callbacks=[es, checkpoint],
                                    validation_data=(valid_X, valid_ground))

Train on 8000 samples, validate on 2000 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000

In [8]:
autoencoder = keras.models.load_model('models/autoencoder_Y1_best_checkpoint.h5')


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



In [6]:
autoencoder.save_weights('models/autoencoder_Y1_weights.h5')

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x1, y1.values, test_size=0.2)
x_test, x_valid, y_test, y_valid = train_test_split(x_test, y_test, test_size=0.5)
input_shape = (x_train.shape[1],)
num_classes = y_train.shape[1]

In [13]:
print("Train      :: {} inpatients".format(x_train.shape[0]))
print("Test       :: {} inpatients".format(x_test.shape[0]))
print("validation :: {} inpatients".format(x_valid.shape[0]))

Train      :: 8000 inpatients
Test       :: 1000 inpatients
validation :: 1000 inpatients


#### Create a classifier with encoded pretrained weights and set them as not trainable

In [14]:
enco_layers = 10
classifier = create_classifier(inputs, autoencoder, enco_layers)
classifier.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(),metrics=['accuracy'])
classifier.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 108, 108, 1)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 108, 108, 32)      320       
_________________________________________________________________
batch_normalization_1 (Batch (None, 108, 108, 32)      128       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 54, 54, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 54, 54, 64)        18496     
_________________________________________________________________
batch_normalization_2 (Batch (None, 54, 54, 64)        256       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 27, 27, 64)        0   

In [11]:
enco_layers = 10
classifier = create_classifier(inputs, autoencoder, enco_layers)

for l1,l2 in zip(classifier.layers[:enco_layers+1],autoencoder.layers[:enco_layers+1]):
    l1.set_weights(l2.get_weights())
    l1.trainable = False

classifier.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(),metrics=['accuracy'])
classifier.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 108, 108, 1)       0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 108, 108, 32)      320       
_________________________________________________________________
batch_normalization_5 (Batch (None, 108, 108, 32)      128       
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 54, 54, 32)        0         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 54, 54, 64)        18496     
_________________________________________________________________
batch_normalization_6 (Batch (None, 54, 54, 64)        256       
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 27, 27, 64)        0   

In [None]:
1

In [None]:
checkpoint = keras.callbacks.ModelCheckpoint('models/autoencoder_classifier_Y1_best_checkpoint.h5', monitor='val_loss', mode='min', save_best_only=True)

history = classifier.fit(x_train, y_train, 
                         batch_size=batch_size,
                         epochs=epochs,
                         callbacks=[es, checkpoint],
                         validation_data=(x_test, y_test))

In [None]:
for layer in classifier.layers[0:enco_layers+1]:
    layer.trainable = True
    
classifier.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(),metrics=['accuracy'])

In [None]:
checkpoint = keras.callbacks.ModelCheckpoint('models/autoencoder_classifier_Y1_best_checkpoint.h5', monitor='val_loss', mode='min', save_best_only=True)

history = classifier.fit(x_train, y_train, 
                         batch_size=batch_size,
                         epochs=epochs,
                         callbacks=[es, checkpoint],
                         validation_data=(x_test, y_test))

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['training', 'validation'], loc='best')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['training', 'validation'], loc='best')
plt.show()

In [None]:
loss, accuracy = classifier.evaluate(x_valid, y_valid)
print("Validation loss :: {}".format(loss))
print("Validation acc  :: {}".format(accuracy))

## Target 2 : Clinical procedures classification
### Read the data and split into 3 datasets
##### 10,000 inpatients records with 15384 clinical binarized descriptors

In [8]:
x2 = pd.read_csv("data/y2/BPPR-pre-trained-2008.txt", sep=",", header=None)
y2 = pd.read_csv("data/y2/labels_Y2-pre-trained-2008.txt", sep=",", header=None)

In [None]:
print("X2 contains {} columns for {} inpatients".format(x2.shape[1], x2.shape[0]))
print("Y2 contains {} categories".format(y2.shape[1]))

In [5]:
x, y = 124, 124
inputs = Input(shape = (x, y, 1))

In [6]:
autoencoder = Model(inputs, decoder(encoder(inputs)))
autoencoder.compile(loss='binary_crossentropy', optimizer = RMSprop())
autoencoder.summary()


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 124, 124, 1)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 124, 124, 32)      320       
_________________________________________________________________
batch_normalization_1 (Batch (None, 124, 124, 32)      128       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 62, 62, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 62, 62, 64)        18496     
_________________________________________________________________
batch_normalization_2 (Batch (None, 62, 62, 64)        256       
______________________________

In [11]:
x2 = x2.values[:, :x*y].reshape(-1, x, y, 1)
x2.shape

(10000, 124, 124, 1)

In [None]:
train_X,valid_X,train_ground,valid_ground = train_test_split(x2, x2, test_size=0.2)

In [None]:
checkpoint = keras.callbacks.ModelCheckpoint('models/autoencoder_Y2_best_checkpoint.h5', monitor='val_loss', mode='min', save_best_only=True)

autoencoder_train = autoencoder.fit(train_X, train_ground, 
                                    batch_size=batch_size,
                                    epochs=epochs,
                                    callbacks=[es, checkpoint],
                                    validation_data=(valid_X, valid_ground))

In [None]:
autoencoder.save_weights('models/autoencoder_y2_weights.h5')

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x2, y2.values, test_size=0.2)
x_test, x_valid, y_test, y_valid = train_test_split(x_test, y_test, test_size=0.5)
input_shape = (x_train.shape[1],)
num_classes = y_train.shape[1]

#### Create a classifier with encoded pretrained weights and set them as not trainable

In [14]:
enco_layers = 10
classifier = create_classifier(inputs, autoencoder, enco_layers)

#for l1,l2 in zip(classifier.layers[:enco_layers+1],autoencoder.layers[:enco_layers+1]):
#    l1.set_weights(l2.get_weights())

#for layer in classifier.layers[:enco_layers+1]:
#    layer.trainable = False

classifier.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(),metrics=['accuracy'])
classifier.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 124, 124, 1)       0         
_________________________________________________________________
conv2d_21 (Conv2D)           (None, 124, 124, 32)      320       
_________________________________________________________________
batch_normalization_20 (Batc (None, 124, 124, 32)      128       
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 62, 62, 32)        0         
_________________________________________________________________
conv2d_22 (Conv2D)           (None, 62, 62, 64)        18496     
_________________________________________________________________
batch_normalization_21 (Batc (None, 62, 62, 64)        256       
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 31, 31, 64)        0   

In [None]:
checkpoint = keras.callbacks.ModelCheckpoint('models/autoencoder_classifier_Y2_best_checkpoint.h5', monitor='val_loss', mode='min', save_best_only=True)

history = classifier.fit(x_train, y_train, 
                         batch_size=batch_size,
                         epochs=epochs,
                         callbacks=[es, checkpoint],
                         validation_data=(x_test, y_test))

In [None]:
for layer in classifier.layers[0:enco_layers+1]:
    layer.trainable = True
    
classifier.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(),metrics=['accuracy'])

In [None]:
checkpoint = keras.callbacks.ModelCheckpoint('models/autoencoder_classifier_Y2_best_checkpoint.h5', monitor='val_loss', mode='min', save_best_only=True)

history = classifier.fit(x_train, y_train, 
                         batch_size=batch_size,
                         epochs=epochs,
                         callbacks=[es, checkpoint],
                         validation_data=(x_test, y_test))

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['training', 'validation'], loc='best')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['training', 'validation'], loc='best')
plt.show()

In [None]:
loss, accuracy = classifier.evaluate(x_valid, y_valid)
print("Validation loss :: {}".format(loss))
print("Validation acc  :: {}".format(accuracy))

## Target 3 : Destination classification
### Read the data and split into 3 datasets
##### 10,000 inpatients records with 12285 clinical binarized descriptors

In [15]:
x3 = pd.read_csv("data/y3/BPPR-KMM_x1_x2_x3_x4_x5_x7_x83_x9_x10_Y3-2008.txt", sep=",", header=None)
y3 = pd.read_csv("data/y3/labels_Y3-KMM_x1_x2_x3_x4_x5_x7_x83_x9_x10_Y3-2008.txt", sep=",", header=None)

In [16]:
print("X3 contains {} columns for {} inpatients".format(x3.shape[1], x3.shape[0]))
print("Y3 contains {} categories".format(y3.shape[1]))

X3 contains 12285 columns for 10000 inpatients
Y3 contains 5 categories


In [17]:
x, y = 108, 108
inputs = Input(shape = (x, y, 1))

In [18]:
autoencoder = Model(inputs, decoder(encoder(inputs)))
autoencoder.compile(loss='binary_crossentropy', optimizer = RMSprop())
autoencoder.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 108, 108, 1)       0         
_________________________________________________________________
conv2d_25 (Conv2D)           (None, 108, 108, 32)      320       
_________________________________________________________________
batch_normalization_24 (Batc (None, 108, 108, 32)      128       
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 54, 54, 32)        0         
_________________________________________________________________
conv2d_26 (Conv2D)           (None, 54, 54, 64)        18496     
_________________________________________________________________
batch_normalization_25 (Batc (None, 54, 54, 64)        256       
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 27, 27, 64)        0   

In [19]:
x3 = x3.values[:, :x*y].reshape(-1, x, y, 1)
x3.shape

(10000, 108, 108, 1)

In [None]:
train_X,valid_X,train_ground,valid_ground = train_test_split(x3, x3, test_size=0.2)

In [None]:
checkpoint = keras.callbacks.ModelCheckpoint('models/autoencoder_Y3_best_checkpoint.h5', monitor='val_loss', mode='min', save_best_only=True)

autoencoder_train = autoencoder.fit(train_X, train_ground, 
                                    batch_size=batch_size,
                                    epochs=epochs,
                                    callbacks=[es, checkpoint],
                                    validation_data=(valid_X, valid_ground))

In [None]:
autoencoder.save_weights('models/autoencoder_y3_weights.h5')

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x3, y3.values, test_size=0.2)
x_test, x_valid, y_test, y_valid = train_test_split(x_test, y_test, test_size=0.5)
input_shape = (x_train.shape[1],)
num_classes = y_train.shape[1]

#### Create a classifier with encoded pretrained weights and set them as not trainable

In [22]:
enco_layers = 10
classifier = create_classifier(inputs, autoencoder, enco_layers)

#for l1,l2 in zip(classifier.layers[:enco_layers+1],autoencoder.layers[:enco_layers+1]):
#    l1.set_weights(l2.get_weights())

#for layer in classifier.layers[:enco_layers+1]:
#    layer.trainable = False

classifier.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(),metrics=['accuracy'])
classifier.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 108, 108, 1)       0         
_________________________________________________________________
conv2d_37 (Conv2D)           (None, 108, 108, 32)      320       
_________________________________________________________________
batch_normalization_35 (Batc (None, 108, 108, 32)      128       
_________________________________________________________________
max_pooling2d_15 (MaxPooling (None, 54, 54, 32)        0         
_________________________________________________________________
conv2d_38 (Conv2D)           (None, 54, 54, 64)        18496     
_________________________________________________________________
batch_normalization_36 (Batc (None, 54, 54, 64)        256       
_________________________________________________________________
max_pooling2d_16 (MaxPooling (None, 27, 27, 64)        0   

In [None]:
checkpoint = keras.callbacks.ModelCheckpoint('models/autoencoder_classifier_Y3_best_checkpoint.h5', monitor='val_loss', mode='min', save_best_only=True)

history = classifier.fit(x_train, y_train, 
                         batch_size=batch_size,
                         epochs=epochs,
                         callbacks=[es, checkpoint],
                         validation_data=(x_test, y_test))

In [None]:
for layer in classifier.layers[0:enco_layers+1]:
    layer.trainable = True
    
classifier.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(),metrics=['accuracy'])

In [None]:
checkpoint = keras.callbacks.ModelCheckpoint('models/autoencoder_classifier_Y3_best_checkpoint.h5', monitor='val_loss', mode='min', save_best_only=True)

history = classifier.fit(x_train, y_train, 
                         batch_size=batch_size,
                         epochs=epochs,
                         callbacks=[es, checkpoint],
                         validation_data=(x_test, y_test))

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['training', 'validation'], loc='best')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['training', 'validation'], loc='best')
plt.show()

In [None]:
loss, accuracy = classifier.evaluate(x_valid, y_valid)
print("Validation loss :: {}".format(loss))
print("Validation acc  :: {}".format(accuracy))