In [None]:
import os
import cv2
import random
import numpy as np
from collections import Counter
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras import Sequential, Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.layers import Activation, Flatten, Dropout, BatchNormalization, Conv2D
from tensorflow.keras.layers import AveragePooling2D, Dense, Input, Lambda, concatenate
from tensorflow.keras.optimizers import RMSprop, Adam

In [None]:
folders = sorted(os.listdir("data_organized"))
train_val_patients = []
test_patients = []
test_count = 0
random.shuffle(folders)

for name in folders:
    train_val_patients.append(name)
    
for name in folders:
    _, count = name.split('+')
    # Fill up test_patients with 62 first
    count = int(count)
    if test_count+count <= 62:
        test_patients.append(name)
        test_count += count
        train_val_patients.remove(name)
        if test_count == 62:
            break
    elif test_count+count > 62:
        continue
        
print(test_count)

In [None]:
# Read in files from folders, process images, T2 size, zone info, labels
# clinsig TRUE = 1, FALSE = 0

X_img_t2_clinsig = []
X_img_t2_indolent = []
y_t2_clinsig = []
y_t2_indolent = []

X_img_adc_clinsig = []
X_img_adc_indolent = []
y_adc_clinsig = []
y_adc_indolent = []

X_size_t2 = []
X_zone = []

for folder in train_val_patients:
    for filename in sorted(os.listdir("data_organized/{}".format(folder))):
        root, _ = os.path.splitext(filename)
        patient, fid, adc_t2, size, zone, label = root.split('+')
        if zone == 'SV': # only 2 data points have this zone
            continue
        img = Image.open("data_organized/{}/{}".format(folder, filename))
        img = img.resize((32, 32))
        array_gray = np.asarray(img)
        array = cv2.merge((array_gray,array_gray,array_gray))     
        if adc_t2 == 't2':
            if label == 'TRUE':
                X_img_t2_clinsig.append(array)
                y_t2_clinsig.append(1)
            else:
                X_img_t2_indolent.append(array)
                y_t2_indolent.append(0)
        else:
            if label == 'TRUE':
                X_img_adc_clinsig.append(array)
                y_adc_clinsig.append(1)
            else:
                X_img_adc_indolent.append(array)
                y_adc_indolent.append(0)
            

# one-hot encode zones, which are categorical features: AS=100, PZ=010, TZ=001
X_zone = LabelBinarizer().fit_transform(X_zone)

# [0,1] min-max T2 size, which are continuous features
X_size_t2 = MinMaxScaler().fit_transform(np.array(X_size_t2).reshape(-1,1))

In [None]:
# Generate images of minority group
X_t2 = []
X_t2.extend(X_img_t2_clinsig)
X_t2.extend(X_img_t2_indolent)
y_t2 = []
y_t2.extend(y_t2_clinsig)
y_t2.extend(y_t2_indolent)

X_adc = []
X_adc.extend(X_img_adc_clinsig)
X_adc.extend(X_img_adc_indolent)
y_adc = []
y_adc.extend(y_adc_clinsig)
y_adc.extend(y_adc_indolent)

datagen = ImageDataGenerator(shear_range=0.05,
                            rotation_range=90,
                            horizontal_flip=True,
                            vertical_flip=True,
                            fill_mode='nearest')

X_t2_clinsig = np.array(X_img_t2_clinsig)
X_adc_clinsig = np.array(X_img_adc_clinsig)
y_t2_clinsig = np.array(y_t2_clinsig)
y_adc_clinsig = np.array(y_adc_clinsig)
np.asarray(y_t2_clinsig).astype('float32').reshape((-1,1))
np.asarray(y_adc_clinsig).astype('float32').reshape((-1,1))

for img in X_t2_clinsig:
    img = img.reshape((1,) + img.shape)
    for x, val in zip(datagen.flow(img, batch_size=1), range(2)):
        X_t2.append(x[0])
        y_t2.append(1)
        
for img in X_adc_clinsig:
    img = img.reshape((1,) + img.shape)
    for x, val in zip(datagen.flow(img, batch_size=1), range(2)):
        X_adc.append(x[0])
        y_adc.append(1)

X_t2 = np.array(X_t2)
X_adc = np.array(X_adc)
y_t2 = np.array(y_t2)
y_adc = np.array(y_adc)

print(Counter(y_t2), Counter(y_adc))
print(len(X_t2), len(X_adc))

In [None]:
X_img_t2 = np.array(X_t2)
X_img_adc = np.array(X_adc)
X_size_t2 = np.array(X_size_t2)
X_zone = np.array(X_zone)
y = np.array(y_t2)
print('Original: indolent={}, clinsig={}'.format(Counter(y)[0], Counter(y)[1]))

print('X_img_t2.shape={}'.format(X_img_t2.shape))
print('X_img_adc.shape={}'.format(X_img_adc.shape))
print('X_size_t2.shape={}'.format(X_size_t2.shape))
print('X_zone.shape={}'.format(X_zone.shape))
print('y.shape={}'.format(y.shape))

In [None]:
# split this into training and validation set
X = list(zip(X_img_t2, X_img_adc, X_size_t2, X_zone))
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=2)
X_img_t2_train, X_img_adc_train, X_size_t2_train, X_zone_train = zip(*X_train)
X_img_t2_val, X_img_adc_val, X_size_t2_val, X_zone_val = zip(*X_val)

ntrain = len(X_img_t2_train) # = len(X_size_train) = len(X_zone_train), same for adc
nval = len(X_img_t2_val) # = len(X_size_val) = len(X_zone_val), same for adc
print("ntrain={}, nval={}".format(ntrain, nval))

In [None]:
def create_cnn(width=32, height=32, depth=3):
    chanDim = 3
    input_tensor = Input(shape=(32, 32, 3))
    conv_base = ResNet50(input_tensor=input_tensor, weights='imagenet', include_top=False)
    conv_base.trainable = False
    x = conv_base.output
    
    x = Conv2D(32, (3, 3), padding="same")(x)
    x = BatchNormalization(axis=chanDim)(x)
    x = Activation("relu")(x)
    
    for i in range(9):
        x = Conv2D(32, (3, 3), padding="same")(x)
        x = BatchNormalization(axis=chanDim)(x)
        x = Activation("relu")(x)
        x = Conv2D(32, (3, 3), padding="same")(x)
        x = BatchNormalization(axis=chanDim)(x)
    
    x = AveragePooling2D(pool_size=(3,3), padding="same")(x)
    
    model = Model(input_tensor, x)
    return model

def output_size():
    inp = Input(shape=(1))
    out = Lambda(lambda x: x, output_shape=(1))(inp)
    model = Model(inp, out)
    
def output_zone():
    inp = Input(shape=(1,3))
    out = Lambda(lambda x: x, output_shape=(1,3))(inp)
    model = Model(inp, out)

In [None]:
cnn_t2 = create_cnn()
cnn_adc = create_cnn()
dummy_size = output_size()
dummy_zone = output_zone()

for layer in cnn_t2.layers:
    layer._name = layer.name + str("_t2")
for layer in cnn_adc.layers:
    layer._name = layer.name + str("_adc")

combinedInput = concatenate([cnn_t2.output, cnn_adc.output])

x = combinedInput
x = Flatten()(x)
x = Dense(512, activation="relu")(x)
x = Dense(1, activation="sigmoid", name='predictions')(x)

model = Model(inputs=[cnn_t2.input, cnn_adc.input], outputs=x)
# model.summary()

In [None]:
opt = RMSprop(lr=1e-5)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['acc'])

In [None]:
batch_size = 64

img_datagen = ImageDataGenerator(rescale=1./255,
                                 shear_range=0.2,
                                 zoom_range=[0.7,1],
                                 horizontal_flip=True,
                                 vertical_flip=True,
                                 fill_mode='nearest')
val_datagen = ImageDataGenerator(rescale=1./255)

def generate_generator_multiple(generator, x_t2, x_adc, y, batch):
    genX1 = generator.flow(x_t2, y, batch_size=batch)
    genX2 = generator.flow(x_adc, y, batch_size=batch)
    while True:
        X1i = genX1.next()
        X2i = genX2.next()
        yield [X1i[0], X2i[0]], X2i[1]  #Yield both images and their mutual label
            
X_img_t2_train = np.array(X_img_t2_train)
X_img_adc_train = np.array(X_img_adc_train)
y_train = np.array(y_train)
np.asarray(y_train).astype('float32').reshape((-1,1))

X_img_t2_val = np.array(X_img_t2_val)
X_img_adc_val = np.array(X_img_adc_val)
y_val = np.array(y_val)
np.asarray(y_val).astype('float32').reshape((-1,1))

img_train_generator = generate_generator_multiple(generator=img_datagen, 
                                                  x_t2=X_img_t2_train, 
                                                  x_adc=X_img_adc_train,
                                                  y=y_train,
                                                  batch=batch_size)
img_val_generator = generate_generator_multiple(generator=val_datagen, 
                                                x_t2=X_img_t2_val, 
                                                x_adc=X_img_adc_val, 
                                                y=y_val,
                                                batch=batch_size)

# train_generator = train_datagen.flow(X_train_img, y_train, batch_size=batch_size)
# val_generator = val_datagen.flow(X_val_img, y_val, batch_size=batch_size)

In [None]:
history = model.fit_generator(img_train_generator,
                              steps_per_epoch=ntrain//batch_size,
                              epochs=70,
                              validation_data=img_val_generator,
                              validation_steps=nval//batch_size)

In [None]:
model.save_weights('models/model_t2+adc_weights.h5')
model.save('models/model_t2+adc_keras.h5')

In [None]:
# Plot validation and losss

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

#Train and validation accuracy
plt.plot(epochs, acc, 'b', label='Training accurarcy')
plt.plot(epochs, val_acc, 'r', label='Validation accurarcy')
plt.title('Training and Validation accuracy')
plt.legend()

plt.figure()
#Train and validation loss
plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and Validation loss')
plt.legend()
plt.show()

In [None]:
# Plot validation and losss with smoother lines

def smooth_plot(points, factor=0.7):
    smooth_pts = []
    for point in points:
        if smooth_pts:
            previous = smooth_pts[-1]
            smooth_pts.append(previous * factor + point * (1 - factor))
        else:
            smooth_pts.append(point)
    return smooth_pts

#Plot figure
plt.plot(epochs, smooth_plot(acc), 'b', label='Training accurarcy')
plt.plot(epochs, smooth_plot(val_acc), 'r', label='Validation accurarcy')
plt.title('Training and Validation accuracy')
plt.legend()
plt.show()

plt.plot(epochs, smooth_plot(loss), 'b', label='Training loss')
plt.plot(epochs, smooth_plot(val_loss), 'r', label='Validation loss')
plt.title('Training and Validation loss')
plt.legend()
plt.show()

In [None]:
#Prepare test set

X_t2_test = []
X_adc_test = []
y_test = []

for folder in test_patients:
    for filename in sorted(os.listdir("data_organized/{}".format(folder))):
        root, _ = os.path.splitext(filename)
        patient, fid, adc_t2, size, zone, label = root.split('+')
        img = Image.open("data_organized/{}/{}".format(folder, filename))
        img = img.resize((32, 32))
        array_gray = np.asarray(img)
        array = cv2.merge((array_gray,array_gray,array_gray))  
        if adc_t2 == 't2':
            X_t2_test.append(array)
            if label == 'TRUE':
                y_test.append(1)
            else:
                y_test.append(0)
        else:
            X_adc_test.append(array)

In [None]:
x_t2 = np.array(X_t2_test)
x_adc = np.array(X_adc_test)
y = np.array(y_test)
y_pred_class = []
y_pred_prob = []
total = len(y)

clinsig_pred = 0
correct = 0
true_pos = 0
true_neg = 0
false_pos = 0
false_neg = 0

test_datagen = ImageDataGenerator(rescale=1./255)

def generate_generator_multiple(generator, x_t2, x_adc, y, batch):
    genX1 = generator.flow(x_t2, y, batch_size=batch)
    genX2 = generator.flow(x_adc, y, batch_size=batch)
    while True:
        X1i = genX1.next()
        X2i = genX2.next()
        yield [X1i[0], X2i[0]], X2i[1]  #Yield both images and their mutual label

test_generator = generate_generator_multiple(generator=test_datagen, x_t2=x_t2, x_adc=x_adc, y=y, batch=1)
    
predictions = model.predict_generator(test_generator, steps=total)
for i, pred in enumerate(predictions):
    y_pred_prob.append(pred[0])
    prediction = 1 if pred[0] > 0.5 else 0
    y_pred_class.append(prediction)
    actual = y[i] 
    if prediction == actual:
        correct += 1
        if prediction == 1:
            true_pos += 1
        else:
            true_neg += 1
    else:
        if prediction == 1 and actual == 0:
            false_pos += 1
        else:
            false_neg += 1
    if prediction:
        clinsig_pred += prediction

print('PREDICTS CLINSIG: {}%'.format(clinsig_pred/total * 100))

In [None]:
print('ACCURACY:', metrics.accuracy_score(y, y_pred_class))
print('SENSITIVITY:', metrics.recall_score(y, y_pred_class))
print('SPECIFICITY:', true_neg / (true_neg + false_pos))

y_prob = np.array(y_pred_prob).reshape((len(y_pred_prob),))
fpr, tpr, thresholds = metrics.roc_curve(y,y_prob)

print(','.join(str(x) for x in fpr))
print(','.join(str(x) for x in tpr))

plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 12
plt.title('ROC curve')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)

print('AUC SCORE:', metrics.roc_auc_score(y,y_prob))

In [None]:
def evaluate_threshold(threshold):
    print('Sensitivity:', tpr[thresholds > threshold][-1])
    print('Specificity:', 1 - fpr[thresholds > threshold][-1])

evaluate_threshold(0.45)

In [None]:
precision, recall, thresholds = metrics.precision_recall_curve(y,y_prob)
print('PRECISION:', ','.join(str(x) for x in precision))
print('RECALL', ','.join(str(x) for x in recall))

In [None]:
plt.figure(figsize=(7, 8))
f_scores = np.linspace(0.2, 0.8, num=4)
lines = []
labels = []
for f_score in f_scores:
    x = np.linspace(0.01, 1)
    y = f_score * x / (2 * x - f_score)
    l, = plt.plot(x[y >= 0], y[y >= 0], color='gray', alpha=0.2)
    plt.annotate('f1={0:0.1f}'.format(f_score), xy=(0.9, y[45] + 0.02))

plt.plot(recall, precision, color='turquoise', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall curves')
plt.legend(lines, labels, loc=(0, -.38), prop=dict(size=14))

plt.show()