In [1]:
import numpy as np
import pandas as pd
from random import sample as ran_sample
from cv2 import *
import cv2
import os
from glob import glob
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"  # This is for using Mac GPUs
from PIL import Image
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D, Activation
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau
%matplotlib inline

Using plaidml.keras.backend backend.


In [2]:
path = glob(r"HAM10000_images/*.jpg")
#empty array for image data
data = []
#empty array for image id
image_id= []
counter=0
height, width = (28, 28)
num_classes = 7
for f1 in path:
    img = cv2.imread(f1)
    img = cv2.resize(img,(height, width))
    data.append(np.array(np.array(img).ravel()))
    image_id.append(f1[f1.find('ISIC'):-4])
#image data
image_data=np.array(data)/255

#image data with image id
images_data_with_id=pd.DataFrame(image_data,image_id)
images_data_with_id.reset_index(level=0, inplace=True)
images_data_with_id.rename(columns ={'index':'image_id'}, inplace = True)

# Loading  metadata
metadata_df = pd.read_csv(r'Data/HAM10000_metadata.csv')
#print(len(metadata_df))


# Adding a column to denote classification names and labels
metadata_df['lesion_type'] = 'x'
metadata_df['label'] = 0

lesion_types = {
                'akiec':'Actinic keratoses',
                'bcc':'Basal cell carcinoma', 
                'bkl':'Benign keratosis-like lesions', 
                'df':'Dermatofibroma', 
                'nv':'Melanocytic nevi', 
                'vasc':'Vascular lesions', 
                'mel':'Melanoma'                
               }

label_codes = {
                0:'Actinic keratoses',
                1:'Basal cell carcinoma', 
                2:'Benign keratosis-like lesions', 
                3:'Dermatofibroma', 
                4:'Melanocytic nevi', 
                5:'Vascular lesions', 
                6:'Melanoma'                
               }


labels = {
            0:'akiec',
            1:'bcc', 
            2:'bkl', 
            3:'df', 
            4:'nv', 
            5:'vasc', 
            6:'mel'                
           }

for key, value in lesion_types.items():
    metadata_df['lesion_type'].loc[metadata_df['dx'] == key] = value    

for key, value in labels.items():
    metadata_df['label'].loc[metadata_df['dx'] == value] = key      
    
    
#all data with images and metadata
overall_data = pd.merge(metadata_df,images_data_with_id,on='image_id')
X=np.array(overall_data.drop(['lesion_id','image_id','dx','dx_type','lesion_type','age','sex','localization', 'label'],axis=1))
Y=np.array(overall_data['dx'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [3]:
reverse_labels = {
            'akiec': 0,
            'bcc': 1, 
            'bkl': 2, 
            'df': 3, 
            'nv': 4, 
            'vasc': 5, 
            'mel': 6             
           }

# Set the randomizer seed so results are the same each time.
np.random.seed(123)
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, Y = X[shuffle], Y[shuffle]

# Set some variables to hold test, dev, and training data.
test_data, test_labels = X[8000:], Y[8000:]
dev_data, dev_labels = X[7000:8000], Y[7000:8000]
train_data, train_labels = X[:7000], Y[:7000]
mini_train_data, mini_train_labels = X[:1000], Y[:1000]

# Reshape array for neural network input
test_data_2d = np.array([i.reshape(height,width,3) for i in test_data])
dev_data_2d = np.array([i.reshape(height,width,3) for i in dev_data])
train_data_2d = np.array([i.reshape(height,width,3) for i in train_data])
mini_train_data_2d = np.array([i.reshape(height,width,3) for i in mini_train_data])

# Convert classification to numeric values
test_labels = [reverse_labels[i] for i in test_labels]
dev_labels = [reverse_labels[i] for i in dev_labels]
train_labels = [reverse_labels[i] for i in train_labels]
mini_train_labels = [reverse_labels[i] for i in mini_train_labels]

# Convert numeric values to categorical values (one hot encoding)
test_labels_2d = to_categorical(test_labels, num_classes=7)
dev_labels_2d = to_categorical(dev_labels, num_classes=7)
train_labels_2d = to_categorical(train_labels, num_classes=7)
mini_train_labels_2d = to_categorical(mini_train_labels, num_classes=7)

In [4]:
# A lot of the features are left False as we will use them for the preprocessed model

datagen = ImageDataGenerator(
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=False,  # divide inputs by std of the dataset
        samplewise_std_normalization=False,  # divide each input by its std
        zca_whitening=False,  # apply ZCA whitening
        rotation_range=10,  # randomly rotate images in the range (degrees, 0 to 180)
        zoom_range = 0.1, # Randomly zoom image 
        width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
        height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
        horizontal_flip=False,  # randomly flip images
        vertical_flip=False)  # randomly flip images
datagen.fit(train_data_2d)

In [5]:
input_shape = (height, width, 3)

# CNN to include 5 Conv layers and 2 Dense Layers

model_cnn = Sequential([
    Conv2D(32, 3, padding='same', activation='relu', input_shape=input_shape),
    Conv2D(32, 3, padding='same', activation='relu'),
    MaxPooling2D(),
    Dropout(0.25),

    Conv2D(64, 3, padding='same', activation='relu'),
    Conv2D(64, 3, padding='same', activation='relu'),
    MaxPooling2D(),
    Dropout(0.4),
    
    Conv2D(128, 3, padding='same', activation='relu'),
    MaxPooling2D(),
    Dropout(0.5),
    
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.55),
    Dense(num_classes, activation='softmax')
])

INFO:plaidml:Opening device "metal_amd_radeon_pro_560x.0"


In [6]:
model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [7]:
# This helps reduce learning rate if there are no improvements for 5 consecutive epochs, helps hone in better
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', patience=5, verbose=1, factor=0.5, min_lr=0.00001)

In [8]:
epochs = 50  # 50 is chosen as a safe middle ground between overfitting and underfitting
batch_size = 10
history = model_cnn.fit_generator(
    datagen.flow(train_data_2d, train_labels_2d, batch_size=batch_size),
    steps_per_epoch=dev_data_2d.shape[0] // batch_size,
    epochs=epochs,
    validation_data=(test_data_2d, test_labels_2d),
    validation_steps=test_data_2d.shape[0] // batch_size,
    callbacks=[learning_rate_reduction]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50

Epoch 00032: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50

Epoch 00042: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [21]:
loss, accuracy = model_cnn.evaluate(test_data_2d, test_labels_2d, verbose=1)
loss_v, accuracy_v = model_cnn.evaluate(dev_data_2d, dev_labels_2d, verbose=1)  # Dev data used for validation (data it has never seen before)
print("Validation: accuracy = %f  ;  loss_v = %f" % (accuracy_v, loss_v))
print("Test: accuracy = %f  ;  loss = %f" % (accuracy, loss))
model_cnn.save("model_cnn.h5")

Validation: accuracy = 0.925857  ;  loss_v = 0.184779
Test: accuracy = 0.924883  ;  loss = 0.185417


In [10]:
#  Getting data for malignant cancers only
test_out = model_cnn.predict(test_data_2d)
test_pred = []
for i in test_out:
    for n, item in enumerate(i):
        if i.max() == item:
            test_pred.append(n)
        else:
            continue
dd = pd.DataFrame({"Predicted": test_pred, "Truth": test_labels})
d_mel_recall = dd[dd["Truth"] == 6]
d_mel_recall["Correct"] = d_mel_recall["Predicted"] == d_mel_recall["Truth"]
recall_mel = len(d_mel_recall[d_mel_recall["Correct"] == True])/len(d_mel_recall)
d_mel_pres = dd[dd["Predicted"] == 6]
d_mel_pres["Correct"] = d_mel_pres["Predicted"] == d_mel_pres["Truth"]
precision_mel = len(d_mel_pres[d_mel_pres["Correct"] == True])/len(d_mel_pres)
print("Recall: {:.2f}% (Melanoma)".format(recall_mel*100))
print("f1: {:.2f}% (Melanoma)".format(2*((precision_mel*recall_mel)*100/(precision_mel+recall_mel))))
d_bas_recall = dd[dd["Truth"] == 1]
d_bas_recall["Correct"] = d_bas_recall["Predicted"] == d_bas_recall["Truth"]
recall_bas = len(d_bas_recall[d_bas_recall["Correct"] == True])/len(d_bas_recall)
d_bas_pres = dd[dd["Predicted"] == 1]
d_bas_pres["Correct"] = d_bas_pres["Predicted"] == d_bas_pres["Truth"]
precision_bas = len(d_bas_pres[d_bas_pres["Correct"] == True])/len(d_bas_pres)
print("Recall: {:.2f}% (Basal cell carcinoma)".format(recall_bas*100))
print("f1: {:.2f}% (Basal cell carcinoma)".format(2*((precision_bas*recall_bas)*100/(precision_bas+recall_bas))))

Recall: 7.79% (Melanoma)
f1: 13.85% (Melanoma)
Recall: 19.23% (Basal cell carcinoma)
f1: 24.39% (Basal cell carcinoma)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docume

In [11]:
# Very basic EDA to help us choose how much to reduce certain cancers that are dominantly frequent.
# We choose this method as a simple means to achieve better balance between the classes
for num in range(0, 7):
    print("{}: {}".format(num, train_labels.count(num)))

0: 231
1: 368
2: 776
3: 78
4: 4673
5: 102
6: 772


In [12]:
for num in range(0, 7):
    print("{}: {}".format(num, test_labels.count(num)))

0: 59
1: 104
2: 207
3: 24
4: 1363
5: 26
6: 231


In [13]:
train_data_prepro = []
train_labels_prepro = []
remove_count = 4673 - 776
for n, i in enumerate(train_labels):
    if remove_count == 0:
        break
    if i != 4:
        remove_count -= 1
        train_data_prepro.append(train_data_2d[n])
        train_labels_prepro.append(train_labels_2d[n])

In [14]:
test_data_prepro = []
test_labels_prepro = []
remove_count = 1363 - 231
for n, i in enumerate(test_labels):
    if remove_count == 0:
        break
    if i != 4:
        remove_count -= 1
        test_data_prepro.append(test_data_2d[n])
        test_labels_prepro.append(test_labels_2d[n])

In [15]:
# Note that vertical, horizontal flips are enabled
datagen_prepro = ImageDataGenerator(
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=False,  # divide inputs by std of the dataset
        samplewise_std_normalization=False,  # divide each input by its std
        zca_whitening=False,  # apply ZCA whitening
        rotation_range=10,  # randomly rotate images in the range (degrees, 0 to 180)
        zoom_range = 0.1, # Randomly zoom image 
        width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
        height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
        horizontal_flip=True,  # randomly flip images
        vertical_flip=True)  # randomly flip images
datagen_prepro.fit(train_data_prepro)

In [16]:
# Same architecture is maintained to the base model for comparison
input_shape = (height, width, 3)
model_cnn_prepro = Sequential([
    Conv2D(32, 3, padding='same', activation='relu', input_shape=input_shape),
    Conv2D(32, 3, padding='same', activation='relu'),
    MaxPooling2D(),
    Dropout(0.25),

    Conv2D(64, 3, padding='same', activation='relu'),
    Conv2D(64, 3, padding='same', activation='relu'),
    MaxPooling2D(),
    Dropout(0.4),
    
    Conv2D(128, 3, padding='same', activation='relu'),
    MaxPooling2D(),
    Dropout(0.5),
    
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.55),
    Dense(num_classes, activation='softmax')
])

In [17]:
model_cnn_prepro.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [18]:
epochs = 50 
batch_size = 10
history = model_cnn_prepro.fit_generator(
    datagen_prepro.flow(np.array(train_data_prepro), np.array(train_labels_prepro), batch_size=batch_size),
    steps_per_epoch=dev_data_2d.shape[0] // batch_size,
    epochs=epochs,
    validation_data=(np.array(test_data_prepro), np.array(test_labels_prepro)),
    validation_steps=dev_data_2d.shape[0] // batch_size,
    callbacks=[learning_rate_reduction]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50

Epoch 00020: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50

Epoch 00025: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50

Epoch 00039: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50

Epoch 00047: ReduceLROnPlateau reducing 

In [22]:
loss, accuracy = model_cnn_prepro.evaluate(test_data_2d, test_labels_2d, verbose=1)
loss_v, accuracy_v = model_cnn_prepro.evaluate(dev_data_2d, dev_labels_2d, verbose=1)
print("Validation: accuracy = %f  ;  loss_v = %f" % (accuracy_v, loss_v))
print("Test: accuracy = %f  ;  loss = %f" % (accuracy, loss))
model_cnn_prepro.save("model_cnn_prepro.h5")

Validation: accuracy = 0.813143  ;  loss_v = 1.282965
Test: accuracy = 0.813378  ;  loss = 1.281142


In [20]:
test_out = model_cnn_prepro.predict(test_data_2d)
test_pred = []
for i in test_out:
    for n, item in enumerate(i):
        if i.max() == item:
            test_pred.append(n)
        else:
            continue
dd = pd.DataFrame({"Predicted": test_pred, "Truth": test_labels})
d_mel_recall = dd[dd["Truth"] == 6]
d_mel_recall["Correct"] = d_mel_recall["Predicted"] == d_mel_recall["Truth"]
recall_mel = len(d_mel_recall[d_mel_recall["Correct"] == True])/len(d_mel_recall)
d_mel_pres = dd[dd["Predicted"] == 6]
d_mel_pres["Correct"] = d_mel_pres["Predicted"] == d_mel_pres["Truth"]
precision_mel = len(d_mel_pres[d_mel_pres["Correct"] == True])/len(d_mel_pres)
print("Recall: {:.2f}% (Melanoma)".format(recall_mel*100))
print("f1: {:.2f}% (Melanoma)".format(2*((precision_mel*recall_mel)*100/(precision_mel+recall_mel))))
d_bas_recall = dd[dd["Truth"] == 1]
d_bas_recall["Correct"] = d_bas_recall["Predicted"] == d_bas_recall["Truth"]
recall_bas = len(d_bas_recall[d_bas_recall["Correct"] == True])/len(d_bas_recall)
d_bas_pres = dd[dd["Predicted"] == 1]
d_bas_pres["Correct"] = d_bas_pres["Predicted"] == d_bas_pres["Truth"]
precision_bas = len(d_bas_pres[d_bas_pres["Correct"] == True])/len(d_bas_pres)
print("Recall: {:.2f}% (Basal cell carcinoma)".format(recall_bas*100))
print("f1: {:.2f}% (Basal cell carcinoma)".format(2*((precision_bas*recall_bas)*100/(precision_bas+recall_bas))))

Recall: 82.25% (Melanoma)
f1: 24.74% (Melanoma)
Recall: 56.73% (Basal cell carcinoma)
f1: 25.49% (Basal cell carcinoma)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docume