**Imagine **

You wake up and find a frightening mark on your skin so you go to the doctor’s office to get it checked up. They say it’s fine so you go home and don’t worry about it for a couple months, but then you have a throbbing pain from that spot — it looks ugly and menacing now. It has developed into a malignant tumour as a result of your doctor’s misdiagnosis. The prevalence of misdiagnosis is scary. A study has shown that over 1 in 20 American adults have been misdiagnosed in that past and over half of these are harmful. A lot of skin lesions can be pretty much harmless but others can be life-threatening. It’s super important that these tumours are discovered right away, this is when it is the easiest to treat them.

**Skin Cancer Classifier**

A predictive model that uses the HAM10000 dataset, trained on two architectures to compare the best performance ,MobileNet and Resnet50 to classify skin lesions into seven categories. We have trained the model locally using native Keras.Training process is documented and coded here 



In [0]:
import zipfile
import io
from google.colab import files

from numpy.random import seed
from tensorflow import set_random_seed
set_random_seed(101)
import pandas as pd
import numpy as np
import os
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import itertools
import shutil
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import classification_report
import tensorflow
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint



### Loading files in Colab and creating  directory structure

In [0]:
uploaded = files.upload()

In [0]:
data = zipfile.ZipFile(io.BytesIO(uploaded['Cancer_Data.zip']), 'r')
data.extractall()

In [0]:
# Read the metadata
df_data = pd.read_csv('Cancer_Data/HAM10000_metadata.csv')
# Get a list of images in each of the two folders
HAM_1 = os.listdir('Cancer_Data/ham10000_images_part_1')
HAM_2 = os.listdir('Cancer_Data/ham10000_images_part_2')

# Skin Cancer Dataset Preprocessing

# Training file directory
os.mkdir(os.path.join('Cancer_Data', 'training'))

# Validation file directory
os.mkdir(os.path.join('Cancer_Data', 'Validation'))
# Create new folders in the training directory for each of the classes

os.mkdir(os.path.join('Cancer_Data/training', 'nv'))
os.mkdir(os.path.join('Cancer_Data/training', 'mel'))
os.mkdir(os.path.join('Cancer_Data/training', 'bkl'))
os.mkdir(os.path.join('Cancer_Data/training', 'bcc'))
os.mkdir(os.path.join('Cancer_Data/training', 'akiec'))
os.mkdir(os.path.join('Cancer_Data/training', 'vasc'))
os.mkdir(os.path.join('Cancer_Data/training', 'df'))

# Create new folders in the validation directory for each of the classes
os.mkdir(os.path.join('Cancer_Data/Validation', 'nv'))
os.mkdir(os.path.join('Cancer_Data/Validation', 'mel'))
os.mkdir(os.path.join('Cancer_Data/Validation', 'bkl'))
os.mkdir(os.path.join('Cancer_Data/Validation', 'bcc'))
os.mkdir(os.path.join('Cancer_Data/Validation', 'akiec'))
os.mkdir(os.path.join('Cancer_Data/Validation', 'vasc'))
os.mkdir(os.path.join('Cancer_Data/Validation', 'df'))


### Creating Validation stratified dataset

In [0]:
# this will tell us how many images are associated with each lesion_id and filter out lesion_id's that have only one image associated with it
df = df_data.groupby('lesion_id').count()
df = df[df['image_id'] == 1]
df.reset_index(inplace=True)
df.head()

Checking duplicates

In [0]:
def duplicates(x):
    unique_list = list(df['lesion_id'])
    if x in unique_list:
        return 'no_duplicates'
    else:
        return 'has_duplicates'
    
# create a new colum that is a copy of the lesion_id column
df_data['duplicates'] = df_data['lesion_id']
# apply the function to this new column
df_data['duplicates'] = df_data['duplicates'].apply(duplicates)
# now we filter out images that don't have duplicates
df = df_data[df_data['duplicates'] == 'no_duplicates']
_, df_val = train_test_split(df, test_size=0.17, random_state=101, stratify=df['dx'])

def Validation_subset(x):
    if str(x) in list(df_val['image_id']):
        return 'validation'
    else:
        return 'training'
df_data['train_or_val'] = df_data['image_id']
# apply the function to this new column
df_data['train_or_val'] = df_data['train_or_val'].apply(Validation_subset)
   
# filter out train rows
df_train = df_data[df_data['train_or_val'] == 'training']
print(len(df_train))
print(len(df_val))

Train  subset without validation images 

In [0]:
# Set the image_id as the index in df_data
df_data.set_index('image_id', inplace=True)

In [0]:
df_data.set_index('image_id', inplace=True)
# train and validation images 

for image in list(df_train['image_id']):
    Im_name = image + '.jpg'
    label = df_data.loc[image,'dx']
    if Im_name in HAM_1:
        src = os.path.join('Cancer_Data/ham10000_images_part_1', Im_name)
        dst = os.path.join(training, label, Im_name)
        shutil.copyfile(src, dst)

    if Im_name in HAM_2:
        src = os.path.join('Cancer_Data/ham10000_images_part_2', Im_name)
        dst = os.path.join(training, label, Im_name)
        shutil.copyfile(src, dst)



for image in list(df_val['image_id']):
    Im_name = image + '.jpg'
    label = df_data.loc[image,'dx']
    
    if Im_name in HAM_1:
        # source path to image
        src = os.path.join('Cancer_Data/ham10000_images_part_1', Im_name)
        # destination path to image
        dst = os.path.join(Validation, label, Im_name)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if Im_name in HAM_2:
        # source path to image
        src = os.path.join('Cancer_Data/ham10000_images_part_2', Im_name)
        # destination path to image
        dst = os.path.join(Validation, label, Im_name)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)
        

### Augmentation

In [0]:
class_list = ['mel','bkl','bcc','akiec','vasc','df']

for item in class_list:
    aug_dir = 'aug_dir'
    os.mkdir(aug_dir)
    img_dir = os.path.join(aug_dir, 'img_dir')
    os.mkdir(img_dir)
    img_class = item
    img_list = os.listdir('Cancer_Data/training/' + img_class)
    for Im_name in img_list:
            src = os.path.join('Cancer_Data/training/' + img_class, Im_name)
            dst = os.path.join(img_dir, Im_name)
            # copy the image from the source to the destination
            shutil.copyfile(src, dst)
    path = aug_dir
    save_path = 'Cancer_Data/training/' + img_class
    datagen = ImageDataGenerator(
        rotation_range=180,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        #brightness_range=(0.9,1.1),
        fill_mode='nearest')

    batch_size = 50

    aug_datagen = datagen.flow_from_directory(path,save_to_dir=save_path,save_format='jpg',target_size=(224,224),batch_size=batch_size)
    
    num_aug_images_wanted = 6000 
    num_files = len(os.listdir(img_dir))
    num_batches = int(np.ceil((num_aug_images_wanted-num_files)/batch_size))
    for i in range(0,num_batches):

        imgs, labels = next(aug_datagen)
    shutil.rmtree('aug_dir')

### Generators settings

In [0]:
datagen = ImageDataGenerator(
    preprocessing_function= \
    tensorflow.keras.applications.mobilenet.preprocess_input)

train_batches = datagen.flow_from_directory('Cancer_Data/training',target_size=(224,224),batch_size=10)

valid_batches = datagen.flow_from_directory('Cancer_Data/Validation',target_size=(224,224),batch_size=10)

# Note: shuffle=False causes the test dataset to not be shuffled
test_batches = datagen.flow_from_directory('Cancer_Data/Validation',target_size=(224,224),batch_size=1,shuffle=False)

Modeling based on two architectures ,modification is done on  classification layers and dense

1. Mobilenet                     2.Resnet50



Mobilenet Architecture modification

In [0]:
mobile = tensorflow.keras.applications.mobilenet.MobileNet()
x = mobile.layers[-6].output
x = Dropout(0.25)(x)
predictions = Dense(7, activation='softmax')(x)
model1 = Model(inputs=mobile.input, outputs=predictions)
# We need to choose how many layers we actually want to be trained.
for layer in model1.layers[:-23]:
    layer.trainable = False

Resnet Architecture

In [18]:
Res_model = tensorflow.keras.applications.ResNet50()
x = Res_model.layers[-3].output
x = Dropout(0.25)(x)
predictions = Dense(7, activation='softmax')(x)
model2 = Model(inputs=Res_model.input, outputs=predictions)
# We need to choose how many layers we actually want to be trained.
for layer in model2.layers[:-23]:
    layer.trainable = False

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5


### Train the Model

In [0]:
from tensorflow.keras.metrics import categorical_accuracy, top_k_categorical_accuracy

def top_3_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=3)

def top_2_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=2)
  
  
model1.compile(Adam(lr=0.01), loss='categorical_crossentropy', 
              metrics=[categorical_accuracy, top_2_accuracy, top_3_accuracy])


Weights assignment based on classes

In [0]:
class_weights={
    0: 1.0, 
    1: 1.0,
    2: 1.0,
    3: 1.0,
    4: 3.0,
    5: 1.0,
    6: 1.0,
}

In [26]:

filepath = "model.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_top_3_accuracy', verbose=1, 
                             save_best_only=True, mode='max')

reduce_lr = ReduceLROnPlateau(monitor='val_top_3_accuracy', factor=0.5, patience=2, 
                                   verbose=1, mode='max', min_lr=0.00001)
                              
                              
callbacks_list = [checkpoint, reduce_lr]

history = model1.fit_generator(train_batches, steps_per_epoch=train_steps, 
                              class_weight=class_weights,
                    validation_data=valid_batches,
                    validation_steps=val_steps,
                    epochs=30, verbose=1,
                   callbacks=callbacks_list)


NameError: ignored

###Model Evaluation

In [0]:
val_loss, val_cat_acc, val_top_2_acc, val_top_3_acc = \
model.evaluate_generator(test_batches, 
                        steps=len(df_val))

print('val_loss:', val_loss)
print('val_cat_acc:', val_cat_acc)
print('val_top_2_acc:', val_top_2_acc)
print('val_top_3_acc:', val_top_3_acc)

model.load_weights('model.h5')

val_loss, val_cat_acc, val_top_2_acc, val_top_3_acc = \
model.evaluate_generator(test_batches, 
                        steps=len(df_val))

print('val_loss:', val_loss)
print('val_cat_acc:', val_cat_acc)
print('val_top_2_acc:', val_top_2_acc)
print('val_top_3_acc:', val_top_3_acc)

### Create a Confusion Matrix

In [0]:
# Get the labels of the test images.
test_labels = test_batches.classes
# make a prediction
predictions = model.predict_generator(test_batches, steps=len(df_val), verbose=1)
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

# argmax returns the index of the max value in a row
cm = confusion_matrix(test_labels, predictions.argmax(axis=1))
cm_plot_labels = ['akiec', 'bcc', 'bkl', 'df', 'mel','nv', 'vasc']
plot_confusion_matrix(cm, cm_plot_labels, title='Confusion Matrix')


### Classification Report

In [0]:
y_pred = np.argmax(predictions, axis=1)
y_true = test_batches.classes
report = classification_report(y_true, y_pred, target_names=cm_plot_labels)
print(report)

model's accuracy for mobilenet score settles around 82%, this is mainly due to a small and unbalanced dataset.It can be boosted more using augmentation techniques but due to resource constraints I tried above methodology and results are found good.