In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(i) for i in range(len(tf.config.experimental.list_physical_devices('GPU'))))
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
for i in range(len(tf.config.experimental.list_physical_devices('GPU'))):
    tf.config.experimental.set_memory_growth(tf.config.experimental.list_physical_devices('GPU')[i], True)
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras import models
from keras import layers

import random

from PIL import Image

pd.options.mode.chained_assignment = None

base_path = '../src/data/external/ISIC_2019_Training_Input/'
image_prefix = '.jpg'
TARGET_SIZE = (128, 128)
CONTRAST_FACTOR = 3
DELTA = 0.3


def reset_random_seeds(seed):
    os.environ['PYTHONHASHSEED']=str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/DBW86T

2022-11-19 11:39:38.230726: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-19 11:39:38.335132: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-19 11:39:38.765202: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/sudha/cuda/lib64:/usr/local/cuda-11.2/lib64:/home/sudha/miniconda3/envs/tf/lib/
2022-11-19 11:39:38.765261: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'li

In [2]:
meta_df = pd.read_csv('../src/data/external/ISIC_2019_Training_Metadata.csv')
truth_df = pd.read_csv('../src/data/external/ISIC_2019_Training_GroundTruth.csv')

In [3]:
labels = truth_df.columns
labels = labels[1:]
print(labels)
label_mapping = {i: label for i, label in enumerate(labels)}
print(label_mapping)

Index(['MEL', 'NV', 'BCC', 'AK', 'BKL', 'DF', 'VASC', 'SCC', 'UNK'], dtype='object')
{0: 'MEL', 1: 'NV', 2: 'BCC', 3: 'AK', 4: 'BKL', 5: 'DF', 6: 'VASC', 7: 'SCC', 8: 'UNK'}


In [4]:
dense_labels = truth_df[labels]
dense_labels = dense_labels.values
print('Dense Labels...')
display(dense_labels)
train_labels = np.argmax(dense_labels, axis=-1)
print('Train Labels...')
display(train_labels)

Dense Labels...


array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Train Labels...


array([1, 1, 0, ..., 0, 1, 4])

In [5]:
truth_df['label'] = train_labels.tolist()
display(truth_df)

Unnamed: 0,image,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK,label
0,ISIC_0000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,ISIC_0000001,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,ISIC_0000002,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,ISIC_0000003,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,ISIC_0000004,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
25326,ISIC_0073247,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
25327,ISIC_0073248,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4
25328,ISIC_0073249,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
25329,ISIC_0073251,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [6]:
truth_df[['image', 'label']]

Unnamed: 0,image,label
0,ISIC_0000000,1
1,ISIC_0000001,1
2,ISIC_0000002,0
3,ISIC_0000003,1
4,ISIC_0000004,0
...,...,...
25326,ISIC_0073247,2
25327,ISIC_0073248,4
25328,ISIC_0073249,0
25329,ISIC_0073251,1


In [7]:
merged_df = meta_df.join(truth_df.set_index('image'), on='image', how='left')
display(merged_df)

Unnamed: 0,image,age_approx,anatom_site_general,lesion_id,sex,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK,label
0,ISIC_0000000,55.0,anterior torso,,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,ISIC_0000001,30.0,anterior torso,,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,ISIC_0000002,60.0,upper extremity,,female,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,ISIC_0000003,30.0,upper extremity,,male,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,ISIC_0000004,80.0,posterior torso,,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25326,ISIC_0073247,85.0,head/neck,BCN_0003925,female,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
25327,ISIC_0073248,65.0,anterior torso,BCN_0001819,male,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4
25328,ISIC_0073249,70.0,lower extremity,BCN_0001085,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
25329,ISIC_0073251,55.0,palms/soles,BCN_0002083,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [8]:
# Site details add more information to model and so fill with unknown if not available
print(f"Count of null values in anatom_site_general column before null fill: {sum(merged_df['anatom_site_general'].isnull())}")
merged_df['anatom_site_general'] = merged_df['anatom_site_general'].fillna('unknown')
print(f"Count of null values in anatom_site_general column after null fill: {sum(merged_df['anatom_site_general'].isnull())}")

Count of null values in anatom_site_general column before null fill: 2631
Count of null values in anatom_site_general column after null fill: 0


In [9]:
# Sex details add more information to model and so fill with unknown if not available
print(f"Count of null values in sex column before null fill: {sum(merged_df['sex'].isnull())}")
merged_df['sex'] = merged_df['sex'].fillna('unknown')
print(f"Count of null values in sex column after null fill: {sum(merged_df['sex'].isnull())}")

Count of null values in sex column before null fill: 384
Count of null values in sex column after null fill: 0


In [10]:
# Age is important for the model and so we remove rows not available
clean_df = merged_df[~merged_df['age_approx'].isnull()]
clean_df

Unnamed: 0,image,age_approx,anatom_site_general,lesion_id,sex,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK,label
0,ISIC_0000000,55.0,anterior torso,,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,ISIC_0000001,30.0,anterior torso,,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,ISIC_0000002,60.0,upper extremity,,female,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,ISIC_0000003,30.0,upper extremity,,male,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,ISIC_0000004,80.0,posterior torso,,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25326,ISIC_0073247,85.0,head/neck,BCN_0003925,female,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
25327,ISIC_0073248,65.0,anterior torso,BCN_0001819,male,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4
25328,ISIC_0073249,70.0,lower extremity,BCN_0001085,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
25329,ISIC_0073251,55.0,palms/soles,BCN_0002083,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [11]:
# decreasing the scope of anatomy site can provide more information and so getting a more generic information
clean_df['anatomy_site'] = clean_df['anatom_site_general'].apply(lambda anatomy: anatomy.split()[-1])
display(clean_df)
clean_df.describe()

Unnamed: 0,image,age_approx,anatom_site_general,lesion_id,sex,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK,label,anatomy_site
0,ISIC_0000000,55.0,anterior torso,,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,torso
1,ISIC_0000001,30.0,anterior torso,,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,torso
2,ISIC_0000002,60.0,upper extremity,,female,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,extremity
3,ISIC_0000003,30.0,upper extremity,,male,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,extremity
4,ISIC_0000004,80.0,posterior torso,,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,torso
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25326,ISIC_0073247,85.0,head/neck,BCN_0003925,female,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2,head/neck
25327,ISIC_0073248,65.0,anterior torso,BCN_0001819,male,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4,torso
25328,ISIC_0073249,70.0,lower extremity,BCN_0001085,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,extremity
25329,ISIC_0073251,55.0,palms/soles,BCN_0002083,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,palms/soles


Unnamed: 0,age_approx,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK,label
count,24894.0,24894.0,24894.0,24894.0,24894.0,24894.0,24894.0,24894.0,24894.0,24894.0,24894.0
mean,54.028481,0.178236,0.504459,0.133245,0.034707,0.104362,0.009601,0.010163,0.025227,0.0,1.578091
std,18.130971,0.382719,0.49999,0.339846,0.183041,0.305736,0.097514,0.100301,0.156817,0.0,1.539915
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,55.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,70.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
max,85.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,7.0


In [12]:
def get_sampled_indices(df, ratio):
    total = len(df)
    reset_random_seeds(1234)
    sample_size = int(min(df['label'].value_counts()) * ratio)

    indices = []
    for label in label_mapping:
        observations = df[df['label'] == label].index.values
        if len(observations) >= sample_size:
            indices.extend(random.sample(observations.tolist(), sample_size))
    return indices

In [13]:
# Pareto principle - didn't understand what about this principle!!!
# seen_idx = get_sampled_indices(clean_df, 0.8)
# train_idx = get_sampled_indices(clean_df.loc[seen_idx], 0.8)  # train - 64%
# eval_idx = list(set(seen_idx) - set(train_idx))  # eval - 16%
#
# unseen_idx = list(set(clean_df.index.values) - set(seen_idx))
# test_idx = get_sampled_indices(clean_df.loc[unseen_idx], 1)  # test - 20%
# all_test_idx = list(set(clean_df.index.values) - set(seen_idx))
#
# train_idx.sort()
# eval_idx.sort()
# test_idx.sort()
# all_test_idx.sort()
# print(f"Train: {train_idx}")
# print(f"Eval: {eval_idx}")
# print(f"Test: {test_idx}")
# print(f"All-Test: {all_test_idx}")

In [14]:
balanced_idx = get_sampled_indices(clean_df, 1)
rem_idx = list(set(clean_df.index.values) - set(balanced_idx))

In [15]:
def load_img_data(df):
    images = []
    for img_name in df['image']:
        img = load_img(f"{base_path}/{img_name}{image_prefix}", target_size=TARGET_SIZE)
        img = img_to_array(img)
        images.append(img)
    return np.stack(images)

In [16]:
images_mini = load_img_data(clean_df.loc[balanced_idx])
y_mini = np.array(clean_df.loc[balanced_idx]['label']).flatten()

In [17]:
def preprocess_img_data(images, y, split=(0.6, 0.2, 0.2)):
    reset_random_seeds(1234)
    shuffle = np.random.permutation(np.arange(images.shape[0]))
    images, y = images[shuffle], y[shuffle]

    splits = np.multiply(len(images), split).astype(int)
    x_train, x_val, x_test = np.split(images, [splits[0], splits[0] + splits[1]])
    y_train, y_val, y_test = np.split(y, [splits[0], splits[0] + splits[1]])

    # image resize
    x_train = tf.image.resize(x_train, size=TARGET_SIZE)
    x_val = tf.image.resize(x_val, size=TARGET_SIZE)
    x_test = tf.image.resize(x_test, size=TARGET_SIZE)

    # rescale image
    x_train = x_train / 255.0
    x_val = x_val / 255.0
    x_test = x_test / 255.0

    # image augmentation
    # brightness
    x_train_augm = tf.image.adjust_brightness(x_train, delta=DELTA)
    # contrast
    x_train_augm = tf.image.adjust_contrast(x_train_augm, contrast_factor=CONTRAST_FACTOR)
    # random flip
    x_train_augm = tf.image.random_flip_left_right(x_train_augm)

    # concatenate original and augmented data
    x_train = tf.concat([x_train, x_train_augm], axis=0)
    y_train = tf.concat([y_train, y_train], axis=0)

    # shuffle the data
    shuffle = tf.random.shuffle(tf.range(tf.shape(x_train)[0], dtype=tf.int32))
    x_train = tf.gather(x_train, shuffle)
    y_train = tf.gather(y_train, shuffle).numpy()

    return x_train, y_train, x_val, y_val, x_test, y_test

In [18]:
x_train, y_train, x_val, y_val, x_test, y_test = preprocess_img_data(images_mini, y_mini)

2022-11-19 11:39:52.022245: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-19 11:39:52.155918: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-19 11:39:52.156065: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-19 11:39:52.156181: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

In [21]:
def build_model(kernel_size=(5, 5), strides=(1, 1), pool_size=(2, 2), optimizer='Adam', learning_rate=0.001,
                conv_layers=3, hidden_layers=3, dropout_rate=0.2, **kwargs):
    tf.keras.backend.clear_session()
    reset_random_seeds(1)

    model = tf.keras.Sequential()

    for i in range(1, conv_layers + 1):
        model.add(tf.keras.layers.Conv2D(filters=16 * (2 ** i), kernel_size=kernel_size, strides=strides, padding='same', name=f'conv_{i}', activation='relu'))
        model.add(tf.keras.layers.MaxPool2D(pool_size=pool_size, name=f'pool_{i}'))

    model.add(tf.keras.layers.Flatten())

    for i in range(1, hidden_layers + 1):
        model.add(tf.keras.layers.Dropout(rate=dropout_rate))

        inv_i = hidden_layers - i
        model.add(tf.keras.layers.Dense(units=128 * (4 ** inv_i), name=f'fc_{i}', activation='relu'))

    model.add(tf.keras.layers.Dropout(rate=dropout_rate))
    model.add(tf.keras.layers.Dense(units=len(labels), name='output', activation="softmax"))
    model.build(input_shape=(None, *TARGET_SIZE, 3))

    optimizer_mapping = {
        'sgd': tf.keras.optimizers.SGD,
        'adam': tf.keras.optimizers.Adam,
    }
    optimizer = optimizer_mapping[optimizer.lower()]
    optimizer = optimizer(learning_rate=learning_rate)

    model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy",
                  metrics=['accuracy'])
    return model

# with strategy.scope():
model = build_model()
model.summary()

history = model.fit(x_train, y_train, epochs=10, validation_data=(x_val, y_val))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv_1 (Conv2D)             (None, 128, 128, 32)      2432      
                                                                 
 pool_1 (MaxPooling2D)       (None, 64, 64, 32)        0         
                                                                 
 conv_2 (Conv2D)             (None, 64, 64, 64)        51264     
                                                                 
 pool_2 (MaxPooling2D)       (None, 32, 32, 64)        0         
                                                                 
 conv_3 (Conv2D)             (None, 32, 32, 128)       204928    
                                                                 
 pool_3 (MaxPooling2D)       (None, 16, 16, 128)       0         
                                                                 
 flatten (Flatten)           (None, 32768)             0

In [20]:
model.evaluate(x_test, y_test)



[1.8320398330688477, 0.2819843292236328]