# Load required libraries

In [None]:
# Loading of important libraries that are used throughout
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import cv2 # computer vision library
import keras # Python simplified interface to tensorflow
import matplotlib.pyplot as plt # data visualization tool
from tensorflow.python.keras import backend as K # to utilize more of keras' functionality
from keras.models import Model # the neural network model
from keras.layers import Input, Lambda, Dense, Flatten # neural network layers
from keras.applications.vgg16 import VGG16 # the transfer learning model VGG16

In [None]:
# path to the training and test set
train_dir='/kaggle/input/siim-isic-melanoma-classification/jpeg/train/'
test_dir='/kaggle/input/siim-isic-melanoma-classification/jpeg/test/'

# loading the training and test set
train=pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/train.csv')
test=pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/test.csv')

In [None]:
# shows the first five rows of the training set
train.head()

# Remove duplicate images from the training dataset

In [None]:
# as per an ongoing discussion, there are some duplicate images in the training data, these images might adversely impact our model, 
# so, lets remove these images
dup = pd.read_csv("/kaggle/input/siim-list-of-duplicates/2020_Challenge_duplicates.csv")

drop_idx_list = []
for dup_image in dup.ISIC_id_paired:
    for idx,image in enumerate(train.image_name):
        if image == dup_image:
            drop_idx_list.append(idx)

print("no. of duplicates in training dataset:",len(drop_idx_list))

train.drop(drop_idx_list,inplace=True)

print("updated dimensions of the training dataset:",train.shape)

In [None]:
# shows how many images are benign (target 0) and malignant (target 1), and as we can see, the training set is quite imbalanced
train.target.value_counts()

# Modelling - VGG16 (Transfer Learning)

## Data Preparation

* ### Take Sample Images for training

In [None]:
# Since this is a huge dataset, we would take a sample of it for training purpose
# In addition, to have a more balanced dataset, we create a new dataframe with more balanced amounts of benign and malignant images
df_0=train[train['target']==0].sample(3000)
df_1=train[train['target']==1]
train=pd.concat([df_0,df_1])
train=train.reset_index()

* ### Update Image Names

In [None]:
# update image names with the whole path
def append_ext(fn):
    return train_dir+fn+".jpg"
train["image_name"]=train["image_name"].apply(append_ext)

def append_ext(fn):
    return test_dir+fn+".jpg"
test["image_name"]=test["image_name"].apply(append_ext)

* ### Split into train and validate dataset

In [None]:
# 20% of the training data is set aside for the validation purpose
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train['image_name'],train['target'], test_size=0.2, random_state=42)

# training set
train=pd.DataFrame(X_train)
train.columns=['image_name']
train['target']=y_train

# validation set
validation=pd.DataFrame(X_val)
validation.columns=['image_name']
validation['target']=y_val

* ### Resize Images

In [None]:
# import keras' image preprocessing libraries for images
from keras.preprocessing.image import load_img, img_to_array, array_to_img

# resizing the images to 128x128 for faster processing
IMG_DIM = (128, 128)

# load images using load_img function from keras preprocessing 
# target_size is used to load the images with smaller size
# img_to_array will tranform the loaded image to an array
train_imgs = [img_to_array(load_img(img, target_size=IMG_DIM)) for img in train.image_name]
validation_imgs = [img_to_array(load_img(img, target_size=IMG_DIM)) for img in validation.image_name]

# convert the list of arrays to array
train_imgs = np.array(train_imgs)
validation_imgs = np.array(validation_imgs)

print('Train dataset shape:', train_imgs.shape, 
      '\tValidation dataset shape:', validation_imgs.shape)

In [None]:
# define parameters for model training
batch_size = 32 # the total number of images processed per iteration
num_classes = 2 # we have two classes; benign and malignant
epochs = 100 # the number of iteration over the entire training set
input_shape = (128, 128, 3)

### Define loss function

In [None]:
# focal loss as we have an imbalanced data set
def focal_loss(alpha=0.25,gamma=2.0):
    def focal_crossentropy(y_true, y_pred):
        bce = K.binary_crossentropy(y_true, y_pred)
        
        y_pred = K.clip(y_pred, K.epsilon(), 1.- K.epsilon())
        p_t = (y_true*y_pred) + ((1-y_true)*(1-y_pred))
        
        alpha_factor = 1
        modulating_factor = 1

        alpha_factor = y_true*alpha + ((1-alpha)*(1-y_true))
        modulating_factor = K.pow((1-p_t), gamma)

        # compute the final loss and return
        return K.mean(alpha_factor*modulating_factor*bce, axis=-1)
    return focal_crossentropy

### Optimizer & No. of Iterations

In [None]:
# import optimizers from keras
from keras.optimizers import Adam, SGD, RMSprop

# use Adam optimizer
opt = Adam(lr=1e-5)

#total number of iterations is always equal to the total number of training samples divided by the batch_size.
nb_train_steps = train.shape[0]//batch_size
nb_val_steps=validation.shape[0]//batch_size

print("Number of training and validation steps: {} and {}".format(nb_train_steps,nb_val_steps))

In [None]:
# pixel normalization and data augmentation
from keras.preprocessing.image import ImageDataGenerator # ImageDataGenerator from keras can be used to both pixel normalization (rescale) and data augmentation (e.g. zoom, rotation, width, height, shear and flip the images)

# rescaling and augmenting the training set images
train_datagen = ImageDataGenerator(rescale=1./255, zoom_range=0.3, rotation_range=50,
                                   width_shift_range=0.2, height_shift_range=0.2, shear_range=0.2, 
                                   horizontal_flip=True, fill_mode='nearest')

# only rescaling the pixels in the validation set images
val_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow(train_imgs, y_train, batch_size=batch_size)
val_generator = val_datagen.flow(validation_imgs, y_val, batch_size=batch_size)

In [None]:
# data augmentation example
img_id = 100
generator_100 = train_datagen.flow(train_imgs[img_id:img_id+1], train.target[img_id:img_id+1],
                                   batch_size=1)
aug_img = [next(generator_100) for i in range(0,5)]
fig, ax = plt.subplots(1,5, figsize=(16, 6))
print('Labels:', [item[1][0] for item in aug_img])
l = [ax[i].imshow(aug_img[i][0][0]) for i in range(0,5)]

In [None]:
# import python garbage collector for memory mangagement
import gc
del train
gc.collect()

### Define VGG16 Model

In [None]:
from keras.models import Model
from keras.applications import vgg16

# initializing the VGG16 model with pre-trained weights which was trained on ImageNet. 
vgg = vgg16.VGG16(include_top=False, weights='imagenet', 
                                     input_shape=input_shape)

# flatten the output layer
output = vgg.layers[-1].output
output = keras.layers.Flatten()(output)
vgg_model = Model(vgg.input, output)

# set all layers to not be trained
vgg_model.trainable = False
for layer in vgg_model.layers:
    layer.trainable = False
    
pd.set_option('max_colwidth', -1)
layers = [(layer, layer.name, layer.trainable) for layer in vgg_model.layers]
pd.DataFrame(layers, columns=['Layer Type', 'Layer Name', 'Layer Trainable']) 

vgg_model.summary()

In [None]:
# train the convolution layers from block4_conv1 to output layer in the model
vgg_model.trainable = True

set_trainable = False
for layer in vgg_model.layers:
    if layer.name in ['block5_conv1', 'block4_conv1']:
        set_trainable = True
    if set_trainable:
        layer.trainable = True
    else:
        layer.trainable = False
        
layers = [(layer, layer.name, layer.trainable) for layer in vgg_model.layers]
pd.DataFrame(layers, columns=['Layer Type', 'Layer Name', 'Layer Trainable'])    

In [None]:
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, InputLayer
from keras.models import Sequential
import tensorflow as tf
from tensorflow.keras.models import Sequential, save_model, load_model
from keras.callbacks import ModelCheckpoint

# creating an instance of Sequential model
model = Sequential()

# add the VGG16 model
model.add(vgg_model)

# add dense and dropout layers
model.add(Dense(512, activation='relu', input_dim=input_shape))
model.add(Dropout(0.4))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(1, activation='sigmoid'))

# compiling the model
model.compile(loss=focal_loss(), metrics=[tf.keras.metrics.BinaryAccuracy(),tf.keras.metrics.FalsePositives(),tf.keras.metrics.FalseNegatives()],optimizer=opt)

#we want to save the best model for our test predictions
checkpointer = ModelCheckpoint(filepath="weights.hdf5", verbose=1, save_best_only=True)

model.summary()

In [None]:
# implementing early stopping
#from keras.callbacks import EarlyStopping
#es = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

In [None]:
# training of the model
history = model.fit_generator(train_generator, steps_per_epoch=nb_train_steps, epochs=epochs,callbacks=[checkpointer],
                           validation_data=val_generator, validation_steps=nb_val_steps, 
                              verbose=1)

In [None]:
#checking model performance
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
t = f.suptitle('VGG16 Performance', fontsize=12)
f.subplots_adjust(top=0.85, wspace=0.3)

epoch_list = list(range(1,101))
ax1.plot(epoch_list, history.history['binary_accuracy'], label='Train Accuracy')
ax1.plot(epoch_list, history.history['val_binary_accuracy'], label='Validation Accuracy')
ax1.set_xticks(np.arange(0, 101, 5))
ax1.set_ylabel('Accuracy Value')
ax1.set_xlabel('Epoch')
ax1.set_title('Accuracy')
l1 = ax1.legend(loc="best")

ax2.plot(epoch_list, history.history['loss'], label='Train Loss')
ax2.plot(epoch_list, history.history['val_loss'], label='Validation Loss')
ax2.set_xticks(np.arange(0, 101, 5))
ax2.set_ylabel('Loss Value')
ax2.set_xlabel('Epoch')
ax2.set_title('Loss')
l2 = ax2.legend(loc="best")

In [None]:
x_test = np.load('../input/siimisic-melanoma-resized-images/x_test_128.npy')
x_test = x_test.astype('float16')
test_imgs_scaled = x_test / 255
del x_test
gc.collect()

In [None]:
#load our best saved model
model.load_weights('weights.hdf5')

target=[]
i = 0
for img in test_imgs_scaled:
    img1=np.reshape(img,(1,128,128,3))
    prediction=model.predict(img1)
    i = i + 1
    print("predicted image no.",i)
    target.append(prediction[0][0])

In [None]:
# submission file
sub=pd.read_csv("../input/siim-isic-melanoma-classification/sample_submission.csv")
sub['target']=target
sub.to_csv('submission.csv', index=False)
sub.head()