# **WORK IN PROGRESS**

downloaded resized dataset [here](https://www.kaggle.com/cdeotte/jpeg-melanoma-256x256).

# Importing libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing,metrics
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dropout, Dense, Flatten, BatchNormalization
from tensorflow.keras.preprocessing.image import ImageDataGenerator,load_img, img_to_array, array_to_img
import cv2
from tqdm import tqdm
import os
from PIL import Image
import gc
from keras.callbacks import EarlyStopping
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns

import tensorflow_addons as tfa


print(tf. __version__)
%config Completer.use_jedi = False # makes auto completion work in notebook

## exploring the metadata csv file

In [None]:
#update with the dataset in use
directory = "../input/jpeg-melanoma-256x256/"
train = pd.read_csv(directory + 'train.csv')
test = pd.read_csv(directory + 'test.csv')

In [None]:
test.head(5)

In [None]:
# Kaggle users reported some duplicate images in the dataset, which might impact the model, this code removes them
dup = pd.read_csv("/kaggle/input/siim-list-of-duplicates/2020_Challenge_duplicates.csv")

drop_idx_list = []
for dup_image in dup.ISIC_id_paired:
    for idx,image in enumerate(train.image_name):
        if image == dup_image:
            drop_idx_list.append(idx)

print("no. of duplicates in training dataset:",len(drop_idx_list))

train.drop(drop_idx_list,inplace=True)

print("updated dimensions of the training dataset:",train.shape)

# Explore the data

In [None]:
plt.rcParams['figure.figsize'] = (10,10)
compare = train["target"].value_counts()
print(compare)
labels = ['benign','malignant']
sizes = [compare[0],compare[1]]
explode = (0, 0.1)
fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%0.1f%%',
        shadow=False, startangle=-45)
plt.show()

### Heavily skewed data, $\approx 2\% $ of the data is malignant, We need to take this into consideration

In [None]:
df_benign=train[train['target']==0]
df_malignant=train[train['target']==1]

In [None]:
print('Benign Cases')
benign=[]
df_b=df_benign.head(30)
df_b=df_b.reset_index()
for i in range(30):
    img = cv2.imread(directory + "train/" + df_benign['image_name'].iloc[i]+'.jpg')
    img = cv2.resize(img, (224,224))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = img.astype(np.float32)/255.
    benign.append(img)
f, ax = plt.subplots(5,6, figsize=(15,10))
for i, img in enumerate(benign):
        ax[i//6, i%6].imshow(img)
        ax[i//6, i%6].axis('off')
        
plt.show()

In [None]:
print('Malignant Cases')
malignant=[]
df_m=df_malignant.head(30)
df_m=df_m.reset_index()
for i in range(30):
    img = cv2.imread(directory + "train/"+ df_m['image_name'].iloc[i]+'.jpg')
    img = cv2.resize(img, (224,224))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = img.astype(np.float32)/255.
    malignant.append(img)
f, ax = plt.subplots(5,6, figsize=(15,10))
for i, img in enumerate(malignant):
        ax[i//6, i%6].imshow(img)
        ax[i//6, i%6].axis('off')
        
plt.show()

## Setting the dataset

In [None]:
# define parameters for model training, edit before cell for image dimension 
IMG_DIM = 256 # for input reshape layer
batch_size = 512
num_classes = 2
epochs = 30
validation_split = 0.15

Note: Accuracy is not a helpful metric for this task. You can have 99.8%+ accuracy on this task by predicting False all the time.
Note: that the model is fit using a larger than default batch size, this is important to ensure that each batch has a decent chance of containing a few of the postivie (malignant) samples. If the batch size was too small, they would likely have no malignant case to learn from.

[Tensorflow tutorial on imbalanced data](https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#setup)

In [None]:
#loading images into tf.data.dataset
file_paths = train["image_name"].values # need to add .jpg
labels = train["target"].values
#labels = to_categorical(labels,num_classes)
x_train, x_val, y_train, y_val = train_test_split(file_paths,
                                                  labels, 
                                                  test_size=validation_split,
                                                  random_state=32,
                                                  stratify = labels)

ds_train = tf.data.Dataset.from_tensor_slices(( 'train/'+ x_train + '.jpg', y_train))
ds_val = tf.data.Dataset.from_tensor_slices(( 'train/'+ x_val + '.jpg', y_val))

#read image, reshape and normalize
def read_image(image_file, label):
    image = tf.io.read_file(directory + image_file)
    image = tf.image.decode_image(image, dtype=tf.float32, channels=3)
    #image = tf.cast(image, tf.float32) / 255.0 # not needed for efficenetnet, read documentation
    image = tf.reshape(image, [IMG_DIM, IMG_DIM, 3])

    return image , label


#data augmentation
def augment(image,label):
    datagen = tf.keras.preprocessing.image.ImageDataGenerator(width_shift_range=0.01,
                                                              height_shift_range=0.01,
                                                              shear_range=0.01, 
                                                              rotation_range=15, 
                                                              zoom_range=0.01)
    return image, label

print("number of training images = {}".format(len(ds_train)), "number of val images = {}".format(len(ds_val)))

AUTOTUNE = tf.data.experimental.AUTOTUNE

ds_train = ds_train.map(read_image, num_parallel_calls = AUTOTUNE).map(augment).batch(batch_size)
ds_train = ds_train.prefetch(AUTOTUNE)

ds_val = ds_val.map(read_image, num_parallel_calls = AUTOTUNE).batch(batch_size)
ds_val = ds_val.prefetch(AUTOTUNE)

In [None]:
base_model = tf.keras.applications.efficientnet.EfficientNetB3(
            input_shape = (IMG_DIM, IMG_DIM, 3),
            weights = 'imagenet',
            include_top = False
        )
base_model.trainable = False

inputs = tf.keras.layers.Input(shape=(IMG_DIM, IMG_DIM, 3))
x = base_model(inputs, training = False)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = tf.keras.layers.Dense(512, activation= 'relu')(x)
x = tf.keras.layers.Dropout(0.25)(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs=inputs, outputs= outputs)
model.summary()

In [None]:
#tf.keras.backend.clear_session()

In [None]:
# Compile
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)


METRICS = [keras.metrics.BinaryAccuracy(name='accuracy'), 
          keras.metrics.Precision(name='precision'),
          keras.metrics.Recall(name='recall'),
          keras.metrics.AUC(name='auc')]

lossfunction = tfa.losses.SigmoidFocalCrossEntropy()
#lossfunction = tf.keras.losses.BinaryCrossentropy() 


model.compile(
    optimizer=optimizer, loss=lossfunction, metrics=METRICS)

class_weights = {0: 0.5,
                 1: 20}

es = EarlyStopping(monitor='val_auc', patience=2, verbose=1)

model.summary()

In [None]:
hist = model.fit(ds_train,
                 batch_size = batch_size,
                 validation_data = ds_val,
                 epochs=epochs,
                 verbose=1,
                 class_weight = class_weights, 
                 callbacks=[es])

___
___

In [None]:
plt.plot(hist.history['accuracy'], color='r', label="train accuracy")
plt.plot(hist.history['val_accuracy'], color='b', label="validation accuracy")
plt.title("Test accuracy")
plt.xlabel("Number of Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
plt.plot(hist.history['loss'], color='r', label="Train loss")
plt.plot(hist.history['val_loss'], color='b', label="validation loss")
plt.title("Test Loss")
plt.xlabel("Number of Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
plt.plot(hist.history['auc'], color='r', label="Train auc")
plt.plot(hist.history['val_auc'], color='b', label="validation auc")
plt.title("auc")
plt.xlabel("Number of Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
def plot_cm(labels, predictions, p=0.5):
    cm = confusion_matrix(labels, predictions > p)
    plt.figure(figsize=(10,10))
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title('Confusion matrix @{:.2f}'.format(p))
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')

    print('(True postive): ', cm[0][0])
    print(' (False Positives): ', cm[0][1])
    print('(False Negatives): ', cm[1][0])
    print(' (True Negative): ', cm[1][1])
    print('total malignant: ', np.sum(cm[1]))

val_prediction = model.predict(ds_val)
plot_cm(y_val, val_prediction)

In [None]:
model_save_dir = ('./model_complete_effnetb3_50epochs') #.h5' rename saved model
#model.save(model_save_dir + '.h5')

In [None]:
test

In [None]:
saved_model = tf.keras.models.load_model(model_save_dir + '.h5')

In [None]:

file_paths_test = test["image_name"].values # need to add .jpg

ds_test = tf.data.Dataset.from_tensor_slices(('test/'+ file_paths_test + '.jpg'))

def read_image_test(image_file):
    image = tf.io.read_file(directory + image_file)
    image = tf.image.decode_image(image, dtype=tf.float32, channels=3)
    #image = tf.cast(image, tf.float32) / 255.0
    image = tf.reshape(image, [IMG_DIM, IMG_DIM, 3])

    return image

ds_test = ds_test.map(read_image_test, num_parallel_calls = AUTOTUNE).batch(batch_size)

IMG_DIM

In [None]:
len(file_paths_test)

In [None]:
prediction=model.predict(ds_test)

In [None]:
prediction.shape

In [None]:
prediction = pd.DataFrame(prediction)
prediction = prediction.idxmax(axis=1)
prediction.shape

In [None]:
output_results_pd = pd.read_csv("../input/jpeg-melanoma-256x256/sample_submission.csv")
output_results_pd['target'] = prediction.ravel().tolist()

In [None]:
submission_file = output_results_pd.to_csv(model_save_dir +".csv", index = False)

# **Work in progress**

**conclusion:**
the model is suffering from extreamly under-represented class under fitting, which is appearnt as the accuracy of train and val is high however the precision and recall are quite low even when using focal loss based on the paper found [here](https://arxiv.org/abs/1708.02002) is implemented in the model 

**in progress:**
implemnt of class_weight for weighting the loss function (during training only). This can be useful to tell the model to "pay more attention" to samples from an under-represented class.