# Analysis of SIIM-ISIC Melanoma Classification by Images

# Introduction

## The Competition

Skin cancer is common cancer type and despite beign mostly non malignant, due to high case numbers it's pretty serious diasease and can lead serious cases if not detected, treated in time. It's usually diagnosed by eye for primarily and followed by further clinical analysis if needed. Even though the rares outcome is called melanoma it's the most deadly one, so early detection is pretty important. For this task using computer aided diagnosis might be helpful for primarily steps and early detections. Better detection might save thousands of lives.

# Preparation

In [None]:
!pip install -q efficientnet
!pip install efficientnet tensorflow_addons
!pip install pydicom

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

import pydicom as dicom
import seaborn as sns
import plotly.express as px

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import random
import re
import math
import time

from tqdm import tqdm
from tqdm.keras import TqdmCallback
from pandas_summary import DataFrameSummary

import warnings
warnings.filterwarnings('ignore') # Disabling warnings for clearer outputs

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)

In [None]:
# Import
import tensorflow as tf, re, math
import tensorflow.keras.backend as K
import efficientnet.tfkeras as efn
from kaggle_datasets import KaggleDatasets

import efficientnet.tfkeras as efn
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

tf.random.set_seed(seed_val)

In [None]:
# Set color palette.
color_palette = list(map(lambda x: '#%02x%02x%02x' % (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), sns.color_palette("magma", n_colors=7)))

# Set plot styling.
plt.style.use('ggplot')

In [None]:
# Set file paths for our notebook:
base_path = '/kaggle/input/siim-isic-melanoma-classification'
train_img_path = '/kaggle/input/siim-isic-melanoma-classification/jpeg/train/'
test_img_path = '/kaggle/input/siim-isic-melanoma-classification/jpeg/test/'
img_stats_path = '/kaggle/input/melanoma2020imgtabular'

In [None]:
# Try dcm file
image_path = '../input/siim-isic-melanoma-classification/train/ISIC_0015719.dcm'
ds = dicom.dcmread(image_path)
ds

# Loading the data
Train data has 8 features, 33126 observations and Test data 5 features, 10982 observations.

#### Train dataset consists of:

1. image name -> the filename of specific image for the train set
2. patient_id -> identifies the unique patient
3. sex -> gender of the patient
4. age_approx -> approx age of the patient at time of scanning
5. anatom_site_general_challenge -> location of the scan site
6. diagnosis -> information about the diagnosis
7. benign_malignant - indicates scan result if it's malignant or benign
8. target -> same as above but better for modelling since it's binary

#### Test dataset consists of:

1. image name -> the filename of specific image for the train set
2. patient_id -> identifies the unique patient
3. sex -> gender of the patient
4. age_approx -> approx age of the patient at time of scanning
5. anatom_site_general_challenge -> location of the scan site

In [None]:
# Load train and test data
train = pd.read_csv(os.path.join(base_path, 'train.csv'))
test = pd.read_csv(os.path.join(base_path, 'test.csv'))
sample = pd.read_csv(os.path.join(base_path, 'sample_submission.csv'))

In [None]:
# Observe train and test columns and amount
train.shape, test.shape, train.columns, test.columns

In [None]:
# Rename train/test columns
train.columns = ['img_name', 'id', 'sex', 'age', 'location', 'diagnosis', 'benign_malignant', 'target']
test.columns = ['img_name', 'id', 'sex', 'age', 'location']

In [None]:
# Observe 5 random samples from the train set
train.sample(5)

In [None]:
# Observe 5 random samples from the test set
test.sample(5)

In [None]:
train['diagnosis'].unique(), train['location'].unique()

In [None]:
import cv2
import pandas as pd
import matplotlib.pyplot as plt

print('Examples WITH Melanoma')
imgs = train.loc[train.target == 1].sample(10).img_name.values
plt.figure(figsize=(20, 8))
for i, k in enumerate(imgs):
    img = cv2.imread('../input/siim-isic-melanoma-classification/jpeg/train/%s.jpg' % k)
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    plt.subplot(2, 5, i+1)
    plt.axis('off')
    plt.imshow(img)
plt.show()

print('Examples WITHOUT Melanoma')
imgs = train.loc[train.target == 0].sample(10).img_name.values
plt.figure(figsize=(20, 8))
for i, k in enumerate(imgs):
    img = cv2.imread('../input/siim-isic-melanoma-classification/jpeg/train/%s.jpg' % k)
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    plt.subplot(2, 5, i+1)
    plt.axis('off')
    plt.imshow(img)
plt.show()

# Observe missing values

In [None]:
# Checking missing values:
def missing_percentage(df):
    total = df.isnull().sum().sort_values(ascending=False)[
        df.isnull().sum().sort_values(ascending=False) != 0]
    percent = (df.isnull().sum().sort_values(ascending=False) / len(df) * 100)[
        (df.isnull().sum().sort_values(ascending=False) / len(df) * 100) != 0]
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

missing_train = missing_percentage(train)
missing_test = missing_percentage(test)

fig, ax = plt.subplots(1, 2, figsize=(16, 6))
sns.barplot(x=missing_train.index, y='Percent',
            data=missing_train, palette=color_palette, ax=ax[0])

sns.barplot(x=missing_test.index, y='Percent',
            data=missing_test, palette=color_palette, ax=ax[1])

ax[0].set_title('Train set missing values')
ax[1].set_title('Test set missing values')

# Observation of meta features before filling missing values

In [None]:
# Create a grid
fig = plt.figure(constrained_layout=True, figsize=(20, 9))
grid = gridspec.GridSpec(ncols=4, nrows=2, figure=fig)

# Gender distribution
ax1 = fig.add_subplot(grid[0, :2])
ax1.set_title('Gender distribution')
sns.countplot(train.sex.sort_values(ignore_index=True),
              alpha=0.9, ax=ax1, color=color_palette[0], label='Train')
sns.countplot(test.sex.sort_values(ignore_index=True),
              alpha=0.7, ax=ax1, color=color_palette[2], label='Test')
ax1.legend()

# Scanned site distribution
ax2 = fig.add_subplot(grid[0, 2:])
ax2.set_title('Scanned site distribution')
sns.countplot(train.location,
              alpha=0.9, ax=ax2, color=color_palette[0],
              label='Train', order=train['location'].value_counts().index)
sns.countplot(test.location,
              alpha=0.7, ax=ax2, color=color_palette[2],
              label='Test', order=test['location'].value_counts().index)
ax2.legend()

# Age distribution
ax3 = fig.add_subplot(grid[1, :])
ax3.set_title('Age distribution')
sns.distplot(train.age, ax=ax3, label='Train', color=color_palette[0])
sns.distplot(test.age, ax=ax3, label='Test', color=color_palette[2])
ax3.legend()

plt.show()

# Fill missing data

In [None]:
# Fill missing scanned site values with 'unknown' tag:
for df in [train, test]:
    df['location'].fillna('unknown', inplace=True)

In [None]:
# Check
ids_train = train.location.values
ids_test = test.location.values
ids_train_set = set(ids_train)
ids_test_set = set(ids_test)

location_not_overlap = list(ids_train_set.symmetric_difference(ids_test_set))
n_overlap = len(location_not_overlap)
n_overlap == 0

In [None]:
# Fill age and sex with appropriate values
train['sex'].fillna(train['sex'].mode()[0], inplace=True)
train['age'].fillna(train['age'].median(), inplace=True)

In [None]:
# Check missing value counts
train.isnull().sum().sum(), test.isnull().sum().sum()

# Observe the data

## Distribution of scanned site between train set and test set

In [None]:
# Train set
cntstr = train.location.value_counts().rename_axis('location').reset_index(name='count')

fig = px.treemap(cntstr,
                 path=['location'], values='count',
                 color='count', color_continuous_scale=color_palette,
                 title='Distribution of scanned site - Train set')
fig.update_traces(textinfo='label+percent entry')
fig.show()

In [None]:
# Test set
cntste = test.location.value_counts().rename_axis('location').reset_index(name='count')

fig = px.treemap(cntste,
                 path=['location'], values='count',
                 color='count', color_continuous_scale=color_palette,
                 title='Distribution of scanned site - Test set')

fig.update_traces(textinfo='label+percent entry')
fig.show()

# Distribution of scanned site between genders and target

Some scanned sites may be more likely to be malignant, head/neck comes first with followed by oral/genital and upper extremity. Scanned sites are similar between males and females with small differences on distribution.

In [None]:
# Create a grid
fig = plt.figure(constrained_layout=True, figsize=(20, 9))
grid = gridspec.GridSpec(ncols=4, nrows=2, figure=fig)

# Scanned site - Female
ax1 = fig.add_subplot(grid[1, :2])
ax1.set_title('Scanned site - Female')
sns.countplot(
    train[train['sex'] == 'female'].location.sort_values(ignore_index=True),
    alpha=0.9, ax=ax1, color=color_palette[0], label='Female',
    order=train['location'].value_counts().index)
ax1.legend()

# Scanned site - Male
ax2 = fig.add_subplot(grid[1, 2:])
ax2.set_title('Scanned site - Male')
sns.countplot(
    train[train['sex'] == 'male'].location.sort_values(ignore_index=True),
    alpha=0.9, ax=ax2, color=color_palette[-1], label='Male', 
    order=train['location'].value_counts().index)
ax2.legend()

# Malignant ratio per scanned site
ax3 = fig.add_subplot(grid[0, :])
ax3.set_title('Malignant ratio per scanned site')
loc_freq = train.groupby('location')['target'].mean().sort_values(ascending=False)
sns.barplot(x=loc_freq.index, y=loc_freq, palette=color_palette, ax=ax3)
ax3.legend()

plt.show()

# A general look with sunburst chart

- Only 2% of our targets are malignant
- On malignant images males are dominant with 62% 
- Gender wise benign images are more balance 52-48% male female ratio
- Malignant image scan locations differs based on the patients gender:
    - Meanwhile the torso is most common location in males it's almost half of the scans meanwhile in females it's 39%
    - Lower extremity is more common with female scans than males 18% males vs 26% females
    - Again upper extremity malignant scans is common with females than males (23- 17%)
- Benign image scan locations more similar between male and female patients.

In [None]:
# Plot interactive sunburst chart
fig = px.sunburst(data_frame=train,
                  path=['benign_malignant', 'sex', 'diagnosis'],
                  color='sex', color_discrete_sequence=color_palette,
                  maxdepth=-1, title='Sunburst Chart Benign/Malignant > Sex > Location')
fig.update_traces(textinfo='label+percent parent')
fig.update_layout(margin=dict(t=0, l=0, r=0, b=0))
fig.show()

# Distribution of target between genders and age

Age is relatively correlated to the target. Getting malignant result is more possible for elder patients than young patients. There is spike for both genders after age of 85, however, there isn't much of 80+ patients which may explain this spike. It's safe to say it's more likely to be malignant scan after age of 60.

In [None]:
# Plotting age vs sex vs target:
fig, ax = plt.subplots(1, 2, figsize=(16, 6))
sns.lineplot(x='age', y='target', data=train,
             ax=ax[0], hue='sex', palette=color_palette[2:4], ci=None)
sns.boxplot(x='benign_malignant', y='age', data=train,
            ax=ax[1], hue='sex', palette=color_palette[2:4])
plt.legend(loc='lower right')
ax[0].set_title('Malignant scan ratio by age')
ax[1].set_title('Scan results by age and sex')

plt.show()

# Age Round Two

Wanted to double check age distributions after our previous observations. Age seems evenly distributed on both train and test datasets, we can see small bumps at age 75+ and around 40, these worth investigating.

We can see again older people are more likely to get malignant scan results. One last thing about age distributions, we see more female patients in younger ages this trend changes with the older patients...

In [None]:
# Create a grid
fig = plt.figure(constrained_layout=True, figsize=(20, 6))
grid = gridspec.GridSpec(ncols=4, nrows=1, figure=fig)

# Age distribution by target
ax1 = fig.add_subplot(grid[0, :2])
ax1.set_title('Age distribution by target')
sns.kdeplot(train[train['target'] == 0]['age'],
            shade=True, ax=ax1, color=color_palette[2],
            label='Benign')
sns.kdeplot(train[train['target'] == 1]['age'],
            shade=True, ax=ax1, color=color_palette[0],
            label='Malignant')
ax1.legend()

# Age distribution by gender
ax2 = fig.add_subplot(grid[0, 2:])
ax2.set_title('Age distribution by gender')
sns.distplot(train[train.sex == 'female'].age,
             ax=ax2, label='Female', color=color_palette[0])
sns.distplot(train[train.sex == 'male'].age,
             ax=ax2, label='Male', color=color_palette[2])
ax2.legend()
plt.show()

# Diagnosis Distribution

This part we can't use in our model but it's giving us some insights about this disease so we can inspect that too.

In [None]:
diag = train.diagnosis.value_counts()
fig = px.pie(diag,
             values='diagnosis', names=diag.index, 
             color_discrete_sequence=color_palette, hole=.4)
fig.update_traces(textinfo='percent+label', pull=0.05)
fig.show()

# ML algorithm on Meta Feature

In [None]:
# Import packages
import xgboost as xgb

from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score, cross_validate
from sklearn.metrics import roc_auc_score, roc_curve

In [None]:
# Load lanscape data
train40 = pd.read_csv('../input/melanoma2020imgtabular/train40Features.csv')
trainmet = pd.read_csv('../input/melanoma2020imgtabular/trainMetrics.csv')

test40 = pd.read_csv('../input/melanoma2020imgtabular/test40Features.csv')
testmet = pd.read_csv('../input/melanoma2020imgtabular/testMetrics.csv')

In [None]:
# Pre-processing
one_hot = pd.get_dummies(train['sex'], prefix='sex')
train = pd.concat([train, one_hot], axis=1)
one_hot = pd.get_dummies(test['sex'], prefix='sex')
test = pd.concat([test, one_hot], axis=1)

one_hot = pd.get_dummies(train['location'], prefix='anatom')
train = pd.concat([train, one_hot], axis=1)
one_hot = pd.get_dummies(test['location'], prefix='anatom')
test = pd.concat([test, one_hot], axis=1)

train.drop(['sex','img_name','id','benign_malignant', 'diagnosis', 'location'], axis=1, inplace=True)
test.drop(['sex','img_name','id','benign_malignant', 'diagnosis', 'location'], axis=1, inplace=True)

# Drop duplicate and useless data
train40.drop(['sex', 'age_approx', 'anatom_site_general_challenge'], axis=1, inplace=True)
test40.drop(['sex', 'age_approx', 'anatom_site_general_challenge'], axis=1, inplace=True)

# Merge data
train = pd.concat([train, train40, trainmet], axis=1)
test = pd.concat([test, test40, testmet], axis=1)

# Devide train set and label
X = train.drop('target', axis=1)
y = train.target

# Set up cross-validation

In [None]:
# taking holdout set for validating with stratified y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 5 fold stratify for cv
cv = StratifiedKFold(5, shuffle=True, random_state=42)

# Initialization
xg = xgb.XGBClassifier(
    n_estimators=750,
    min_child_weight=0.81,
    learning_rate=0.025,
    max_depth=2,
    subsample=0.80,
    colsample_bytree=0.42,
    gamma=0.10,
    random_state=42,
    n_jobs=-1)

In [None]:
# Run cross-validation
cv_results = cross_validate(xg, X_train, y_train, 
                            cv=cv, scoring='roc_auc', 
                            return_train_score=True, n_jobs=-1)

cv_results

In [None]:
# Evaluation on test set
xg.fit(X_train, y_train)
validation = xg.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, validation)

In [None]:
def plot_roc_feat(y_trues, y_preds, est, x_max=1.0):
    color_palette = list(map(lambda x: '#%02x%02x%02x' % (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), sns.color_palette("magma", n_colors=20)))
    
    fig, ax = plt.subplots(1,2, figsize=(16,6))
    for i, y_pred in enumerate(y_preds):
        y_true = y_trues[i]
        fpr, tpr, thresholds = roc_curve(y_true, y_pred)
        auc = roc_auc_score(y_true, y_pred)
        ax[0].plot(fpr, tpr, label='AUC=%.3f' % auc, marker='o', markersize=1, c=color_palette[0])

    ax[0].legend()
    ax[0].grid()
    ax[0].plot(np.linspace(0, 1, 20), np.linspace(0, 1, 20), linestyle='--', c=color_palette[-10])
    ax[0].set_title('ROC curve')
    ax[0].set_xlabel('False Positive Rate')
    ax[0].set_xlim([-0.01, x_max])
    _ = ax[0].set_ylabel('True Positive Rate')
    
    
    feature_importance = est.get_booster().get_score(importance_type='weight')

    keys = list(feature_importance.keys())
    values = list(feature_importance.values())

    importance = pd.DataFrame(data=values, index=keys, 
                              columns=['score']).sort_values(by='score', ascending=False)
    
    sns.barplot(x=importance.score.iloc[:20], y=importance.index[:20],
            orient='h', palette=color_palette, ax=ax[1])
    ax[1].set_title('Feature Importances')

In [None]:
plot_roc_feat([y_test], [validation], xg)

# Neural Networks

For this part I was inspired by AgentAuers's 'Incredible TPUs' [here](https://www.kaggle.com/agentauers/incredible-tpus-finetune-effnetb0-b6-at-once).
We start by importing neccesary packages and setting random seed.

In [None]:
# Set TPU as main device for training, if you get warnings while working with tpu's ignore them.
DEVICE = 'TPU'
if DEVICE == 'TPU':
    print('connecting to TPU...')
    try:        
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        print('Could not connect to TPU')
        tpu = None

    if tpu:
        try:
            print('Initializing  TPU...')
            tf.config.experimental_connect_to_cluster(tpu)
            tf.tpu.experimental.initialize_tpu_system(tpu)
            strategy = tf.distribute.experimental.TPUStrategy(tpu)
            print('TPU initialized')
        except _:
            print('Failed to initialize TPU!')
    else:
        DEVICE = 'GPU'

if DEVICE != 'TPU':
    print('Using default strategy for CPU and single GPU')
    strategy = tf.distribute.get_strategy()

if DEVICE == 'GPU':
    print('Num GPUs Available: ',
          len(tf.config.experimental.list_physical_devices('GPU')))

print('REPLICAS: ', strategy.num_replicas_in_sync)
AUTO = tf.data.experimental.AUTOTUNE

In [None]:
# Configuration
cfg = dict(batch_size=32,
           img_size=384,
    
           lr_start=0.000005,
           lr_max=0.00000125,
           lr_min=0.000001,
           lr_rampup=5,
           lr_sustain=0,
           lr_decay=0.8,
           epochs=10,
    
           transform_prob=1.0,
           rot=180.0,
           shr=2.0,
           hzoom=8.0,
           wzoom=8.0,
           hshift=8.0,
           wshift=8.0,
    
           optimizer='adam',
           label_smooth_fac=0.05,
           tta_steps=20)

In [None]:
def get_mat(rotation, shear, height_zoom, width_zoom, height_shift,
            width_shift):
    
    ''' Settings for image preparations '''

    # CONVERT DEGREES TO RADIANS
    rotation = math.pi * rotation / 180.
    shear = math.pi * shear / 180.

    # ROTATION MATRIX
    c1 = tf.math.cos(rotation)
    s1 = tf.math.sin(rotation)
    one = tf.constant([1], dtype='float32')
    zero = tf.constant([0], dtype='float32')
    rotation_matrix = tf.reshape(tf.concat([c1, s1, zero, -s1, c1, zero, zero, zero, one], axis=0),
        [3, 3])

    # SHEAR MATRIX
    c2 = tf.math.cos(shear)
    s2 = tf.math.sin(shear)
    shear_matrix = tf.reshape(tf.concat([one, s2, zero, zero, c2, zero, zero, zero, one], axis=0), [3, 3])

    # ZOOM MATRIX
    zoom_matrix = tf.reshape(
        tf.concat([one / height_zoom, zero, zero, zero, one / width_zoom, zero, zero, zero, one],
                  axis=0), [3, 3])

    # SHIFT MATRIX
    shift_matrix = tf.reshape(
        tf.concat([one, zero, height_shift, zero, one, width_shift, zero, zero, one], axis=0), [3, 3])

    return K.dot(K.dot(rotation_matrix, shear_matrix),
                 K.dot(zoom_matrix, shift_matrix))


def transform(image, cfg):
    
    ''' This function takes input images of [: , :, 3] sizes and returns them as randomly rotated, sheared, shifted and zoomed. '''

    DIM = cfg['img_size']
    XDIM = DIM % 2  # fix for size 331

    rot = cfg['rot'] * tf.random.normal([1], dtype='float32')
    shr = cfg['shr'] * tf.random.normal([1], dtype='float32')
    h_zoom = 1.0 + tf.random.normal([1], dtype='float32') / cfg['hzoom']
    w_zoom = 1.0 + tf.random.normal([1], dtype='float32') / cfg['wzoom']
    h_shift = cfg['hshift'] * tf.random.normal([1], dtype='float32')
    w_shift = cfg['wshift'] * tf.random.normal([1], dtype='float32')

    # GET TRANSFORMATION MATRIX
    m = get_mat(rot, shr, h_zoom, w_zoom, h_shift, w_shift)

    # LIST DESTINATION PIXEL INDICES
    x = tf.repeat(tf.range(DIM // 2, -DIM // 2, -1), DIM)
    y = tf.tile(tf.range(-DIM // 2, DIM // 2), [DIM])
    z = tf.ones([DIM * DIM], dtype='int32')
    idx = tf.stack([x, y, z])

    # ROTATE DESTINATION PIXELS ONTO ORIGIN PIXELS
    idx2 = K.dot(m, tf.cast(idx, dtype='float32'))
    idx2 = K.cast(idx2, dtype='int32')
    idx2 = K.clip(idx2, -DIM // 2 + XDIM + 1, DIM // 2)

    # FIND ORIGIN PIXEL VALUES
    idx3 = tf.stack([DIM // 2 - idx2[0, ], DIM // 2 - 1 + idx2[1, ]])
    d = tf.gather_nd(image, tf.transpose(idx3))

    return tf.reshape(d, [DIM, DIM, 3])

def prepare_image(img, cfg=None, augment=True):
    
    ''' This function loads the image, resizes it, casts a tensor to a new type float32 in our case, transforms it using the function just above, then applies the augmentations.'''
    
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, [cfg['img_size'], cfg['img_size']], antialias=True)
    img = tf.cast(img, tf.float32) / 255.0

    if augment:
        if cfg['transform_prob'] > tf.random.uniform([1], minval=0, maxval=1):
            img = transform(img, cfg)

        img = tf.image.random_flip_left_right(img)
        img = tf.image.random_saturation(img, 0.7, 1.3)
        img = tf.image.random_contrast(img, 0.8, 1.2)
        img = tf.image.random_brightness(img, 0.1)

    return img

In [None]:
def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'image_name': tf.io.FixedLenFeature([], tf.string),
        'patient_id': tf.io.FixedLenFeature([], tf.int64),
        'sex': tf.io.FixedLenFeature([], tf.int64),
        'age_approx': tf.io.FixedLenFeature([], tf.int64),
        'anatom_site_general_challenge': tf.io.FixedLenFeature([], tf.int64),
        'diagnosis': tf.io.FixedLenFeature([], tf.int64),
        'target': tf.io.FixedLenFeature([], tf.int64)}

    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    return example['image'], example['target']


def read_unlabeled_tfrecord(example):
    UNLABELED_TFREC_FORMAT = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'image_name': tf.io.FixedLenFeature([], tf.string),
        'patient_id': tf.io.FixedLenFeature([], tf.int64),
        'sex': tf.io.FixedLenFeature([], tf.int64),
        'age_approx': tf.io.FixedLenFeature([], tf.int64),
        'anatom_site_general_challenge': tf.io.FixedLenFeature([], tf.int64)}
    example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT)
    return example['image'], example['image_name']

def count_data_items(filenames):
    n = [int(re.compile(r'-([0-9]*)\.').search(filename).group(1))
         for filename in filenames]
    return np.sum(n)

In [None]:
def getTrainDataset(files, cfg, augment=True, shuffle=True, repeat=True):
    ''' This function reads the tfrecord train images, shuffles them, apply augmentations to them and prepares the data for training. '''
    
    ds = tf.data.TFRecordDataset(files, num_parallel_reads=AUTO)
    ds = ds.cache()

    if shuffle:
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        ds = ds.with_options(opt)

    ds = ds.map(read_labeled_tfrecord, num_parallel_calls=AUTO)
    if repeat:
        ds = ds.repeat()
        
    if shuffle:
        ds = ds.shuffle(2048)
        
    ds = ds.map(lambda img, label:
                (prepare_image(img, augment=augment, cfg=cfg), label),
                num_parallel_calls=AUTO)
    ds = ds.batch(cfg['batch_size'] * strategy.num_replicas_in_sync)
    ds = ds.prefetch(AUTO)
    return ds

def getTestDataset(files, cfg, augment=False, repeat=False):
    ''' This function reads the tfrecord test images and prepares the data for predicting. '''
    
    ds = tf.data.TFRecordDataset(files, num_parallel_reads=AUTO)
    ds = ds.cache()
    if repeat:
        ds = ds.repeat()
    ds = ds.map(read_unlabeled_tfrecord, num_parallel_calls=AUTO)
    ds = ds.map(lambda img, idnum:
                (prepare_image(img, augment=augment, cfg=cfg), idnum),
                num_parallel_calls=AUTO)
    ds = ds.batch(cfg['batch_size'] * strategy.num_replicas_in_sync)
    ds = ds.prefetch(AUTO)
    return ds

In [None]:
def get_model(weights):
    ''' This function gets the layers inclunding efficientnet ones. '''
    
    model_input = tf.keras.Input(shape=(cfg['img_size'], cfg['img_size'], 3), name='img_input')

    x = efn.EfficientNetB5(include_top=False, weights=weights,
                           input_shape=(cfg['img_size'], cfg['img_size'], 3), pooling='avg')(model_input)
    x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    
    model = tf.keras.Model(model_input, x, name='EffNet')
    return model

def compileModel(cfg, weights='noisy-student'):
    ''' Configuring the model with losses and metrics. '''    
    
    with strategy.scope():
        model = get_model(weights=weights)

    with strategy.scope():
        model.compile(optimizer=cfg['optimizer'],
                      loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=cfg['label_smooth_fac']),
                      metrics=[tf.keras.metrics.AUC(name='auc')])
    return model


def getLearnRateCallback(cfg):
    ''' Using callbacks for learning rate adjustments. '''
    
    lr_start = cfg['lr_start']
    lr_max = cfg['lr_max'] * strategy.num_replicas_in_sync * cfg['batch_size']
    lr_min = cfg['lr_min']
    lr_rampup = cfg['lr_rampup']
    lr_sustain = cfg['lr_sustain']
    lr_decay = cfg['lr_decay']

    def lrfn(epoch):
        if epoch < lr_rampup:
            lr = (lr_max - lr_start) / lr_rampup * epoch + lr_start
        elif epoch < lr_rampup + lr_sustain:
            lr = lr_max
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_rampup -
                                                lr_sustain) + lr_min
        return lr

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=False)
    return lr_callback

In [None]:
def plot_history(cfg, history, network='noisy-student'):

    plt.figure(figsize=(15,5))

    plt.plot(np.arange(cfg['epochs']), history.history['auc'],'-o', 
             label='Train AUC', color=color_palette[0])
    plt.plot(np.arange(cfg['epochs']), history.history['val_auc'],'-o', 
             label='Val AUC', color=color_palette[-2])

    x = np.argmax(history.history['val_auc'])
    y = np.max(history.history['val_auc'])
    xdist = plt.xlim()[1] - plt.xlim()[0]
    ydist = plt.ylim()[1] - plt.ylim()[0]
    plt.scatter(x, y, s=200, color=color_palette[-2])
    plt.text(x-0.03*xdist, y-0.13*ydist, 'max auc\n%.2f'%y, size=14)
    plt.ylabel('AUC', size=14)
    plt.xlabel('Epoch', size=14)
    plt.legend(loc=2)

    plt2 = plt.gca().twinx()
    plt2.plot(np.arange(cfg['epochs']), history.history['loss'],'-o', 
              label='Train Loss', color=color_palette[1])
    plt2.plot(np.arange(cfg['epochs']), history.history['val_loss'],'-o', 
              label='Val Loss', color=color_palette[-3])
    x = np.argmin(history.history['val_loss'])
    y = np.min(history.history['val_loss'])
    ydist = plt.ylim()[1] - plt.ylim()[0]
    plt.scatter(x, y, s=200, color=color_palette[-3])
    plt.text(x-0.03*xdist, y+0.05*ydist, 'min loss', size=14)
    plt.ylabel('Loss', size=14)

    plt.title('Efficientnet-B5 ' + network + ' on melanoma 384 x 384')
    plt.legend(loc=3)
    plt.show()

In [None]:
def plot_roc(y_trues, y_preds, x_max=1.0):
    color_palette = list(map(lambda x: '#%02x%02x%02x' % (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), sns.color_palette("magma", n_colors=20)))
    
    fig, ax = plt.subplots(1,2, figsize=(16,6))
    for i, y_pred in enumerate(y_preds):
        y_true = y_trues[i]
        fpr, tpr, thresholds = roc_curve(y_true, y_pred)
        auc = roc_auc_score(y_true, y_pred)
        ax[0].plot(fpr, tpr, label='AUC=%.3f' % auc, marker='o', markersize=1, c=color_palette[0])

    ax[0].legend()
    ax[0].grid()
    ax[0].plot(np.linspace(0, 1, 20), np.linspace(0, 1, 20), linestyle='--', c=color_palette[-10])
    ax[0].set_title('ROC curve')
    ax[0].set_xlabel('False Positive Rate')
    ax[0].set_xlim([-0.01, x_max])
    _ = ax[0].set_ylabel('True Positive Rate')

# Run Models

## Compare epoch

In [None]:
GCS_PATH = KaggleDatasets().get_gcs_path('melanoma-384x384')
cfg['epochs'] = 20

idx_train, idx_valid = [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14], [8]

files_train = np.array(tf.io.gfile.glob([GCS_PATH + '/train%.2i*.tfrec' % x for x in idx_train]))
files_valid = np.array(tf.io.gfile.glob([GCS_PATH + '/train%.2i*.tfrec'% x for x in idx_valid]))

# print(files_train, files_valid, sep='\n')

model = compileModel(cfg)
model.summary()

ds_train = getTrainDataset(files_train, cfg)
ds_valid = getTrainDataset(files_valid, cfg, augment=False, shuffle=False, repeat=False)
stepsTrain = count_data_items(files_train) / (cfg['batch_size'] * strategy.num_replicas_in_sync)

history = model.fit(ds_train,
                    validation_data=ds_valid,
                    steps_per_epoch=stepsTrain,
                    epochs=cfg['epochs'], verbose=1,
                    callbacks=[getLearnRateCallback(cfg)])

plot_history(cfg, history)

## Compare Pre-trained type

In [None]:
# imagenet EfficientNet test run
GCS_PATH = KaggleDatasets().get_gcs_path('melanoma-384x384')

idx_train, idx_valid = [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14], [8]

files_train = np.array(tf.io.gfile.glob([GCS_PATH + '/train%.2i*.tfrec' % x for x in idx_train]))
files_valid = np.array(tf.io.gfile.glob([GCS_PATH + '/train%.2i*.tfrec'% x for x in idx_valid]))

# print(files_train, files_valid, sep='\n')

model = compileModel(cfg, weights='imagenet')
model.summary()

ds_train = getTrainDataset(files_train, cfg)
ds_valid = getTrainDataset(files_valid, cfg, augment=False, shuffle=False, repeat=False)
stepsTrain = count_data_items(files_train) / (cfg['batch_size'] * strategy.num_replicas_in_sync)

history = model.fit(ds_train,
                    validation_data=ds_valid,
                    steps_per_epoch=stepsTrain,
                    epochs=cfg['epochs'], verbose=1,
                    callbacks=[getLearnRateCallback(cfg)])

plot_history(cfg, history, network='pre_trained imagenet')

In [None]:
# noisy-student EfficientNet test run
GCS_PATH = KaggleDatasets().get_gcs_path('melanoma-384x384')

idx_train, idx_valid = [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14], [8]

files_train = np.array(tf.io.gfile.glob([GCS_PATH + '/train%.2i*.tfrec' % x for x in idx_train]))
files_valid = np.array(tf.io.gfile.glob([GCS_PATH + '/train%.2i*.tfrec'% x for x in idx_valid]))

# print(files_train, files_valid, sep='\n')

model = compileModel(cfg)
model.summary()

ds_train = getTrainDataset(files_train, cfg)
ds_valid = getTrainDataset(files_valid, cfg, augment=False, shuffle=False, repeat=False)
stepsTrain = count_data_items(files_train) / (cfg['batch_size'] * strategy.num_replicas_in_sync)

history = model.fit(ds_train,
                    validation_data=ds_valid,
                    steps_per_epoch=stepsTrain,
                    epochs=cfg['epochs'], verbose=1,
                    callbacks=[getLearnRateCallback(cfg)])

plot_history(cfg, history)

## KFold

In [None]:
# KFold run
histories = []
GCS_PATH = KaggleDatasets().get_gcs_path('melanoma-384x384')
files_test  = np.sort(tf.io.gfile.glob(GCS_PATH + '/test*.tfrec'))

skf = KFold(n_splits=5, shuffle=True, random_state=seed_val)
for fold, (idx_train, idx_valid) in enumerate(skf.split(np.arange(15))):
    
    files_train = np.array(tf.io.gfile.glob([GCS_PATH + '/train%.2i*.tfrec' % x for x in idx_train]))
    files_valid = np.array(tf.io.gfile.glob([GCS_PATH + '/train%.2i*.tfrec'% x for x in idx_valid]))
    
    model = compileModel(cfg)
    if fold == 0:
        model.summary()
    
    ds_train = getTrainDataset(files_train, cfg)
    ds_valid = getTrainDataset(files_valid, cfg, augment=False, shuffle=False, repeat=False)
    stepsTrain = count_data_items(files_train) / (cfg['batch_size'] * strategy.num_replicas_in_sync)
    
    print("Fold", fold + 1, '-' * 15, '\n')
    history = model.fit(ds_train,
                        validation_data=ds_valid,
                        steps_per_epoch=stepsTrain,
                        epochs=cfg['epochs'], verbose=1,
                        callbacks=[getLearnRateCallback(cfg)])
    
    plot_history(cfg, history)
    histories.append(history)