In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import math
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import Model
from os import getcwd
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import RMSprop
import shutil
from shutil import rmtree
import keras_preprocessing
from keras_preprocessing import image
from keras_preprocessing.image import ImageDataGenerator
from tensorflow.python.keras.applications.inception_resnet_v2 import InceptionResNetV2
import fnmatch
from kaggle_datasets import KaggleDatasets
import re
from collections import Counter
import tensorflow.keras.backend as K
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

GCS_PATH = KaggleDatasets().get_gcs_path('siim-isic-melanoma-classification')

In [None]:
def build_lrfn(lr_start=0.00001, lr_max=0.000075, lr_min=0.000001, lr_rampup_epochs=20, lr_sustain_epochs=0, lr_exp_decay=.8):
    lr_max = lr_max * tpu_strategy.num_replicas_in_sync
    def lrfn(epoch):
        if epoch < lr_rampup_epochs:
            lr = (lr_max - lr_start) / lr_rampup_epochs * epoch + lr_start
        elif epoch < lr_rampup_epochs + lr_sustain_epochs:
            lr = lr_max
        else:
            lr = (lr_max - lr_min) * lr_exp_decay ** (epoch - lr_rampup_epochs - lr_sustain_epochs) + lr_min
        return lr
    return lrfn

In [None]:
# Data leakage between training and testing data over PatientId column
def check_for_leakage(df1, df2, patient_col):
    """
    Return True if there any patients are in both df1 and df2.

    Args:
        df1 (dataframe): dataframe describing first dataset
        df2 (dataframe): dataframe describing second dataset
        patient_col (str): string name of column with patient IDs
    
    Returns:
        leakage (bool): True if there is leakage, otherwise False
    """

    df1_patients_unique = df1[patient_col].values
    df2_patients_unique = df2[patient_col].values
    
    patients_in_both_groups = len(list(set(df1_patients_unique).intersection(set(df2_patients_unique))))

    # leakage contains true if there is patient overlap, otherwise false.
    leakage = False if patients_in_both_groups == 0 else True # boolean (true if there is at least 1 patient in both groups)
    return leakage


train_df = pd.read_csv("/kaggle/input/siim-isic-melanoma-classification/train.csv")
test_df = pd.read_csv("/kaggle/input/siim-isic-melanoma-classification/test.csv")
print(f"Data Leakage: {check_for_leakage(train_df, test_df, 'patient_id')}")

In [None]:
BATCH_SIZE = 16 * tpu_strategy.num_replicas_in_sync
Epochs = 16
IMG_HEIGHT = 1024
IMG_WIDTH = 1024
TRAINING_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/tfrecords/train*.tfrec')
TEST_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/tfrecords/test*.tfrec')
AUTO = tf.data.experimental.AUTOTUNE
checkpoint_filepath = '/kaggle/input/siim-isic-melanoma-classification/'


# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():
    print('Loading InceptionResNet version 2 model...')
    pre_trained_model = InceptionResNetV2(include_top = False,
                        weights = 'imagenet',
                        input_tensor = None,
                        input_shape = (IMG_HEIGHT,IMG_WIDTH,3))

    print('Disabling training of all layers in the pre-trained model')
    # Make all the layers in the pre-trained model non-trainable
    for layer in pre_trained_model.layers:
        layer.trainable = False

    print('Appending pre-trained model with extra layers...')
    model = tf.keras.Sequential([
            pre_trained_model,
            tf.keras.layers.GlobalAveragePooling2D(),
            tf.keras.layers.Dense(512, activation = 'relu'),
            tf.keras.layers.Dropout(0.1),
            tf.keras.layers.Dense(256, activation = 'relu'),
            tf.keras.layers.Dropout(0.1),
            tf.keras.layers.Dense(64, activation = 'relu'),
            tf.keras.layers.Dropout(0.1),
            tf.keras.layers.Dense(16, activation = 'relu'),
            tf.keras.layers.Dropout(0.1),
            tf.keras.layers.Dense(1, activation = 'sigmoid')])


    # The model weights (that are considered the best) are loaded into the model.
    # model.load_weights(checkpoint_filepath)

    print('Defining optimizer, loss and metrics...')
    model.compile(optimizer = 'adam', 
                  loss = 'binary_crossentropy', 
                  metrics = tf.keras.metrics.AUC(name='auc'))

print('\nModel Summary:')
model.summary()

In [None]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
    image = tf.reshape(image, [IMG_HEIGHT, IMG_WIDTH, 3]) # explicit size needed for TPU
    return image

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "target": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = tf.cast(example['target'], tf.int32)
    return image, label # returns a dataset of (image, label) pairs


def read_unlabeled_tfrecord(example):
    UNLABELED_TFREC_FORMAT = {
            "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
            "image_name": tf.io.FixedLenFeature([], tf.string),  # shape [] means single element
            # class is missing, this competitions's challenge is to predict flower classes for the test dataset
        }
    example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    idnum = example['image_name']
    return image, idnum # returns a dataset of image(s)

def data_augment(image, label):
    # data augmentation. Thanks to the dataset.prefetch(AUTO) statement in the next function (below),
    # this happens essentially for free on TPU. Data pipeline code is executed on the "CPU" part
    # of the TPU while the TPU itself is computing gradients.
    image = tf.image.random_flip_left_right(image)
    #image = tf.image.random_saturation(image, 0, 2)
    return image, label

def get_test_dataset(ordered = False):
    dataset = load_dataset(TEST_FILENAMES, labeled=False, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def count_data_items(filenames):
    # the number of data items is written in the name of the .tfrec files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

def load_dataset(filenames, labeled = True, ordered = False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord if labeled else read_unlabeled_tfrecord, num_parallel_calls=AUTO)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

print('Performing Data Augmentation...')
# Over sampling of data
#Step 1 - Get labels and their countings
raw_training_dataset = load_dataset(TRAINING_FILENAMES, labeled=True, ordered=False)
label_counter = Counter()
for images, labels in raw_training_dataset:
    label_counter.update([labels.numpy()])
del raw_training_dataset    
label_counting_sorted = label_counter.most_common()
NUM_TRAINING_IMAGES = sum([x[1] for x in label_counting_sorted])
print("\nNumber of examples in the original training dataset: {}".format(NUM_TRAINING_IMAGES))
print("Labels in the original training dataset, sorted by occurrence")
print(label_counting_sorted)

#Step 2 - Define the number of repetitions for each class
# We want each class occur at least (approximately) `TARGET_MIN_COUNTING` times
TARGET_MIN_COUNTING = math.ceil(0.2 * NUM_TRAINING_IMAGES)

def get_num_of_repetition_for_class(class_id):
    counting = label_counter[class_id]
    if counting >= TARGET_MIN_COUNTING:
        return 1.0
    elif counting == 0:
        return 0
    num_to_repeat = TARGET_MIN_COUNTING / counting
    return num_to_repeat

numbers_of_repetition_for_classes = {class_id: get_num_of_repetition_for_class(class_id) for class_id in range(104)}
{k: v for k, v in sorted(numbers_of_repetition_for_classes.items(), key=lambda item: item[1], reverse=True) if v > 1}

#Step 3 - Define the number of repetitions for each training example
keys_tensor = tf.constant([k for k in numbers_of_repetition_for_classes])
vals_tensor = tf.constant([numbers_of_repetition_for_classes[k] for k in numbers_of_repetition_for_classes])
table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(keys_tensor, vals_tensor), -1)

def get_num_of_repetition_for_example(training_example):
    _, label = training_example
    num_to_repeat = table.lookup(label)
    num_to_repeat_integral = tf.cast(int(num_to_repeat), tf.float32)
    residue = num_to_repeat - num_to_repeat_integral
    num_to_repeat = num_to_repeat_integral + tf.cast(tf.random.uniform(shape=()) <= residue, tf.float32)
    return tf.cast(num_to_repeat, tf.int64)

#Step 4 - Use data augmentation to avoid (exactly) same images appear too many times

def get_mat(rotation, shear, height_zoom, width_zoom, height_shift, width_shift):
    # returns 3x3 transformmatrix which transforms indicies
        
    # CONVERT DEGREES TO RADIANS
    rotation = math.pi * rotation / 180.
    shear = math.pi * shear / 180.
    
    # ROTATION MATRIX
    c1 = tf.math.cos(rotation)
    s1 = tf.math.sin(rotation)
    one = tf.constant([1],dtype='float32')
    zero = tf.constant([0],dtype='float32')
    rotation_matrix = tf.reshape( tf.concat([c1,s1,zero, -s1,c1,zero, zero,zero,one],axis=0),[3,3] )
        
    # SHEAR MATRIX
    c2 = tf.math.cos(shear)
    s2 = tf.math.sin(shear)
    shear_matrix = tf.reshape( tf.concat([one,s2,zero, zero,c2,zero, zero,zero,one],axis=0),[3,3] )    
    
    # ZOOM MATRIX
    zoom_matrix = tf.reshape( tf.concat([one/height_zoom,zero,zero, zero,one/width_zoom,zero, zero,zero,one],axis=0),[3,3] )
    
    # SHIFT MATRIX
    shift_matrix = tf.reshape( tf.concat([one,zero,height_shift, zero,one,width_shift, zero,zero,one],axis=0),[3,3] )
    
    return K.dot(K.dot(rotation_matrix, shear_matrix), K.dot(zoom_matrix, shift_matrix))


def transform(image, label):
    # input image - is one image of size [dim,dim,3] not a batch of [b,dim,dim,3]
    # output - image randomly rotated, sheared, zoomed, and shifted
    DIM = IMG_HEIGHT #IMAGE_SIZE[0]
    XDIM = DIM%2 #fix for size 331
    
    rot = 15. * tf.random.normal([1],dtype='float32')
    shr = 5. * tf.random.normal([1],dtype='float32') 
    h_zoom = 1.0 + tf.random.normal([1],dtype='float32')/10.
    w_zoom = 1.0 + tf.random.normal([1],dtype='float32')/10.
    h_shift = 16. * tf.random.normal([1],dtype='float32') 
    w_shift = 16. * tf.random.normal([1],dtype='float32') 
  
    # GET TRANSFORMATION MATRIX
    m = get_mat(rot,shr,h_zoom,w_zoom,h_shift,w_shift) 

    # LIST DESTINATION PIXEL INDICES
    x = tf.repeat( tf.range(DIM//2,-DIM//2,-1), DIM )
    y = tf.tile( tf.range(-DIM//2,DIM//2),[DIM] )
    z = tf.ones([DIM*DIM],dtype='int32')
    idx = tf.stack( [x,y,z] )
    
    # ROTATE DESTINATION PIXELS ONTO ORIGIN PIXELS
    idx2 = K.dot(m,tf.cast(idx,dtype='float32'))
    idx2 = K.cast(idx2,dtype='int32')
    idx2 = K.clip(idx2,-DIM//2+XDIM+1,DIM//2)
    
    # FIND ORIGIN PIXEL VALUES           
    idx3 = tf.stack( [DIM//2-idx2[0,], DIM//2-1+idx2[1,]] )
    d = tf.gather_nd(image,tf.transpose(idx3))
        
    return tf.reshape(d,[DIM,DIM,3]), label


#Step 5 - A method to get oversampled training dataset
def get_training_dataset_with_oversample(repeat_dataset=True, oversample=False, augumentation=False):
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
    
    if oversample:
        dataset = dataset.flat_map(lambda image, label: tf.data.Dataset.from_tensors((image, label)).repeat(get_num_of_repetition_for_example((image, label))))
    if augumentation:
        dataset = dataset.map(transform, num_parallel_calls=AUTO)
    if repeat_dataset:
        dataset = dataset.repeat() # the training dataset must repeat for several epochs
    
    dataset = dataset.shuffle(20000)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

#Step 6 - Check oversampled dataset
oversampled_training_dataset = get_training_dataset_with_oversample(repeat_dataset=False, oversample=True, augumentation=True)

label_counter_2 = Counter()
for images, labels in oversampled_training_dataset:
    label_counter_2.update(labels.numpy())

del oversampled_training_dataset

label_counting_sorted_2 = label_counter_2.most_common()

NUM_TRAINING_IMAGES_OVERSAMPLED = sum([x[1] for x in label_counting_sorted_2])
print("\nNumber of examples in the oversampled training dataset: {}".format(NUM_TRAINING_IMAGES_OVERSAMPLED))

print("Labels in the oversampled training dataset, sorted by occurrence")
print(label_counting_sorted_2)

print('\nData Augmentation completed...')

In [None]:
StepsPerEpoch = count_data_items(TRAINING_FILENAMES)//BATCH_SIZE

weight_for_0 = (1 / 32542)*(39161)/2.0 
weight_for_1 = (1 / 6619)*(39161)/2.0
class_weight = {0: weight_for_0, 1: weight_for_1}

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_filepath,
    save_weights_only = True,
    monitor = 'val_acc',
    mode = 'max',
    save_best_only = True)

lrfn = build_lrfn()

print('Training the model...')

history = model.fit(get_training_dataset_with_oversample(repeat_dataset=True, oversample=True, augumentation=True),
                    epochs = Epochs, 
                    steps_per_epoch = StepsPerEpoch,
                    callbacks = [tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=1), model_checkpoint_callback],
                    class_weight = class_weight)

In [None]:
import pickle 

# Saving the model
filename = 'saved_model.sav'
pickle.dump(model, open(filename, 'wb'))

# Loading the saved model
model = pickle.load(open(filename, 'rb'))

In [None]:
# model.save('./model.h5')
# reload_model = tf.keras.models.load_model('./model.h5')

In [None]:
#Plotting AUC curve
metrics =  ['loss', 'auc']
for n, metric in enumerate(metrics):
    name = metric.replace("_"," ").capitalize()
    plt.subplot(2,1,n+1)
    plt.plot(history.epoch,  history.history[metric], label='Train')
    plt.xlabel('Epoch')
    plt.ylabel(name)
    if metric == 'loss':
      plt.ylim([0, plt.ylim()[1]])
    elif metric == 'auc':
      plt.ylim([0.8,1])
    else:
      plt.ylim([0,1])

    plt.legend()

In [None]:
#Predicting test results
NUM_TEST_IMAGES = count_data_items(TEST_FILENAMES)

test_ds = get_test_dataset(ordered=True)

print('Computing predictions...')
test_images_ds = test_ds.map(lambda image, idnum: image)
output = model.predict(test_images_ds)

print('Generating submission.csv file...')
test_ids_ds = test_ds.map(lambda image, idnum: idnum).unbatch()
test_ids = next(iter(test_ids_ds.batch(NUM_TEST_IMAGES))).numpy().astype('U') # all in one batch

output_df = pd.DataFrame({'image_name': test_ids, 'target': np.concatenate(output)})

output_file = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/sample_submission.csv')
del output_file['target']
output_file = output_file.merge(output_df, on='image_name')
output_file.to_csv('./submission_image.csv', index=False)
print('submission.csv file generated...')

In [None]:
# Preparing numerical data
training_data = pd.read_csv("/kaggle/input/siim-isic-melanoma-classification/train.csv")
X = training_data[['sex', 'age_approx', 'anatom_site_general_challenge']]
y = training_data[['target']]

#One hot encoding of categorical data
X = pd.get_dummies(X, columns = ['sex'])
X = pd.get_dummies(X, columns = ['anatom_site_general_challenge'])

# Create a minimum and maximum processor object
min_max_scaler = preprocessing.MinMaxScaler()
# Create an object to transform the data to fit minmax processor
X = min_max_scaler.fit_transform(X)
# Run the normalizer on the dataframe
X = pd.DataFrame(X)

#Imputing null values with 0
X[0] = X[0].fillna(0)

#Performing oversampling using SMOTE (Synthetic Minority Oversampling Technique)
sm = SMOTE(random_state=42)
X_numerical, y_numerical = sm.fit_resample(X, y)

In [None]:
# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():
    numerical_model = tf.keras.Sequential([
            tf.keras.layers.Dense(512, activation='relu', input_shape = [9]),
            tf.keras.layers.Dropout(0.1),    
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dropout(0.1),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dropout(0.1),
            tf.keras.layers.Dense(32, activation='relu'),    
            tf.keras.layers.Dropout(0.1),
            tf.keras.layers.Dense(1, activation='sigmoid')
        ])

    numerical_model.compile(optimizer= tf.keras.optimizers.Adam(learning_rate=0.1),
                    loss = 'binary_crossentropy',
                    metrics = [tf.keras.metrics.AUC()])

numerical_model.summary()

In [None]:
EPOCHS = 400
STEPS_PER_EPOCH = math.ceil(len(X)/EPOCHS)

weight_for_0 = (1 / len(y_numerical[y_numerical == 0]))*len(X_numerical)/2.0 
weight_for_1 = (1 / len(y_numerical[y_numerical == 1]))*len(X_numerical)/2.0
class_weight = {0: weight_for_0, 1: weight_for_1}

history = numerical_model.fit(
    X, y, 
    epochs=EPOCHS,
    steps_per_epoch=STEPS_PER_EPOCH,
    class_weight = class_weight
)

In [None]:
# Predicting test results
test_data = pd.read_csv("/kaggle/input/siim-isic-melanoma-classification/test.csv")
X_test = test_data[['sex', 'age_approx', 'anatom_site_general_challenge']]

#One hot encoding of categorical data
X_test = pd.get_dummies(X_test, columns = ['sex'])
X_test = pd.get_dummies(X_test, columns = ['anatom_site_general_challenge'])

# Create a minimum and maximum processor object
min_max_scaler = preprocessing.MinMaxScaler()
# Create an object to transform the data to fit minmax processor
X_test = min_max_scaler.fit_transform(X_test)
# Run the normalizer on the dataframe
X_test = pd.DataFrame(X_test)

#Imputing null values with 0
X_test[0] = X_test[0].fillna(0)


# below section coped from other notebook

test_ds = get_test_dataset(ordered=True)

print('Computing predictions...')
test_images_ds = test_ds.map(lambda image, idnum: image)
output = np.round(numerical_model.predict(X_test))

print('Generating submission.csv file...')
test_ids_ds = test_ds.map(lambda image, idnum: idnum).unbatch()
test_ids = next(iter(test_ids_ds.batch(NUM_TEST_IMAGES))).numpy().astype('U') # all in one batch

output_df = pd.DataFrame({'image_name': test_ids, 'target': np.concatenate(output)})

output_file = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/sample_submission.csv')
del output_file['target']
output_file = output_file.merge(output_df, on='image_name')
output_file.to_csv('submission_numerical.csv', index=False)
print('submission.csv file generated...')