In [None]:
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2
import tensorflow as tf
import tensorflow.keras.backend as K
import matplotlib.pyplot as plt
import tensorflow as tf
import math, PIL

In [None]:
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, GlobalAveragePooling2D, Dropout, LeakyReLU, Input, Convolution2D, BatchNormalization
from tensorflow.keras.models import load_model, Sequential, Model
from tensorflow.keras.optimizers import Adam, Adadelta
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.inception_v3 import InceptionV3
import keras
from kaggle_datasets import KaggleDatasets

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import re
#for dirname, _, filenames in os.walk('/kaggle/input'):
 #   for filename in filenames:
 #       print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from tensorflow.keras import initializers
from tensorflow.keras import regularizers
from tensorflow.keras import constraints

from tensorflow.keras.layers import MaxPooling2D, Convolution2D, AveragePooling2D
from tensorflow.keras.layers import Input, Dropout, Dense, Flatten, Activation
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import concatenate

from keras.utils.layer_utils import convert_all_kernels_in_model
from keras.utils.data_utils import get_file
from tensorflow.keras.initializers import RandomNormal



In [None]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
input_path = '/kaggle/input/siim-isic-melanoma-classification'
#input_path = '/kaggle/input/mela-domain-data'
output_path = '/kaggle/working'
train_data = pd.read_csv(input_path+'/train.csv', usecols = ['image_name', 'target'])
test_data = pd.read_csv(input_path+'/test.csv')

NUM_TEST_IMAGES = test_data.shape[0]

In [None]:
def focal_loss(gamma=2., alpha=.25):
    def focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        return -K.mean(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) - K.mean((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))
    return focal_loss_fixed

In [None]:
with strategy.scope(): 
    base=VGG16(input_shape=(256,256,3), weights='imagenet', include_top=False) 
    base.trainable=True 
    model_copy = tf.keras.Sequential([ 
        (base), 
        GlobalAveragePooling2D(),
        Dense(4096,  name='fc1'), 
        LeakyReLU(alpha=0.2),
        Dropout(0.2), 
        Dense(4096,  name='fc2'), 
        LeakyReLU(alpha=0.2),
        Dropout(0.2), 
        Dense(1024,   name='fc3'),
        LeakyReLU(alpha=0.2),
        Dropout(0.2), 
        Dense(1, activation ='sigmoid',trainable=True) ])

#model = Model(model.input, x) 
adadelta = tf.keras.optimizers.Adadelta(lr=0.0001) 
model_copy.compile(optimizer=adadelta,  loss='binary_crossentropy', metrics=['AUC', 'Recall', 'Precision'])
model_copy.summary()

In [None]:
model = load_model('/kaggle/input/target-model/model.h5')
model_copy.set_weights(model.get_weights())

In [None]:
model_copy.save('model_copy.h5')

In [None]:
with strategy.scope():
    reload_model = load_model('model_copy.h5')
    
#adadelta = tf.keras.optimizers.Adadelta(lr=0.001) 
#model.compile(optimizer=adadelta, loss='binary_crossentropy', metrics=['AUC', 'Recall', 'Precision'])
#model.summary()

In [None]:
model_copy.summary()

[focal_loss(alpha=.25, gamma=2)]


In [None]:
LR_START = 0.0001
LR_MAX = 0.00005 * strategy.num_replicas_in_sync
LR_MIN = 0.1
LR_RAMPUP_EPOCHS = 5
LR_SUSTAIN_EPOCHS = 0
LR_EXP_DECAY = .8

def lrfn(epoch):
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        lr = (LR_MAX - LR_MIN) * LR_EXP_DECAY**(epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) + LR_MIN
    return lr

#def lrfn(epoch):
#    lr = (LR_MAX - LR_MIN) * LR_EXP_DECAY**((50+epoch) - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) + LR_MIN
#    return lr
    
lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = True)

In [None]:
IMAGE_SIZE = [256, 256]

In [None]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
    image = tf.reshape(image, [1024, 1024, 3]) # explicit size needed for TPU
    image = tf.image.resize(image, [256, 256])
    return image


def decode_image_generated(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0 # convert image to floats in [0, 1] range
    image = tf.reshape(image, [256, 256, 3]) # explicit size needed for TPU
    return image

def decode_image_test(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0 # convert image to floats in [0, 1] range
    image = tf.reshape(image, [1024,1024, 3]) # explicit size needed for TPU
    image = tf.image.resize(image, [256, 256])
    return image

In [None]:
def data_augment(img, label):
    img = tf.image.random_flip_left_right(img)
    img = tf.image.random_hue(img, 0.01)
    img = tf.image.random_saturation(img, 0.7, 1.3)
    img = tf.image.random_contrast(img, 0.8, 1.2)
    img = tf.image.random_brightness(img, 0.1)
    return img, label 

In [None]:
def data_augment_test(img):
    img = transform(img)
    img = tf.image.random_flip_left_right(img)
    img = tf.image.random_hue(img, 0.01)
    img = tf.image.random_saturation(img, 0.7, 1.3)
    img = tf.image.random_contrast(img, 0.8, 1.2)
    img = tf.image.random_brightness(img, 0.1)
    return img

In [None]:
def load_dataset(filenames, labeled=True, ordered=False, generated=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_generated_tfrecord if (labeled and generated) else read_labeled_tfrecord if labeled else read_unlabeled_tfrecord, num_parallel_calls=AUTO)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

In [None]:
def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        #"class": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
        "target": tf.io.FixedLenFeature([], tf.int64)  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    #label = tf.cast(example['class'], tf.int32)
    label = tf.cast(example['target'], tf.int32)
    return image, label # returns a dataset of (image, label) pairs

def read_labeled_generated_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        #"class": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
        "target": tf.io.FixedLenFeature([], tf.int64)  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image_generated(example['image'])
    #label = tf.cast(example['class'], tf.int32)
    label = tf.cast(example['target'], tf.int32)
    return image, label # returns a dataset of (image, label) pairs

def read_unlabeled_tfrecord(example):
    UNLABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "image_name": tf.io.FixedLenFeature([], tf.string)  # shape [] means single element
        # class is missing, this competitions's challenge is to predict flower classes for the test dataset
    }
    example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT)
    image = decode_image_test(example['image'])
    idnum = example['image_name']
    return image, idnum # returns a dataset of image(s)

In [None]:
#GCS_PATH = KaggleDatasets().get_gcs_path('siim-isic-melanoma-classification')
GCS_PATH = KaggleDatasets().get_gcs_path('mela-domain-data')
TRAINING_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/train*.tfrec')

GCS_GR_PATH = KaggleDatasets().get_gcs_path('generated')
Generated = tf.io.gfile.glob(GCS_GR_PATH + '/generated*.tfrec')

AUTO = tf.data.experimental.AUTOTUNE

BATCH_SIZE = 8
EPOCHS = 40

STEPS_PER_EPOCH = (train_data.shape[0] + 20710) // BATCH_SIZE

def get_training_dataset():
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True, generated=True)
    dataset = dataset.map(data_augment, num_parallel_calls=AUTO)
    #dataset = dataset.repeat() # the training dataset must repeat for several epochs
    #dataset = dataset.shuffle(2048)
    #dataset = dataset.batch(BATCH_SIZE)
    #dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def get_training_dataset_gen():
    dataset = load_dataset(Generated, labeled=True, generated=True)
    dataset = dataset.map(data_augment, num_parallel_calls=AUTO)
    #dataset = dataset.repeat() # the training dataset must repeat for several epochs
    #dataset = dataset.shuffle(2048)
    #dataset = dataset.batch(BATCH_SIZE)
    #dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

In [None]:
def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

In [None]:
dataset1 = get_training_dataset()
dataset2 = get_training_dataset_gen()

dataset1_cnt = count_data_items(TRAINING_FILENAMES)
dataset2_cnt = count_data_items(Generated)


train_size_1 = int(0.8 * dataset1_cnt) + 1
valid_size_1 = dataset1_cnt - train_size_1

train_size_2 = int(0.8 * dataset2_cnt) + 1
valid_size_2 = dataset2_cnt - train_size_2

train_dataset_1 = dataset1.take(train_size_1)#.shuffle(2048, reshuffle_each_iteration=True)
valid_dataset_1 = dataset1.skip(train_size_1)

train_dataset_2 = dataset2.take(train_size_2)#.shuffle(2048, reshuffle_each_iteration=True)
valid_dataset_2 = dataset2.skip(train_size_2)

train_dataset = train_dataset_1.concatenate(train_dataset_2)
train_dataset = train_dataset.batch(BATCH_SIZE)
train_dataset = train_dataset.shuffle(2048, reshuffle_each_iteration=True)
train_dataset = train_dataset.prefetch(AUTO)


valid_dataset = valid_dataset_1.concatenate(valid_dataset_2)
valid_dataset = valid_dataset.batch(BATCH_SIZE)
valid_dataset = valid_dataset.prefetch(AUTO)


reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, min_lr=1e-8, mode='auto')
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, verbose=1, patience=8, mode='auto')#, restore_best_weights=True)
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=output_path+'/model.h5', monitor='val_loss', verbose=1, save_best_only=True,save_weights_only=False, mode='auto')


In [None]:
history = reload_model.fit(train_dataset, epochs=EPOCHS, validation_data=valid_dataset, callbacks=[reduce_lr, early_stopping, checkpoint])

In [None]:
reload_model = load_model('/kaggle/working/model.h5')

In [None]:
GCS_TEST_PATH = KaggleDatasets().get_gcs_path('siim-isic-melanoma-classification')
TEST_FILENAMES = tf.io.gfile.glob(GCS_TEST_PATH + '/tfrecords/test*.tfrec')
def get_test_dataset(ordered=False):
    dataset = load_dataset(TEST_FILENAMES, labeled=False, ordered=ordered)
    dataset = dataset.map(data_augment, num_parallel_calls=AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

In [None]:
test_ds = get_test_dataset(ordered=True) # since we are splitting the dataset and iterating separately on images and ids, order matters.

print('Computing predictions...')
test_images_ds = test_ds.map(lambda image, idnum: image)
probabilities = reload_model.predict(test_images_ds).flatten()
print(probabilities)
    
print('Generating submission.csv file...')
test_ids_ds = test_ds.map(lambda image, idnum: idnum).unbatch()
test_ids = next(iter(test_ids_ds.batch(NUM_TEST_IMAGES))).numpy().astype('U') # all in one batch
np.savetxt('submission.csv', np.rec.fromarrays([test_ids, probabilities]), fmt=['%s', '%f'], delimiter=',', header='image_name,target', comments='')
!head submission.csv

In [None]:
def show_dataset(thumb_size, cols, rows, ds):
    mosaic = PIL.Image.new(mode='RGB', size=(thumb_size*cols + (cols-1), 
                                             thumb_size*rows + (rows-1)))
   
    for idx, data in enumerate(iter(ds)):
        img, target_or_imgid = data
        ix  = idx % cols
        iy  = idx // cols
        img = np.clip(img.numpy() * 255, 0, 255).astype(np.uint8)
        img = PIL.Image.fromarray(img)
        img = img.resize((thumb_size, thumb_size), resample=PIL.Image.BILINEAR)
        mosaic.paste(img, (ix*thumb_size + ix, 
                           iy*thumb_size + iy))

    display(mosaic)

ds = dataset1.take(12*5)   
show_dataset(64, 12, 5, ds)