In [None]:
DEVICE = "TPU"
BASEPATH = "../input/siim-isic-melanoma-classification"

In [None]:
!pip install -q efficientnet
import efficientnet.tfkeras as efn


In [None]:
import numpy as np
import pandas as pd
import os
import random, re, math, time
random.seed(a=128)

from os.path import join 

import tensorflow as tf
import tensorflow.keras.backend as K
#import tensorflow_addons as tfa
import efficientnet.tfkeras as efn

from tqdm.keras import TqdmCallback

from PIL import Image
import PIL

import matplotlib.pyplot as plt

from sklearn.model_selection import KFold

from sklearn.utils.class_weight import compute_class_weight

import plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from pandas_summary import DataFrameSummary

from kaggle_datasets import KaggleDatasets

from tqdm import tqdm

In [None]:
if DEVICE == "TPU":
    print("connecting to TPU...")
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
        print('Running on TPU ', tpu.master())
    except ValueError:
        print("Could not connect to TPU")
        tpu = None

    if tpu:
        try:
            print("initializing  TPU ...")
            tf.config.experimental_connect_to_cluster(tpu)
            tf.tpu.experimental.initialize_tpu_system(tpu)
            strategy = tf.distribute.experimental.TPUStrategy(tpu)
            print("TPU initialized")
        except _:
            print("failed to initialize TPU")
    else:
        DEVICE = "GPU"

if DEVICE != "TPU":
    print("Using default strategy for CPU and single GPU")
    strategy = tf.distribute.get_strategy()

if DEVICE == "GPU":
    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
    
print("REPLICAS: ", strategy.num_replicas_in_sync)
AUTO = tf.data.experimental.AUTOTUNE


In [None]:
# Configuration
EPOCHS = 6
BATCH_SIZE = 8 * strategy.num_replicas_in_sync
IMAGE_SIZE = [512,512]

In [None]:
GCS_PATH = KaggleDatasets().get_gcs_path('oc-d-512512')
TRAINING_FILENAMES = np.array(tf.io.gfile.glob(GCS_PATH + '/train*.tfrec'))
from sklearn.model_selection import train_test_split
TRAINING_FILENAMES,VALIDATION_FILENAMES = train_test_split(TRAINING_FILENAMES,test_size = 0.20,random_state =42)

CLASSES = [0,1,2,3,4,5,6,7]   

In [None]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
    image = tf.reshape(image, [*IMAGE_SIZE, 3]) # explicit size needed for TPU
    return image

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        #"class": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
        "target": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    #label = tf.cast(example['class'], tf.int32)
    label = tf.cast(example['target'], tf.int32)
    return image, label # returns a dataset of (image, label) pairs

def read_unlabeled_tfrecord(example):
    UNLABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "filename": tf.io.FixedLenFeature([], tf.string),  # shape [] means single element
        # class is missing, this competitions's challenge is to predict flower classes for the test dataset
    }
    example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    idnum = example['filename']
    return image, idnum # returns a dataset of image(s)

def load_dataset(filenames, labeled = True, ordered = False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # Diregarding data order. Order does not matter since we will be shuffling the data anyway

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads = AUTO) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # use data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord if labeled else read_unlabeled_tfrecord, num_parallel_calls = AUTO) # returns a dataset of (image, label) pairs if labeled = True or (image, id) pair if labeld = False

    # Supose you want to keep labels 2 and 4
    dataset = dataset.filter(lambda x, y: (y==0) or(y==2)or (y==3)or(y==4)or(y==5)or(y==6))
    #dataset = dataset.batch(batch_size)

    return dataset

def data_augment(image, label):
    # data augmentation. Thanks to the dataset.prefetch(AUTO) statement in the next function (below),
    # this happens essentially for free on TPU. Data pipeline code is executed on the "CPU" part
    # of the TPU while the TPU itself is computing gradients.
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    image = tf.image.random_hue(image, 0.01)
    image = tf.image.random_saturation(image, 0.7, 1.3)
    image = tf.image.random_contrast(image, 0.8, 1.2)
    image = tf.image.random_brightness(image, 0.1)
    return image, label   

def get_training_dataset():
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
    dataset = dataset.map(data_augment, num_parallel_calls=AUTO)
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def get_validation_dataset(ordered=False):
    dataset = load_dataset(VALIDATION_FILENAMES, labeled=True, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def get_test_dataset(ordered=False):
    dataset = load_dataset(TEST_FILENAMES, labeled=False, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def count_data_items(filenames):
    # the number of data items is written in the name of the .tfrec files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

NUM_TRAINING_IMAGES = count_data_items(TRAINING_FILENAMES)
#NUM_TEST_IMAGES = count_data_items(TEST_FILENAMES)
STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE
print('Dataset: {} training images'.format(NUM_TRAINING_IMAGES))

In [None]:
def lrfn(epoch):
    LR_START          = 0.000001
    LR_MAX            = 0.000005 
    LR_MIN            = 0.000001
    LR_RAMPUP_EPOCHS = 5
    LR_SUSTAIN_EPOCHS = 0
    LR_EXP_DECAY = .8
    
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        lr = (LR_MAX - LR_MIN) * LR_EXP_DECAY**(epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) + LR_MIN
    return lr




In [None]:
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, BatchNormalization, Activation, Dropout
from tensorflow.keras.regularizers import l2

In [None]:
# Efficeint Net B7
import keras

with strategy.scope():
    base_network = efn.EfficientNetB7(input_shape=(512,512,3),weights='noisy-student',include_top=False)
    network = keras.Sequential()
    network.add(base_network)
    network.add(keras.layers.MaxPooling2D())
    network.add(keras.layers.Conv2D(2560,3,padding='same'))
    network.add(keras.layers.BatchNormalization())
    network.add(keras.layers.ReLU())
    network.add(keras.layers.GlobalAveragePooling2D())
    
    network.add(keras.layers.Dense(1024))
    network.add(keras.layers.BatchNormalization())
    network.add(keras.layers.LeakyReLU())
    
    network.add(keras.layers.Dense(512))
    network.add(keras.layers.BatchNormalization())
    network.add(keras.layers.LeakyReLU())
    
    network.add(keras.layers.Dense(256))
    network.add(keras.layers.BatchNormalization())
    network.add(keras.layers.LeakyReLU())
    
    
    network.add(keras.layers.Dense(8,activation='softmax'))
    network.compile(optimizer=keras.optimizers.Adam(lr=0.0001),loss=keras.losses.SparseCategoricalCrossentropy(),metrics=[keras.metrics.SparseCategoricalAccuracy()])
    network.summary()




In [None]:
lr_schedule = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=1)
history7 = network.fit(get_training_dataset(), steps_per_epoch=STEPS_PER_EPOCH,validation_data=get_validation_dataset(), epochs=200, callbacks=[lr_schedule])

In [None]:
test_ds = get_test_dataset(ordered=True)

print('Computing predictions...')
test_images_ds = test_ds.map(lambda image, idnum: image)

probabilitiesRN = modelRN.predict(test_images_ds)
probabilities7 = model7.predict(test_images_ds)
probabilitiesV3 = modelIV3.predict(test_images_ds)

In [None]:
print('Generating submission.csv files...')
print('Generating submission.csv file...')
test_ids_ds = test_ds.map(lambda image, idnum: idnum).unbatch()
test_ids = next(iter(test_ids_ds.batch(NUM_TEST_IMAGES))).numpy().astype('U') # all in one batch

In [None]:
pred_dfRN = pd.DataFrame({'image_name': test_ids, 'target': np.concatenate(probabilitiesRN)})
pred_df7 = pd.DataFrame({'image_name': test_ids, 'target': np.concatenate(probabilities7)})
pred_dfV3 = pd.DataFrame({'image_name': test_ids, 'target': np.concatenate(probabilitiesV3)})

pred_df7.head()

In [None]:
subRN = sub.copy()
sub7 = sub.copy()
subV3 = sub.copy()
subRN.head()

In [None]:

del subRN['target']
subRN = subRN.merge(pred_dfRN, on='image_name')
subRN.to_csv('submissionRN.csv', index=False)


del subV3['target']
subV3 = subV3.merge(pred_dfV3, on='image_name')
subV3.to_csv('submissionV3.csv', index=False)


del sub7['target']
sub7 = sub7.merge(pred_df7, on='image_name')
sub7.to_csv('submission7.csv', index=False)

subRN.head()

In [None]:
#average

ensemble1 = (subRN['target'] + sub7['target']+ subV3['target'])/3
ensemble_img1 = subRN['image_name']
ensemble_sub1 = pd.concat([ensemble_img1, ensemble1], axis = 1)
ensemble_sub1.to_csv('submissionV3.csv', index=False)
ensemble_sub1.head()