In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import train_test_split
import cv2
import matplotlib.pyplot as plt
from functools import partial


import tensorflow as tf, re, math
import tensorflow.keras.backend as K
# import efficientnet.tfkeras as efn
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score


## Set up

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print('Number of replicas:', strategy.num_replicas_in_sync)

### Load Data:
**TPUs** will read the data from **Google Cloud Storage**(GCS), so we need to specify the training data path in GCS

In [None]:
GCS_DS_PATH = KaggleDatasets().get_gcs_path()
!gsutil ls $GCS_DS_PATH # list the bucket

**We will see that we set a bigger batch size (128) because the data is imbalanced**

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

BATCH_SIZE = 8 * strategy.num_replicas_in_sync
IMAGE_SIZE = [1024, 1024]
IMAGE_RESIZE = [256, 256]
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')
print(BATCH_SIZE)

In [None]:
TRAINING_FILENAMES, VALIDATION_FILENAMES = train_test_split(
    tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords/train*.tfrec'),
    test_size=0.2, random_state=42
)

print(len(TRAINING_FILENAMES))
print(len(VALIDATION_FILENAMES))

In [None]:
TEST_FILENAMES = tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords/test*.tfrec')
print(len(TEST_FILENAMES))

In [None]:
def decode_image(image):
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.reshape(image, [*IMAGE_SIZE, 3])
    return image

In [None]:
def read_tfrecord(example, labeled):
    tfrecord_format = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "target": tf.io.FixedLenFeature([], tf.int64)
    } if labeled else {
        "image": tf.io.FixedLenFeature([], tf.string),
        "image_name": tf.io.FixedLenFeature([], tf.string)
    }
    example = tf.io.parse_single_example(example, tfrecord_format)
    image = decode_image(example['image'])
    if labeled:
        label = tf.cast(example['target'], tf.int32)
        return image, label
    idnum = example['image_name']
    return image, idnum

In [None]:
def dropout(image, DIM=IMAGE_RESIZE[0], PROBABILITY = 0.5, CT = 8, SZ = 0.2):
    # input - one image of size [dim,dim,3] not a batch of [b,dim,dim,3]
    # output - image with CT squares of side size SZ*DIM removed

    # DO DROPOUT WITH PROBABILITY DEFINED ABOVE
    P = tf.cast( tf.random.uniform([],0,1) < PROBABILITY, tf.int32)
    if (P == 0)|(CT == 0)|(SZ == 0): return image

    for k in range( CT ):
        # CHOOSE RANDOM LOCATION
        x = tf.cast( tf.random.uniform([],0,DIM),tf.int32)
        y = tf.cast( tf.random.uniform([],0,DIM),tf.int32)
        # COMPUTE SQUARE 
        WIDTH = tf.cast( SZ*DIM,tf.int32) * P
        ya = tf.math.maximum(0,y-WIDTH//2)
        yb = tf.math.minimum(DIM,y+WIDTH//2)
        xa = tf.math.maximum(0,x-WIDTH//2)
        xb = tf.math.minimum(DIM,x+WIDTH//2)
        # DROPOUT IMAGE
        one = image[ya:yb,0:xa,:]
        two = tf.zeros([yb-ya,xb-xa,3]) 
        three = image[ya:yb,xb:DIM,:]
        middle = tf.concat([one,two,three],axis=1)
        image = tf.concat([image[0:ya,:,:],middle,image[yb:DIM,:,:]],axis=0)

    # RESHAPE HACK SO TPU COMPILER KNOWS SHAPE OF OUTPUT TENSOR 
    image = tf.reshape(image,[DIM,DIM,3])
    return image

In [None]:
def augmentation_pipeline(image, label):
    image = tf.image.random_flip_left_right(image)
    image = tf.image.resize(image, IMAGE_RESIZE)
    image = dropout(image, DIM=IMAGE_RESIZE[0], PROBABILITY = 0.5, CT = 8, SZ = 0.2)
    
    return image, label

In [None]:
def augmentation_pipeline_val(image, label):
    image = tf.image.resize(image, IMAGE_RESIZE)
    return image, label

In [None]:
def augmentation_pipeline_test(image, name):
    image = tf.image.resize(image, IMAGE_RESIZE)
    return image, name
# def augmentation_pipeline_test(image):
#     image = tf.image.resize(image, IMAGE_RESIZE)
#     return image

In [None]:
def load_dataset(filenames, labeled=True, ordered=False):
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(partial(read_tfrecord, labeled=labeled), num_parallel_calls=AUTOTUNE)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

In [None]:
# def get_dataset(file):
#     dataset = load_dataset(file, labeled=True)
#     dataset = dataset.map(augmentation_pipeline, num_parallel_calls=AUTOTUNE)
#     dataset = dataset.repeat()
#     dataset = dataset.shuffle(2048)
#     dataset = dataset.batch(BATCH_SIZE)
#     dataset = dataset.prefetch(AUTOTUNE)
#     return dataset


# def get_test_dataset(file):
#     dataset = load_dataset(file)
#     dataset = dataset.map(augmentation_pipeline_test, num_parallel_calls=AUTOTUNE)
# #     dataset = dataset.repeat()
# #     dataset = dataset.shuffle(2048)
# #     dataset = dataset.batch(BATCH_SIZE)
# #     dataset = dataset.prefetch(AUTOTUNE)
#     return dataset



In [None]:
def get_training_dataset():
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
    dataset = dataset.map(augmentation_pipeline, num_parallel_calls=AUTOTUNE)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

def get_validation_dataset(ordered=False):
    dataset = load_dataset(VALIDATION_FILENAMES, labeled=True, ordered=ordered)
    dataset = dataset.map(augmentation_pipeline_val, num_parallel_calls=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

def get_test_dataset(ordered=False):
    dataset = load_dataset(TEST_FILENAMES, labeled=False, ordered=ordered) 
    dataset = dataset.map(augmentation_pipeline_test, num_parallel_calls=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

In [None]:
train_dataset = get_training_dataset()
valid_dataset = get_validation_dataset()

In [None]:
train_dataset

In [None]:
image_batch, label_batch = next(iter(train_dataset))

In [None]:
def show_batch(image_batch, label_batch):
    plt.figure(figsize=(10,10))
    for n in range(16):
        ax = plt.subplot(4,4,n+1)
        plt.imshow(image_batch[n])
        if label_batch[n]:
            plt.title("MALIGNANT")
        else:
            plt.title("BENIGN")
        plt.axis("off")
        
show_batch(image_batch.numpy(), label_batch.numpy())

In [None]:
import re
def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

In [None]:
NUM_TRAINING_IMAGES = count_data_items(TRAINING_FILENAMES)
NUM_VALIDATION_IMAGES = count_data_items(VALIDATION_FILENAMES)
NUM_TEST_IMAGES = count_data_items(TEST_FILENAMES)
STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE
print(
    'Dataset: {} training images, {} validation images, {} unlabeled test images'.format(
        NUM_TRAINING_IMAGES, NUM_VALIDATION_IMAGES, NUM_TEST_IMAGES
    )
)

## ================= Build Model ======================


In [None]:
from tensorflow.keras.layers.experimental import preprocessing
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
# from focal_loss import BinaryFocalLoss

In [None]:
def Training_Model(model_name, IMG_SIZE, NUM_CHANNELS, Dropout_rate):
    inputs = layers.Input(shape=(IMG_SIZE, IMG_SIZE, 3))
    
    if model_name == 'VGG19':
        base_model = tf.keras.applications.vgg19.VGG19(input_shape=(IMG_SIZE, IMG_SIZE, NUM_CHANNELS), include_top=False, weights='imagenet')
        base_model.trainable = False
        x = base_model.output
        x = layers.GlobalAveragePooling2D()(x)
        x = layers.Dense(1000, activation='relu')(x)
        x = layers.Dropout(Dropout_rate)(x)
        x = layers.Dense(1, activation='sigmoid')(x)
    
    if model_name == 'ResNet152V2':
        base_model = tf.keras.applications.resnet_v2.ResNet152V2( input_shape=(IMG_SIZE, IMG_SIZE, NUM_CHANNELS), include_top=False, weights='imagenet')
        base_model.trainable = False
        x = base_model.output
        x = layers.GlobalAveragePooling2D()(x)
        x = layers.Dense(1000, activation='relu')(x)
        x = layers.Dropout(Dropout_rate)(x)
        x = layers.Dense(1, activation='sigmoid')(x)
        
    if model_name == 'InceptionV3':
        base_model = tf.keras.applications.InceptionV3( input_shape=(IMG_SIZE, IMG_SIZE, NUM_CHANNELS), include_top=False, weights='imagenet')
        base_model.trainable = False
        x = base_model.output
        x = layers.GlobalAveragePooling2D()(x)
        x = layers.Dense(1000, activation='relu')(x)
        x = layers.Dropout(Dropout_rate)(x)
        x = layers.Dense(1, activation='sigmoid')(x)
        
    if model_name == 'Xception':
        base_model=tf.keras.applications.Xception( input_shape=(IMG_SIZE, IMG_SIZE, NUM_CHANNELS), include_top=False, weights='imagenet')
        base_model.trainable = False
        x = base_model.output
        x = layers.GlobalAveragePooling2D()(x)
        x = layers.Dense(1000, activation='relu')(x)
        x = layers.Dropout(Dropout_rate)(x)
        x = layers.Dense(1, activation='sigmoid')(x)
        
    if model_name == 'EfficientNetB2':
        base_model=tf.keras.applications.EfficientNetB2( input_shape=(IMG_SIZE, IMG_SIZE, NUM_CHANNELS), include_top=False, weights='imagenet')
        base_model.trainable = False
        x = base_model.output
        x = layers.GlobalAveragePooling2D()(x)
        x = layers.Dense(1000, activation='relu')(x)
        x = layers.Dropout(Dropout_rate)(x)
        x = layers.Dense(1, activation='sigmoid')(x)
        
        
    if model_name == 'EfficientNetB3':
        base_model=tf.keras.applications.EfficientNetB3( input_shape=(IMG_SIZE, IMG_SIZE, NUM_CHANNELS), include_top=False, weights='imagenet')
        base_model.trainable = False
        x = base_model.output
        x = layers.GlobalAveragePooling2D()(x)
        x = layers.Dense(1000, activation='relu')(x)
        x = layers.Dropout(Dropout_rate)(x)
        x = layers.Dense(1, activation='sigmoid')(x)
        
    if model_name == 'EfficientNetB4':
        base_model=tf.keras.applications.EfficientNetB4( input_shape=(IMG_SIZE, IMG_SIZE, NUM_CHANNELS), include_top=False, weights='imagenet')
        base_model.trainable = False
        x = base_model.output
        x = layers.GlobalAveragePooling2D()(x)
        x = layers.Dense(1000, activation='relu')(x)
        x = layers.Dropout(Dropout_rate)(x)
        x = layers.Dense(1, activation='sigmoid')(x)
        
    if model_name == 'InceptionResNetV2':
        base_model=tf.keras.applications.InceptionResNetV2( input_shape=(IMG_SIZE, IMG_SIZE, NUM_CHANNELS), include_top=False, weights='imagenet')
        base_model.trainable = False
        x = base_model.output
        x = layers.GlobalAveragePooling2D()(x)
        x = layers.Dense(1000, activation='relu')(x)
        x = layers.Dropout(Dropout_rate)(x)
        x = layers.Dense(1, activation='sigmoid')(x)
        
    if model_name == 'DenseNet201':
        base_model=tf.keras.applications.DenseNet201( input_shape=(IMG_SIZE, IMG_SIZE, NUM_CHANNELS), include_top=False, weights='imagenet')
        base_model.trainable = False
        x = base_model.output
        x = layers.GlobalAveragePooling2D()(x)
        x = layers.Dense(1000, activation='relu')(x)
        x = layers.Dropout(Dropout_rate)(x)
        x = layers.Dense(1, activation='sigmoid')(x)
        
    if model_name == 'MobileNetV2':
        base_model=tf.keras.applications.MobileNetV2( input_shape=(IMG_SIZE, IMG_SIZE, NUM_CHANNELS), include_top=False, weights='imagenet')
        base_model.trainable = False
        x = base_model.output
        x = layers.GlobalAveragePooling2D()(x)
        x = layers.Dense(1000, activation='relu')(x)
        x = layers.Dropout(Dropout_rate)(x)
        x = layers.Dense(1, activation='sigmoid')(x)
        
    if model_name == 'ResNet101V2':
        base_model=tf.keras.applications.ResNet101V2( input_shape=(IMG_SIZE, IMG_SIZE, NUM_CHANNELS), include_top=False, weights='imagenet')
        base_model.trainable = False
        x = base_model.output
        x = layers.GlobalAveragePooling2D()(x)
        x = layers.Dense(1000, activation='relu')(x)
        x = layers.Dropout(Dropout_rate)(x)
        x = layers.Dense(1, activation='sigmoid')(x)
    
        
    model = models.Model(inputs=base_model.input, outputs=x)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC']) 
#     model.compile(optimizer='adam', loss=BinaryFocalLoss(gamma=2), metrics=['AUC'])
    return model

## ==================== Training =========================

In [None]:
with strategy.scope():
    model1 = Training_Model(model_name='ResNet101V2', IMG_SIZE=IMAGE_RESIZE[0], NUM_CHANNELS=3, Dropout_rate=0.4)
    model2 = Training_Model(model_name='DenseNet201', IMG_SIZE=IMAGE_RESIZE[0], NUM_CHANNELS=3, Dropout_rate=0.4)
    model3 = Training_Model(model_name='MobileNetV2', IMG_SIZE=IMAGE_RESIZE[0], NUM_CHANNELS=3, Dropout_rate=0.4)

In [None]:
epochs = 5
STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE
VALID_STEPS = NUM_VALIDATION_IMAGES // BATCH_SIZE

# checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("ResNet_model.h5", save_best_only=True)
# early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

class_weight = {0: 0.5, 1: 28.0}

history1 = model1.fit(
    train_dataset,
    epochs=epochs,
    steps_per_epoch=STEPS_PER_EPOCH,
    validation_data=valid_dataset,
    validation_steps=VALID_STEPS
#     callbacks=[checkpoint_cb, early_stopping_cb],
#     class_weight=class_weight

)
model1.save('model1.hdf5')

In [None]:
history2 = model2.fit(
    train_dataset,
    epochs=epochs,
    steps_per_epoch=STEPS_PER_EPOCH,
    validation_data=valid_dataset,
    validation_steps=VALID_STEPS,
#     callbacks=[checkpoint_cb, early_stopping_cb],
    class_weight=class_weight

)

model2.save('model2.hdf5')

In [None]:
history3 = model3.fit(
    train_dataset,
    epochs=epochs,
    steps_per_epoch=STEPS_PER_EPOCH,
    validation_data=valid_dataset,
    validation_steps=VALID_STEPS,
#     callbacks=[checkpoint_cb, early_stopping_cb],
    class_weight=class_weight

)
model3.save('model3.hdf5')

## =================== Prediction ===================

In [None]:
test_ds = get_test_dataset(TEST_FILENAMES)

print('Computing predictions...')
test_images_ds = test_ds.map(lambda image, idnum: image)

model_list = [model1, model2, model3]

# probabilities1 = model1.predict(test_images_ds)
# probabilities2 = model2.predict(test_images_ds)
# probabilities3 = model3.predict(test_images_ds)
ens_probabilities = [model.predict(test_images_ds) for model in model_list]


print("========================  Done  ============================")

In [None]:
# Average the predictions of models

average_prob = np.sum(ens_probabilities, axis=0)/len(model_list)

In [None]:
# weight the prediction of models


weights = [0.4, 0.25, 0.35]

#Use tensordot to sum the products of all elements over specified axes.
weighted_prob = np.tensordot(ens_probabilities, weights, axes=((0),(0)))


In [None]:
print('Generating submission.csv file...')
test_ids_ds = test_ds.map(lambda image, idnum: idnum).unbatch()
test_ids = next(iter(test_ids_ds.batch(NUM_TEST_IMAGES))).numpy().astype('U')
print("========================  Done  ============================")

In [None]:
pred_df_weighted = pd.DataFrame({'image_name': test_ids, 'target': np.concatenate(weighted_prob)})
pred_df_av = pd.DataFrame({'image_name': test_ids, 'target': np.concatenate(average_prob)})

pred_df_weighted.head()

In [None]:
pred_df_weighted.to_csv('submission_weighted.csv', index=False)
pred_df_av.to_csv('submission_av.csv', index=False)