In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.applications import DenseNet169
from tensorflow.keras.applications import DenseNet201
from tensorflow.keras.applications import ResNet101
from tensorflow.keras.applications import ResNet152
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPool2D
import math
import re
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
import tensorflow.keras.backend as K

print("Tensorflow version " + tf.__version__)

In [None]:
# Copied from https://www.kaggle.com/ryanholbrook/create-your-first-submission

from kaggle_datasets import KaggleDatasets

GCS_DS_PATH = KaggleDatasets().get_gcs_path('tpu-getting-started')
print(GCS_DS_PATH) # what do gcs paths look like?

In [None]:
# Copied from https://www.kaggle.com/ryanholbrook/create-your-first-submission

# Detect TPU, return appropriate distribution strategy

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() 

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
# Copied from https://www.kaggle.com/ryanholbrook/create-your-first-submission

# Define the batch size. This will be 16 with TPU off and 128 (=16*8) with TPU on
BATCH_SIZE = 32 * strategy.num_replicas_in_sync

In [None]:
# Copied from https://www.kaggle.com/ryanholbrook/create-your-first-submission

from kaggle_datasets import KaggleDatasets

GCS_DS_PATH = KaggleDatasets().get_gcs_path('tpu-getting-started')

In [None]:
# Copied from https://www.kaggle.com/ryanholbrook/create-your-first-submission

IMAGE_SIZE = [224, 224]
GCS_PATH = GCS_DS_PATH + '/tfrecords-jpeg-' + str(IMAGE_SIZE[0]) + 'x' + str(IMAGE_SIZE[1])
AUTO = tf.data.experimental.AUTOTUNE

TRAINING_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/train/*.tfrec')
VALIDATION_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/val/*.tfrec')
TEST_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/test/*.tfrec') 

CLASSES = ['pink primrose',    'hard-leaved pocket orchid', 'canterbury bells', 'sweet pea',     'wild geranium',     'tiger lily',           'moon orchid',              'bird of paradise', 'monkshood',        'globe thistle',         # 00 - 09
           'snapdragon',       "colt's foot",               'king protea',      'spear thistle', 'yellow iris',       'globe-flower',         'purple coneflower',        'peruvian lily',    'balloon flower',   'giant white arum lily', # 10 - 19
           'fire lily',        'pincushion flower',         'fritillary',       'red ginger',    'grape hyacinth',    'corn poppy',           'prince of wales feathers', 'stemless gentian', 'artichoke',        'sweet william',         # 20 - 29
           'carnation',        'garden phlox',              'love in the mist', 'cosmos',        'alpine sea holly',  'ruby-lipped cattleya', 'cape flower',              'great masterwort', 'siam tulip',       'lenten rose',           # 30 - 39
           'barberton daisy',  'daffodil',                  'sword lily',       'poinsettia',    'bolero deep blue',  'wallflower',           'marigold',                 'buttercup',        'daisy',            'common dandelion',      # 40 - 49
           'petunia',          'wild pansy',                'primula',          'sunflower',     'lilac hibiscus',    'bishop of llandaff',   'gaura',                    'geranium',         'orange dahlia',    'pink-yellow dahlia',    # 50 - 59
           'cautleya spicata', 'japanese anemone',          'black-eyed susan', 'silverbush',    'californian poppy', 'osteospermum',         'spring crocus',            'iris',             'windflower',       'tree poppy',            # 60 - 69
           'gazania',          'azalea',                    'water lily',       'rose',          'thorn apple',       'morning glory',        'passion flower',           'lotus',            'toad lily',        'anthurium',             # 70 - 79
           'frangipani',       'clematis',                  'hibiscus',         'columbine',     'desert-rose',       'tree mallow',          'magnolia',                 'cyclamen ',        'watercress',       'canna lily',            # 80 - 89
           'hippeastrum ',     'bee balm',                  'pink quill',       'foxglove',      'bougainvillea',     'camellia',             'mallow',                   'mexican petunia',  'bromelia',         'blanket flower',        # 90 - 99
           'trumpet creeper',  'blackberry lily',           'common tulip',     'wild rose']                                                                                                                                               # 100 - 102




In [None]:
import tensorflow as tf
import numpy as np
import random
import tensorflow.keras.backend as K


deg_180 = tf.constant([180], dtype='float32')
_1 = tf.constant([1], dtype='float32')
_0 = tf.constant([0], dtype='float32')
mat_shape = (3, 3)


def get_augment_matrix(rot=0.0, shift_x=0.0, shift_y=0.0, zoom_x=0.0, zoom_y=0.0):
    """"""
    # Degrees to rad
    rot = np.pi * (rot / deg_180)

    # Rotation matrix
    sin = tf.math.sin(rot)
    cos = tf.math.cos(rot)
    rot_mat = tf.concat([cos, sin, _0,
                         -sin, cos, _0,
                         _0, _0, _1], axis=0)
    rot_mat = tf.reshape(rot_mat, mat_shape)

    # Shift matrix
    shift_mat = tf.concat([_1, _0, shift_y,
                           _0, _1, shift_x,
                           _0, _0, _1], axis=0)
    shift_mat = tf.reshape(shift_mat, mat_shape)

    # Zoom matrix
    zoom_mat = tf.concat([_1 / (_1 + zoom_y), _0, _0,
                          _0, _1 / (_1 + zoom_x), _0,
                          _0, _0, _0], axis=0)
    zoom_mat = tf.reshape(zoom_mat, mat_shape)

    # Combine matrices by simple multiplication
    return tf.matmul(tf.matmul(rot_mat, shift_mat), zoom_mat)


def augment_data(image, label):
    """"""
    # Generate transformation parameters
    rot = tf.random.uniform(shape=(1,), minval=-45, maxval=45, dtype='float32')
    shift_x = tf.random.uniform(shape=(1,), minval=-50, maxval=50, dtype='float32')
    shift_y = tf.random.uniform(shape=(1,), minval=-50, maxval=50, dtype='float32')
    zoom = tf.random.normal([1], 0, 0.15, dtype='float32')

    augment_mat = get_augment_matrix(rot, shift_x, shift_y, zoom, zoom)

    # Image pixels coordinates
    # - x and y vectors: so their stack makes cartesian product of pixel coordinates (all possible coordinate pair)
    # - intercept vector: serves as supportive vector for shift operation (allows the operation in absence of addition)
    x = tf.repeat(tf.range(IMAGE_SIZE[0] // 2, -IMAGE_SIZE[0] // 2, -1), IMAGE_SIZE[0])
    y = tf.tile(tf.range(-IMAGE_SIZE[1] // 2, IMAGE_SIZE[1] // 2), [IMAGE_SIZE[1]])
    intercept = tf.ones([IMAGE_SIZE[0] * IMAGE_SIZE[1]], dtype='int32')
    coordinates = tf.stack([x, y, intercept])

    # Find new pixel coordinates
    # 1. Multiply coordinates and augmentation matrix
    # 2. Remove coordinates exceeding the space of the image
    # 3. Get rid of supportive vector and update the reference frame
    coordinates = K.cast(K.dot(augment_mat, K.cast(coordinates, dtype='float32')), dtype='int32')
    coordinates = K.clip(coordinates, -IMAGE_SIZE[0] // 2 + 1, IMAGE_SIZE[1] // 2)
    coordinates = tf.stack([IMAGE_SIZE[0] // 2 - coordinates[0,], IMAGE_SIZE[1] // 2 - 1 + coordinates[1,]])

    # Map the image to the new reference frame
    image = tf.gather_nd(image, tf.transpose(coordinates))
    image = tf.reshape(image, [IMAGE_SIZE[0], IMAGE_SIZE[1], 3])

    return image, label


In [None]:
# Copied from https://www.kaggle.com/ryanholbrook/create-your-first-submission

import math
import re

import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt


def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
    image = tf.reshape(image, [*IMAGE_SIZE, 3])  # explicit size needed for TPU
    return image


def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string),  # tf.string means bytestring
        "class": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = tf.cast(example['class'], tf.int32)
    return image, label  # returns a dataset of (image, label) pairs


def read_unlabeled_tfrecord(example):
    UNLABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string),  # tf.string means bytestring
        "id": tf.io.FixedLenFeature([], tf.string),  # shape [] means single element
        # class is missing, this competitions's challenge is to predict flower classes for the test dataset
    }
    example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    idnum = example['id']
    return image, idnum  # returns a dataset of image(s)


def load_dataset(filenames, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False  # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames,
                                      num_parallel_reads=AUTO)  # automatically interleaves reads from multiple files
    dataset = dataset.with_options(
        ignore_order)  # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord if labeled else read_unlabeled_tfrecord, num_parallel_calls=AUTO)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset


def get_training_dataset():
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
    
    # Augment the data
    dataset = dataset.map(augment_data, num_parallel_calls=AUTO)
    
    dataset = dataset.repeat()  # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(2048, reshuffle_each_iteration=True)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)  # prefetch next batch while training (autotune prefetch buffer size)
    return dataset


def get_validation_dataset(ordered=False):
    dataset = load_dataset(VALIDATION_FILENAMES, labeled=True, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    dataset = dataset.prefetch(AUTO)
    return dataset


def get_test_dataset(ordered=False):
    dataset = load_dataset(TEST_FILENAMES, labeled=False, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset


def count_data_items(filenames):
    # the number of data items is written in the name of the .tfrec
    # files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)


def batch_to_numpy_images_and_labels(data):
    images, labels = data
    numpy_images = images.numpy()
    numpy_labels = labels.numpy()
    if numpy_labels.dtype == object:  # binary string in this case,
        # these are image ID strings
        numpy_labels = [None for _ in enumerate(numpy_images)]
    # If no labels, only image IDs, return None for labels (this is
    # the case for test data)
    return numpy_images, numpy_labels


def title_from_label_and_target(label, correct_label):
    if correct_label is None:
        return CLASSES[label], True
    correct = (label == correct_label)
    return "{} [{}{}{}]".format(CLASSES[label], 'OK' if correct else 'NO', u"\u2192" if not correct else '',
                                CLASSES[correct_label] if not correct else ''), correct


def display_one_flower(image, title, subplot, red=False, titlesize=16):
    plt.subplot(*subplot)
    plt.axis('off')
    plt.imshow(image)
    if len(title) > 0:
        plt.title(title, fontsize=int(titlesize) if not red else int(titlesize / 1.2), color='red' if red else 'black',
                  fontdict={'verticalalignment': 'center'}, pad=int(titlesize / 1.5))
    return (subplot[0], subplot[1], subplot[2] + 1)


def display_batch_of_images(databatch, predictions=None):
    """This will work with:
    display_batch_of_images(images)
    display_batch_of_images(images, predictions)
    display_batch_of_images((images, labels))
    display_batch_of_images((images, labels), predictions)
    """
    # data
    images, labels = batch_to_numpy_images_and_labels(databatch)
    if labels is None:
        labels = [None for _ in enumerate(images)]

    # auto-squaring: this will drop data that does not fit into square
    # or square-ish rectangle
    rows = int(math.sqrt(len(images)))
    cols = len(images) // rows

    # size and spacing
    FIGSIZE = 13.0
    SPACING = 0.1
    subplot = (rows, cols, 1)
    if rows < cols:
        plt.figure(figsize=(FIGSIZE, FIGSIZE / cols * rows))
    else:
        plt.figure(figsize=(FIGSIZE / rows * cols, FIGSIZE))

    # display
    for i, (image, label) in enumerate(zip(images[:rows * cols], labels[:rows * cols])):
        title = '' if label is None else CLASSES[label]
        correct = True
        if predictions is not None:
            title, correct = title_from_label_and_target(predictions[i], label)
        dynamic_titlesize = FIGSIZE * SPACING / max(rows,
                                                    cols) * 40 + 3  # magic formula tested to work from 1x1 to 10x10 images
        subplot = display_one_flower(image, title, subplot, not correct, titlesize=dynamic_titlesize)

    # layout
    plt.tight_layout()
    if label is None and predictions is None:
        plt.subplots_adjust(wspace=0, hspace=0)
    else:
        plt.subplots_adjust(wspace=SPACING, hspace=SPACING)
    plt.show()


def display_training_curves(training, validation, title, subplot):
    if subplot % 10 == 1:  # set up the subplots on the first call
        plt.subplots(figsize=(10, 10), facecolor='#F0F0F0')
        plt.tight_layout()
    ax = plt.subplot(subplot)
    ax.set_facecolor('#F8F8F8')
    ax.plot(training)
    ax.plot(validation)
    ax.set_title('model ' + title)
    ax.set_ylabel(title)
    # ax.set_ylim(0.28,1.05)
    ax.set_xlabel('epoch')
    ax.legend(['train', 'valid.'])
    plt.grid()
    
def display_set_of_images(images, labels, predictions=None):
    # auto-squaring: this will drop data that does not fit into square
    # or square-ish rectangle
    rows = int(math.sqrt(len(images)))
    cols = len(images) // rows

    # size and spacing
    FIGSIZE = 13.0
    SPACING = 0.1
    subplot = (rows, cols, 1)
    if rows < cols:
        plt.figure(figsize=(FIGSIZE, FIGSIZE / cols * rows))
    else:
        plt.figure(figsize=(FIGSIZE / rows * cols, FIGSIZE))

    # display
    for i, (image, label) in enumerate(zip(images[:rows * cols], labels[:rows * cols])):
        title = '' if label is None else CLASSES[label]
        correct = True
        if predictions is not None:
            title, correct = title_from_label_and_target(predictions[i], label)
        dynamic_titlesize = FIGSIZE * SPACING / max(rows,
                                                    cols) * 40 + 3  # magic formula tested to work from 1x1 to 10x10 images
        subplot = display_one_flower(image, title, subplot, not correct, titlesize=dynamic_titlesize)

    # layout
    plt.tight_layout()
    if label is None and predictions is None:
        plt.subplots_adjust(wspace=0, hspace=0)
    else:
        plt.subplots_adjust(wspace=SPACING, hspace=SPACING)
    plt.show()


In [None]:
# Copied from https://www.kaggle.com/ryanholbrook/create-your-first-submission

# Load datasets
ds_train = get_training_dataset()
ds_valid = get_validation_dataset(True)
ds_test = get_test_dataset(True)

In [None]:
# Copied from https://www.kaggle.com/ryanholbrook/create-your-first-submission

NUM_TRAINING_IMAGES = count_data_items(TRAINING_FILENAMES)
NUM_VALIDATION_IMAGES = count_data_items(VALIDATION_FILENAMES)
NUM_TEST_IMAGES = count_data_items(TEST_FILENAMES)

In [None]:
# Copied from https://www.kaggle.com/ryanholbrook/create-your-first-submission

# Show examples of augmented images
ds_iter = iter(ds_train.unbatch().batch(20))
one_batch = next(ds_iter)
display_batch_of_images(one_batch)

In [None]:
# MODELS

def pretrainded_model(type: str, trainable=False):
    with strategy.scope():
        if type == 'VGG16':
            backbone = VGG16(weights='imagenet', include_top=False, input_shape=[*IMAGE_SIZE, 3])
        elif type == 'VGG19':
            backbone = VGG19(weights='imagenet', include_top=False, input_shape=[*IMAGE_SIZE, 3])
        elif type == 'DenseNet121':
            backbone = DenseNet121(weights='imagenet', include_top=False, input_shape=[*IMAGE_SIZE, 3])
        elif type == 'DenseNet169':
            backbone = DenseNet169(weights='imagenet', include_top=False, input_shape=[*IMAGE_SIZE, 3])
        elif type == 'DenseNet201':
            backbone = DenseNet201(weights='imagenet', include_top=False, input_shape=[*IMAGE_SIZE, 3])
        elif type == 'ResNet101':
            backbone = ResNet101(weights='imagenet', include_top=False, input_shape=[*IMAGE_SIZE, 3])
        elif type == 'ResNet152':
            backbone = ResNet152(weights='imagenet', include_top=False, input_shape=[*IMAGE_SIZE, 3])
        elif type == 'ResNet50':
            backbone = ResNet50(weights='imagenet', include_top=False, input_shape=[*IMAGE_SIZE, 3])

        backbone.trainable = trainable

        model = Sequential([
            backbone,
            Flatten(),
            Dense(512, activation='relu'),
            Dropout(0.2),
            Dense(512, activation='relu'),
            Dropout(0.2),
            Dense(512, activation='relu'),
            Dropout(0.2),
            Dense(512, activation='relu'),
            Dropout(0.2),
            tf.keras.layers.Dense(len(CLASSES), activation='softmax', use_bias=False)
        ])

    return model


def vgg16(trainable=False):
    return pretrainded_model('VGG16', trainable)


def vgg19(trainable=False):
    return pretrainded_model('VGG19', trainable)


def densenet121(trainable=False):
    return pretrainded_model('DenseNet121', trainable)


def densenet169(trainable=False):
    return pretrainded_model('DenseNet169', trainable)


def densenet201(trainable=False):
    return pretrainded_model('DenseNet201', trainable)


def resNet101(trainable=False):
    return pretrainded_model('ResNet101', trainable)


def resNet152(trainable=False):
    return pretrainded_model('ResNet152', trainable)


def resNet50(trainable=False):
    return pretrainded_model('ResNet50', trainable)


In [None]:
model = densenet169()

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy']
)

model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Define training epochs
EPOCHS = 200
STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE


early_stopping = EarlyStopping(monitor='val_loss',
                               min_delta=0.0,
                               patience=10,
                               restore_best_weights=False)


history = model.fit(
        ds_train,
        validation_data=ds_valid,
        epochs=EPOCHS,
        steps_per_epoch=STEPS_PER_EPOCH,
        callbacks=[early_stopping],
    )

In [None]:
# Copied from https://www.kaggle.com/ryanholbrook/create-your-first-submission

display_training_curves(
        history.history['loss'],
        history.history['val_loss'],
        'loss',
        211,
    )
display_training_curves(
    history.history['sparse_categorical_accuracy'],
    history.history['val_sparse_categorical_accuracy'],
    'accuracy',
    212,
)

In [None]:
# Statistics

real_values = np.zeros((NUM_VALIDATION_IMAGES,))
np_images = np.zeros((NUM_VALIDATION_IMAGES,IMAGE_SIZE[0], IMAGE_SIZE[1], 3))

batch_size = 32

ds_iter = iter(ds_valid.unbatch().batch(batch_size))

for i in range(NUM_VALIDATION_IMAGES // batch_size):
    one_batch = next(ds_iter)
    images, lbl = one_batch
    lbl = lbl.numpy()
    images =images.numpy()
    
    real_values[i*batch_size:(i+1)*batch_size] = lbl
    np_images[i*batch_size:(i+1)*batch_size, :, :, :] = images
    

est_values = np.argmax(model.predict(ds_valid), axis=-1)

conf_mat = tf.math.confusion_matrix(est_values, real_values)

plt.matshow(conf_mat)


est_values = tf.cast(tf.one_hot(est_values, len(CLASSES)), 'int32')
real_values = tf.cast(tf.one_hot(real_values, len(CLASSES)), 'int32')

err = real_values - est_values
err = err.numpy()
err = np.argmax(err, axis=-1)
err =  [e for e in err if e > 0]

real_values = np.argmax(real_values, axis=-1)

err_count = np.zeros((len(CLASSES),))
class_count = np.zeros((len(CLASSES),))

for e in err:
    err_count[e] += 1
    
for r in real_values:
    class_count[r] += 1



relative_err =  (err_count / class_count)

leaderboard = np.argsort(-relative_err)


worst_names = np.array(CLASSES)[leaderboard[:16]]
wors_accuraci =  1.0 - relative_err[leaderboard[:16]]

print(relative_err)
print(worst_names)

to_found = 16
wrong_examples = []
leaderboard = list(leaderboard[:16])

for i in range(NUM_VALIDATION_IMAGES):
    if to_found < 1:
        break
    if real_values[i] in leaderboard:
        wrong_examples.append(i)
        to_found -= 1
        leaderboard.remove(real_values[i])
        

wrong_images = np_images[wrong_examples, :, :, :]
wrong_labels = real_values[wrong_examples]

display_set_of_images(wrong_images, wrong_labels)




In [None]:
np_images = None

In [None]:
# Copied from https://www.kaggle.com/ryanholbrook/create-your-first-submission

print('Computing predictions...')
test_images_ds = ds_test.map(lambda image, idnum: image)
probabilities = model.predict(test_images_ds)
predictions = np.argmax(probabilities, axis=-1)
print(predictions)

In [None]:
# Copied from https://www.kaggle.com/ryanholbrook/create-your-first-submission

print('Generating submission.csv file...')

# Get image ids from test set and convert to unicode
test_ids_ds = ds_test.map(lambda image, idnum: idnum).unbatch()
test_ids = next(iter(test_ids_ds.batch(NUM_TEST_IMAGES))).numpy().astype('U')

# Write the submission file
np.savetxt(
    'submission.csv',
    np.rec.fromarrays([test_ids, predictions]),
    fmt=['%s', '%d'],
    delimiter=',',
    header='id,label',
    comments='',
)

# Look at the first few predictions
!head submission.csv