In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from kaggle_datasets import KaggleDatasets
import numpy as np
import tensorflow as tf
import re
from tensorflow import keras

print(f'Tensorflow version: {tf.__version__}')

In [None]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is set.
    # On Kaggle this is always the case.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

# TPUStrategy for distributing training
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else: # default strategy that works on CPU and single GPU
    strategy = tf.distribute.get_strategy()

print('Replicas ',strategy.num_replicas_in_sync)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

IMAGE_SIZE = [224,224]
EPOCHS = 20
FOLDS = 3
BATCH_SIZE = 16 * strategy.num_replicas_in_sync

In [None]:

GCS_DS_PATH = KaggleDatasets().get_gcs_path('tpu-getting-started')
print(GCS_DS_PATH)

GCS_PATH = GCS_DS_PATH + '/tfrecords-jpeg-224x224'
AUTO = tf.data.experimental.AUTOTUNE

TRAINING_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/train/*.tfrec')
VALIDATION_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/val/*.tfrec')
TEST_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/test/*.tfrec') 

In [None]:
CLASSES = ['pink primrose',    'hard-leaved pocket orchid', 'canterbury bells', 'sweet pea',     'wild geranium',     'tiger lily',           'moon orchid',              'bird of paradise', 'monkshood',        'globe thistle',         # 00 - 09
           'snapdragon',       "colt's foot",               'king protea',      'spear thistle', 'yellow iris',       'globe-flower',         'purple coneflower',        'peruvian lily',    'balloon flower',   'giant white arum lily', # 10 - 19
           'fire lily',        'pincushion flower',         'fritillary',       'red ginger',    'grape hyacinth',    'corn poppy',           'prince of wales feathers', 'stemless gentian', 'artichoke',        'sweet william',         # 20 - 29
           'carnation',        'garden phlox',              'love in the mist', 'cosmos',        'alpine sea holly',  'ruby-lipped cattleya', 'cape flower',              'great masterwort', 'siam tulip',       'lenten rose',           # 30 - 39
           'barberton daisy',  'daffodil',                  'sword lily',       'poinsettia',    'bolero deep blue',  'wallflower',           'marigold',                 'buttercup',        'daisy',            'common dandelion',      # 40 - 49
           'petunia',          'wild pansy',                'primula',          'sunflower',     'lilac hibiscus',    'bishop of llandaff',   'gaura',                    'geranium',         'orange dahlia',    'pink-yellow dahlia',    # 50 - 59
           'cautleya spicata', 'japanese anemone',          'black-eyed susan', 'silverbush',    'californian poppy', 'osteospermum',         'spring crocus',            'iris',             'windflower',       'tree poppy',            # 60 - 69
           'gazania',          'azalea',                    'water lily',       'rose',          'thorn apple',       'morning glory',        'passion flower',           'lotus',            'toad lily',        'anthurium',             # 70 - 79
           'frangipani',       'clematis',                  'hibiscus',         'columbine',     'desert-rose',       'tree mallow',          'magnolia',                 'cyclamen ',        'watercress',       'canna lily',            # 80 - 89
           'hippeastrum ',     'bee balm',                  'pink quill',       'foxglove',      'bougainvillea',     'camellia',             'mallow',                   'mexican petunia',  'bromelia',         'blanket flower',        # 90 - 99
           'trumpet creeper',  'blackberry lily',           'common tulip',     'wild rose']                                                                                                                                               # 100 - 102

In [None]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.reshape(image, [*IMAGE_SIZE, 3])
    return image

In [None]:
def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        'image':tf.io.FixedLenFeature([], tf.string),
        'class':tf.io.FixedLenFeature([], tf.int64),
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = tf.cast(example['class'], tf.int32)
    return image, label

In [None]:
def read_unlabeled_tfrecord(test_example):
    UNLABELED_TFREC_FORMAT = {
        'image':tf.io.FixedLenFeature([], tf.string),
        'id':tf.io.FixedLenFeature([], tf.string),
    }
    example = tf.io.parse_single_example(test_example,UNLABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    idnum = example['id']
    return image, idnum

In [None]:
def data_augment(image, label):
    image = tf.image.random_flip_left_right(image)
    return image, label 

In [None]:
def load_dataset(filenames, labeled = True, ordered = False):
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads = AUTO)
    
    dataset = dataset.with_options(ignore_order)
    
    dataset = dataset.map(read_labeled_tfrecord if labeled else read_unlabeled_tfrecord, num_parallel_calls = AUTO)
    return dataset

In [None]:
def get_training_dataset():
    dataset = load_dataset(TRAINING_FILENAMES, labeled = True, ordered = False)
    dataset = dataset.map(data_augment, num_parallel_calls = AUTO)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

In [None]:
def get_validation_dataset():
    dataset = load_dataset(VALIDATION_FILENAMES, labeled = True, ordered = False)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    dataset = dataset.prefetch(AUTO)
    return dataset

In [None]:
def get_test_dataset():
    dataset = load_dataset(TEST_FILENAMES, labeled = False, ordered = True)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

In [None]:
def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

In [None]:
NUM_TRAINING_IMAGES = count_data_items(TRAINING_FILENAMES)
NUM_VALIDATION_IMAGES = count_data_items(VALIDATION_FILENAMES)
NUM_TEST_IMAGES = count_data_items(TEST_FILENAMES)
STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE
print('Dataset: {} training images, {} validation images, {} unlabeled test images'.format(NUM_TRAINING_IMAGES, NUM_VALIDATION_IMAGES, NUM_TEST_IMAGES))

In [None]:
training_dataset = get_training_dataset()
validation_dataset = get_validation_dataset()

In [None]:
for img, label in training_dataset.take(1):
    data = [img[0:16,:,:,:].numpy(),label[0:16].numpy()]

In [None]:
data[0].shape, data[1].shape

In [None]:
import matplotlib.pyplot as plt

rows = 4
cols = 4
fig = plt.figure(figsize  = (10, 10))
for index in range(1, rows * cols + 1):
    ax = fig.add_subplot(rows, cols, index)
    img = data[0][index -1]
    label = data[1][index - 1]
    ax.axis('off')
    plt.imshow(img)
    plt.title(CLASSES[label])
plt.tight_layout()
plt.show()

In [None]:
LR_START = 0.00001
LR_MAX = 0.00005 * strategy.num_replicas_in_sync
LR_MIN = 0.00001
LR_RAMPUP_EPOCHS = 5
LR_SUSTAIN_EPOCHS = 0
LR_EXP_DECAY = 0.8


def lrfun(epoch):
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS*epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        lr = (LR_MAX - LR_MIN)*LR_EXP_DECAY**(epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) + LR_MIN
    return lr

In [None]:
lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfun, verbose = True)

rng = [i for i in range(25 if  EPOCHS < 25 else EPOCHS)]
y = [lrfun(x) for x in rng]
plt.plot(rng, y)
print("Learning rate schedule: {:.3g} to {:.3g} to {:.3g}".format(y[0], max(y), y[-1]))

with strategy.scope():
    pretrained_model = tf.keras.applications.DenseNet201(weights = 'imagenet', include_top = False, input_shape = [*IMAGE_SIZE, 3]) 
    pretrained_model.trainable = False
    
    DN201 = tf.keras.Sequential([
        pretrained_model,
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(len(CLASSES), activation = 'softmax', dtype = 'float32')
    ])
    
DN201.compile(
    optimizer = 'adam',
    loss = 'sparse_categorical_crossentropy',
    metrics = ['sparse_categorical_accuracy'])

historicalDN201 = DN201.fit(
    training_dataset,
    steps_per_epoch = STEPS_PER_EPOCH,
    epochs = 20,
    callbacks = [lr_callback],
    validation_data = validation_dataset)


training_loss = historicalDN201.history['loss']
training_sparse_categorical_accuracy = historicalDN201.history['sparse_categorical_accuracy']

validation_loss = historicalDN201.history['val_loss']
validation_sparse_categorical_accuracy = historicalDN201.history['val_sparse_categorical_accuracy']

epochs = np.arange(EPOCHS)

plt.subplots(1,2)

plt.subplot(1, 2, 1)
plt.plot(epochs, training_loss,label = 'Training Loss')
plt.plot(epochs, validation_loss, label = 'Validation Loss')
plt.xlabel('Epochs')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs, training_sparse_categorical_accuracy,label = 'Training Accuracy')
plt.plot(epochs, validation_sparse_categorical_accuracy, label = 'Validation Accuracy')
plt.xlabel('Epochs')
plt.legend()

plt.show()

In [None]:
from tensorflow.keras.applications import InceptionV3

with strategy.scope():
    pretrained_model = InceptionV3(weights = 'imagenet', include_top = False, input_shape = [*IMAGE_SIZE, 3]) 
    pretrained_model.trainable = False
    
    incv3 = tf.keras.Sequential([
        pretrained_model,
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(len(CLASSES), activation = 'softmax', dtype = 'float32')
    ])
    
incv3.compile(
    optimizer = 'adam',
    loss = 'sparse_categorical_crossentropy',
    metrics = ['sparse_categorical_accuracy'])

historicalincv3 = incv3.fit(
    training_dataset,
    steps_per_epoch = STEPS_PER_EPOCH,
    epochs = 20,
    callbacks = [lr_callback],
    validation_data = validation_dataset)


In [None]:
training_loss = historicalincv3.history['loss']
training_sparse_categorical_accuracy = historicalincv3.history['sparse_categorical_accuracy']

validation_loss = historicalincv3.history['val_loss']
validation_sparse_categorical_accuracy = historicalincv3.history['val_sparse_categorical_accuracy']

epochs = np.arange(EPOCHS)

plt.subplots(1,2)

plt.subplot(1, 2, 1)
plt.plot(epochs, training_loss,label = 'Training Loss')
plt.plot(epochs, validation_loss, label = 'Validation Loss')
plt.xlabel('Epochs')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs, training_sparse_categorical_accuracy,label = 'Training Accuracy')
plt.plot(epochs, validation_sparse_categorical_accuracy, label = 'Validation Accuracy')
plt.xlabel('Epochs')
plt.legend()

plt.show()

In [None]:
test_ds = get_test_dataset()

print('Computing predictions...')
test_images_ds = test_ds.map(lambda image, idnum: image)
probabilities = incv3.predict(test_images_ds)
predictions = np.argmax(probabilities, axis=-1)
print(predictions)

In [None]:
test_ids_ds = test_ds.map(lambda image, idnum: idnum).unbatch()
test_ids = next(iter(test_ids_ds.batch(NUM_TEST_IMAGES))).numpy().astype('U') # all in one batch
np.savetxt('submission.csv', np.rec.fromarrays([test_ids, predictions]), fmt=['%s', '%d'], delimiter=',', header='id,label', comments='')