In [None]:
!pip uninstall tensorflow -y
!pip install tensorflow==2.7.0
!pip install cloud-tpu-client
!pip install livelossplot

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

import os
import re
import glob

import tensorflow as tf
print("Tensorflow version " + tf.__version__)
import tensorflow_datasets as tfds
import tensorflow_addons as tfa
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, BatchNormalization, GlobalAveragePooling2D, Dropout, AveragePooling2D, CenterCrop
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.regularizers import L2, L1, L1L2
from tensorflow.keras.metrics import TopKCategoricalAccuracy, top_k_categorical_accuracy, SparseTopKCategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications import EfficientNetB0, EfficientNetB7
from tensorflow.keras.applications.efficientnet import preprocess_input
from kaggle_datasets import KaggleDatasets
from cloud_tpu_client import Client
Client().configure_tpu_version(tf.version.VERSION, restart_type='ifNeeded')
AUTO = tf.data.experimental.AUTOTUNE

from livelossplot import PlotLossesKerasTF
from livelossplot.outputs import MatplotlibPlot
from skimage.io import imread, imshow
from sklearn.metrics import accuracy_score

In [None]:
try:  # detect TPUs
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

except ValueError:  # detect GPUs
    raise BaseException(
        'ERROR: Not connected to a TPU runtime; switching to GPU strategy')
    strategy = tf.distribute.MirroredStrategy()  # for GPU or multi-GPU machines
    # default strategy that works on CPU and single GPU
    strategy = tf.distribute.get_strategy()
    # for clusters of multi-GPU machines
    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

print("Number of accelerators: ", strategy.num_replicas_in_sync)

In [None]:
IMAGE_SIZE = [512, 512]

BATCH_SIZE = 16 * strategy.num_replicas_in_sync


GCS_DS_PATH = KaggleDatasets().get_gcs_path('tpu-getting-started')
GCS_PATH_SELECT = {  # available image sizes
    192: GCS_DS_PATH + '/tfrecords-jpeg-192x192',
    224: GCS_DS_PATH + '/tfrecords-jpeg-224x224',
    331: GCS_DS_PATH + '/tfrecords-jpeg-331x331',
    512: GCS_DS_PATH + '/tfrecords-jpeg-512x512'
}
GCS_PATH = GCS_PATH_SELECT[IMAGE_SIZE[0]]

TRAINING_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/train/*.tfrec')
VALIDATION_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/val/*.tfrec')
# predictions on this dataset should be submitted for the competition
TEST_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/test/*.tfrec')

CLASSES = ['pink primrose',    'hard-leaved pocket orchid', 'canterbury bells', 'sweet pea',     'wild geranium',     'tiger lily',           'moon orchid',              'bird of paradise', 'monkshood',        'globe thistle',         # 00 - 09
           'snapdragon',       "colt's foot",               'king protea',      'spear thistle', 'yellow iris',       'globe-flower',         'purple coneflower',        'peruvian lily',    'balloon flower',   'giant white arum lily',  # 10 - 19
           'fire lily',        'pincushion flower',         'fritillary',       'red ginger',    'grape hyacinth',    'corn poppy',           'prince of wales feathers', 'stemless gentian', 'artichoke',        'sweet william',         # 20 - 29
           'carnation',        'garden phlox',              'love in the mist', 'cosmos',        'alpine sea holly',  'ruby-lipped cattleya', 'cape flower',              'great masterwort', 'siam tulip',       'lenten rose',           # 30 - 39
           'barberton daisy',  'daffodil',                  'sword lily',       'poinsettia',    'bolero deep blue',  'wallflower',           'marigold',                 'buttercup',        'daisy',            'common dandelion',      # 40 - 49
           'petunia',          'wild pansy',                'primula',          'sunflower',     'lilac hibiscus',    'bishop of llandaff',   'gaura',                    'geranium',         'orange dahlia',    'pink-yellow dahlia',    # 50 - 59
           'cautleya spicata', 'japanese anemone',          'black-eyed susan', 'silverbush',    'californian poppy', 'osteospermum',         'spring crocus',            'iris',             'windflower',       'tree poppy',            # 60 - 69
           'gazania',          'azalea',                    'water lily',       'rose',          'thorn apple',       'morning glory',        'passion flower',           'lotus',            'toad lily',        'anthurium',             # 70 - 79
           'frangipani',       'clematis',                  'hibiscus',         'columbine',     'desert-rose',       'tree mallow',          'magnolia',                 'cyclamen ',        'watercress',       'canna lily',            # 80 - 89
           'hippeastrum ',     'bee balm',                  'pink quill',       'foxglove',      'bougainvillea',     'camellia',             'mallow',                   'mexican petunia',  'bromelia',         'blanket flower',        # 90 - 99
           'trumpet creeper',  'blackberry lily',           'common tulip',     'wild rose']                                                                                                                                               # 100 - 102

In [None]:
def count_data_items(filenames):
    # the number of data items is written in the name of the .tfrec files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1))
         for filename in filenames]
    return np.sum(n)


NUM_TRAINING_IMAGES = count_data_items(TRAINING_FILENAMES)
NUM_VALIDATION_IMAGES = count_data_items(VALIDATION_FILENAMES)
NUM_TEST_IMAGES = count_data_items(TEST_FILENAMES)
STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE
# The "-(-//)" trick rounds up instead of down :-)
VALIDATION_STEPS = -(-NUM_VALIDATION_IMAGES // BATCH_SIZE)
# The "-(-//)" trick rounds up instead of down :-)
TEST_STEPS = -(-NUM_TEST_IMAGES // BATCH_SIZE)
print('Dataset: {} training images, {} validation images, {} unlabeled test images'.format(
    NUM_TRAINING_IMAGES, NUM_VALIDATION_IMAGES, NUM_TEST_IMAGES))

In [None]:
# code from https://www.kaggle.com/zephyr44/getting-started-with-100-flowers-on-tpu/edit

def decode_image(image_data):
    # image format uint8 [0,255]
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.reshape(image, [*IMAGE_SIZE, 3])  # explicit size needed for TPU
    return image


def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        # tf.string means bytestring
        "image": tf.io.FixedLenFeature([], tf.string),
        # shape [] means single element
        "class": tf.io.FixedLenFeature([], tf.int64),
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = tf.cast(example['class'], tf.int32)
    return image, label  # returns a dataset of (image, label) pairs


def read_unlabeled_tfrecord(example):
    UNLABELED_TFREC_FORMAT = {
        # tf.string means bytestring
        "image": tf.io.FixedLenFeature([], tf.string),
        # shape [] means single element
        "id": tf.io.FixedLenFeature([], tf.string),
    }
    example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    idnum = example['id']
    return image, idnum  # returns a dataset of image(s)

In [None]:
# Reference: https://github.com/facebookresearch/FixRes/blob/main/transforms_v2.py
def processing(image, label, smaller_size, bigger_size, train=True, finetuning=False, fixres=True):
    """
    Processing for the first stage of FixRes procedure
        For training : random crop -> random horizontal flip -> resize to a smaller size 
    Processing for the second stage of FixRes procedure = Fine-Tuning : resize and center_crop to the normal size
    """
    ratio = int(256 / 224)
    if not finetuning:
        if train:
            channels = 3
            begin, size, _ = tf.image.sample_distorted_bounding_box(
                tf.shape(image),
                tf.zeros([0, 0, 4], tf.float32),
                area_range=(0.2, 1.0),
                min_object_covered=0,
                use_image_if_no_bounding_boxes=True,
            )
            image = tf.slice(image, begin, size)
            image.set_shape([None, None, channels])
            image = tf.image.resize(image, [smaller_size, smaller_size])
            image = tf.image.random_flip_left_right(image)
            image = tf.image.random_saturation(image, 0.7, 1.3)
            image = tf.image.random_brightness(image, 0.3)
        if fixres:
            image = tf.image.resize(
                image, [ratio*smaller_size, ratio*smaller_size])
            image = layers.CenterCrop(
                smaller_size, smaller_size)(image[None, ...])[0]
        elif not fixres:
            image = tf.image.resize(
                image, [ratio*bigger_size, ratio*bigger_size])
            image = layers.CenterCrop(
                bigger_size, bigger_size)(image[None, ...])[0]

    elif finetuning:
        # if train:
        #    image = tf.image.random_flip_left_right(image)
        image = tf.image.resize(image, [ratio*bigger_size, ratio*bigger_size])
        image = layers.CenterCrop(
            bigger_size, bigger_size)(image[None, ...])[0]

    return image, label


def generate_augmented_dataset(filenames, smaller_size, bigger_size, train=True, finetuning=False, fixres=True, labeled=True):
    ignore_order = tf.data.Options()
    if labeled:
        ignore_order.experimental_deterministic = False  # disable order, increase speed
    # automatically interleaves reads from multiple files
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO)
    # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.with_options(ignore_order)
    dataset = dataset.map(
        read_labeled_tfrecord if labeled else read_unlabeled_tfrecord, num_parallel_calls=AUTO)
    dataset = dataset.map(
        lambda x, y: processing(
            x, y, smaller_size, bigger_size, train, finetuning, fixres),
        num_parallel_calls=AUTO)
    if train:
        dataset = dataset.repeat()  # the training dataset must repeat for several epochs
        dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    # prefetch next batch while training (autotune prefetch buffer size)
    dataset = dataset.prefetch(AUTO)

    return dataset

In [None]:
smaller_size = int(IMAGE_SIZE[0]/2)
bigger_size = IMAGE_SIZE[0]

fixres_train_data = generate_augmented_dataset(
    TRAINING_FILENAMES, smaller_size, bigger_size, train=True, finetuning=False)
fixres_val_data = generate_augmented_dataset(
    VALIDATION_FILENAMES, smaller_size, bigger_size, train=False, finetuning=False)

finetune_train_data = generate_augmented_dataset(
    TRAINING_FILENAMES, smaller_size, bigger_size, train=True, finetuning=True)
finetune_val_data = generate_augmented_dataset(
    VALIDATION_FILENAMES, smaller_size, bigger_size, train=False, finetuning=True)

In [None]:
def build_model(input_shape, num_classes=len(CLASSES), lr=1e-4):
    with strategy.scope():
        inputs = Input((input_shape, input_shape, 3))
        # On récupère les couches du modèle EfficientNet
        efficientnet = EfficientNetB7(weights='imagenet', include_top=False)

        # On freeze les couches pré-entrainés excepté les

        print(efficientnet.layers[-11:])
        for layer in efficientnet.layers[:-11]:
            layer.trainable = False
        for layer in efficientnet.layers[-11:]:
            layer.trainable = True

        # Régularisation L2
        regularizer = L2()
        for layer in efficientnet.layers:
            for attr in ['kernel_regularizer']:
                if hasattr(layer, attr):
                    setattr(layer, attr, regularizer)

        x = preprocess_input(inputs)
        x = efficientnet(x)

        # On ajoute les dernières couches
        top_layers = Dropout(0.5)(x)
        top_layers = GlobalAveragePooling2D()(top_layers)
        top_layers = Dropout(0.5)(top_layers)
        output_top_layers = Dense(
            num_classes, activation='softmax')(top_layers)

        model = Model(inputs=inputs, outputs=output_top_layers)

        model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=lr),
            loss="sparse_categorical_crossentropy",
            metrics=[tfa.metrics.F1Score(len(CLASSES), average='macro'),"sparse_categorical_accuracy",
                     SparseTopKCategoricalAccuracy(k=3)],
            # steps_per_execution=16
        )

        return model

In [None]:
def train_and_evaluate(model, name, train_data, val_data,  epochs):
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss', factor=0.1, patience=3, min_lr=0.00001, min_delta=0.001)
    early_stopping_monitor = EarlyStopping(
        monitor='val_loss', patience=6, min_delta=0, verbose=1, mode='auto')
    checkpoint = ModelCheckpoint(
        filepath=f'{name}.weights.best.hdf5', save_best_only=True)
    # outputs=[MatplotlibPlot(figpath =f'{model_name}.png')]
    plot = PlotLossesKerasTF()
    callbacks = [reduce_lr, early_stopping_monitor, plot, checkpoint]

    model.fit(train_data,
              validation_data=val_data,
              epochs=epochs,
              steps_per_epoch=STEPS_PER_EPOCH,
              validation_steps=VALIDATION_STEPS,
              callbacks=callbacks
              )

    return model

# FixRes Model

In [None]:
model_name = 'Flower_Classification_wFixRes'
model = build_model(smaller_size)
print(model.summary())

In [None]:
model = train_and_evaluate(
    model, model_name, fixres_train_data, fixres_val_data, epochs=50)

In [None]:
model_name = 'Flower_Classification_wFixRes_finetune'
finetune_model = build_model(bigger_size, lr=1e-5)
finetune_model.load_weights(f'Flower_Classification_wFixRes.weights.best.hdf5')

for layer in finetune_model.layers[1].layers[:-2]:
    layer.trainable = False
for layer in finetune_model.layers[1].layers[-2:]:
    layer.trainable = True
for layer in finetune_model.layers[2:7]:
    layer.trainable = True

print(finetune_model.summary())

In [None]:
# Use a lower learning rate during fine-tuning.
finetune_model = train_and_evaluate(
    finetune_model,
    model_name,
    finetune_train_data,
    finetune_val_data,
    epochs=100
)

In [None]:
test_data = generate_augmented_dataset(
    TEST_FILENAMES, smaller_size, bigger_size, train=False, finetuning=True, fixres=True, labeled=False)

print('Computing predictions...')
test_images_ds = test_data.map(lambda image, idnum: image)
probabilities = finetune_model.predict(test_images_ds)
predictions = np.argmax(probabilities, axis=-1)
print(predictions)

In [None]:
print('Generating submission.csv file...')

# Get image ids from test set and convert to unicode
test_ids_ds = test_data.map(lambda image, idnum: idnum).unbatch()
test_ids = next(iter(test_ids_ds.batch(NUM_TEST_IMAGES))).numpy().astype('U')

# Write the submission file
np.savetxt(
    'submission.csv',
    np.rec.fromarrays([test_ids, predictions]),
    fmt=['%s', '%d'],
    delimiter=',',
    header='id,label',
    comments='',
)

# Look at the first few predictions
!head submission.csv