In [None]:
from kaggle_datasets import KaggleDatasets
import math, re, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from functools import partial

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.utils import plot_model
from sklearn.model_selection import train_test_split, StratifiedKFold

import cv2
print("Tensorflow version " + tf.__version__)

Nu ser vi om vi kan få forbindelse/ se vores TPU'er:

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print('Number of replicas:', strategy.num_replicas_in_sync)

# Setup

Nu sætter vi vores konfigurerings variabler og bestemmer hvordan vi skal håndtere vores billeder i datasættet:

TPU'er læser data fra Google Cloud Storage (GCS) buckets.

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
GCS_PATH = KaggleDatasets().get_gcs_path() # Google Cloud Storage (CGS) Bucket
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
IMAGE_SIZE = [512, 512]
CLASSES = ['0', '1', '2', '3', '4']
EPOCHS = 15

In [None]:
def decode_image(image):
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.cast(image, tf.float32) / 255.0  # konverterer image 255 RGB values til floats i [0, 1] range
    image = tf.reshape(image, [*IMAGE_SIZE, 3])
    return image

Nu sætter vi vores "features" som "image", og vores label som "target".

Endnu mere vigtigt at forklare, så er datasættet allerede blevet formatteret til TFRecords for os (fra kaggles side).
TFRecords er en serialisering af datasættet, så datasættet omdannes til bytes. Derfor skal vores data omdannes til byte-strings før de kan komme i en TFRecord.

Efterfølgende kan man konvertere bytestrings tilbage til tensors

In [None]:
def read_tfrecord(example, labeled):
    tfrecord_format = {
        "image": tf.io.FixedLenFeature([], tf.string), #Her betyder tf.string at det er en byte-string
        "target": tf.io.FixedLenFeature([], tf.int64)
    } if labeled else {
        "image": tf.io.FixedLenFeature([], tf.string),
        "image_name": tf.io.FixedLenFeature([], tf.string)
    }
    example = tf.io.parse_single_example(example, tfrecord_format)
    image = decode_image(example['image'])
    if labeled:
        label = tf.cast(example['target'], tf.int32)
        return image, label
    idnum = example['image_name']
    return image, idnum

In [None]:
#train_df = tf.io.gfile.glob(GCS_PATH + 'train.csv')
#display(train_df.head())

In [None]:
def read_dataset(tfrecords, labeled = True, ordered = False):
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False
        # Ved at slukke for sortering, øger vi hastigheden.
        
    dataset = tf.data.TFRecordDataset(
        tfrecords, num_parallel_reads=AUTOTUNE
    ) # Her blandes der automatisk adskillige filer til at blive læst på samme tid.
    
    dataset = dataset.with_options(ignore_order)
    # Bruger dataen så snart den kommer ind, istedet for at sortere den.
    
    dataset = dataset.map(
        partial(read_tfrecord, labeled=labeled),
        num_parallel_calls=AUTOTUNE)
    return dataset
        

## Training & Validation datasæt

Nu gør jeg brug af SciKit Learns "train_test_split" funktion til at adskille datasættet i et trænings sæt, og et sæt til validering.

Hvis jeg havde tid, ville jeg have gjort brug af K-fold Cross Validation i stedet, men det er noget mere kompliceret.

In [None]:
train_tfrecords, valid_tfrecords = train_test_split(
    tf.io.gfile.glob(GCS_PATH + '/train_tfrecords/ld_train*.tfrec'),
    test_size=0.30, random_state = 42
)


train_images = GCS_PATH + '/train_images/'

test_tfrecords = tf.io.gfile.glob(GCS_PATH + '/test_tfrecords/ld_test*.tfrec')

## Data Augmentation

Hvis man selv udførte data augmentation på billederne ned til mindste detalje, ville man bruge keras ImageDataGenerator.

Jeg bruger dog en simpel metode først.

In [None]:
#ImageDataGenerator(
                  #  rotation_range = 30,
                  #  width_shift_range = 0.2,
                  #  height_shift_range = 0.2,
                  #  shear_range = 0.2,
                  #  zoom_range = 0.2,
                  #  brightness_range = [0.5,1.5],
                   # horizontal_flip = True,
                   # vertical_flip = True,
                   # fill_mode = 'nearest'
#)

In [None]:
#def data_augmentation(image, label):
   # image = tf.image.random_brightness(image, 0.2)
    #image = tf.image.random_contrast(image, 0.2, 0.4)
    #image = tf.image.random_flip_left_right(image)
   # image = tf.image.random_flip_up_down(image)
   # return image, label

In [None]:
def data_augmentation(image, label):
   
    return image, label

Men vi er så heldige at den følgende "dataset.prefetch(AUTO)" funktion gør dette for os, "gratis" ved brug af TPU.

In [None]:
def get_train_dataset():
    dataset = read_dataset(train_tfrecords, labeled = True)  
    dataset = dataset.map(data_augmentation, num_parallel_calls = AUTOTUNE)  
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

In [None]:
def get_valid_dataset():
    dataset = read_dataset(valid_tfrecords, labeled = True, ordered = True)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

In [None]:
def get_test_dataset(ordered = False):
    dataset = read_dataset(test_tfrecords, labeled = False, ordered = True)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

## Lad os få antal på dataen:

In [None]:
def count_data(tfrecords):
    num = [int(re.compile(r"-([0-9]*)\.").search(tfrecord).group(1))
           for tfrecord in tfrecords]
    return np.sum(num)

In [None]:
num_train_images = count_data(train_tfrecords)
num_valid_images = count_data(valid_tfrecords)
num_test_images = count_data(test_tfrecords)

print("Datasæt: {} billeder til træning, {} billeder til validering, {} (unlabeled) test billeder".format(
num_train_images, num_valid_images, num_test_images))

# EDA?

In [None]:
#plt.figure(figsize = (20,20))
#for i in range(20):
   # plt.subplot(4,5,i+1)
   # img = cv2.imread(train_images + images[i])
    #img = cv2.imread(tf.io.gfile.glob(GCS_PATH + '/train_images/10*.jpg'))
   # img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    #plt.imshow(img)
   # plt.title(data[str(labels[i])])

# Valg af model

In [None]:
with strategy.scope():
        
    adjust_img_layer = tf.keras.layers.Lambda(
        tf.keras.applications.resnet50.preprocess_input,
        input_shape=[IMAGE_SIZE, IMAGE_SIZE, 3])
        
    base_model = tf.keras.applications.ResNet50(
        weights = "imagenet",
        include_top = False)
    base_model.trainable = False
        
    model = tf.keras.Sequential([
        tf.keras.layers.BatchNormalization(renorm = True),
        adjust_img_layer,
        base_model,
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(8, activation = "relu"),
        #tf.keras.layers.BatchNormalization(renorm=True),
        tf.keras.layers.Dense(len(CLASSES), activation = 'softmax')
    ])
    model.compile(
        optimizer = tf.keras.optimizers.Adam(
            learning_rate = 0.001),
        loss = "sparse_categorical_crossentropy",
        metrics = ["sparse_categorical_accuracy"])

# Træning af model

In [None]:
train_dataset = get_train_dataset()
valid_dataset = get_valid_dataset()

In [None]:
# Stopper træningen når validation loss metric er stoppet med at falde i 5 epochs.
early_stopping = EarlyStopping(monitor = 'val_loss',
                               patience = 5,
                               mode = 'min',
                               restore_best_weights = True)

# Gemmer modellen med det maksimale validerings præcision, virker ikke?
checkpoint = ModelCheckpoint('best_model.hdf5', 
                             monitor = 'val_sparse_categorical_accuracy',
                             verbose = 1,
                             mode = 'max', 
                             save_best_only = True)
# Reducerer lærings raten
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss',
                              factor = 0.2,
                              patience = 2,
                              mode = 'min',
                              verbose = 1)

In [None]:
epoch_steps = num_train_images // BATCH_SIZE
valid_steps = num_valid_images // BATCH_SIZE

In [None]:
history = model.fit(train_dataset,
                    validation_data = valid_dataset,
                    epochs = EPOCHS,
                    steps_per_epoch = epoch_steps, 
                    validation_steps = valid_steps, 
                    callbacks = [early_stopping, reduce_lr]
                   )

In [None]:
model.summary()

In [None]:
print(history.history.keys())

Nu fremviser vi lærings kurven + loss funktion for at evaluere vores model

In [None]:
acc = history.history['sparse_categorical_accuracy']
val_acc = history.history['val_sparse_categorical_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'c-', label='Training sparse categorical accuracy')
plt.plot(epochs, val_acc, 'y-', label='Validation sparse categorical accuracy')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'c-', label='Training Loss')
plt.plot(epochs, val_loss, 'y-', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

In [None]:
plot_model(model, show_shapes = True)

# Resultats historie:

* ResNet50 - image_size = (512, 512), batch_size = 128 - 0.6901


### Med Callbacks/flere augmentationer:


* ResNet50 - image_size = (512, 512), batch_size = 128 - 0.7169
* EfficientNetB3 - image_size = (512, 512), batch_size = 128 - 0.61584
* ResNet50 - Ekstra augmentation (ændring af contrast + brightness) - image_size = (512, 512), batch_size = 128 - 0.6486 - endnu værre.


### Skiftet train/test split til 15% + Accuracy metric i stedet for sparse categorical accuracy: 

* EfficientNetB3 - image_size = (512, 512), batch_size = 128 - 0.61968