# PREPROCESSING

# SET UP
**Importing necessary modules**

In [None]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from kaggle_datasets import KaggleDatasets
import os
import re
from tensorflow import keras
import tensorflow_datasets as tfds
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
tf.compat.v1.disable_eager_execution()

In [None]:
tf.executing_eagerly()

# XLA GPU
SET STRATEGY FOR TF

In [None]:
strategy = tf.distribute.get_strategy()
tf.config.optimizer.set_jit(True)
    
print(strategy)

# FOLDERS AND CLASS LABELS

In [None]:
CLASSES = ['pink primrose',    'hard-leaved pocket orchid', 'canterbury bells', 'sweet pea',     'wild geranium',     'tiger lily',           'moon orchid',              'bird of paradise', 'monkshood',        'globe thistle',         # 00 - 09
           'snapdragon',       "colt's foot",               'king protea',      'spear thistle', 'yellow iris',       'globe-flower',         'purple coneflower',        'peruvian lily',    'balloon flower',   'giant white arum lily', # 10 - 19
           'fire lily',        'pincushion flower',         'fritillary',       'red ginger',    'grape hyacinth',    'corn poppy',           'prince of wales feathers', 'stemless gentian', 'artichoke',        'sweet william',         # 20 - 29
           'carnation',        'garden phlox',              'love in the mist', 'cosmos',        'alpine sea holly',  'ruby-lipped cattleya', 'cape flower',              'great masterwort', 'siam tulip',       'lenten rose',           # 30 - 39
           'barberton daisy',  'daffodil',                  'sword lily',       'poinsettia',    'bolero deep blue',  'wallflower',           'marigold',                 'buttercup',        'daisy',            'common dandelion',      # 40 - 49
           'petunia',          'wild pansy',                'primula',          'sunflower',     'lilac hibiscus',    'bishop of llandaff',   'gaura',                    'geranium',         'orange dahlia',    'pink-yellow dahlia',    # 50 - 59
           'cautleya spicata', 'japanese anemone',          'black-eyed susan', 'silverbush',    'californian poppy', 'osteospermum',         'spring crocus',            'iris',             'windflower',       'tree poppy',            # 60 - 69
           'gazania',          'azalea',                    'water lily',       'rose',          'thorn apple',       'morning glory',        'passion flower',           'lotus',            'toad lily',        'anthurium',             # 70 - 79
           'frangipani',       'clematis',                  'hibiscus',         'columbine',     'desert-rose',       'tree mallow',          'magnolia',                 'cyclamen ',        'watercress',       'canna lily',            # 80 - 89
           'hippeastrum ',     'bee balm',                  'pink quill',       'foxglove',      'bougainvillea',     'camellia',             'mallow',                   'mexican petunia',  'bromelia',         'blanket flower',        # 90 - 99
           'trumpet creeper',  'blackberry lily',           'common tulip',     'wild rose']                                                                                                                                             

In [None]:
IMAGE_SIZES = [(192, 192), (224, 224), (331, 331), (512, 512)]

In [None]:
IMAGE_SIZE = IMAGE_SIZES[1]

In [None]:
stem = "/kaggle/input/tpu-getting-started/"

In [None]:
FOLDER_PATHS = [stem + str(item) for item in os.listdir("/kaggle/input/tpu-getting-started") if "contains" not in str(item)]

for i, p in enumerate(FOLDER_PATHS):
    if "224" in str(p):
        PATH = FOLDER_PATHS[i]

Return all the files so they can be used in the dataset

In [None]:
def return_files(f):
    return [f + "/" + item for item in os.listdir(f)]

train_files = []
test_files = []
val_files = []

#if "sample" not in str(p):
for p in FOLDER_PATHS:
    if "sample" not in str(p):
        train_folder = p + "/train"
        test_folder = p + "/test"
        val_folder = p + "/val"

        print(train_folder)
        for f in return_files(train_folder):
            train_files.append(f)
        for f in return_files(test_folder):
            test_files.append(f)
        for f in return_files(val_folder):
            val_files.append(f)


In [None]:
train_files

**Define Image Reading Dictionaries**

In [None]:
LABELED_TFREC_FORMAT = {
    "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
    "class": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
}
UNLABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "id": tf.io.FixedLenFeature([], tf.string),
}

# DATA PROCESSING

Image Decoding and Reading Functions

In [None]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.image.resize(image,[224,224],method='nearest', preserve_aspect_ratio=True,)
    image = tf.cast(image, tf.float32) / 255.0
    #image = tf.image.resize(image,[224,224],method='nearest', preserve_aspect_ratio=True,)
    #image = tf.reshape(image, [*IMAGE_SIZE, 3])
    return image

def read_labeled_record(example_proto):
    ex = tf.io.parse_single_example(example_proto, LABELED_TFREC_FORMAT)
    img = decode_image(ex["image"])
    label = tf.cast(ex["class"], tf.int64)
    return img, label

def read_unlabeled_record(example_proto):
    ex = tf.io.parse_single_example(example_proto, UNLABELED_TFREC_FORMAT)
    img = decode_image(ex)
    label = tf.cast(ex["class"], tf.int64)
    return img, label

Data augmentation function

In [None]:
def data_augment(image, label):
    #image = tf.image.resize(image,(224,224))
    image = tf.image.random_flip_left_right(image, seed=None)
    image = tf.image.random_flip_up_down(image, seed=None)
    image = tf.image.random_saturation(image, lower=0, upper=2, seed=None)
#     image = tf.image.random_contrast(image, lower=.8, upper=2, seed=seed)
#     image = tf.image.random_brightness(image, max_delta=.2, seed=seed)
    image = tf.image.random_crop(image, size=[int(224), int(224), 3], seed=None)

    return image, label

Counts number of data items in files

In [None]:
def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

Loads dataset with tfds

In [None]:
def load_dataset(filenames, augment):
    ignore_order = tf.data.Options()

    dataset = tf.data.TFRecordDataset(filenames)
    dataset = dataset.with_options(ignore_order)
    dataset = dataset.map(read_labeled_record)
    #dataset = dataset.map(reshape)
    if augment:
        dataset = dataset.map(data_augment, num_parallel_calls=AUTO)
    
    return dataset

In [None]:
TRAIN_LEN = count_data_items(train_files)
VAL_LEN = count_data_items(val_files)
print("There are " + str(TRAIN_LEN) + " training pictures.")
print("There are " + str(VAL_LEN) + " validation pictures.")

# TRAINING

Define Batch Fetching function

In [None]:
def BatchGen(files, augment):
    data = load_dataset(files, augment)
    iterator = tf.compat.v1.data.make_one_shot_iterator(data)
    next_element = iterator.get_next()
    
    #tf.compat.v1.disable_eager_execution()
    train_x = []
    train_y = []
    
    with tf.compat.v1.Session() as s:                
        try:
            while True:
                data_record = s.run(next_element)
                train_x.append(data_record[0])
                train_y.append(data_record[1])
        except:
            pass
        
    return train_x, train_y

Some more constants

In [None]:
#CLASS_WEIGHT = 
EPOCHS=5
AUTO = tf.data.experimental.AUTOTUNE
LEARNING_RATE = 0.000051

## Defining the Model

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler

In [None]:
from tensorflow.keras import layers

In [None]:
#base_model = keras.applications.xception.Xception(weights="imagenet", include_top=False)
def get_model():
    with strategy.scope():
        global model
        base_model = tf.keras.applications.DenseNet201(
            include_top=False,
            weights="imagenet",
            input_shape=[None, None, 3],
        )

        base_model.trainable = False

        set_trainable = False

            # Un-freeze the last 256 layers
        for layer in base_model.layers:
            if layer == base_model.layers[-2]: 
                set_trainable = True
            if set_trainable:
                layer.trainable = True
            else:
                layer.trainable = False

        N_CLASSES = len(CLASSES)

        model = tf.keras.Sequential([
            base_model,
            layers.GlobalAveragePooling2D(),
            layers.Dropout(0.075),
            layers.Dense(N_CLASSES*10, activation='relu'),
            layers.Dropout(0.075),
            layers.Dense(N_CLASSES, activation='softmax')
        ])
        
        return model
model = get_model()

## Compiling the model and initializing callbacks

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(lr=LEARNING_RATE), loss='sparse_categorical_crossentropy', run_eagerly=False, metrics=["sparse_categorical_accuracy"])

In [None]:
tensorboard_callback = tf.keras.callbacks.EarlyStopping(
    monitor="sparse_categorical_accuracy",
    min_delta=0,
    patience=2,
    verbose=1,
    mode="auto",
    baseline=None,
    restore_best_weights=False,
)

In [None]:
def scheduler(epoch, lr):
    if epoch < 10:
        return LEARNING_RATE*1.1
    elif epoch < 20:
        return lr * 0.9
    else:
        return lr * 0.8

lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=0)

## Training Function

In [None]:
#To run the 64 files multiple times
for t in range(10):
    print("Iteration: ", t)
    for i, file in enumerate(train_files):
        train_x, train_y = BatchGen(file, augment=True)
        print("File Number: ", i, "Number of Records: ", len(train_x))
        train_x, train_y = np.asarray(train_x), np.asarray(train_y)
                
        model.fit(train_x, train_y, epochs=25, 
                  batch_size=24, 
                  verbose=1, shuffle=False,
              callbacks=[tensorboard_callback, lr_scheduler])

print("Model fit completed!!")

Train the model

# VALIDATION

Validation function that predicts data

In [None]:
pred_arr = []
label_arr = []
for file in val_files:
    val_x, val_y = BatchGen(file, False)
    val_x = np.asarray(val_x)
    
    pred = model.predict(val_x)
    pred = np.argmax(pred, axis=-1)
    
    for p in pred:
        pred_arr.append(p)
    for label in val_y:
        label_arr.append(label)

## Accuracy Analysis

In [None]:
from sklearn.metrics import confusion_matrix, precision_score

In [None]:
print(len(pred_arr))

In [None]:
print(len(label_arr))

In [None]:
confusion_matrix(label_arr, pred_arr)

In [None]:
arr = classification_report(label_arr, pred_arr, target_names=CLASSES)

In [None]:
print(arr)