In [None]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        continue
        #print(os.path.join(dirname, filename))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

import keras
from keras.models import Sequential
from keras.layers import Dense, Conv2D , MaxPool2D , Flatten , Dropout 
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam

from sklearn.metrics import classification_report,confusion_matrix
from kaggle_datasets import KaggleDatasets
import tensorflow as tf
import pathlib

import cv2
import os
import math
import re

In [None]:
# detect and init the TPU
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    tpu_strategy = tf.distribute.get_strategy()
print("Device:", tpu.master())
tpu_strategy.num_replicas_in_sync

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
GCS_DS_PATH = KaggleDatasets().get_gcs_path('tpu-getting-started')

GCS_PATH = GCS_DS_PATH + '/tfrecords-jpeg-512x512'
AUTO = tf.data.experimental.AUTOTUNE
IMAGE_SIZE = [512, 512] 
TRAINING_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/train/*.tfrec')
VALIDATION_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/val/*.tfrec')
TEST_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/test/*.tfrec')

BATCH_SIZE = 16 * tpu_strategy.num_replicas_in_sync

In [None]:
def decode_image(image_data):
    image =tf.image.decode_jpeg(image_data, channels=3)
    image =tf.image.resize(image,[*IMAGE_SIZE])  # resize image to the dimension needed for the pretrained model
    image =tf.cast(image, tf.float32) /255.0
    image = tf.reshape(image,[*IMAGE_SIZE, 3])
    return image

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([],tf.string), # tf.string means bytestring
        "class": tf.io.FixedLenFeature([],tf.int64),
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example["image"])
    label = tf.cast(example["class"], tf.int32)
    return image,label

def read_unlabeled_tfrecord(example):
    UNLABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([],tf.string),
        "id": tf.io.FixedLenFeature([],tf.string),
    }
    example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT)
    image = decode_image(example["image"])
    idnum = example["id"]
    return image,idnum

def load_dataset(filenames, labeled=True, ordered=False):
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False
    
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO)
    dataset = dataset.with_options(ignore_order)
    dataset = dataset.map(read_labeled_tfrecord if labeled else read_unlabeled_tfrecord, num_parallel_calls=AUTO)
    return dataset

In [None]:

def get_training_dataset():
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
   # dataset = dataset.map(data_augment, num_parallel_calls=AUTO)
    dataset = dataset.repeat() # repeats for several epochs
    dataset = dataset.shuffle(buffer_size=2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training
    return dataset

def get_validation_dataset(ordered=False):
    dataset = load_dataset(VALIDATION_FILENAMES, labeled=True, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    dataset = dataset.prefetch(AUTO)
    return dataset

def get_test_dataset(ordered = False):
    dataset = load_dataset(TEST_FILENAMES, labeled=False, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

def count_data_items(filenames):
    # the number of data items in the name of the .tfrec 
    n  = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

fTrainImages = count_data_items(TRAINING_FILENAMES)
fValidationImages = count_data_items(VALIDATION_FILENAMES)
fTestImages = count_data_items(TEST_FILENAMES)
print(f"{fTrainImages} training images, {fValidationImages} validation images, {fTestImages} test images ")


In [None]:
BATCH_SIZE = 16* tpu_strategy.num_replicas_in_sync
train_ds = get_training_dataset()
val_ds = get_validation_dataset()
test_ds = get_test_dataset()

print("Training: ", train_ds)
print("Validation: ", val_ds)
print("Test: ", test_ds)

In [None]:
def batch_to_numpy_images_and_labels(data):
    images,labels = data
    numpy_images = images.numpy()
    numpy_labels = labels.numpy()
    if numpy_labels.dtype == object:
        numpy_labels = [ None for _ in enumerate(numpy_images)]
    return numpy_images, numpy_labels

def display_one_flower(image, title, subplot, red=False, titlesize=16):
    plt.subplot(*subplot)
    plt.axis('off')
    plt.imshow(image)
    if len(title)>0:
        plt.title(title, fontsize=int(titlesize) if not red else int(titlesize/1.2),
                  color= 'red' if red else 'black',
                fontdict={'verticalalignment':'center'}, 
                  pad=int(titlesize/1.5))
    return (subplot[0],subplot[1],subplot[2]+1)

def display_batch_of_images(databatch,predictions=None):
    images,labels = batch_to_numpy_images_and_labels(databatch)
    if labels is None:
        labels  = [None for _ in enumerate(images)]
        
    rows = int(math.sqrt(len(images)))
    cols = len(images)//rows
    
    FIGSIZE = 13.0
    SPACING = 0.1
    subplot = (rows,cols,1)
    if(rows < cols):
        plt.figure(figsize=(FIGSIZE, FIGSIZE/cols*rows))
    else:
        plt.figure(figsize=(FIGSIZE/rows*cols, FIGSIZE))
    
    #display 
    for i, (image,label) in enumerate(zip(images[:rows*cols], labels[:rows*cols])):
        title = '{}'.format(label)
        correct = True
        if predictions is not None:
            title, correct = title_from_label_and_target(predictions[i],label)
        dynamic_titlesize = FIGSIZE*SPACING/max(rows,cols)*40+3
        subplot = display_one_flower(image,title,subplot, not correct,
                                    titlesize = dynamic_titlesize)
    
    plt.tight_layout()
    if label is None and predictions is None:
        plt.subplots_adjust(wspace=0,hspace=0)
    else:
        plt.subplots_adjust(wspace=SPACING, hspace=SPACING)
    plt.show()
               

In [None]:
ds_iter = iter(train_ds.unbatch().batch(20))

In [None]:
one_batch = next(ds_iter)
display_batch_of_images(one_batch)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lrr = ReduceLROnPlateau(monitor='val_loss',patience=3,verbose=1,factor=0.5, min_lr=0.00001)

STEPS_PER_EPOCH = fTrainImages // BATCH_SIZE

In [None]:
with tpu_strategy.scope():
    img_adjust_layer = tf.keras.layers.Lambda(lambda data: tf.keras.applications.xception.preproces_input(tf.cast(data,tf.float32)), input_shape=[*IMAGE_SIZE,3])
    xce_pretrained_model = tf.keras.applications.Xception(weights='imagenet',include_top=False)    
    xce_pretrained_model.trainable = True
    model = tf.keras.Sequential([
        xce_pretrained_model,
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(104, activation='softmax')
    ])

In [None]:
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
model.summary()

In [None]:
model.fit(train_ds, epochs=5, steps_per_epoch=STEPS_PER_EPOCH, callbacks=[early_stopping, lrr],
         validation_data=val_ds)

In [None]:
with tpu_strategy.scope():
    model2 = tf.keras.Sequential([
        tf.keras.layers.Conv2D(32,(3, 3), activation='relu', input_shape=(512,512,3)),
        tf.keras.layers.MaxPooling2D((2,2)),
        tf.keras.layers.Conv2D(64,(3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2,2)),
        tf.keras.layers.Conv2D(64,(3, 3), activation='relu'),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(104, activation='softmax'),
    ])
   

In [None]:
model2.compile(optimizer='adam',
               loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
               metrics=['accuracy'])

model2.summary()

In [None]:
model2.fit(train_ds, epochs=10,
           steps_per_epoch=STEPS_PER_EPOCH, validation_data=val_ds, callbacks=[early_stopping, lrr])

In [None]:
test_ds = get_test_dataset(ordered=True)
test_images_ds = test_ds.map(lambda image, idnum: image)
test_ids_ds = test_ds.map(lambda image, idnum: idnum).unbatch()
test_ids = next(iter(test_ids_ds.batch(fTestImages))).numpy().astype('U')
proba = model.predict(test_images_ds)

predictions = np.argmax(proba, axis=-1)

np.savetxt('submission.csv',
          np.rec.fromarrays([test_ids,predictions]),
           fmt=['%s', '%d'],
           delimiter=',',
           header='id,label',
           comments='',)