In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        pass
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
from kaggle_datasets import KaggleDatasets

In [None]:
AUTO = tf.data.experimental.AUTOTUNE
ignore_order = tf.data.Options()
ignore_order.experimental_deterministic = False

GCS_DS_PATH = KaggleDatasets().get_gcs_path()

train_filenames = tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-512*512/train/*.tfrec')
raw_train_dataset = tf.data.TFRecordDataset(train_filenames, num_parallel_reads=AUTO)

val_filenames = tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-512*512/val/*.tfrec')
raw_val_dataset = tf.data.TFRecordDataset(val_filenames, num_parallel_reads=AUTO)

test_filenames = tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-512*512/test/*.tfrec')
raw_test_dataset = tf.data.TFRecordDataset(test_filenames, num_parallel_reads=AUTO)

In [None]:

for raw_record in raw_test_dataset.take(1):
  example = tf.train.Example()
  example.ParseFromString(raw_record.numpy())
#   print(example)

In [None]:
%config Completer.use_jedi = False

#detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)



tpu_strategy.num_replicas_in_sync

In [None]:
IMAGE_SIZE = (512, 512)
AUTO = tf.data.experimental.AUTOTUNE
batch_size = 16*tpu_strategy.num_replicas_in_sync


def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data , channels=3)
#     image = tf.cast(image, tf.float32) / 255.0
    image = tf.reshape(image, (*IMAGE_SIZE, 3))
#     image = tf.image.random_flip_left_right(image)
    return image


def parse_tfrecords(example):
    features = {
        'class': tf.io.FixedLenFeature([], tf.int64),
        'image': tf.io.FixedLenFeature([], tf.string),
#         'one_hot_class': tf.io.VarLenFeature(tf.float32)
    }
    example = tf.io.parse_single_example(example, features)
    image = decode_image(example['image'])
    return image, tf.cast(example['class'], tf.int32)

def parse_tfrecords_test(example):
    features = {
        'id': tf.io.FixedLenFeature([], tf.string),
        'image': tf.io.FixedLenFeature([], tf.string),
#         'one_hot_class': tf.io.VarLenFeature(tf.float32)
    }
    example = tf.io.parse_single_example(example, features)
    image = decode_image(example['image'])
    return image, example['id']


def augment(image, label):
    return tf.image.random_flip_left_right(image), label


def load_dataset(filenames, labeled=True, as_supervised=True):
    order = tf.data.Options()
    order.experimental_deterministic = True
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO)
    dataset = dataset.with_options(order)
    
    parse = parse_tfrecords if as_supervised else parse_tfrecords_test
    return dataset.map(parse).map(augment, num_parallel_calls=AUTO)


train_dataset = load_dataset(train_filenames)
train_dataset = train_dataset.batch(batch_size).prefetch(AUTO)

val_dataset = load_dataset(val_filenames)
val_dataset = val_dataset.batch(batch_size).prefetch(AUTO)

test_dataset = load_dataset(test_filenames, as_supervised=False)
test_dataset = test_dataset.batch(batch_size).prefetch(AUTO)

In [None]:
def display_one_flower(image, title):
    plt.axis('off')
    plt.imshow(image)
    plt.title(title, fontsize=16)

    
image, label = next(train_dataset.unbatch().as_numpy_iterator())
display_one_flower(image, label)

In [None]:
for image, label in train_dataset.take(2):
    print(image.numpy().shape)
    print(label.numpy().shape)

In [None]:
# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():
#     pretrained_model = tf.keras.applications.Xception(input_shape=(*IMAGE_SIZE, 3), include_top=False)
    img_adjust_layer = tf.keras.layers.Lambda(lambda data: tf.keras.applications.vgg16.preprocess_input(tf.cast(data, tf.float32)), input_shape=(*IMAGE_SIZE, 3))
    pretrained_model = tf.keras.applications.VGG16(weights='imagenet', include_top=False)
    pretrained_model.trainable = False
    model = tf.keras.Sequential([
        img_adjust_layer,
        pretrained_model,
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(104, activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'], optimizer='adam')

model.summary()

In [None]:
%config Completer.use_jedi = False

In [None]:
def count_data_items(filenames):
    basenames = [os.path.basename(path).split('-')[-1] for path in filenames]
    images_counts = [int(basename.split('.')[0]) for basename in basenames]
    return np.sum(images_counts)
    

n_train = count_data_items(train_filenames)
n_val = count_data_items(val_filenames)
n_test = count_data_items(test_filenames)

train_steps = n_train // batch_size

n_train, n_val, n_test

In [None]:
EPOCHS = 12

start_lr = 0.00001
min_lr = 0.00001
max_lr = 0.00005*tpu_strategy.num_replicas_in_sync
rampup_epochs = 5
sustain_epochs = 0
exp_decay = .8

def lrfn(epoch):
    if epoch < rampup_epochs:
        return (max_lr - start_lr)/rampup_epochs * epoch + start_lr
    else:
        return (max_lr - min_lr)*exp_decay**(epoch-rampup_epochs-sustain_epochs) + min_lr

rang = np.arange(EPOCHS)
y = [lrfn(x) for x in rang]
plt.plot(rang, y)

lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=True)


In [None]:
train_dataset.map(lambda i, l: print(l))

In [None]:
history = model.fit(
    train_dataset, validation_data=val_dataset, epochs=EPOCHS)

In [None]:
# history.history

In [None]:
np.mean(history.history['val_sparse_categorical_accuracy'][-5:])

In [None]:
prediction = model.predict_classes(test_dataset.map(lambda img, lab: img))

In [None]:
img_ids = test_dataset.map(lambda img, lab: lab).unbatch()
test_ids = next(iter(img_ids.batch(n_test))).numpy().astype('U')

In [None]:
pd.DataFrame({'id': test_ids, 'label': prediction}).to_csv('submission.csv', index=False)