In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
 #   for filename in filenames:
  #      print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import re
import json
from collections import Counter

from sklearn.model_selection import train_test_split

import tensorflow as tf
from functools import partial
from kaggle_datasets import KaggleDatasets
print("Tensorflow version " + tf.__version__)

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print("Device:", tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print("Number of replicas:", strategy.num_replicas_in_sync)

In [None]:
path = '/kaggle/input/hpa-single-cell-image-classification/'
os.listdir(path)

In [None]:
path_gcs = KaggleDatasets().get_gcs_path()
print(path_gcs)

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 16*strategy.num_replicas_in_sync
IMAGE_SIZE = [256, 256]

In [None]:
train_filenames, val_filenames = train_test_split(tf.io.gfile.glob(path_gcs + '/train_tfrecords/*.tfrec'),
                                                  test_size=0.20, random_state=2020)
test_filenames = tf.io.gfile.glob(path_gcs+'/test_tfrecords/*.tfrec')

In [None]:
raw_dataset = tf.data.TFRecordDataset(train_filenames)
for raw_record in raw_dataset.take(1):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())

In [None]:
def number_of_files(filenames):
    """ Evaluate the number on files """
    
    num = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(num)

def decode_image(image):
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [*IMAGE_SIZE])
    image = tf.cast(image, tf.float32)/255.
    image = tf.reshape(image, [*IMAGE_SIZE, 3])
    return image

def read_tfrecord(example, labeled):
    tfrecord_format = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "target": tf.io.FixedLenFeature([], tf.string),
    } if labeled else {
        "image": tf.io.FixedLenFeature([], tf.string),
        "image_name":tf.io.FixedLenFeature([], tf.string)
    }
    
    example = tf.io.parse_single_example(example, tfrecord_format)
    image = decode_image(example["image"])
    if labeled:
        label = tf.strings.split(example["target"], '|')
        label = tf.strings.to_number(label, tf.int32)
        label = tf.one_hot(label, depth=19)
        label = tf.reduce_sum(label, axis=0)
        return image, label
    idnum = example['image_name']
    return image, idnum


def load_dataset(filenames, labeled=True, ordered=False):
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False  # disable order, increase speed
    dataset = tf.data.TFRecordDataset(
        filenames
    )  # automatically interleaves reads from multiple files
    dataset = dataset.with_options(
        ignore_order
    )  # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(
        partial(read_tfrecord, labeled=labeled), num_parallel_calls=AUTOTUNE
    )
    # returns a dataset of (image, label) pairs if labeled=True or just images if labeled=False
    return dataset


def get_train_dataset(filenames, labeled=True, ordered=False):
    dataset = load_dataset(filenames, labeled=labeled, ordered=ordered)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2021)
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

def get_val_dataset(filenames, labeled=True, ordered=False):
    dataset = load_dataset(filenames, labeled=labeled, ordered=ordered)
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

def get_test_dataset(filenames, labeled=False, ordered=True):
    dataset = load_dataset(filenames, labeled=labeled, ordered=ordered)
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

def show_batch(image_batch, label_batch):
    """ Plot 16 images of a batch """
    
    plt.figure(figsize=(20, 20))
    for n in range(16):
        ax = plt.subplot(4, 4, n + 1)
        plt.imshow(image_batch[n])
        #plt.title(str(label_batch[n].numpy()))
        plt.axis("off")

In [None]:
print('Number of train tfrec files:', len(train_filenames))
print('Number of val tfrec files:', len(val_filenames))
print('Number of test tfrec files:', len(test_filenames))

In [None]:
print('Number Files train:', number_of_files(train_filenames))
print('Number Files val:', number_of_files(val_filenames))
print('Number Files test:', number_of_files(test_filenames))

In [None]:
train_dataset = get_train_dataset(train_filenames)
val_dataset = get_val_dataset(val_filenames)
test_dataset = get_test_dataset(test_filenames)

In [None]:
print(train_dataset)
print(val_dataset)
print(test_dataset)

In [None]:
image_batch, label_batch = next(iter(val_dataset))
show_batch(image_batch, label_batch)

In [None]:
metrics = [tf.keras.metrics.AUC(name='auc', multi_label=True)]
learning_rate = 1e-3

In [None]:
def make_model():
    base_model = tf.keras.applications.ResNet50(weights = 'imagenet', 
                                                include_top = False,
                                                input_shape = [*IMAGE_SIZE, 3])
    base_model.trainable = True
    model = tf.keras.Sequential([
            base_model,
            tf.keras.layers.GlobalAveragePooling2D(),
            tf.keras.layers.Dense(19, activation='sigmoid')])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                           loss="binary_crossentropy",
                                           metrics=metrics,
                                           steps_per_execution=32
    )
    return model

In [None]:
with strategy.scope():
    model = make_model()

model.summary()


In [None]:
history = model.fit(train_dataset,
                    epochs=5,
                    validation_data = val_dataset,
                    steps_per_epoch = number_of_files(train_filenames)//BATCH_SIZE)

In [None]:
save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
model.save('./model', options=save_locally)

In [None]:
import tensorflow as tf
load_locally = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')
model = tf.keras.models.load_model('./model', options=load_locally) # loading in Tensorflow's "SavedModel" format

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(20, 6))
fig.subplots_adjust(hspace = .2, wspace=.2)
axs = axs.ravel()
loss = history.history['loss']
loss_val = history.history['val_loss']
epochs = range(1, len(loss)+1)
axs[0].plot(epochs, loss, 'bo', label='loss_train')
axs[0].plot(epochs, loss_val, 'ro', label='loss_val')
axs[0].set_title('Value of the loss function')
axs[0].set_xlabel('epochs')
axs[0].set_ylabel('value of the loss function')
axs[0].legend()
axs[0].grid()
acc = history.history['auc']
acc_val = history.history['val_auc']
axs[1].plot(epochs, acc, 'bo', label='accuracy_train')
axs[1].plot(epochs, acc_val, 'ro', label='accuracy_val')
axs[1].set_title('Accuracy')
axs[1].set_xlabel('Epochs')
axs[1].set_ylabel('Value of accuracy')
axs[1].legend()
axs[1].grid()
plt.show()