# Intro
Welcome to the [Human Protein Atlas - Single Cell Classification](https://www.kaggle.com/c/hpa-single-cell-image-classification).

![](https://storage.googleapis.com/kaggle-competitions/kaggle/23823/logos/header.png)

The goal of this notebook is to give a short tutorial for the usage of TFRecords. We don't focus on optimization of the prediction model.

For a more general tutorial we recommend this notebook.

<span style="color: royalblue;">Please vote the notebook up if it helps you. Feel free to leave a comment above the notebook. Thank you. </span>

# Motivation
TFRecord files (.tfrec) are based on a binary format for storing sequences of values. The TFRecord format was developed by TensorFlow. The motivation of the development is to use Tensor Processing Units (TPUs) to accelerate the applications of machine learning applications.

To use the advantages of TPU you have to switch on your notebook:
1. Klick on the notebook seetings (right upper corner of the notebook).
2. Klick on "Accelerator".
3. Choose TPU v3-8.
![](https://i.ibb.co/mHFPHpN/setting.png)

# Libraries

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import re
import json
from collections import Counter

from sklearn.model_selection import train_test_split

import tensorflow as tf
from functools import partial
from kaggle_datasets import KaggleDatasets
print("Tensorflow version " + tf.__version__)

# Set Up

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print("Device:", tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print("Number of replicas:", strategy.num_replicas_in_sync)

# Path

In [None]:
path = '/kaggle/input/hpa-single-cell-image-classification/'
os.listdir(path)

There are 2 folders with tfrec-files: train_tfrecords, test_tfrecords.

Create the GCS path:

In [None]:
path_gcs = KaggleDatasets().get_gcs_path('hpa-single-cell-image-classification')
print(path_gcs)

# Parameter

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 16*strategy.num_replicas_in_sync
IMAGE_SIZE = [256, 256]

# Load Data

In [None]:
samp_subm = pd.read_csv(path+'sample_submission.csv')

Define train, validation and test filenames:

In [None]:
train_filenames, val_filenames = train_test_split(tf.io.gfile.glob(path_gcs + '/train_tfrecords/*.tfrec'),
                                                  test_size=0.20, random_state=2020)
test_filenames = tf.io.gfile.glob(path_gcs+'/test_tfrecords/*.tfrec')

# Key Names
First we have to extract the features keys. To see the feature keys we have to execute the following code.

There are 3 feature keys for this dataset. We only want to show a section of the output
![](https://i.ibb.co/M7JKpsC/features-1.png)
![](https://i.ibb.co/YDk1Qkj/features-2.png)

In [None]:
raw_dataset = tf.data.TFRecordDataset(train_filenames)
for raw_record in raw_dataset.take(1):
  example = tf.train.Example()
  example.ParseFromString(raw_record.numpy())
  #print(example.features)

# Functions

In [None]:
def number_of_files(filenames):
    """ Evaluate the number on files """
    
    num = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(num)

def decode_image(image):
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [*IMAGE_SIZE])
    image = tf.cast(image, tf.float32)/255.
    image = tf.reshape(image, [*IMAGE_SIZE, 3])
    return image

def read_tfrecord(example, labeled):
    tfrecord_format = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "target": tf.io.FixedLenFeature([], tf.string),
    } if labeled else {
        "image": tf.io.FixedLenFeature([], tf.string),
        "image_name":tf.io.FixedLenFeature([], tf.string)
    }
    
    example = tf.io.parse_single_example(example, tfrecord_format)
    image = decode_image(example["image"])
    if labeled:
        label = tf.strings.split(example["target"], '|')
        label = tf.strings.to_number(label, tf.int32)
        label = tf.one_hot(label, depth=19)
        label = tf.reduce_sum(label, axis=0)
        return image, label
    idnum = example['image_name']
    return image, idnum


def load_dataset(filenames, labeled=True, ordered=False):
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False  # disable order, increase speed
    dataset = tf.data.TFRecordDataset(
        filenames
    )  # automatically interleaves reads from multiple files
    dataset = dataset.with_options(
        ignore_order
    )  # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(
        partial(read_tfrecord, labeled=labeled), num_parallel_calls=AUTOTUNE
    )
    # returns a dataset of (image, label) pairs if labeled=True or just images if labeled=False
    return dataset


def get_train_dataset(filenames, labeled=True, ordered=False):
    dataset = load_dataset(filenames, labeled=labeled, ordered=ordered)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2021)
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

def get_val_dataset(filenames, labeled=True, ordered=False):
    dataset = load_dataset(filenames, labeled=labeled, ordered=ordered)
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

def get_test_dataset(filenames, labeled=False, ordered=True):
    dataset = load_dataset(filenames, labeled=labeled, ordered=ordered)
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

def show_batch(image_batch, label_batch):
    """ Plot 16 images of a batch """
    
    plt.figure(figsize=(20, 20))
    for n in range(16):
        ax = plt.subplot(4, 4, n + 1)
        plt.imshow(image_batch[n])
        #plt.title(str(label_batch[n].numpy()))
        plt.axis("off")

# Overview

In [None]:
print('Number samples of submission file:', len(samp_subm))

In [None]:
print('Number of train tfrec files:', len(train_filenames))
print('Number of val tfrec files:', len(val_filenames))
print('Number of test tfrec files:', len(test_filenames))

In [None]:
print('Number Files train:', number_of_files(train_filenames))
print('Number Files val:', number_of_files(val_filenames))
print('Number Files test:', number_of_files(test_filenames))

# Get Data

In [None]:
train_dataset = get_train_dataset(train_filenames)
val_dataset = get_val_dataset(val_filenames)
test_dataset = get_test_dataset(test_filenames)

Check the shape of the data:

In [None]:
print(train_dataset)
print(val_dataset)
print(test_dataset)

In [None]:
image_batch, label_batch = next(iter(val_dataset))
show_batch(image_batch, label_batch)

# Model

In [None]:
metrics = [tf.keras.metrics.AUC(name='auc', multi_label=True)]
learning_rate = 1e-3

In [None]:
def make_model():
    base_model = tf.keras.applications.ResNet50(weights = 'imagenet', 
                                                include_top = False,
                                                input_shape = [*IMAGE_SIZE, 3])
    base_model.trainable = True
    model = tf.keras.Sequential([
            base_model,
            tf.keras.layers.GlobalAveragePooling2D(),
            tf.keras.layers.Dense(19, activation='sigmoid')])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                           loss="binary_crossentropy",
                                           metrics=metrics
    )
    return model

In [None]:
with strategy.scope():
    model = make_model()

model.summary()

In [None]:
history = model.fit(train_dataset,
                    epochs=5,
                    validation_data = val_dataset,
                    steps_per_epoch = number_of_files(train_filenames)//BATCH_SIZE)

# Analyse Training

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(20, 6))
fig.subplots_adjust(hspace = .2, wspace=.2)
axs = axs.ravel()
loss = history.history['loss']
loss_val = history.history['val_loss']
epochs = range(1, len(loss)+1)
axs[0].plot(epochs, loss, 'bo', label='loss_train')
axs[0].plot(epochs, loss_val, 'ro', label='loss_val')
axs[0].set_title('Value of the loss function')
axs[0].set_xlabel('epochs')
axs[0].set_ylabel('value of the loss function')
axs[0].legend()
axs[0].grid()
acc = history.history['auc']
acc_val = history.history['val_auc']
axs[1].plot(epochs, acc, 'bo', label='accuracy_train')
axs[1].plot(epochs, acc_val, 'ro', label='accuracy_val')
axs[1].set_title('Accuracy')
axs[1].set_xlabel('Epochs')
axs[1].set_ylabel('Value of accuracy')
axs[1].legend()
axs[1].grid()
plt.show()