# Intro
Welcome to the [Human Protein Atlas - Single Cell Classification](https://www.kaggle.com/c/hpa-single-cell-image-classification).

![](https://storage.googleapis.com/kaggle-competitions/kaggle/23823/logos/header.png)

For a TPU tutorial of this compedition we recommend this [notebook](https://www.kaggle.com/drcapa/human-protein-atlas-tpu-tutorial/).

<span style="color: royalblue;">Please vote the notebook up if it helps you. Feel free to leave a comment above the notebook. Thank you. </span>

# Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import cv2

from sklearn.model_selection import train_test_split

import tensorflow as tf
from keras.utils import to_categorical, Sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization
from keras.optimizers import RMSprop,Adam
from keras.applications import ResNet50

# Path

In [None]:
path = '/kaggle/input/hpa-single-cell-image-classification/'
os.listdir(path)

# Load Data

In [None]:
train_data = pd.read_csv(path+'train.csv')
samp_subm = pd.read_csv(path+'sample_submission.csv')

# Parameter

In [None]:
img_size = 64
img_channel = 3
num_classes = 19
batch_size = 64

# Overview

In [None]:
print('Number train samples:', len(train_data.index))
print('Number train images:', len(os.listdir(path+'train')))
print('Number submission samples:', len(samp_subm.index))
print('Number submission images:', len(os.listdir(path+'test')))

In [None]:
train_data.head()

# Load Images
All samples consist of four files - blue, green, red, and yellow. Colors are 
* red for [microtubule channels](https://en.wikipedia.org/wiki/Microtubule)
* blue for nuclei channels
* yellow for [Endoplasmic Reticulum (ER)](https://en.wikipedia.org/wiki/Endoplasmic_reticulum) channels
* green for protein

In [None]:
colors_dict = {'red': 'microtubule', 'blue': 'nuclei', 'yellow': 'Endoplasmic Reticulum', 'green': 'protein'}

Load the first image of the train dataset:

In [None]:
image_id = train_data.loc[0, 'ID']
image_file = cv2.imread(path+'train/'+image_id+'_blue.png')
image_file.shape

Show the 4 images for the first sample of the train dataset:

In [None]:
image_id = train_data.loc[0, 'ID']
fig, axs = plt.subplots(1, 4, figsize=(20, 5))
fig.subplots_adjust(hspace = .2, wspace=.1)
axs = axs.ravel()
colors = list(colors_dict.keys())
for i in range(len(colors)):
    filename = ''.join([image_id, '_', colors[i], '.png'])
    image_file = cv2.imread(path+'train/'+filename)
    axs[i].imshow(image_file)
    axs[i].set_title(colors_dict[colors[i]])
    axs[i].set_xticklabels([])
    axs[i].set_yticklabels([])

# Encoding Labels
This is a multilabel classification. The labels are separeted by | in the train dataset.

In [None]:
# original label
print('input :', train_data.loc[0, 'Label'])
# label as list
print('step 1:', train_data.loc[0, 'Label'].split('|'))
# label as list on integers
print('step 2:', list(map(int, train_data.loc[0, 'Label'].split('|'))))
# label to binary class matrix
label = to_categorical(list(map(int, train_data.loc[0, 'Label'].split('|'))), num_classes=19)
print('step 3:', label)
# sum the labels
label = label.sum(axis=0)
print('step 4:', label)

# Data Generator
We define a data generator to load the data on demand.

In [None]:
class DataGenerator(Sequence):
    def __init__(self, path, list_IDs, labels, batch_size, img_size, img_channel):
        self.path = path
        self.list_IDs = list_IDs
        self.labels = labels
        self.batch_size = batch_size
        self.img_size = img_size
        self.img_channel = img_channel
        self.indexes = np.arange(len(self.list_IDs))
        
    def __len__(self):
        len_ = int(len(self.list_IDs)/self.batch_size)
        if len_*self.batch_size < len(self.list_IDs):
            len_ += 1
        return len_
    
    
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        X, y = self.__data_generation(list_IDs_temp)
        return X, y

    
    def __data_generation(self, list_IDs_temp):
        X = np.zeros((self.batch_size, self.img_size, self.img_size, self.img_channel))
        y = np.zeros((self.batch_size, num_classes), dtype=int)
        for i, ID in enumerate(list_IDs_temp):
            data_file = cv2.imread(self.path+ID+'_blue.png')
            img = cv2.resize(data_file, (self.img_size, self.img_size))
            X[i, ] = img/255.
            # Prepare label
            label = self.labels[i]
            label = label.split('|')
            label = list(map(int, label))
            label = to_categorical(label, num_classes=num_classes)
            label = label.sum(axis=0)
            y[i, ] = label
        return X, y

# Define Model

In [None]:
weights='../input/models/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'

In [None]:
metrics = [tf.keras.metrics.AUC(name='auc', multi_label=True)]
learning_rate = 1e-3

In [None]:
conv_base = ResNet50(include_top=False,
                     weights=weights,
                     input_shape=(img_size, img_size, img_channel))
conv_base.trainable = True

In [None]:
model = Sequential()
model.add(conv_base)
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.1))
model.add(Dense(num_classes, activation='sigmoid'))

In [None]:
model.compile(optimizer=Adam(lr=learning_rate), loss="binary_crossentropy", metrics=metrics)

In [None]:
model.summary()

# Train Model

In [None]:
epochs = 3

In [None]:
train_IDs, val_IDs, y_train, y_val = train_test_split(train_data['ID'], train_data['Label'], test_size=0.33, random_state=2021)
train_IDs.index=range(len(train_IDs))
y_train.index=range(len(train_IDs))
val_IDs.index=range(len(val_IDs))
y_val.index=range(len(val_IDs))

In [None]:
train_generator = DataGenerator(path+'train/', train_IDs, y_train, batch_size, img_size, img_channel)
val_generator = DataGenerator(path+'train/', val_IDs, y_val, batch_size, img_size, img_channel)

In [None]:
history = model.fit_generator(generator=train_generator,
                              validation_data=val_generator,
                              epochs = epochs)

# Analyse Training

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(20, 6))
fig.subplots_adjust(hspace = .2, wspace=.2)
axs = axs.ravel()
loss = history.history['loss']
loss_val = history.history['val_loss']
epochs = range(1, len(loss)+1)
axs[0].plot(epochs, loss, 'bo', label='loss_train')
axs[0].plot(epochs, loss_val, 'ro', label='loss_val')
axs[0].set_title('Value of the loss function')
axs[0].set_xlabel('epochs')
axs[0].set_ylabel('value of the loss function')
axs[0].legend()
axs[0].grid()
acc = history.history['auc']
acc_val = history.history['val_auc']
axs[1].plot(epochs, acc, 'bo', label='accuracy_train')
axs[1].plot(epochs, acc_val, 'ro', label='accuracy_val')
axs[1].set_title('Accuracy')
axs[1].set_xlabel('Epochs')
axs[1].set_ylabel('Value of accuracy')
axs[1].legend()
axs[1].grid()
plt.show()

# Write Output

In [None]:
output = samp_subm.copy()

In [None]:
output.to_csv('submission.csv', index=False)

In [None]:
output.head()

# Next Steps
* Extend the data generator for all colors (blue, red, yellow, green). Currently onle blue is used.
* Predict test data.