# Step 1. See how the data is

1. let's see how the data is formed
2. and then make a function to see more easily

import some modules

In [None]:
%config Completer.use_jedi = False
import os
import numpy as np # linear algebra
import pandas as pd # data processing
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image, ImageOps
from matplotlib.pyplot import imread
import tensorflow as tf

setting some constants

In [None]:
D_ = r'/kaggle/input/hpa-single-cell-image-classification'
D_TRAIN = os.path.join(D_, 'train')
D_TEST = os.path.join(D_, 'test')
D_TRAIN_CSV = os.path.join(D_, 'train.csv')
D_SUBMI_CSV = os.path.join(D_, 'sample_submission.csv')

What does the train set look like?

In [None]:
df_train = pd.read_csv(D_TRAIN_CSV)
df_train.head()

In [None]:
print(f'Train Sample Numbers - {df_train.shape[0]}')

**Test Samples**

In [None]:
df_test = pd.read_csv(D_SUBMI_CSV)
df_test.head()

In [None]:
print(f'Test Sample Numbers - {df_test.shape[0]}')

## **Labels**

In [None]:
DICT_LABEL_NAME = {
    0: 'Nucleoplasm',
    1: 'Nuclear membrane',
    2: 'Nucleoli',
    3: 'Nucleoli fibrillar center',
    4: 'Nuclear speckles',
    5: 'Nuclear bodies',
    6: 'Endoplasmic reticulum',
    7: 'Golgi apparatus',
    8: 'Intermediate filaments',
    9: 'Actin filaments',
    10: 'Microtubules',
    11: 'Mitotic spindle',
    12: 'Centrosome',
    13: 'Plasma membrane',
    14: 'Mitochondria',
    15: 'Aggresome',
    16: 'Cytosol',
    17: 'Vesicles and punctate cytosolic patterns',
    18: 'Negative'
}
DICT_NAME_LABEL = dict((v, k) for k, v in DICT_LABEL_NAME.items())

def fill_targets(row):
    row['split_Label'] = np.array(row.Label.split('|')).astype(np.int)
    for num in row.split_Label:
        name = DICT_LABEL_NAME[int(num)]
        row.loc[name] = 1
    return row

fill_targets(df_train.iloc[0])

**Reformat Train Targets**

In [None]:
for name in DICT_NAME_LABEL.keys():
    df_train[name] = 0 
df_train = df_train.apply(lambda row: fill_targets(row), axis = 1)
df_train.head()

**Reformat Test Targets**

## 1. Distribution of the data

In [None]:
df_train_targets = df_train.drop(['ID', 'Label', 'split_Label'], axis=1)
df_train_value_counts = df_train_targets.sum().sort_values(ascending=False)
df_train_value_counts.head()
plt.figure(figsize=(10, 10))
sns.barplot(y=df_train_value_counts.index, x=df_train_value_counts.values)

## 2. Target numbers distribution

In [None]:
df_numTarget_counts = df_train_targets.sum(axis='columns').value_counts()
plt.figure(figsize=(10, 4))
sns.barplot(x=df_numTarget_counts.index, y=df_numTarget_counts.values, palette='Reds')
plt.xlabel('Number of targets per image')
plt.ylabel('Number of images')

## 3. Relations

In [None]:
plt.figure(figsize = (15, 15))
sns.heatmap(df_train_targets.corr(), cmap = 'YlGnBu', vmin =-1, vmax=1)

## 3. Looks of the image

**check input quality**

In [None]:
from os import listdir
list_file = listdir(D_TRAIN)
list_color = ['red', 'yellow', 'green', 'blue']
df_train = df_train.set_index('ID')
for color in list_color:
    df_train[color] = 0 
for file in list_file:
    ID = file.split('_')[0]
    color = file.split('_')[1].split('.png')[0]
    df_train.loc[ID, color] += 1

In [None]:
df_color = df_train[list_color]
df_color.value_counts()

check test quality

In [None]:
df_test = df_test.set_index('ID')

In [None]:
list_file = listdir(D_TEST)
len(list_file)
for i in range(3):
    print(list_file[i])

In [None]:
for color in list_color:
    df_test[color] = 0 
for file in list_file:
    ID = file.split('_')[0]
    color = file.split('_')[1].split('.png')[0]
    df_test.loc[ID, color] += 1
df_color = df_test[list_color]
df_color.value_counts()

**Let's see the image**

In [None]:
def load_image(image_id, basepath = D_TRAIN):
    red_image = imread(os.path.join(basepath, image_id + '_red.png'))
    blue_image = imread(os.path.join(basepath, image_id + '_blue.png'))
    green_image = imread(os.path.join(basepath, image_id + '_green.png'))
    #yellow_image = imread(os.path.join(basepath, image_id + '_yellow.png'))
    #image = np.dstack((red_image, blue_image, green_image, yellow_image))
    image = np.dstack((red_image, green_image, blue_image))
    if image.max() > 255:
        image = (image/255).astype('uint8')
    return image

sample_image = load_image(df_train.index[1])
plt.imshow(sample_image)
plt.axis('off')
plt.show()


**separate to individual cells**

I found HPACellSeg 

* https://www.kaggle.com/lnhtrang/hpa-public-data-download-and-hpacellseg
* https://github.com/CellProfiling/HPA-Cell-Segmentation

I'm tried to use this mask, but it was too slow. 
* https://www.kaggle.com/yushinjung/human-protein-atlas-single-cell-classification

I coinceidentally found the masks already processed.
https://www.kaggle.com/its7171/hpa-mask

Let's check whether mask works.

In [None]:
D_CELL_MASK = '/kaggle/input/hpa-mask/hpa_cell_mask'
D_NUCLEI_MASK = '/kaggle/input/hpa-mask/hpa_nuclei_mask'

flag_demo_mask = False
if flag_demo_mask:
    ID_sample = '000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0'
    d_nucl = os.path.join(D_CELL_MASK, ID_sample)
    d_cell = os.path.join(D_NUCLEI_MASK, ID_sample)

    cell_mask = np.load(d_cell + '.npz')['arr_0']
    nucl_mask = np.load(d_nucl + '.npz')['arr_0']

    image_sample = load_image(ID_sample, D_TRAIN)
    fig, ax = plt.subplots(1, 3, figsize=(30, 10))
    ax[0].imshow(image_sample)
    ax[1].imshow(cell_mask)
    ax[2].imshow(nucl_mask)

    for _ in range(3):
        ax[_].axis('off')

**Important Thing**

we have to get unique image only to use for training

In [None]:
flag_saveUsingIndex = True # to save using index list

import random 
df_train_targets = df_train.drop(['Label', 'split_Label'] + list_color, axis = 1)
df_train_unique_targets = df_train_targets[df_train_targets.sum(axis = 'columns') == 1]
n_max_image_per_label = 10000
list_using_index = []

d_saveUsingIndex = os.path.join('/kaggle/working', 'save_using_Index')
if not flag_saveUsingIndex:
    list_using_index = np.load(d_saveUsingIndex + '.npz')['arr_0']
else:
    for name in DICT_NAME_LABEL.keys():
        num_unique_image = (df_train_unique_targets[name] == 1).sum()
        num__ = len(list_using_index)
        if num_unique_image < n_max_image_per_label:
            list_using_index += list(df_train_unique_targets[df_train_unique_targets[name] == 1].index)
        else:
            # num_random = random.sample(range(0, num_unique_image), n_max_image_per_label)
            # for ease let's make it simple 
            num_random = [i for i in range(100)]
            list_using_index += list(df_train_unique_targets[df_train_unique_targets[name] == 1].iloc[num_random].index)
        print(f'{name} - unique image number is {len(list_using_index) - num__}')
        np.savez(d_saveUsingIndex, list_using_index)    # save for future work

Images of cell by cell

In [None]:
flag_seeMasks = False
import cv2
def get_single_cellImage(contours, original_image, cell_mask):
    list_singleImage = []
    for contour in contours:
        x, y, width, height = cv2.boundingRect(contour)
        instance_contour = np.zeros(cell_mask.shape)
        cv2.drawContours(instance_contour,[contour], 0, 255, thickness=cv2.FILLED)

        isolated_cell_image = np.zeros(cell_mask.shape)
        isolated_cell_image = cv2.bitwise_and(image,image, mask = instance_contour.astype("uint8"))
        list_singleImage.append(isolated_cell_image[y:y+height,x:x+width,:3])
    return list_singleImage

for every unique image, single cell image will be saved to a folder

In [None]:
d_saveFolderSingleCell = os.path.join('/kaggle/working/', 'SingleCellTrain')
if not os.path.isdir(d_saveFolderSingleCell):
    os.mkdir(d_saveFolderSingleCell)

list_existingfile = os.listdir(d_saveFolderSingleCell)
label = 0 
total_index = len(list_using_index)
for num, ID in enumerate(list_using_index):
    if num % 100 == 0 :
        print('{:.2f} % is finished'.format(num/total_index * 100))
    if list_existingfile.count(f'{ID}_0_0.npz') > 0:
        continue
    image = load_image(ID)
    d_cell = os.path.join(D_NUCLEI_MASK, ID) 
    cell_mask = np.load(d_cell + '.npz')['arr_0']
#     if flag_seeMasks:
#         fig, ax = plt.subplots(1, 3, figsize = (24, 8))
#         ax[0].imshow(image)
#         ax[1].imshow(cell_mask)
#         ax[2].imshow(nucl_mask)
#         for _ in range(3):
#             ax[_].axis('off')
    contours, hierarchy = cv2.findContours(cell_mask.astype(np.uint8), mode=cv2.RETR_EXTERNAL, method=cv2.CHAIN_APPROX_SIMPLE)
#     for contour in contours:    
#         plt.figure(figsize = (5, 5))
#         x, y, width, height = cv2.boundingRect(contour)
#         instance_contour = np.zeros(cell_mask.shape)
#         cv2.drawContours(instance_contour,[contour], 0, 255, thickness=cv2.FILLED)

#         isolated_cell_image = np.zeros(cell_mask.shape)
#         isolated_cell_image = cv2.bitwise_and(image,image, mask = instance_contour.astype("uint8"))
#         plt.axis('off')
#         plt.title(f'{x}, {y}, {width}, {height}')
#         plt.imshow(isolated_cell_image[y:y+height,x:x+width,:3])
    list_delIndex = []
    for i, contour in enumerate(contours):
        if cv2.contourArea(contour) < 50: # check whether there is too little contour
            list_delIndex.append(i)
    if len(list_delIndex) != 0 :
        list_delIndex.sort(reverse=True)
        for i in list_delIndex:
            del contours[i]
    if len(contours) == 0 : # if there is no cell found go next
        continue
    list_singleCellImage = get_single_cellImage(contours, image, cell_mask)
    for i, sc_image in enumerate(list_singleCellImage):
        d_saveFile = os.path.join(d_saveFolderSingleCell, f'{ID}_{i}_{label}')
        np.savez_compressed(d_saveFile, sc_image)
#     plt.imshow(image)
#     fig, ax = plt.subplots(1, len(list_singleCellImage), figsize=(len(list_singleCellImage*10), 10) )
#     for i, sc_image in enumerate(list_singleCellImage):
#         ax[i].imshow(sc_image)
#         ax[i].axis('off')
#         ax[i].set_title(f'{i}', fontdict={'fontsize': 50})
    

**Let's segment the images before making predictions**

In [None]:
# functions to get path for required image
def get_imagepath_of_RYB(image_ID, basepath):
    r_path = os.path.join(basepath, f'{image_ID}_red.png')
    y_path = os.path.join(basepath, f'{image_ID}_yellow.png')
    b_path = os.path.join(basepath, f'{image_ID}_blue.png')
    return r_path, y_path, b_path

**encode binary mask**

https://www.kaggle.com/thedrcat/hpa-baseline-cell-segmentation#kln-39

In [None]:
!pip install -q "/kaggle/input/pycocotools/pycocotools-2.0-cp37-cp37m-linux_x86_64.whl"

In [None]:
import base64
from pycocotools import _mask as coco_mask

from pycocotools import _mask 
import zlib
def encode_binary_mask(mask):
    # check input mask --
    if mask.dtype != np.bool:
#         raise ValueError(
#             "encode_binary_mask expects a binary mask, received dtype == %s" %
#             mask.dtype)
        mask = mask.astype(np.bool)
    mask = np.squeeze(mask)
    if len(mask.shape) != 2:
        raise ValueError(
            "encode_binary_mask expects a 2d mask, received shape == %s" %
            mask.shape)
    # convert input mask to expected COCO API input --
    mask_to_encode = mask.reshape(mask.shape[0], mask.shape[1], 1)
    mask_to_encode = mask_to_encode.astype(np.uint8)
    mask_to_encode = np.asfortranarray(mask_to_encode)

    # RLE encode mask --
    encoded_mask = _mask.encode(mask_to_encode)[0]["counts"]

    # compress and base64 encoding --
    binary_str = zlib.compress(encoded_mask, zlib.Z_BEST_COMPRESSION)
    base64_str = base64.b64encode(binary_str)
    return base64_str.decode('ascii')

In [None]:
from itertools import groupby

# encode
def coco_rle_encode(mask):
    rle = {'counts': [], 'size': list(mask.shape)}
    counts = rle.get('counts')
    for i, (value, elements) in enumerate(groupby(mask.ravel(order='F'))):
        if i == 0 and value == 1:
            counts.append(0)
        counts.append(len(list(elements)))
    return rle

**Before machine learning we have to get all the single cell image from test image**

by using segmentator above, we will save test images single cell

# Step 2. Building a baseline model

In [None]:
from sklearn.model_selection import train_test_split
series_ID = df_train.index
series_trainID, series_validID = train_test_split(series_ID, test_size=0.25, random_state=0)
series_testID = df_test.index

labels = df_train[DICT_NAME_LABEL.keys()]
print(f'train ID, length-{len(series_trainID)}\n{series_trainID[:2]}')
print(f'valid ID, length-{len(series_validID)}\n{series_validID[:2]}')
print(f'labels, length-{labels.shape}\n{labels.iloc[:2, :2]}')

## **Shared Parameters**

In [None]:
class ModelParameter:
    def __init__(self, 
                 basepath=D_TRAIN, 
                 num_classes=len(DICT_NAME_LABEL.keys()),
                 image_row_dim=32, 
                 image_col_dim=32, 
                 shuffle=False, 
                 batch_size=200, 
                 n_epochs=1, 
                 n_channels=4):
        self.basepath = basepath
        self.num_classes = num_classes
        self.image_row_dim = image_row_dim
        self.image_col_dim = image_col_dim
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.n_channels = n_channels
parameter_base = ModelParameter(basepath=D_TRAIN)

## **Image Preprocessor**

In [None]:
from skimage.transform import resize

class ImagePreprocessor:
    def __init__(self, parameter):
        self.parameter = parameter
        self.basepath = parameter.basepath
        self.image_row_dim = parameter.image_row_dim
        self.image_col_dim = parameter.image_col_dim
    
    def resize(self,image):
        image = resize(image, (self.image_row_dim, self.image_col_dim))
        return image
    
    def load_image(self, image_id):
        return load_image(image_id, basepath = self.basepath)
    
    def preprocess(self, image):
        image = self.resize(image)
        return image
    
preprocessor = ImagePreprocessor(parameter_base)

In [None]:
if False:
    for i, ID in enumerate(series_trainID):
        print(i)
        fig, ax = plt.subplots(1,2, figsize=(10, 5))
        image = preprocessor.load_image(ID)
        ax[0].imshow(image)    
        print(image.shape)
        image = preprocessor.preprocess(image)
        ax[1].imshow(image)
        print(image.shape)
        if i == 3:
            break   

In [None]:
import keras
class DataGenerator(keras.utils.Sequence):
    def __init__(self, list_IDs, labels, parameter, preprocessor):
        self.current_epoch = 0 
        self.params = parameter
        self.labels = labels
        self.list_IDs = list_IDs
        self.preprocessor = preprocessor
        
        self.dim = (parameter.image_row_dim, parameter.image_col_dim)
        self.batch_size = parameter.batch_size
        self.num_classes = parameter.num_classes
        self.shuffle = parameter.shuffle
        self.n_channels = parameter.n_channels
        
        self.on_epoch_end()
        
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes, random_state=self.current_epoch)
            self.current_epoch += 1
        
    def get_targets_per_image(self, identifier):
        return self.labels.loc[identifier][DICT_NAME_LABEL.keys()]
        
    def __data_generation(self, list_IDs_temp):
        # Initialize
        data = np.empty((self.batch_size, *self.dim, self_n_channels))
        label = np.empty((self.batch_size, self.num_classes), dtype = int)
        # Generate Data
        for i, identifier in enumerate(list_IDs_temp):
            # store dataset
            image = self.preprocessor.load_image(identifier)
            image = self.preprocessor.preprocess(image)
            data[i]
            # store label
            label[i] = self.get_targets_per_image(identifier)
        return data, label
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))
    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index * self.batch_size:(index+1) * self.batch_size]
        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        # Generate data
        data, label = self.__data_generation(list_IDs_temp)
        pass
    
trainDataGenerator = DataGenerator(series_trainID, labels, parameter_base, preprocessor)
validDataGenerator = DataGenerator(series_validID, labels, parameter_base, preprocessor)

In [None]:
class PredictGenerator:
    def __init__(self, predict_IDs, preprocessor, predict_path):
        self.preprocessor = preprocessor
        self.preprocessor.basepath = predict_path
        self.identifiers = predict_IDs
        
    def predict(self, model):
        y = np.empty(shape=(len(self.identifiers), self.preprocessor.parameter.num_classes))
        for n in range(len(self.identifiers)):
            image = self.preprocessor.load_image(self.identifiers[n])
            image = self.preprocessor.preprocess(image)
            y[n] = model.predict(image)      
        return y

testDataGenerator = PredictGenerator(series_testID, preprocessor, D_TEST)

## CNN Baseline model using KERAS

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.losses import binary_crossentropy
from keras.optimizers import Adadelta
from keras.initializers import VarianceScaling

class BaseLineModel:
    def __init__(self, parameter):
        self.params = parameter
        self.num_classes = parameter.num_classes
        self.img_rows = parameter.image_row_dim
        self.img_cols = parameter.image_col_dim
        self.n_channels = parameter.n_channels
        self.input_shape = (self.img_rows, self.img_cols, self.n_channels)
        self.my_metrics = ['accuracy']
        
    def build_model(self):
        self.model = Sequential()
        
        