In [None]:
#
# Attempt to use ResNet50
# 
#

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

from PIL import Image  
from IPython.display import display 

import tensorflow as tf
from tensorflow.keras.utils import to_categorical, Sequence
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization
from tensorflow.keras.applications.resnet50 import ResNet50

import os
import datetime


In [None]:
root_path = '/kaggle/input/hpa-single-cell-image-classification/'

CHANNELS = np.array(['blue', 'green', 'red', 'yellow'])

CLASSES = np.array([
    'Nucleoplasm',
    'Nuclear membrane',
    'Nucleoli',
    'Nucleoli fibrillar center',
    'Nuclear speckles',
    'Nuclear bodies',
    'Endoplasmic reticulum',
    'Golgi apparatus',
    'Intermediate filaments',
    'Actin filaments',
    'Microtubules',
    'Mitotic spindle',
    'Centrosome',
    'Plasma membrane',
    'Mitochondria',
    'Aggresome',
    'Cytosol',
    'Vesicles and punctate cytosolic patterns',
    'Negative'
])


## Build Labels

Take in a trainning set dataframe and build a dataframe containing labels.  The resulting lables data frame will contain a column for each class with the value of zero or one.

In [None]:
def build_labels(df):
    # dataframe with column for each class
    labels = list()
    
    for index, sample in df.iterrows():
        # zero out class array
        label = [0] * 19
        
        # for each class found in training sample, flip lablel value to one
        for clazz in sample['Label'].split('|'):
            label[int(clazz)] = 1

        # Append label to list
        labels.append( np.array(label) )

    return np.vstack(labels)

## Create Image Data For Each Channel

Create 3d array containing each image for all channels.

In [None]:
def read_image_data(id):
    channels = None

    for channel in CHANNELS:
        image = Image.open('/kaggle/input/hpa-single-cell-image-classification/train/{}_{}.png'.format(id, channel))
        image = image.resize((IMAGE_SIZE, IMAGE_SIZE))

        image_array = np.array(image)
        #print("Image Size: {}".format(np.shape(image_array)))

        channels = [image_array] if channels is None else np.append(channels, [image_array], axis=0) 
        #print("\rBuilding channel: (id={}, channel={}, channel_size={})".format(id, channel, np.shape(channels)), end="")

    #print("\rCorrelating channel: (id={}, channel_size={})             ".format(id, np.shape(channels)))
    channels = correlate_channels(channels)
    return channels

## Plot Training Data With Images

In [None]:
def plot_samples(df_samples, df_labels):
    
    for sample_index, sample in df_samples.iterrows():

        fig = plt.figure(figsize=(25,25))
        index = 1

        for channel in CHANNELS:
            ax = fig.add_subplot(1, CHANNEL_SIZE, index)
            path = root_path + 'train/{}_{}.png'.format(sample['ID'], channel)
            image = mpimg.imread(path)
            imgplot = plt.imshow(image)
            index = index + 1

            ax.set_title("{}\n{}\n{}".format(label_description(df_labels[train_index]), channel, image.shape))

        
        
        

In [None]:
# Convert (3, 2048, 2048) to (2048, 2048, 3)
def correlate_channels(channels):
    image = np.full((IMAGE_SIZE, IMAGE_SIZE, len(channels)), None)
    images = list()
    for channel_index, channel in enumerate(channels):
        images.append(channel)
    correlated_image = np.stack(images, axis=2)
    return np.asarray(correlated_image).astype(np.int)    

In [None]:
def label_description(label):
    description = ""
    for index in range(len(CLASSES)):
        if label[index] == 1:
            if len(description) > 0:
                description = description + ", "
            description = description + CLASSES[index]
    return description


## Timer Functions

Functions to support tracking elapsed times.

In [None]:
time_start = datetime.datetime.now()
previous_mark = datetime.datetime.now()
times = {}

MARK_PREP = "Finished Preparations"
MARK_TRAIN = "Finished Trainning"
MARK_TEST = "Finished Trainning"

In [None]:

def time_mark(position):
    mark = datetime.datetime.now()
    times[position] = (mark - previous_mark, mark - time_start)
    print("Time elapsed: mark={},  total={}".format(str(times[position][0]), str(times[position][1])))
    


## Parameters

In [None]:
IMAGE_SIZE = 1024
CHANNEL_SIZE = len(CHANNELS)
CLASS_SIZE = len(CLASSES)
# Note: Will get OOM on kaggle with full dataset
SAMPLE_SIZE = 2000
BATCH_SIZE = 2


## Read Full Trainning Data

Read full set of training data from ```train.csv```

In [None]:
df_train = pd.read_csv(root_path + 'train.csv')
print("Trainning data length: {}".format(len(df_train)))
df_train.head()

In [None]:
# if sample size is set then reduce trainning set accordingly
if SAMPLE_SIZE > -1:
    df_train = df_train.sample(SAMPLE_SIZE)
    df_train.reset_index(inplace=True);

# split 80, 10, 10
df_train, df_validation, df_test = np.split(df_train, [int(.8*len(df_train)), int(.9*len(df_train))]) 
    
# reindex after split
df_train.reset_index(inplace=True)
df_validation.reset_index(inplace=True)
df_test.reset_index(inplace=True)

# build labels
df_train_labels = build_labels( df_train )
df_validation_labels = build_labels( df_validation )
df_test_labels = build_labels( df_test )

print("Trainning sample size:  {}".format(len(df_train)))
print("Trainning labels size:  {}".format(len(df_train_labels)))
print("")
print("Validation sample size: {}".format(len(df_validation)))
print("Validation labels size: {}".format(len(df_validation_labels)))
print("")
print("Test sample size:       {}".format(len(df_test)))
print("Test labels size:       {}".format(len(df_test_labels)))


In [None]:
class DataGenerator(Sequence):
    def __init__(self, list_ids, labels, batch_size, image_size, channel_size):
        self.list_ids = list_ids
        self.labels = labels
        self.batch_size = batch_size
        self.image_size = image_size
        self.channel_size = channel_size
        self.indexes = list(list_ids.index.values)
        
    def __len__(self):
        return int(np.floor(len(self.list_ids) / self.batch_size))
    
    
    def __getitem__(self, index):
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        temp_list_ids = [self.list_ids[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(temp_list_ids)

        return X, y

    
    def __data_generation(self, temp_list_ids):
        label_values = list()
        batch = list()
        for id in temp_list_ids:
            index = self.list_ids[self.list_ids == id].index
            label_values.append(self.labels[index])
            channels = read_image_data(id)
            batch.append(channels)

        return np.array( batch ), np.array( label_values ).reshape(-1,19)


## Build Model

In [None]:

#weights = '../input/tf-keras-resnet/resnet50_notop.h5'
model_resnet = ResNet50(weights=None, input_tensor=Input(shape=(IMAGE_SIZE, IMAGE_SIZE, CHANNEL_SIZE)), include_top=False)
#model_resnet.summary()

In [None]:

model_base = model_resnet.layers[-2].output
connected_model = tf.keras.layers.GlobalMaxPooling2D()(model_base)
connected_model = Dense(CLASS_SIZE, activation='sigmoid')(connected_model)


model = Model(model_resnet.input, connected_model)


In [None]:
# Set parameters in pre-train model to False
#for layer in model_resnet.layers:
#    layer.trainable = False

In [None]:

model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001), 
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=[tf.keras.metrics.Accuracy(), tf.keras.metrics.AUC(), tf.keras.metrics.Precision()]
    )

In [None]:
# Print the model
model.summary()

In [None]:
train_generator = DataGenerator(df_train['ID'], df_train_labels, BATCH_SIZE, IMAGE_SIZE, CHANNEL_SIZE)
validation_generator = DataGenerator(df_validation['ID'], df_validation_labels, BATCH_SIZE, IMAGE_SIZE, CHANNEL_SIZE)


In [None]:
time_mark(MARK_PREP)

## Train Model

In [None]:


history = model.fit(train_generator,
        validation_data=validation_generator,
        epochs = 5
    )

In [None]:
time_mark(MARK_TRAIN)

## Test Predictions

Use the portion of trainning data set asside for testing to see how predictions hold up.

In [None]:
def clean_prediction(prediction):
    for batch_index in range(len(prediction)):
        for class_index in range(len(prediction[batch_index])):
            prediction[batch_index][class_index] = 1 if prediction[batch_index][class_index] >= 0.50 else 0
    return np.array(prediction).astype(np.int) 

correct = 0;
total = 0;

for index, test in df_test.iterrows():
    channels = read_image_data(test['ID'])
    prediction = model.predict(channels.reshape(-1, IMAGE_SIZE, IMAGE_SIZE, 4))
    prediction = clean_prediction(prediction)
    
    # validate prediction
    is_correct = np.all(df_test_labels[index] == prediction[0])
    result = "Correct. " if is_correct else "Incorrect"
    
    #print("ID: {} {} {}".format(test['ID'], result, label_description(prediction[0])))
    
    # accumulate
    if is_correct == True:
        correct = correct + 1
    total = total + 1
    
    print ("\rPrecition details: (total={},correct={},percentage_correct={}%)    ".format(total, correct, int(correct / total * 100)), end="")

print ("")
print ("")
print ("Total predictions:   {}".format(total))
print ("Correct predictions: {}".format(correct))
print ("Precentage correct:  {}%".format(int(correct / total * 100)))

In [None]:
time_mark(MARK_TEST)