In [None]:
import numpy as np 
import pandas as pd

import os
print("Input Directory:")
print(os.listdir("../input"))

import warnings
warnings.filterwarnings("ignore")

import sys
import matplotlib.pyplot as plt
import skimage.io
from skimage.transform import resize
from imgaug import augmenters as iaa
from tqdm import tqdm


'''--------------- Load Training Data --------------'''

#read train.csv
data = pd.read_csv('../input/train.csv')
print("\nFew samples from train.csv:")
print(data[:3])

path_to_train = '../input/train/'
print("\nPath to training images:" + path_to_train)

#from train.csv - generate dataset info [{path:"image_path", labels:[targets]},...,{}]
train_dataset_info = []
for name, labels in zip(data['Id'], data['Target'].str.split(' ')):
    train_dataset_info.append({
        'path':os.path.join(path_to_train, name),
        'labels':np.array([int(label) for label in labels]),
        'id':'orig'})

print("\nFew samples from train dataset info:")
print(train_dataset_info[:3])

print("\nNo. of training images:")
nTrainImages = (len)(train_dataset_info)
print(nTrainImages)

In [None]:
#Target Distibution In Training Data
'''
label_names = {
    0:  "Nucleoplasm",  
    1:  "Nuclear membrane",   
    2:  "Nucleoli",   
    3:  "Nucleoli fibrillar center",   
    4:  "Nuclear speckles",
    5:  "Nuclear bodies",   
    6:  "Endoplasmic reticulum",   
    7:  "Golgi apparatus",   
    8:  "Peroxisomes",   
    9:  "Endosomes",   
    10:  "Lysosomes",   
    11:  "Intermediate filaments",   
    12:  "Actin filaments",   
    13:  "Focal adhesion sites",   
    14:  "Microtubules",   
    15:  "Microtubule ends",   
    16:  "Cytokinetic bridge",   
    17:  "Mitotic spindle",   
    18:  "Microtubule organizing center",   
    19:  "Centrosome",   
    20:  "Lipid droplets",   
    21:  "Plasma membrane",   
    22:  "Cell junctions",   
    23:  "Mitochondria",   
    24:  "Aggresome",   
    25:  "Cytosol",   
    26:  "Cytoplasmic bodies",   
    27:  "Rods & rings"
}

for key in label_names.keys():
    data[label_names[key]] = 0

def fill_targets(row):
    row.Target = np.array(row.Target.split(" ")).astype(np.int)
    for num in row.Target:
        name = label_names[int(num)]
        row.loc[name] = 1
    return row

data = data.apply(fill_targets, axis=1) 

#plot
import seaborn as sns
sns.set()

target_counts = data.drop(["Id", "Target"],axis=1).sum(axis=0).sort_values(ascending=False)
plt.figure(figsize=(15,15))
sns.barplot(y=target_counts.index.values, x=target_counts.values, order=target_counts.index)
'''

In [None]:
'''--------------- Generate Dataset for Augmentation --------------'''

train_dataset_info_aug = [] #duplicate data for augmentation

train_dataset_info_aug0 = []
train_dataset_info_aug1 = []
train_dataset_info_aug2 = []
train_dataset_info_aug3 = []

for name, labels in zip(data['Id'], data['Target'].str.split(' ')):
    #1 19 3 4(speckels):
    #18 6 14 11(filaments)
    #16 13 12 22(cell junction)
    #27 15 10 9 8 20 17 24 26(cytoplasmic bodies) 
    #L2R and B2T sample count inc -> 27(rods&rings): min count and 4(Nuclear Speckels): max count
    #Others with higher sample count ignored in duplication
    if (("27" in labels) or ("15" in labels) or ("10" in labels) or ("9" in labels) or ("8" in labels) or ("20" in labels) or ("17" in labels) or 
        ("24" in labels) or ("26" in labels) or("16" in labels) or("13" in labels) or ("12" in labels) or ("22" in labels) or ("18" in labels) or
        ("6" in labels) or ("14" in labels) or("11" in labels) or ("1" in labels) or ("19" in labels) or ("3" in labels) or ("4" in labels)):
        train_dataset_info_aug0.append({
            'path':os.path.join(path_to_train, name),
            'labels':np.array([int(label) for label in labels]),
            'id': "set0"})
    if ((("27" in labels) or ("15" in labels) or ("10" in labels) or ("9" in labels) or ("8" in labels) or ("20" in labels) or ("17" in labels) or 
        ("24" in labels) or ("26" in labels) or("16" in labels) or("13" in labels) or ("12" in labels) or ("22" in labels) or ("18" in labels) or
        ("6" in labels) or ("14" in labels) or("11" in labels) or ("1" in labels) or ("19" in labels) or ("3" in labels) or ("4" in labels)) and ("0" not in labels and "25" not in labels)):
        train_dataset_info_aug1.append({
            'path':os.path.join(path_to_train, name),
            'labels':np.array([int(label) for label in labels]),
            'id': "set1"})
    if ((("27" in labels) or ("15" in labels) or ("10" in labels) or ("9" in labels) or ("8" in labels) or ("20" in labels) or ("17" in labels) or 
        ("24" in labels) or ("26" in labels) or("16" in labels) or("13" in labels) or ("12" in labels) or ("22" in labels) or ("18" in labels) or
        ("6" in labels) or ("14" in labels) or("11" in labels)) and ("0" not in labels and "25" not in labels)):
        train_dataset_info_aug2.append({
            'path':os.path.join(path_to_train, name),
            'labels':np.array([int(label) for label in labels]),
            'id': "set2"})
    if ((("27" in labels) or ("15" in labels) or ("10" in labels) or ("9" in labels) or ("8" in labels) or ("20" in labels) or ("17" in labels) or 
        ("24" in labels) or ("26" in labels) or("16" in labels) or("13" in labels) or ("12" in labels) or ("22" in labels)) and ("0" not in labels and "25" not in labels)):
        train_dataset_info_aug3.append({
            'path':os.path.join(path_to_train, name),
            'labels':np.array([int(label) for label in labels]),
            'id': "set3"})
        
for sample in train_dataset_info_aug0:
    train_dataset_info_aug.append(sample)
for sample in train_dataset_info_aug1:
    train_dataset_info_aug.append(sample)
for sample in train_dataset_info_aug2:
    train_dataset_info_aug.append(sample)
for sample in train_dataset_info_aug3:
    train_dataset_info_aug.append(sample)

np.random.shuffle(train_dataset_info_aug)

nSamplesOrig = (len)(train_dataset_info)
nSamplesAug = (len)(train_dataset_info_aug)

train_dataset_info = np.array(train_dataset_info)
print("\nOrig train dataset samples:")
print(nSamplesOrig)
print("Duplicated train dataset samples(for Aug):")
print(nSamplesAug)

from random import randint
if(nSamplesAug < nSamplesOrig):
    for i in range(0,nSamplesOrig - nSamplesAug):
        randIdx = randint(0, nSamplesOrig - 1)
        train_dataset_info_aug.append(train_dataset_info[randIdx])
        
    nSamplesOrig = (len)(train_dataset_info)
    nSamplesAug = (len)(train_dataset_info_aug)

    print("Augmentation set extended...")
    print("Orig train dataset samples:")
    print(nSamplesOrig)
    print("Duplicated train dataset samples(for Aug):")
    print(nSamplesAug)

IMG_SIZE = 512
CHANNELS = 3

In [None]:
''' Data generator '''
from sys import getsizeof
class data_generator:
    #Note: If using augument = True, batch size should be even num
    def create_train(dataset_info, batch_size, shape, augument=True):
        assert shape[2] == CHANNELS
        AUG_BATCH_SIZE = 0
        if augument == True:
            batch_size = (int)(batch_size/2)
            AUG_BATCH_SIZE = batch_size
            
        start = 0 
        rev_start = (len)(train_dataset_info_aug) - AUG_BATCH_SIZE - 1
        while True:
            batch_images = np.empty((batch_size + AUG_BATCH_SIZE, shape[0], shape[1], shape[2]))
            batch_labels = np.zeros((batch_size  + AUG_BATCH_SIZE, 28))

            indexes = np.arange(start,start + batch_size)
            for i, idx in enumerate(indexes):
                image = data_generator.load_image(
                    dataset_info[idx]['path'], shape)
                batch_images[i] = image
                batch_labels[i][dataset_info[idx]['labels']] = 1
            start = start + batch_size
            
            if augument and rev_start > 0:
                rev_indexes = np.arange(rev_start, rev_start + AUG_BATCH_SIZE)
                for i, idx in enumerate(rev_indexes):
                    image = data_generator.load_image(
                        train_dataset_info_aug[idx]['path'], shape, bNormalize = False)
                    image = data_generator.augment(image, train_dataset_info_aug[idx]['id'])
                    batch_images[batch_size+i] = image
                    batch_labels[batch_size+i][train_dataset_info_aug[idx]['labels']] = 1
                rev_start = rev_start - AUG_BATCH_SIZE

            yield batch_images, batch_labels
    
    def load_image(path, shape, bNormalize = True):
        image_red_ch = skimage.io.imread(path+'_red.png')
        #image_yellow_ch = skimage.io.imread(path+'_yellow.png')
        image_green_ch = skimage.io.imread(path+'_green.png')
        image_blue_ch = skimage.io.imread(path+'_blue.png')
        
        image = np.stack((
            image_red_ch, #red: microtubules
            image_green_ch, #[green: target protein]
            image_blue_ch), -1) # blue: neucleous
            #yellow: endoplasmic reticulum
        
        return image
            
    def augment(image, strId):
        augment_img = iaa.Sequential([
            iaa.SomeOf(2, [
                iaa.Affine(rotate=90),
                iaa.MultiplyElementwise((0.8, 1.2))
            ])], random_order=True)
        augment_img0 = iaa.Sequential([
            iaa.SomeOf(2, [
                iaa.Affine(rotate=180),
                iaa.MultiplyElementwise((0.95, 1.05))
            ])], random_order=True)
        augment_img1 = iaa.Sequential([
            iaa.SomeOf(2, [
                iaa.Affine(rotate=270),
                iaa.MultiplyElementwise((0.9, 1.1))
            ])], random_order=True)
        augment_img2 = iaa.Sequential([
            iaa.SomeOf(2, [
                iaa.Affine(rotate=90),
                iaa.MultiplyElementwise((0.8, 1.2))
            ])], random_order=True)
        augment_img3 = iaa.Sequential([
            iaa.SomeOf(2, [
                iaa.Affine(rotate=180),
                iaa.MultiplyElementwise((0.7, 1.3))
            ])], random_order=True)
        
        
        if strId == "set0":
            image_aug = augment_img0.augment_image(image)
        elif strId == "set1":
            image_aug = augment_img1.augment_image(image)
        elif strId == "set2":
            image_aug = augment_img2.augment_image(image)
        elif strId == "set3":
            image_aug = augment_img3.augment_image(image)
        else:
            image_aug = augment_img.augment_image(image)
            
        return image_aug

In [None]:
# create train data generator and show few train samples
train_datagen = data_generator.create_train(
    train_dataset_info, 4, (IMG_SIZE,IMG_SIZE,CHANNELS), augument=False)

images, labels = next(train_datagen)

fig, ax = plt.subplots(1,4,figsize=(25,5))
for i in range(4):
    ax[i].imshow(images[i])
    print([i for i, x in enumerate(labels[i]) if x == 1.0])
print('min: {0}, max: {1}'.format(images.min(), images.max()))

In [None]:
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential, load_model
from keras.layers import Input, Activation, Dropout, Flatten, Dense, Conv2D, SeparableConv2D, GlobalAveragePooling2D, MaxPooling2D, AveragePooling2D, BatchNormalization, concatenate
from keras.initializers import TruncatedNormal, lecun_normal
from keras.regularizers import l2
from keras.applications.inception_resnet_v2 import InceptionResNetV2
from keras.applications.resnet50 import ResNet50
from keras.applications.inception_v3 import InceptionV3
from keras.applications.xception import Xception
from keras.callbacks import ModelCheckpoint
from keras.models import Model
from keras import metrics
from keras.optimizers import Adam, SGD
from keras import backend as K
import keras


def create_model(input_shape, n_out):
    
    pretrain_model = Xception(
        include_top=False, 
        weights='imagenet', 
        input_shape=(IMG_SIZE,IMG_SIZE, 3))
    
    model = Sequential()
    model.add(BatchNormalization(input_shape = input_shape))
    #model.add(SeparableConv2D(filters=3, kernel_size=(3, 3), strides=(1, 1), use_bias=True, padding='valid',activation = 'relu',
                           #kernel_initializer="glorot_uniform", kernel_regularizer=l2(1e-5)))
    model.add(pretrain_model) #input_shape should match first layer input dim of pre-trained model 
    model.add(GlobalAveragePooling2D(data_format=None))
    
    model.add(Dense(n_out, activation = 'sigmoid',activity_regularizer=keras.regularizers.l2(1e-5))) #instead of "softmax" use sigmoid (https://www.dlology.com/blog/how-to-choose-last-layer-activation-and-loss-function/)
    return model


In [None]:
keras.backend.clear_session()

model = create_model(
    input_shape=(IMG_SIZE,IMG_SIZE, CHANNELS), #4 ip channels but for Xception specify 3
    n_out=28)
'''
model = create_model(
    input_shape=Input(shape=(IMG_SIZE,IMG_SIZE,CHANNELS)), 
    n_out=28)
'''
import tensorflow as tf
import keras.backend as K
from sklearn.metrics import f1_score
#Ref: https://en.wikipedia.org/wiki/F1_score
def f1_Macro(y_true, y_pred):
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon()) #K.epsilon() for non-zero denominator
    r = tp / (tp + fn + K.epsilon())

    f1 = (2*p*r) / (p+r + K.epsilon())
    return f1

model.compile(
    loss='binary_crossentropy', #categorical_crossentropy(when target is single label)
    optimizer=Adam(1e-4),
    metrics=['accuracy', f1_Macro])

model.summary()

In [None]:
#orig:197x5x30->(29550 of 30000)  samples with aug:197x5x30->(29550)
total_epochs = 198; batch_size = 10 
stepsPerEpoch = 30

checkpointer = ModelCheckpoint(
    '../working/XceptionSig.model', 
    verbose=2, 
    save_best_only=True)

# split and suffle data, Generate Train and Validation sets
np.random.seed(2500)
indexes = np.arange(train_dataset_info.shape[0])
np.random.shuffle(indexes)
train_indexes = indexes[:30000] 
valid_indexes = indexes[30000:]

# create train and validaion data generators
train_generator = data_generator.create_train(
    train_dataset_info[train_indexes], batch_size, (IMG_SIZE,IMG_SIZE,CHANNELS), augument=True)
validation_generator = data_generator.create_train(
    train_dataset_info[valid_indexes], 500, (IMG_SIZE,IMG_SIZE,CHANNELS), augument=False)

# train model
history = model.fit_generator(
    train_generator, #can specifiy x, y OR do it by train data generator
    steps_per_epoch = stepsPerEpoch, #no. of batches per epoch
    #if you are keeping epochs = 1(default) then steps_per_epoch = total samples / batch size
    #if epochs = 20 then each epoch have steps_per_epoch = total samples / (batch size*20)
    validation_data=next(validation_generator),
    epochs = total_epochs,
    verbose=1,
    callbacks=[checkpointer])

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
print("Started predicting test set....")
submit = pd.read_csv('../input/sample_submission.csv')

predicted = []
for name in tqdm(submit['Id']):
    path = os.path.join('../input/test/', name)
    image = data_generator.load_image(path, (IMG_SIZE,IMG_SIZE,CHANNELS))
    score_predict = model.predict(image[np.newaxis])[0]
    
    indicesMaxScoresDesc = score_predict.argsort()[-3:][::-1] #array of size 3: indices of max score        
    #print("max scores:" + (str)(score_predict[indicesMaxScoresDesc[0]]) + " | " + (str)(score_predict[indicesMaxScoresDesc[1]]) + " | " + (str)(score_predict[indicesMaxScoresDesc[2]]))
    
    label_predict = np.arange(28)[score_predict>=0.40]
    if (len)(label_predict) == 2:
        if score_predict[indicesMaxScoresDesc[2]] > 0.30:
            label_predict = np.append(label_predict, indicesMaxScoresDesc[2])
    if (len)(label_predict) == 1: #only one above 40
        if score_predict[indicesMaxScoresDesc[1]] > 0.25: #(25 - 40)
            label_predict = np.append(label_predict, indicesMaxScoresDesc[1])
        if score_predict[indicesMaxScoresDesc[2]] > 0.25:
            label_predict = np.append(label_predict, indicesMaxScoresDesc[2])
    elif (len)(label_predict) == 0: #None above 40, all 20-40
        label_predict = np.append(label_predict, indicesMaxScoresDesc[0])                
        if (score_predict[indicesMaxScoresDesc[1]]) > 0.20:
            label_predict = np.append(label_predict, indicesMaxScoresDesc[1])
        if (score_predict[indicesMaxScoresDesc[2]]) > 0.20:
            label_predict = np.append(label_predict, indicesMaxScoresDesc[2])
    
    str_predict_label = ' '.join(str(l) for l in label_predict)
    predicted.append(str_predict_label)
    #print((str)(str_predict_label))
    
submit['Predicted'] = predicted
submit.to_csv('BN_Xception_GAP_Predictions.csv', index=False)

print("Done...")