In [None]:
from pathlib import Path
import cv2
import matplotlib.pyplot as plt

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import tensorflow as tf
from statistics import mean
from matplotlib.colors import ListedColormap

In [None]:
# read data first. The csv is the core.
train_data = pd.read_csv('../input/sartorius-cell-instance-segmentation/train.csv')
# submision sample
sample_submission=pd.read_csv('../input/sartorius-cell-instance-segmentation/sample_submission.csv')

In [None]:
# https://www.kaggle.com/c/sartorius-cell-instance-segmentation/discussion/291627
def rle_decode(mask_rle, shape=(520, 704, 1)):
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)  # Needed to align to RLE direction

def rle_encode(img):
    pixels = img.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

In [None]:
# each mask annotation has one area
mask = train_data[train_data["id"] == "0030fd0e6378"]["annotation"].tolist()[0]
img = rle_decode(mask)
plt.imshow(img, cmap="gray");

In [None]:
def plot_masks(image_id, colors=False):
    labels = train_data[train_data["id"] == image_id]["annotation"].tolist()

    if colors:
        mask = np.zeros((520, 704, 3))
        for label in labels:
            mask += rle_decode(label, shape=(520, 704, 3), color=np.random.rand(3))
    else:
        mask = np.zeros((520, 704, 1))
        for label in labels:
            mask += rle_decode(label, shape=(520, 704, 1))
    mask = mask.clip(0, 1)

    image = cv2.imread(f"../input/sartorius-cell-instance-segmentation/train/{image_id}.png")
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    plt.figure(figsize=(18,6))
    plt.subplot(1, 3, 1)
    plt.imshow(image)
    plt.title('Input image')
    plt.axis("off")
    
    plt.subplot(1, 3, 2)
    plt.imshow(image)
    plt.imshow(mask, alpha=0.1)
    plt.title('Input image with mask')
    plt.axis("off")
    
    plt.subplot(1, 3, 3)
    plt.imshow(mask)
    plt.title('Only mask')
    plt.axis("off")
    
    plt.show();

In [None]:
sample_ids = ['0030fd0e6378','0140b3c8f445','01ae5a43a2ab']

for sample_id in sample_ids:
    celltype=train_data[train_data['id']==sample_id]['cell_type'].tolist()[0]
    file_path = '../input/sartorius-cell-instance-segmentation/train/' + sample_id + '.png'
    image_df = cv2.imread(file_path)
    print('ID:', sample_id, ', CellType:',celltype)
    plot_masks(sample_id, colors=False)

In [None]:
# Reference: https://www.kaggle.com/keegil/keras-u-net-starter-lb-0-277
IMG_HEIGHT = 520
IMG_WIDTH = 704
IMG_CHANNELS = 1
TRAIN_PATH = '../input/sartorius-cell-instance-segmentation/train/'

train_ids = train_data['id'].unique().tolist()
test_ids = sample_submission['id'].unique().tolist()

# Get and resize train images and masks
X_train = np.zeros((train_data['id'].nunique(), IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS), dtype=np.uint8)
Y_train = np.zeros((train_data['id'].nunique(), IMG_HEIGHT, IMG_WIDTH, 1), dtype=np.bool)

In [None]:
from tqdm import tqdm
for n, id_ in tqdm(enumerate(train_ids), total=len(train_ids)):
    path = TRAIN_PATH + id_
    img = cv2.imread(path + '.png')[:,:]
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).astype(np.float32) -125
    img = np.expand_dims(img, axis = 2)
    X_train[n] = img
    
    labels = train_data[train_data["id"]
                        == id_]["annotation"].tolist()
    mask = np.zeros((520, 704, 1))
    for label in labels:
        mask += rle_decode(label, shape=(520, 704, 1))
    mask = mask.clip(0, 1)

    Y_train[n] = mask
print("Done")

In [None]:
# Get and resize test images
test_images_id = []
X_test = np.zeros((sample_submission['id'].nunique(), IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS), dtype=np.uint8)
for n, id_ in tqdm(enumerate(test_ids), total=len(test_ids)):
    path = TRAIN_PATH.replace('train', 'test') + id_
    img = cv2.imread(path + '.png')[:,:]
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).astype(np.float32) -125
    img = np.expand_dims(img, axis = 2)
    X_test[n] = img
    test_images_id.append(id_)
print("Done")

In [None]:
print(X_train.shape,Y_train.shape,X_test.shape)

In [None]:
sample_id_num = 40
plt.imshow(X_train[sample_id_num][:,:,0], cmap = 'gray')
plt.show()
plt.imshow(Y_train[sample_id_num][:,:,0])
plt.show()

print('Input image:','Min:', X_train[sample_id_num][:,:,0].min(), '; Max:', X_train[sample_id_num][:,:,0].max(), '; Mean:', X_train[sample_id_num][:,:,0].mean())
print('Mask:','Min:', Y_train[sample_id_num][:,:,0].min(), '; Max:', Y_train[sample_id_num][:,:,0].max(), '; Mean:', Y_train[sample_id_num][:,:,0].mean())

In [None]:
#dice_coefficient
def dice_coefficient(y_true, y_pred):
    numerator = 2 * tf.reduce_sum(y_true * y_pred)
    denominator = tf.reduce_sum(y_true + y_pred)
    return numerator / (denominator + tf.keras.backend.epsilon())

In [None]:
"""from torch import nn
NN = nn.Sequential(
    nn.Conv2d(1, 20, kernel_size=5, padding="same"),
    nn.BatchNorm2d(20),
    nn.ReLU(),
    nn.Conv2d(20, 10, kernel_size=1),

    nn.Conv2d(10, 10, kernel_size=5, padding="same"),
    nn.BatchNorm2d(10),
    nn.ReLU(),
    nn.Conv2d(10, 1, kernel_size=1),
)
from torchsummary import summary
summary(model_to_transfer, input_size=[IMG_WIDTH,IMG_HEIGHT,IMG_CHANNELS])
"""
#input_var.shape[-3:]

In [None]:
import keras
from keras.models import Model, load_model
from keras import layers

model = keras.Sequential([
    # Convolutional layer 1
    keras.layers.Conv2D(filters=20, kernel_size=5, strides=1,
                  padding='same',input_shape=[IMG_WIDTH,IMG_HEIGHT,IMG_CHANNELS],
                  activation='relu'),
    keras.layers.BatchNormalization(),
    
    # Convolutional layer 2
    keras.layers.Conv2D(filters=10, kernel_size=1),

    # Convolutional layer 3
    keras.layers.Conv2D(filters=10, kernel_size=5, strides=1,
                  padding='same', activation='relu'),
    keras.layers.BatchNormalization(),

    # Convolutional layer 4
    keras.layers.Conv2D(filters=1, kernel_size=1),
])

In [None]:
from tensorflow.keras.losses import BinaryCrossentropy
loss = BinaryCrossentropy(from_logits=True)

In [None]:
model.compile(optimizer='adam', loss=loss)
#model.summary()

In [None]:
# Fit model
n_epochs = 50
batch_size = 32
from keras.callbacks import EarlyStopping
earlystopper = EarlyStopping(patience=20, verbose=1)

results = model.fit(X_train, Y_train, validation_split=0.15, batch_size=batch_size, epochs=n_epochs, 
                    callbacks=[earlystopper])
print("Done!")

In [None]:
plt.figure(figsize=(14,4))
plt.plot(results.history['loss'])
plt.plot(results.history['val_loss'])
plt.title('model loss')
plt.ylabel('Loss')
plt.xlabel('epoch')
plt.legend(['loss', 'val_loss'], loc='upper right')
plt.show()

In [None]:
print(X_train.shape)
print(Y_train.shape)

In [None]:
preds_train = model.predict(X_train, verbose=1)

In [None]:
preds_train.shape

In [None]:
# Threshold predictions
preds_train_t = (preds_train > 0.5).astype(np.uint8)

In [None]:
plt.imshow(preds_train_t[0], cmap="gray");

In [None]:
# unoptimized and slow; any way to speed up?

def get_threshold(Y, pred):
    scores = list(pred.ravel())
    mask = list(Y.ravel())
    
    idxs=np.argsort(scores)[::-1]
    mask_sorted=np.array(mask)[idxs]
    sum_mask_one=np.cumsum(mask_sorted)
    IoU=sum_mask_one/(np.arange(1,len(mask_sorted)+1)+np.sum(mask_sorted)-sum_mask_one)
    best_IoU_idx=IoU.argmax()
    best_threshold=scores[idxs[best_IoU_idx]]
    best_IoU=IoU[best_IoU_idx]

    return best_threshold, best_IoU

In [None]:
print(X_train.shape)
print(preds_train.shape)
print(Y_train.shape)

In [None]:
get_threshold(Y_train[0], preds_train[0])

In [None]:
img_thresholds = []         # one for each image
img_IoUs = []
for Y, P in tqdm(zip(Y_train, preds_train), total=Y_train.shape[0]):

    best_img_threshold, best_img_IoU = get_threshold(Y, P)
    img_thresholds.append(best_img_threshold)
    img_IoUs.append(best_img_IoU)

In [None]:
best_threshold = np.mean(img_thresholds)
best_threshold_spread = np.std(img_thresholds)
avg_IoU = mean(img_IoUs)

print(f"Best threshold: {best_threshold:.3g} (+-{best_threshold_spread:.3g}), Avg. Train IoU: {avg_IoU:.3f}")

In [None]:
dice_coefficient(Y_train, preds_train)

In [None]:
pred_Y = (preds_train >= best_threshold)
    
def plot(img_Y, img_pred):
    output = np.zeros_like(img_Y)
    output = np.where((img_Y == 0) & (img_pred == 1), 1, output)
    output = np.where((img_Y == 1) & (img_pred == 0), 2, output)
    output = np.where((img_Y == 1) & (img_pred == 1), 3, output)

    plt.figure(figsize=(10,10))
    plt.imshow(output, cmap=ListedColormap(['black', 'gray', 'orange', 'green']))
    plt.xticks([])
    plt.yticks([]);

In [None]:
N = 5
for i in range(N):
    img_Y = Y_train[i]
    img_pred = pred_Y[i]
    
    plot(img_Y, img_pred)
    plt.show()

# green: correct prediction
# gray: false positive (too much)
# orange: false negative (missed)

In [None]:
preds_test = model.predict(X_test, verbose=1)
preds_test_t = (preds_test >= best_threshold).astype(np.uint8)

In [None]:
preds_test_t[1].shape

In [None]:
# Test samples
from random import randint
ix = randint(0, len(preds_test_t)-1)
print(ix)
plt.imshow(X_test[ix])
plt.show()
plt.imshow(np.squeeze(preds_test_t[ix]))
plt.show()

In [None]:
print(preds_test_t[0].shape)
print(preds_test_t[1].shape)
print(preds_test_t[2].shape)

In [None]:
def check_overlap(msk):
    msk = msk.astype(np.bool).astype(np.uint8)
    return np.any(np.sum(msk, axis=-1)>1)

In [None]:
for test_mask in preds_test_t:
    print(check_overlap(test_mask))

In [None]:
# split the mask into each cluster nucleus for the submision
# seen on https://www.kaggle.com/c/sartorius-cell-instance-segmentation/discussion/288376
def post_process(mask, min_size=80):
    num_component, component = cv2.connectedComponents(mask.astype(np.uint8))
    predictions = []
    for c in range(1, num_component):
        p = (component == c)
        if p.sum() > min_size:
            a_prediction = np.zeros((520, 704), np.float32)
            a_prediction[p] = 1
            predictions.append(a_prediction)
    return predictions

In [None]:
# test the nucleus thing
plt.imshow(Y_train[4], cmap="gray");

In [None]:
num_component, component = cv2.connectedComponents(Y_train[4].astype(np.uint8))
num_component

In [None]:
plt.imshow(component, cmap="gray");

In [None]:
compenent_1 = (component == 1)
plt.imshow(compenent_1, cmap="gray");

In [None]:
final = post_process(Y_train[4])
final[0].shape

In [None]:
plt.imshow(final[0], cmap="gray");

In [None]:
# old submision
predicted2 = [rle_encode(test_mask2) for test_mask2 in preds_test_t]
len(predicted2[0])

In [None]:
def remove_isolated_points_from_rle(strin):
    t2 = strin.split(" ")
    a = []
    for i in range(0, len(t2), 2):
        if t2[i+1]!="1":
            a.append(t2[i])
            a.append(t2[i+1])
    return ' '.join(a)

In [None]:
predicted_filt = [remove_isolated_points_from_rle(s) for s in predicted2]

In [None]:
# new version with the mask nucleus split
predicted_nucleus = []
test_nucleus_image_id = []

for index, s in enumerate(preds_test_t):
    nucleus = post_process(s)
    for nucl in nucleus:
        predicted_nucleus.append(nucl)
        test_nucleus_image_id.append(test_images_id[index])

In [None]:
plt.imshow(predicted_nucleus[0], cmap="gray");

In [None]:
predicted2 = [rle_encode(test_mask2) for test_mask2 in predicted_nucleus]
print(predicted2[0])
predicted_filt = [remove_isolated_points_from_rle(s) for s in predicted2]
print(predicted_filt[0])

In [None]:
submit = sample_submission.copy()
#submit['predicted'] = predicted2
submit = pd.DataFrame({'id':test_nucleus_image_id, 'predicted':predicted_filt})

In [None]:
print(submit.shape)
submit.head()

In [None]:
submit.to_csv('submission.csv', index=False)