In [None]:
import sys
package_path = '../input/efficientnet-pytorch/EfficientNet-PyTorch/EfficientNet-PyTorch-master'
sys.path.append(package_path)

In [None]:
!pip install -q "../input/pycocotools/pycocotools-2.0-cp37-cp37m-linux_x86_64.whl"
!pip install -q "../input/hpapytorchzoozip/pytorch_zoo-master"
!pip install -q "../input/hpacellsegmentatorraman/HPA-Cell-Segmentation"

In [None]:
import pandas as pd
import os

already_processed = pd.read_csv('../input/b7model/ensemble.csv')
TEST = False
processed_ids = set(already_processed.ID)
print(len(processed_ids))
if len(os.listdir('../input/hpa-single-cell-image-classification/test')) == len(processed_ids) * 4:
    print("In test mode")
    TEST = True
processed_ids = set()

In [None]:
import numpy as np
import hpacellseg.cellsegmentator as cellsegmentator
from hpacellseg.utils import label_cell
import time
import os
import sys
import cv2
import matplotlib.pyplot as plt

def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


def process_single_image(file_name, rescaled=False, resized=True):
    image = cv2.imread(file_name)[:, :, 0]
    if resized:
        image = cv2.resize(image, (512, 512))
    if rescaled:
        image = image / 255.0
    return image

def process_folder(folder, model_folder, save_folder, prefix, num_in_chunk=24):
    start = time.time()
    NUC_MODEL = f'{model_folder}/dpn_unet_nuclei_v1.pth'
    CELL_MODEL = f'{model_folder}/dpn_unet_cell_3ch_v1.pth'
    segmentator = cellsegmentator.CellSegmentator(NUC_MODEL, CELL_MODEL, device='cuda', multi_channel_model=True)

    file_name_all = list(set([x.split('_')[0] for x in os.listdir(folder)]))
    file_name = [x for x in file_name_all if x not in processed_ids]
    cell_dir = f'{save_folder}/{prefix}_cell_mask'
    if not os.path.exists(cell_dir):
        os.makedirs(cell_dir)
    for chunk in chunks(file_name, num_in_chunk):
        rcs = [process_single_image(f'{folder}/{stem}_red.png', True) for stem in chunk]
        bcs = [process_single_image(f'{folder}/{stem}_blue.png', True) for stem in chunk]
        ycs = [process_single_image(f'{folder}/{stem}_yellow.png', True) for stem in chunk]
        rgb_batch = list(map(lambda x: np.stack(x, axis=2), zip(rcs, ycs, bcs)))
        nuc_seg = segmentator.pred_nuclei(bcs)
        cell_seg = segmentator.pred_cells(rgb_batch, precombined=True)

        for i, stem in enumerate(chunk):
            orig_size = cv2.imread(f'{folder}/{stem}_red.png').shape
            nucl_mask, cell_mask = label_cell(nuc_seg[i], cell_seg[i])
            cell_mask = cv2.resize(cell_mask, (orig_size[0], orig_size[1]), interpolation=cv2.INTER_NEAREST)
            np.savez_compressed(f'{cell_dir}/{stem}', cell_mask)
    end = time.time()
    print(f'Segmentation took {end - start}')
    return cell_dir

In [None]:
import base64
from pycocotools import _mask as coco_mask
import typing as t
import zlib


def encode_binary_mask(mask: np.ndarray) -> t.Text:
    """Converts a binary mask into OID challenge encoding ascii text."""

    # check input mask --
    if mask.dtype != np.bool:
        raise ValueError(
            "encode_binary_mask expects a binary mask, received dtype == %s" %
            mask.dtype)

    mask = np.squeeze(mask)
    if len(mask.shape) != 2:
        raise ValueError(
            "encode_binary_mask expects a 2d mask, received shape == %s" %
            mask.shape)

    # convert input mask to expected COCO API input --
    mask_to_encode = mask.reshape(mask.shape[0], mask.shape[1], 1)
    mask_to_encode = mask_to_encode.astype(np.uint8)
    mask_to_encode = np.asfortranarray(mask_to_encode)

    # RLE encode mask --
    encoded_mask = coco_mask.encode(mask_to_encode)[0]["counts"]

    # compress and base64 encoding --
    binary_str = zlib.compress(encoded_mask, zlib.Z_BEST_COMPRESSION)
    base64_str = base64.b64encode(binary_str)
    return base64_str

def save_encoding(mask, name):
    maxnum = np.max(mask[:, :])
    lookup_image = {}
    for i in range(1, maxnum + 1):
        try:
            cur_mask = (mask == i)
            encoding = encode_binary_mask(cur_mask)
            key = "_".join([name, str(i)])
            lookup_image[key] = encoding
        except:
            continue
    return lookup_image

def save_all_mask(folder, save_name):
    mask_lookup = {}
    for f in os.listdir(folder):
        name = os.path.join(folder, f)
        mask = np.load(name)['arr_0']
        stem = f.split('.')[0]
        im_lookup = save_encoding(mask, stem)
        mask_lookup.update(im_lookup)
    with open(save_name, 'wb') as f:
        np.save(f, mask_lookup)
    return mask_lookup

In [None]:
from math import ceil
import numpy as np
import pandas as pd
import tensorflow as tf


def _bytes_feature(value):
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _array_feature(value):
    bytes_image = value.tobytes()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[bytes_image]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def createExample(im_set, name, size, label=None):
    r, c = size
    if label is not None:
        feature = {
            'name': _bytes_feature(name.encode('utf-8')),
            'label': _bytes_feature(label.encode('utf-8')),
            'row': _int64_feature(r),
            'col': _int64_feature(c),
            'red': _array_feature(im_set[0]),
            'blue': _array_feature(im_set[1]),
            'yellow': _array_feature(im_set[2]),
            'green': _array_feature(im_set[3])
        }
    else:
        feature = {
            'name': _bytes_feature(name.encode('utf-8')),
            'row': _int64_feature(r),
            'col': _int64_feature(c),
            'red': _array_feature(im_set[0]),
            'blue': _array_feature(im_set[1]),
            'yellow': _array_feature(im_set[2]),
            'green': _array_feature(im_set[3])
        }
    return tf.train.Example(features=tf.train.Features(feature=feature))

def bbox2(img):
    rows = np.any(img, axis=1)
    cols = np.any(img, axis=0)
    rmin, rmax = np.where(rows)[0][[0, -1]]
    cmin, cmax = np.where(cols)[0][[0, -1]]
    return rmin, rmax, cmin, cmax

def make_square(h, w):
    top, bottom, left, right = (0, 0, 0, 0)
    if h > w:
        left = (h-w)//2
        right = (h-w)-left
    if h < w:
        top = (w-h)//2
        bottom = (w-h)-top
    return top, bottom, left, right

def calculate_new_size(row, col, max_size=250):
    if row <= max_size and col <= max_size:
        return row, col
    a = 1
    if row > col:
        a = max_size * 1.0 / row
        return max_size, int(col * a)
    a = max_size * 1.0 / col
    return int(a * row), max_size

def generate_cropped(writer, images, mask, stem, label=None):
    for i in range(1, np.max(mask[:, :]) + 1):
        cur = (mask == i)
        rmin, rmax, cmin, cmax = bbox2(cur)
        cropped_ims = []
        try:
            cropped_mask = cur[rmin:rmax, cmin:cmax]
            new_size = calculate_new_size(rmax-rmin, cmax-cmin)
            for im in images:
                crop = im[rmin:rmax, cmin:cmax]
                crop_masked = np.multiply(crop, cropped_mask)
                crop_resized = cv2.resize(crop_masked, (new_size[1], new_size[0]))
                cropped_ims.append(crop_resized)
            new_name = stem + "_" + str(i)
            ex = createExample(cropped_ims, new_name, new_size, label)
            writer.write(ex.SerializeToString())
        except:
            continue

def process_data(df, name, train_folder, mask_folder, test=False):
    label_lookup = dict(zip(list(df.ID), list(df.Label))) if not test else None
    image_set = df['ID'] if not test else df
    with tf.io.TFRecordWriter(name) as writer:
        for imid in image_set:
            red_im = cv2.imread(f'{train_folder}/{imid}_red.png')[:, :, 0]
            blue_im = cv2.imread(f'{train_folder}/{imid}_blue.png')[:, :, 0]
            green_im = cv2.imread(f'{train_folder}/{imid}_green.png')[:, :, 0]
            yellow_im = cv2.imread(f'{train_folder}/{imid}_yellow.png')[:, :, 0]
            mask = np.load(f'{mask_folder}/{imid}.npz')['arr_0']
            label = label_lookup[imid] if not test else None
            generate_cropped(writer, [red_im, blue_im, yellow_im, green_im], mask, imid, label)


def slice_to_tfrec(train_folder, mask_folder, collection, start_ind, name, test=False, num_in=2500, split_size=100):
    start = time.time()
    cur_slice = collection[start_ind:start_ind+num_in]
    len_slice = len(cur_slice)
    for i in range(ceil(len_slice/split_size)):
        cur_name = f'{name}_{i}.tfrec'
        print(cur_name)
        process_data(cur_slice[i*split_size:(i+1)*split_size], cur_name, train_folder, mask_folder, test)
        print(f'Done {i}')
    end = time.time()
    print(f'Create tfrec took {end - start}')

In [None]:
from tensorflow.keras.applications import EfficientNetB0, EfficientNetB1, EfficientNetB7, DenseNet121
from tensorflow.python.keras.layers import Dense, Flatten, GlobalAveragePooling2D, Dropout
from tensorflow.keras import Model

def b0_model(size):
    base_model = EfficientNetB0(input_shape=(size, size,3), include_top=False, weights=None)
    return base_model

def b7_model(size):
    base_model = EfficientNetB7(input_shape=(size, size, 3), include_top=False, weights=None)
    return base_model

def b1_model(size):
    base_model = EfficientNetB1(input_shape=(size, size, 3), include_top=False, weights=None)
    return base_model

def densenet_model(size):
    base_model = DenseNet121(include_top=False, input_tensor=None, input_shape=(size, size, 3), weights=None)
    return base_model

def make_model(base_model_fn, size, core_name, weight_folder):
    base_model = base_model_fn(size)
    base_model.trainable = False
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dropout(0.5)(x)
    predictions = Dense(19, activation='sigmoid')(x)
    model = Model(inputs=base_model.input, outputs=predictions)
    model.load_weights(f'{weight_folder}/{core_name}.h5')
    return model

In [None]:
def record_to_test_v8(record):
    record_format = {
        'name': tf.io.FixedLenFeature([], tf.string),
        'row': tf.io.FixedLenFeature([], tf.int64),
        'col': tf.io.FixedLenFeature([], tf.int64),
        'red': tf.io.FixedLenFeature([], tf.string),
        'blue': tf.io.FixedLenFeature([], tf.string),
        'yellow': tf.io.FixedLenFeature([], tf.string),
        'green': tf.io.FixedLenFeature([], tf.string)
    }
    example = tf.io.parse_single_example(record, record_format)
    shape = (example['row'], example['col'])
    rc = tf.reshape(tf.io.decode_raw(example['red'], out_type=tf.uint8), shape)
    bc = tf.reshape(tf.io.decode_raw(example['blue'], out_type=tf.uint8), shape)
    gc = tf.reshape(tf.io.decode_raw(example['green'], out_type=tf.uint8), shape)
    im = tf.stack([rc, gc, bc], axis=2)
    return im, example['name']


def preprocessing(size, image, name):
    image = tf.image.resize_with_pad(image, size, size)
    return image

In [None]:
def build_decoder_cell():
    def decode(path):
        file_bytes = tf.io.read_file(path)
        img = tf.image.decode_png(file_bytes, channels=3)
        return img
    return decode

def make_dataset(size, path):
    dataset_name = tf.io.gfile.glob(path + '/test*.tfrec')
    print(len(dataset_name))
    dataset = tf.data.TFRecordDataset(dataset_name)
    dataset = dataset.map(record_to_test_v8)
    dataset = dataset.map(lambda x, y: preprocessing(size, x, y))
    dataset = dataset.batch(1)
    return dataset

In [None]:
def build_decoder(target_size=(512, 512)):
    def decode(path):
        file_bytes = tf.io.read_file(path)
        img = tf.image.decode_png(file_bytes, channels=3)
        img = tf.cast(img, tf.float32) / 255.0
        img = tf.image.resize(img, target_size)

        return img
    return decode

def build_dataset_full_size(paths):
    AUTO = tf.data.experimental.AUTOTUNE
    slices = paths
    dset = tf.data.Dataset.from_tensor_slices(slices)
    decoder = build_decoder()
    dset = dset.map(decoder, num_parallel_calls=AUTO)
    dset = dset.batch(1).prefetch(AUTO)
    return dset

def predict_whole_image(folder, model, core_name):
    # names = tf.io.gfile.glob(folder + '/*_green.png')
    names = [f'{folder}/{x}_green.png' for x in image_set]
    print(len(names))
    test_set = build_dataset_full_size(names)
    results = model.predict(test_set)
    dictionary_name = f'{core_name}.npy'
    lookup = {}
    for ind, n in enumerate(names):
        lookup[n] = results[ind, :]
    with open(dictionary_name, 'wb') as f:
        np.save(f, lookup)
    return lookup

In [None]:
import os

def generate_submission_v2(lookup, mask, name, lookup_encoding):
    maxnum = np.max(mask[:, :])
    init_str = "{},{},{},".format(name, mask.shape[0], mask.shape[1])
    for i in range(1, maxnum + 1):
        key = "_".join([name, str(i)])
        if key not in lookup.keys():
            print(key + " not found")
            continue
        encoding = lookup_encoding[key]
        pred = lookup[key]
        for idx, p in enumerate(list(pred)):
            pred_str = "{} {} {} ".format(idx, p, encoding.decode())
            init_str += pred_str
    init_str += '\n'
    return init_str

def generate(folder, lookup_file, save_file, lookup_encoding):
    fs = open(save_file, 'w')
    fs.write('ID,ImageWidth,ImageHeight,PredictionString\n')
    lookup = np.load(lookup_file, allow_pickle=True).item()
    for f in os.listdir(folder):
        name = os.path.join(folder, f)
        mask = np.load(name)['arr_0']
        stem = f.split('.')[0]
        s = generate_submission_v2(lookup, mask, stem, lookup_encoding)
        fs.write(s)
    fs.close()

In [None]:
def record_to_test_name_v8(record):
    record_format = {
        'name': tf.io.FixedLenFeature([], tf.string),
        'row': tf.io.FixedLenFeature([], tf.int64),
        'col': tf.io.FixedLenFeature([], tf.int64),
        'red': tf.io.FixedLenFeature([], tf.string),
        'blue': tf.io.FixedLenFeature([], tf.string),
        'yellow': tf.io.FixedLenFeature([], tf.string),
        'green': tf.io.FixedLenFeature([], tf.string)
    }
    example = tf.io.parse_single_example(record, record_format)
    return example['name']

def generate_name(path):
    dataset_name = tf.io.gfile.glob(path + '/test*.tfrec')
    dataset = tf.data.TFRecordDataset(dataset_name)
    dataset = dataset.map(record_to_test_name_v8)
    dataset_name_list = [y.numpy().decode('utf-8') for y in dataset]
    print(len(dataset_name_list))
    saved_name = 'name_list.npy'
    np.save(saved_name, dataset_name_list)
    return saved_name

In [None]:
def convert_to_dictionary(predictions, dataset, dictionary_name):
    lookup = {}
    for count, name in enumerate(dataset):
        lookup[name] = predictions[count, :]
    with open(dictionary_name, 'wb') as f:
        np.save(f, lookup)
    return lookup

In [None]:
def predict_and_save(core_name, attributes, dataset_name_list, lookup_encoding, cell_folder, weight_folder='../input/hpamodels'):
    print(core_name)
    start = time.time()
    size = attributes['size']
    model = make_model(attributes['model'], size, core_name, weight_folder)
    dset = make_dataset(size, '/tmp')
    predict_time = time.time()
    results = model.predict(dset)
    predict_time_end = time.time()
    print(predict_time_end - predict_time)
    dictionary_name = f'./{core_name}.npy'
    lookup = convert_to_dictionary(results, dataset_name_list, dictionary_name)
    # save_file = f'{core_name}.csv'
    # generate(cell_folder, dictionary_name, save_file, lookup_encoding)
    end = time.time()
    print(end - start)
    # return save_file

In [None]:
def ensemble(s, scores):
    dfs = []
    for prediction_file in s:
        d = np.load(prediction_file, allow_pickle=True).item()
        dfs.append(d)
    newlookup = {}
    for k in dfs[0].keys():
        collected_pred = []
        for p in dfs:
            cur = p[k]
            collected_pred.append(cur)
        final_pred = np.zeros(19)
        for ind, score in enumerate(scores):
            final_pred += score * collected_pred[ind]
        newlookup[k] = final_pred
    with open('./ensemble.npy', 'wb') as f:
        np.save(f, newlookup)
    return newlookup

In [None]:
def combine_whole_image(orig, image_pred, scores):
    d = np.load(orig, allow_pickle=True).item()
    image = np.load(image_pred, allow_pickle=True).item()
    newlookup = {}
    whole_im_lookup = {}
    for key, val in image.items():
        newkey = key.split('/')[-1].split('_')[0]
        whole_im_lookup[newkey] = val
    for k in d.keys():
        final_pred = d[k] * scores[0]
        image_key = k.split('_')[0]
        im_pred = whole_im_lookup[image_key]
        final_pred += scores[1] * im_pred
        newlookup[k] = final_pred
    with open('./ensemble_whole.npy', 'wb') as f:
        np.save(f, newlookup)
    return newlookup

In [None]:
try:
    cell_folder = process_folder('../input/hpa-single-cell-image-classification/test', '../input/hpacellsegmentatormodelweights', '/tmp', 'test')
    mask_lookup = save_all_mask(cell_folder, './mask_lookup.npy')
    image_set = [x.split('.')[0] for x in os.listdir(cell_folder)]
    slice_to_tfrec('../input/hpa-single-cell-image-classification/test', cell_folder, image_set, 0, '/tmp/test', True, len(image_set))
    dset = make_dataset(224, '/tmp')
    dataset_name_list = generate_name('/tmp')
except:
    print("Preprocessing failed")

In [None]:
core_name_dict = {
                  'model-121-rgb-tpu-aug-kfold-0-smooth-255-224-v8': {'size': 224, 'model': densenet_model},
                  'model-b1-rgb-tpu-aug-kfold-0-smooth-255-v8-224': {'size': 224, 'model': b1_model},
                  }

try:
    dataset_name_list = np.load('./name_list.npy')
    lookup_encoding = np.load('./mask_lookup.npy', allow_pickle=True).item()
    for name, attrs in core_name_dict.items():
        predict_and_save(name, attrs, dataset_name_list, lookup_encoding, cell_folder)
    first_ensemble = ensemble(['model-b1-rgb-tpu-aug-kfold-0-smooth-255-v8-224.npy', 'model-121-rgb-tpu-aug-kfold-0-smooth-255-224-v8.npy'],
                              [0.5, 0.5])
except:
    print("Prediction cell failed")

In [None]:
try:
    model_full = make_model(b7_model, 512, 'full-image-green-remaining', '../input/hpamodels')
    model_full_lookup = predict_whole_image('../input/hpa-single-cell-image-classification/test', model_full, 'full-image-green-remaining')
    final_ensemble = combine_whole_image('./ensemble.npy', 'full-image-green-remaining.npy', [0.8, 0.2])
except:
    print("Prediction image failed")

In [None]:
save_file = 'submission_others.csv'
try:
    generate(cell_folder, './ensemble_whole.npy', save_file, lookup_encoding)
except:
    print("Saving prediction failed")

In [None]:
try:
    final_submission = pd.read_csv(save_file)
except:
    print("Failed to read")
    final_submission = already_processed
# final_submission = pd.concat([other_submission, already_processed], ignore_index=True)

In [None]:
sample = pd.read_csv('../input/hpa-single-cell-image-classification/sample_submission.csv')
data = final_submission[['ID', 'PredictionString']]
data = data.rename(columns={'PredictionString': 'pred'})

sub = pd.merge(
    sample,
    data,
    how="left",
    left_on='ID',
    right_on='ID',
)

print(sub.head())

def isNaN(num):
    return num != num

for i, row in sub.iterrows():
    if isNaN(row['pred']): continue
    sub.PredictionString.loc[i] = row['pred']

sub = sub[sample.columns]
sub.to_csv('submission.csv', index=False)

In [None]:
import shutil

images_dir_path = '/tmp'

try:
    shutil.rmtree(images_dir_path)
except OSError as e:
    print(e)