# **How you might be able to clean the data**

* The first part until the data cleaning part was taken from @https://www.kaggle.com/its7171/mmdetection-for-segmentation-training
* The basic idea is to use cell segmentor to get a data set where each image shows a single cell
* Then training a classifier on single cell images
* Such a dataset will contain faulty data because:
    1. the labels correspond to a bunch of cells without
    2. there is no guarantee that each cell actually expresses the protein of interest
    3. even if the cell expresses it there is no guarantee that it is visible properly through the green marker for example due to photobleaching
    4. overexposure might lead to bright signal in the green channel but the characteristic feature for this protein might not be visible due to loss of information 
    5. there are erros in the segmentation masks
 
 
* In order to get rid of some of the faulty data the following criterion is applied:
    1. from the pixels corresponding to no cell, the background level of the green channel is calculated
    2. for each cell the area of pixels above this background level is divided by the cell's entire area (this quantity is refered to as fraction in the code)    
    3. each cell with a fraction below a threshold and above another threshold is discarded as faulty
    4. the thresholds are calulated by calculating two percentile of the distribution of the fractions of cells with the same protein of interest
    
   
    

In [None]:
from itertools import groupby
import numpy as np
from tqdm import tqdm
import pandas as pd
import os
import pickle
import cv2
from multiprocessing import Pool
import matplotlib.pyplot as plt

In [None]:
exp_name = "v3"
conf_name = "mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco"
cell_mask_dir = '../input/hpa-mask/hpa_cell_mask'    
ROOT = '../input/hpa-single-cell-image-classification/'
train_or_test = 'train'
img_dir = f'../work/mmdet_{exp_name}_{train_or_test}'
!mkdir -p {img_dir}
df = pd.read_csv(os.path.join(ROOT, 'train.csv'))

# this script takes more than 9hours for full data.
debug = False
if debug:
    df = df[:4]

# helper funcs

In [None]:
# convert segmentation mask image to run length encoding
MAX_GREEN = 64 # filter out dark green cells
def get_rles_from_mask(image_id, class_id):
    mask = np.load(f'{cell_mask_dir}/{image_id}.npz')['arr_0']
    if class_id != '18':
        green_img = read_img(image_id, 'green')
    rle_list = []
    mask_ids = np.unique(mask)
    for val in mask_ids:
        if val == 0:
            continue
        binary_mask = np.where(mask == val, 1, 0).astype(bool)
        if class_id != '18':
            masked_img = green_img * binary_mask
            #print(val, green_img.max(),masked_img.max())
            if masked_img.max() < MAX_GREEN:
                continue
        rle = coco_rle_encode(binary_mask)
        rle_list.append(rle)
    return rle_list, mask.shape[0], mask.shape[1]

def coco_rle_encode(mask):
    rle = {'counts': [], 'size': list(mask.shape)}
    counts = rle.get('counts')
    for i, (value, elements) in enumerate(groupby(mask.ravel(order='F'))):
        if i == 0 and value == 1:
            counts.append(0)
        counts.append(len(list(elements)))
    return rle

# mmdet custom dataset generator
def mk_mmdet_custom_data(image_id, class_id):
    rles, height, width = get_rles_from_mask(image_id, class_id)
    if len(rles) == 0:
        return {
            'filename': image_id+'.jpg',
            'width': width,
            'height': height,
            'ann': {}
        }
    rles = mutils.frPyObjects(rles, height, width)
    bboxes = mutils.toBbox(rles)
    bboxes[:, 2] += bboxes[:, 0]
    bboxes[:, 3] += bboxes[:, 1]
    return {
        'filename': image_id+'.jpg',
        'width': width,
        'height': height,
        'ann':
            {
                'bboxes': np.array(bboxes, dtype=np.float32),
                'labels': np.zeros(len(bboxes)), # dummy data.(will be replaced later)
                'masks': rles
            }
    }

# print utility from public notebook
def print_masked_img(image_id, mask):
    img = load_RGBY_image(image_id, train_or_test)
    
    plt.figure(figsize=(15, 15))
    plt.subplot(1, 3, 1)
    plt.imshow(img)
    plt.title('Image')
    plt.axis('off')
    
    plt.subplot(1, 3, 2)
    plt.imshow(mask)
    plt.title('Mask')
    plt.axis('off')
    
    plt.subplot(1, 3, 3)
    plt.imshow(img)
    plt.imshow(mask, alpha=0.6)
    plt.title('Image + Mask')
    plt.axis('off')
    plt.show()
    
# image loader, using rgb only here
def load_RGBY_image(image_id, train_or_test='train', image_size=None):
    red = read_img(image_id, "red", train_or_test, image_size)
    green = read_img(image_id, "green", train_or_test, image_size)
    blue = read_img(image_id, "blue", train_or_test, image_size)
    #yellow = read_img(image_id, "yellow", train_or_test, image_size)
    stacked_images = np.transpose(np.array([red, green, blue]), (1,2,0))
    return stacked_images

# 
def read_img(image_id, color, train_or_test='train', image_size=None):
    filename = f'{ROOT}/{train_or_test}/{image_id}_{color}.png'
    assert os.path.exists(filename), f'not found {filename}'
    img = cv2.imread(filename, cv2.IMREAD_UNCHANGED)
    if image_size is not None:
        img = cv2.resize(img, (image_size, image_size))
    if img.max() > 255:
        img_max = img.max()
        img = (img/255).astype('uint8')
    return img

# make annotation helper called multi processes
def mk_ann(idx):
    image_id = df.iloc[idx].ID
    class_id = df.iloc[idx].Label
    anno = mk_mmdet_custom_data(image_id, class_id)
    img = load_RGBY_image(image_id, train_or_test)
    cv2.imwrite(f'{img_dir}/{image_id}.jpg', img)
    return anno, idx, image_id

# checking segment mask
To extract the each cells, [CellSegmentator](https://github.com/CellProfiling/HPA-Cell-Segmentation) can be used.
And The extracted segment masks are stored in [this dataset](https://www.kaggle.com/its7171/hpa-mask).

This mask files are made as follows:
<pre>
nucl_mask, cell_mask = segmentCell(im, segmentator)
np.savez_compressed(f'{cell_dir}/{image_id}', cell_mask)
np.savez_compressed(f'{nucl_dir}/{image_id}', nucl_mask)
</pre>
So you can load the mask as follows:
<pre>
cell_mask = np.load(f'{cell_dir}/{image_id}.npz')['arr_0']
nucl_mask = np.load(f'{nucl_dir}/{image_id}.npz')['arr_0']
</pre>


In [None]:
cell_mask_dir = '../input/hpa-mask/hpa_cell_mask'    
for idx in range(2):
    image_id = df.iloc[idx].ID
    cell_mask = np.load(f'{cell_mask_dir}/{image_id}.npz')['arr_0']
    print_masked_img(image_id, cell_mask)

# **Data Cleaning**

# **Calculating fraction as metric for expression level of target protein**

In [None]:
def background(mask, ch):
    bm = binary_mask(mask)
    background = np.logical_not(bm) * ch
    return background

def threshold(bg, p):
    return np.quantile(bg, p)


def bbox(img):
    rows = np.any(img, axis=1)
    cols = np.any(img, axis=0)
    rmin, rmax = np.where(rows)[0][[0, -1]]
    cmin, cmax = np.where(cols)[0][[0, -1]]

    return rmin, rmax, cmin, cmax

def crop(img, cell_mask, index):
    bb = bbox(cell_mask == index)
    return crop(img, bb)

def crop(img, bb):   
    return img[bb[0] : bb[1], bb[2] : bb[3]]
    
def binary_mask(mask):
    bm = mask > 0
    return bm

def fraction(croppedimg, croppedmask, th):
    filtered = croppedimg * (croppedmask > 0)
    s1 = np.sum(filtered > th)   
    s2 = (np.sum(croppedmask > 0))
    
    frac = s1 / s2
    return frac

def cell_mask_to_fraction(cell_mask, img, thp, chindex):
    unique = np.unique(cell_mask)
    ch = img[:,:,chindex]
    bg = background(cell_mask, ch)
    th = threshold(bg, thp)
    
    result =[]
    for index in unique:
        cri = crop(ch, cell_mask, index)
        crm = crop(cell_mask, cell_mask, index)
        
        frac = fraction(cri, crm == index, th)
        result.append(frac)
    
    return result     
    
    
def get_fraction(cell_mask_dir, image_id, chindex=1, thp=0.99):
    idext = []
    bbs = []
    fracs = []
    
    cell_mask =load_cell_mask(image_id)
    unique = np.unique(cell_mask)
    img = load_RGBY_image(image_id)
    
    ch = img[:,:,chindex]
    bg = background(cell_mask, ch)
    th = threshold(bg, thp)  
    
    for i in unique[1:]:
        cmtemp = cell_mask == i
        bb = bbox(cmtemp)                
        
        idext.append(str(image_id) + '_' + str(i))
        bbs.append(bb)
        
        cri = crop(ch, bb)
        crm = crop(cell_mask, bb)
        
        frac = fraction(cri, crm == i, th)
        fracs.append(frac)
        
    return idext, bbs, fracs     

def load_cell_mask(image_id):
    cell_mask = np.load(f'{cell_mask_dir}/{image_id}.npz')['arr_0']
    return cell_mask


def calculate_fractions(chindex=1, thp=0.99):
    cell_mask_dir = '../input/hpa-mask/hpa_cell_mask' 
    idext, bbs, fracs = [], [], []
    n = len(df)
    for idx in range(n):
        if idx % 100 == 0: print(str(idx) + '|' + str(n))    
        image_id = df.iloc[idx].ID
        if(df.iloc[idx].Label.find('|') != -1): continue    
        idexttemp, bbstemp, fracstemp = get_fraction(cell_mask_dir, image_id, chindex, thp)
        idext.extend(idexttemp)
        bbs.extend(bbstemp)
        fracs.extend(fracstemp)


    df_fractions = {'cell_ID' : idext, 'bbox' : bbs, 'fraction' : fracs}    
    df_fractions = pd.DataFrame(data=df_fractions)
    
    labels = []
    i = 0
    
    print("Number of cells: ")
    print(len(df_fractions))
    print("Number of calculated fractions: ")
    
    for cid in df_fractions['cell_ID']:
        if i % 100 == 0:   
            print(i)
        l = df.loc[df['ID'] == cid.split('_')[0]].Label
        labels.append(l)
        i += 1

    df_fractions['Label'] = [k.to_list()[0] for k in labels]   
    
    df_fractions.to_csv('df_fractions.csv',index=False)
    
    return df_fractions

In [None]:
calculate_Fractions = False

In [None]:
if calculate_Fractions:
    df_fractions = calculate_fractions()
else:
    df_fractions = pd.read_csv('../input/hpafractions/hpa_fractions.csv')
    

In [None]:
def rgby_from_frac_id(frac_id, bbox):
    split = frac_id.split("_")
    image_id = split[0]
    cell_mask_id = int(split[1])   
    cell_mask = load_cell_mask(image_id)
    cell_mask_crop = crop(cell_mask, bbox)
    cell_mask_crop = cell_mask_crop == cell_mask_id
    cell_mask_crop = cell_mask_crop.reshape(cell_mask_crop.shape[0], cell_mask_crop.shape[1], 1)
    
    img = crop(load_RGBY_image(image_id), bbox) * cell_mask_crop
    
    return img 
    
        
def bbox_from_string(bboxstring):
    bbox = bboxstring.split(",")
    bbox[0] = bbox[0][1:]
    bbox[-1] = bbox[-1][:-1]
    
    return [int(c) for c in bbox]


def get_entries_in_percentiles(label, pl, pu):
    df_label = df_fractions[df_fractions['Label'] == label]
    qu = np.quantile(df_label.fraction.to_list(), pu)    
    ql = np.quantile(df_label.fraction.to_list(), pl)
    
    extremel = df_label[df_label['fraction'] < ql]    
    extremeu = df_label[df_label['fraction'] > qu]
    middle = df_label[df_label['fraction'] < qu]
    middle = middle[middle['fraction'] > ql]    
    
    return extremel, middle, extremeu 

def get_percentiles(label, pl, pu):
    df_label = df_fractions[df_fractions['Label'] == label]
    qu = np.quantile(df_label.fraction.to_list(), pu)    
    ql = np.quantile(df_label.fraction.to_list(), pl)
    
    return ql, qu 

def get_extremes(label, pl, pu):
    df_label = df_fractions[df_fractions['Label'] == label]
    qu = np.quantile(df_label.fraction.to_list(), pu)    
    ql = np.quantile(df_label.fraction.to_list(), pl)
    
    extremel = df_label[df_label['fraction'] < ql]    
    extremeu = df_label[df_label['fraction'] > qu]
    
    return extremel, extremeu 



def visualize_random(df_sub, n, ci = None):
    n_ev = n - (n % 2)
    randindsl = ((len(df_sub) - 1) * np.random.uniform(0,1,n_ev)).astype(int)   
    fig, axs = plt.subplots(int((len(randindsl) / 2)), 2, figsize=(15,15))
    plotix = 0
    plotiy = 0
    for i in randindsl:
        frac_id, bbox = df_sub.cell_ID.values[i], bbox_from_string(df_sub.bbox.values[i])
        im = rgby_from_frac_id(frac_id, bbox)
        
        if ci is None:
            axs[plotiy, plotix % 2].imshow(im)      
        else: 
            axs[plotiy, plotix % 2].imshow(im[:,:,ci])

        if plotix % 2 == 1: plotiy += 1
        plotix += 1    
        


# Look at the distribution and visualize some examples

In [None]:
plt.hist(df_fractions[df_fractions['Label'] == 0].fraction.to_list())

In [None]:
lowerpercent = 0.001
upperpercent = 1 - 0.001

extremel, middle, extremeu = get_entries_in_percentiles(0, lowerpercent, upperpercent)

Show some random example from below lower percentile:

In [None]:
visualize_random(extremel, 20)

Show some random example between lower and upper percentile:

In [None]:
visualize_random(middle, 20)

Show some random example above upper percentile:

In [None]:
visualize_random(extremeu, 20)

In [None]:
ql, qu = get_percentiles(0, lowerpercent, upperpercent)


# **Output**

In [None]:
df_fractions["clean"] = True
labels = np.unique(df_fractions.Label)

for l in labels:
    ql, qu = get_percentiles(l, lowerpercent, upperpercent)
    df_fractions["clean"] = True
    df_fractions.loc[np.bitwise_and(df_fractions['Label'] == l, df_fractions['fraction'] < ql), ['clean']] = False
    df_fractions.loc[np.bitwise_and(df_fractions['Label'] == l, df_fractions['fraction'] > ql), ['clean']] = False

In [None]:
df_fractions.to_csv("df_fractions_clean.csv")

# **Outlook**

You could try different metrics for detecting faulty data like image entropy or brenner and do it for the other channels for example in order to detect if a nucleus is present