#### this notebook is part of the documentation on my HPA approach  
    -> main notebook: https://www.kaggle.com/philipjamessullivan/0-hpa-approach-summary

# 1: make masks, get bboxes, make dataframe
## GOAL:
extract the bounding boxes of the individual cells contained in each image  

## RESULTS:
**dataset name:** "hpa-data" (linked to this notebook)  
---> contains two files;  

**file1 type:** dictionary (pickled)  
**file1 name:** "bboxesone.pkl"  
**file 1 contents:** all image ids mapped to all bbox coordinates contained  

**file2 type:** dataframe (pickled)   
**file2 name:** "hpa-data.pkl  
**file2 contents:** all image ids mapped to all labels and all bbox coordinates with one row per bbox and one column per value

In [None]:
#constants
IMG_FOLDER_PATH="../input/hpa-single-cell-image-classification/train/"
CSV_FILE_PATH="../input/hpa-single-cell-image-classification/train.csv"
MASK_FOLDER_PATH="./masks"

In [None]:
#load HPA dataset
import pandas as pd
CSV_FILE_PATH="../input/hpa-single-cell-image-classification/train.csv"
id_labels_array=pd.read_csv(CSV_FILE_PATH)
id_array=(id_labels_array["ID"]).tolist()
labels_dict=id_labels_array.set_index('ID').T.to_dict('list')
labels_dict = {num: labels[0] for num, labels in labels_dict.items()}

In [None]:
%%capture
#function for cell segmentation
!pip install https://github.com/CellProfiling/HPA-Cell-Segmentation/archive/master.zip
import hpacellseg.cellsegmentator as cellsegmentator
from hpacellseg.utils import label_cell, label_nuclei
from PIL import Image
import numpy as np
NUC_MODEL = "../input/hpacellsegmentatormodelweights/dpn_unet_nuclei_v1.pth"
CELL_MODEL = "../input/hpacellsegmentatormodelweights/dpn_unet_cell_3ch_v1.pth"
segmentator = cellsegmentator.CellSegmentator(
    NUC_MODEL,
    CELL_MODEL,
    scale_factor=0.25,
    device="cpu", # why not gpu? does not lead to major increase, as the bottleneck is the file loading!
    padding=False,
    multi_channel_model=True,
)

#function to get mask only using supplied img_id
def get_mask(img_id):
    ch_r=Image.open(IMG_FOLDER_PATH+img_id+"_red.png")
    ch_y=Image.open(IMG_FOLDER_PATH+img_id+"_yellow.png")
    ch_b=Image.open(IMG_FOLDER_PATH+img_id+"_blue.png")
    nuc_segmentations = segmentator.pred_nuclei([np.asarray( ch_b )])
    cell_segmentations = segmentator.pred_cells([
            [np.asarray( ch_r )],
            [np.asarray( ch_y )],
            [np.asarray( ch_b )]
        ])
    nuclei_mask, mask = label_cell(nuc_segmentations[0], cell_segmentations[0])
    mask = np.uint8(mask)
    return mask

#function for bbox creation
def get_bboxes(mask):
    mask_flattened=np.ravel(mask)
    cell_ids=set(mask_flattened)
    cell_ids.remove(0)
    bboxes=list()
    for cell_id in cell_ids:
        a = np.where(mask == cell_id)
        ymin, ymax, xmin, xmax = np.min(a[0]), np.max(a[0]), np.min(a[1]), np.max(a[1])
        bboxes.append([ymin,ymax,xmin,xmax])
    return bboxes

In [None]:
#this step takes more than the maximum of 9h allowed for a kaggle notebook:
##SOLUTION1: divide dataset into about 20 pieces and run kaggle notebooks in parallel, one for each part
##SOLUTION2: download data and compute on local machine
##  --> PROBLEM WITH SOLUTION 2: kaggle upload of large datasets is very buggy

import pickle
from tqdm import tqdm

bboxes_dict={}
for img_id in tqdm(id_array[:10]):    ###LIMITED TO 10 ONLY FOR DEMONSTRATION
    mask=get_mask(img_id)
    bboxes=get_bboxes(mask)
    bboxes_dict[img_id]=bboxes
    
with open('bboxes_dict.pkl', 'wb') as handle:
    pickle.dump(bboxes_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
df = pd.DataFrame(columns=['ID','cell','Label','ymin','ymax','xmin','xmax'])
for img_id in bboxes_dict.keys():
    for i,bbox in enumerate(bboxes_dict[img_id]):
        df = df.append({'ID': img_id,
                        'cell':i+1,
                        'Label': labels_dict[img_id],
                        'ymin': bbox[0],
                        'ymax': bbox[1],
                        'xmin': bbox[2],
                        'xmax': bbox[3]}, ignore_index=True)
with open('hpa_data_df.pkl', 'wb') as handle:
    pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
#show results
df

In [None]:
#show results
bboxes_dict