# 1. Data Exploration

* The Dataset belongs to the following Kaggle Competition :
* https://www.kaggle.com/c/sartorius-cell-instance-segmentation/data



In [None]:
!pip install pycocotools

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython import display
from PIL import Image, ImageEnhance
import matplotlib.image as mpimg
import json,itertools
import skimage.io as io
from pathlib import Path
from pycocotools.coco import COCO


# Training Dataframe
* Previewing the training data
* Creating a function to display an image from the files provided 

In [None]:
df = pd.read_csv('../input/sartorius-cell-instance-segmentation/train.csv')
df.head()
#print(df.loc[0, 'annotation'])


In [None]:
def displayImage(id,folder):
    #'../input/sartorius-cell-instance-segmentation/train
    
    #Image.open(dataDir/img['file_name'])
    imageName = id + '.png'
    folder = folder
    directory = "../input/sartorius-cell-instance-segmentation/"+folder+"/"
    imgPath = directory + imageName
    img = mpimg.imread(imgPath)
    imgplot = plt.imshow(img,cmap="gray")


displayImage('0a6ecc5fe78a','train')

# 2. Data Preprocessing 

COCO is an object segmentation and object detection and captioning dataset which is perfect to be used to look at cells and is the format in which Detectron2 accepts the data to perform image segmentation. The focus of this part of the notebook will be to ensure that the data is correctly organized in the format accepted by COCO.

* https://www.kaggle.com/coldfir3/efficient-coco-dataset-generator
* https://towardsdatascience.com/how-to-work-with-object-detection-datasets-in-coco-format-9bf4fb5848a4

More links about COCO Dataset

* https://cocodataset.org/#home
* https://github.com/cocodataset/cocoapi 
* https://www.kaggle.com/eigrad/convert-rle-to-bounding-box-x0-y0-x1-y1

In [None]:
#Expected output structure
'''
{
    "categories": [
        {
            "name": "shsy5y",
            "id": 1
        },
        {
            "name": "astro",
            "id": 2
        },
        {
            "name": "cort",
            "id": 3
        }
    ],
    "images": [
        {
            "id": "0030fd0e6378",
            "width": 704,
            "height": 520,
            "file_name": "train/0030fd0e6378.png"
        },
    ],
    "annotations": [
        {
            "segmentation": {
                "counts": [
                    299687,
                    7,
                    513,
                    19,
                    501,
                    25,
                    495,
                    .
                    .
                    .
                ], 
                "size": [
                    520,
                    704
                ]
            }, 
            "box": [
                679,
                149,
                25,
                37
            ], 
            "area": 420, 
            "image_id": "ffc2ead3e8cc", 
            "category_id": 0,
            "iscrowd": 0, 
            "id": 73506
        }
    ]
}
'''

# Create a mask on the area where a cell exists

* The purpose of this function is to return a mask which is identified as 1 or 0 which is identified as background
* The rle is decoded by taking the start and end of a pixel and filling the start to end with 1s to denote the cell present   
    
Sample Annotation input

118145 6 118849 7 119553 8 120257 8 120961 9 121665 10 122369 12 123074 13 123778 14 124482 15 125186 16 125890 17 126594 18 127298 19 128002 20 128706 21 129410 22 130114 23 130818 24 131523 24 132227 25 132931 25 133635 24 134339 24 135043 23 135748 21 136452 19 137157 16 137864 11 138573 4


In [None]:
def covertRLE(annotation,height,width):
    #convert String to array
    rle = annotation.split()
    #print(rle)
    starts_of_pixel, lengths_of_pixel= [np.asarray(x, dtype=int) for x in (rle[0:][::2],rle[1:][::2])]
    #print(lengths_of_pixel)
    #print(starts_of_pixel)
    #move start over 1 because numpy indexing starts at 0
    starts_of_pixel -= 1
    ends_of_pixel =  starts_of_pixel + lengths_of_pixel
    #create empty matrix filled with 0s to store final values
    img = np.zeros(width*height, dtype=np.uint8)
    #identify these regions as 1 
    for start, end in zip(starts_of_pixel, ends_of_pixel):
        img[start:end] = 1
    #create a 2D Array with height as number of rows and width as columns
    return img.reshape(height,width)
    

singleCell = df.loc[2534, 'annotation']
#print(singleCell)
#print(df.loc[2534, 'id'])
height = 520
width = 704
convertedImg = covertRLE(singleCell,height,width)

plt.imshow(convertedImg,cmap='gray');
#print(convertedImg)
#ansArr = createImgMask(singleAnn,width,height)
#print(ansArr.shape)


# Count the number of pixels that have cells
* This function will return the data in the form of number of 0s followed by number of 1s

 https://newbedev.com/encode-numpy-array-using-uncompressed-rle-for-coco-dataset

In [None]:
def getCountsOfMask(binary_mask):
    rle = {'counts': [], 'size': list(binary_mask.shape)}
    counts_obj = rle.get('counts')
    #print( counts_obj)
    
    #Flatten the file 
    for i, (value, elements) in enumerate(itertools.groupby(binary_mask.ravel(order='F'))):
        if i == 0 and value == 1:
             counts_obj.append(0)
        counts_obj.append(len(list(elements)))

    return rle


test_binary_mask =  np.array([[  0,   0,   0,   0,   0,  1,  0,   0,   0,   1]], dtype=np.uint8)

#{'counts': [5, 1, 3, 1], 'size': [1, 10]}

'''
test_binary_mask =  np.array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
                                     [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
                                     [  0,   0,   0,   0,   0,   1,   1,   1,   0,   0],
                                     [  0,   0,   0,   0,   0,   1,   1,   1,   0,   0],
                                     [  0,   0,   0,   0,   0,   1,   1,   1,   0,   0],
                                     [  0,   0,   0,   0,   0,   1,   1,   1,   0,   0],
                                     [  1,   0,   0,   0,   0,   0,   0,   0,   0,   0],
                                     [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
                                     [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0]], dtype=np.uint8)

'''

print(getCountsOfMask(test_binary_mask))

# Convert the Dataframe into a readable .json file

* This function will create a category map of all the types of cells that are 
present in the training file.


In [None]:
def createCategoryMapByCellType(train_df):
    categoryDict = []
    cellCategories = train_df['cell_type'].values
    uniqueCellCategories = np.unique(cellCategories)
    #print(uniqueCellCategories)
    for i in range(0,len(uniqueCellCategories)):
        cellCategory = uniqueCellCategories[i]
        if cellCategory not in categoryDict:
            newCat = {
                'name':'',
                'id': ''
            }
            newCat['name'] = cellCategory
            newCat['id'] = i+1
            categoryDict.append(newCat)  
        #print(categoryDict)
    return categoryDict

def getCategoryID(cat_name,all_cats):
    for idx, cat in enumerate(all_cats):
        if cat['name'] == cat_name:
            return cat['id']
    
sample_rows = df.iloc[73506:73512,:]
cat_map = createCategoryMapByCellType(sample_rows)
get_id = getCategoryID('astro',cat_map)
#print(sample_rows)
#print(cat_map)
#print(get_id)

# Function that builds the COCO .json file

In [None]:
from tqdm.notebook import tqdm
from pycocotools import mask as maskUtils
from joblib import Parallel, delayed

def createCocoStructure(train_df):
    all_cell_arr = {
        'categories' : [],
        'images' : [],
        'annotations':[]
    }
    cat_map = createCategoryMapByCellType(train_df)
    all_cell_arr['categories'] = cat_map
   
    for idx, row in tqdm(train_df.iterrows()):        
        cell_category_name = row.cell_type
        cell_category_id =  getCategoryID(cell_category_name,cat_map)
        mask = covertRLE(row.annotation, row.height, row.width)
        enc_binary_ann_counts = getCountsOfMask(mask)
        row_id = row.id
        img_path = 'train/'+row_id+'.png'
        #create the image obj 
        img_obj = {
            'id':row_id,
            'width' : row.width,
            'height' : row.height,
            'file_name': img_path
        }
        all_cell_arr['images'].append(img_obj)
        #create box around pixel
        ys, xs = np.where(mask)
        x1, x2 = min(xs), max(xs)
        y1, y2 = min(ys), max(ys)        
        ann_obj = {
            'segmentation' :  enc_binary_ann_counts,
            'bbox' : [int(x1),int(y1),int(x2-x1+1),int(y2-y1+1)],
            'area':int(np.sum(mask)),
            'image_id':row.id,
            'category_id': cell_category_id,
            'iscrowd' : 0,
            'id' :idx
        }
        all_cell_arr['annotations'].append(ann_obj)
        
    return all_cell_arr
    
#sample_rows = df.iloc[73506:73512,:]
#print(sample_rows)
#root = createCocoStructure(sample_rows) 

#Save the test file
#with open('annotations_sample_test.json', 'w', encoding='utf-8') as f:
#    json.dump(root, f, ensure_ascii=True, indent=4)
#print(root)


# Split the Data to Train and Test Sets 
Since there is 73586 rows the data can be split as 
1. 80% Training - 58868
2. 20% Testing - 14717


In [None]:
 
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.20, stratify=df['cell_type'])
df_train.head()
#print(df_train.shape)


In [None]:
df_test.head()
#print(df_test.shape)

In [None]:
def createAndSaveFile(df,fileName):
    fileNameJson = fileName+ '.json' 
    root = createCocoStructure(df) 
    with open(fileNameJson, 'w', encoding='utf-8') as f:
        json.dump(root, f, ensure_ascii=True, indent=4)
    

In [None]:
#createAndSaveFile(df_train,'annotations_train_final')
#createAndSaveFile(df_test,'annotations_test_final')

In [None]:
#View Train file - increase number of lines to see more 
!head -n 10 ../input/annotations-test-final/annotations_test_final.json

# Ensure coco dataset is able to categorize images
* The data in the dataset only shows annotations of a sincle cell, which is why the iscrowd attribute was set to 0
* Now we can see if Coco is able to identify the other cell pixels based on the data of that one cell segmentation
* Loading 10 images to check the box outlines

# The following API functions are defined:
  COCO       - COCO api class that loads COCO annotation file and prepare data structures.
* decodeMask - Decode binary mask M encoded via run-length encoding.
* encodeMask - Encode binary mask M using run-length encoding.
* getAnnIds  - Get ann ids that satisfy given filter conditions.
* getCatIds  - Get cat ids that satisfy given filter conditions.
* getImgIds  - Get img ids that satisfy given filter conditions.
* loadAnns   - Load anns with the specified ids.
* loadCats   - Load cats with the specified ids.
* loadImgs   - Load imgs with the specified ids.
* annToMask  - Convert segmentation in an annotation to binary mask.
* showAnns   - Display the specified annotations.
* loadRes    - Load algorithm results and create API for accessing them.
* download   - Download COCO images from mscoco.org server.

https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py

In [None]:
 
dataDir=Path('../input/sartorius-cell-instance-segmentation')
trFile = Path('../input/annotations-train-final/annotations_train_final.json')
coco = COCO(trFile)
imgIds = coco.getImgIds()
imgs = coco.loadImgs(imgIds)
    
imgs = coco.loadImgs(imgIds[-10:])
_,axs = plt.subplots(len(imgs),2,figsize=(40,15 * len(imgs)))
for img, ax in zip(imgs, axs):
    I = io.imread(dataDir/img['file_name'])
    annIds = coco.getAnnIds(imgIds=[img['id']])
    anns = coco.loadAnns(annIds)
    ax[0].imshow(I)
    ax[1].imshow(I)
    plt.sca(ax[1])
    coco.showAnns(anns, draw_bbox=True)