# Cellpose Data Preperation

Say we have 2 images with ID's "as98d7fk3u" and "cvi8b8798s". Then, Cellpose 
expects data in the form: 

**./dir**
- as98d7fk3u.tif
- as98d7fk3u_mask.tif
- cvi8b8798s.tif
- cvi8b8798s_mask.tif


This Notebook should give a general framework you can follow for your data.


In [1]:
from skimage.transform import resize
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
import numpy as np
import glob
import cv2
import os

from cellpose.utils import diameters

2021-12-30 10:24:01,795 [INFO] WRITING LOG OUTPUT TO /home/ryan/.cellpose/run.log


In [2]:
def rle_decode(rle_list, shape = (520, 704), arr_type = np.uint16):
    """
    Given a list of RLE encoded masks, decode them and
    return the numpy int64 mask.

    Args:
        rle_list (list): List of RLE encoded masks for
        a whole image.

        shape (tuple): Tuple shape of the final image.
        
    Returns:
        np.array (int 64)

    """
    mask = np.zeros((shape[0] * shape[1], 1), dtype = np.uint64)

    for idx, rle in enumerate(rle_list):
        rle    = rle.split()
        np_rle = np.array(rle, dtype = np.uint64)

        first_indices = np_rle[0 : : 2] - 1 
        lengths       = np_rle[1 : : 2]
        last_indices  = first_indices + lengths 

        for i in range(len(first_indices)):
            mask[first_indices[i] : last_indices[i]] = 1 + idx

    return mask.reshape(shape).astype(arr_type)

In [9]:
data_dir = "../../data/"

dir_list = sorted(glob.glob(os.path.join(data_dir, "train/*")))

to_dir = "./cellpose_data_shsy5y"
os.makedirs(to_dir, exist_ok = True)

df = pd.read_csv(os.path.join(data_dir, "train.csv"))

In [14]:
diameters_ = []    
ids = []
for file in dir_list:
    image_id  = file.split("/")[-1].split(".png")[0]
    df_id     = df[df["id"] == image_id]
    
    cell_type   = df_id["cell_type"].tolist()[0]
    annotations = df_id["annotation"].tolist()
    
    #def diameters(masks, omni=False, dist_threshold=1):   
    if cell_type == "astro":
        a = diameters(rle_decode(annotations))
        diameters_.extend(a[1].tolist())
        ids.append(image_id)
        
    #if cell_type == "shsy5y":
    #    image = cv2.imread(file, cv2.IMREAD_GRAYSCALE).astype(np.uint16)
    #    mask  = rle_decode(annotations).astype(np.uint16)

    #    mask = Image.fromarray(mask)
    #   mask.save(f"{to_dir}/{image_id}_masks.tif")

     #   image = Image.fromarray(image)
     #   image.save(f"{to_dir}/{image_id}.tif")
    

In [13]:
print(np.mean(diameters_))

14.225759641883307


In [15]:
ids

['0140b3c8f445',
 '085eb8fec206',
 '08f52aa2add3',
 '0a6ecc5fe78a',
 '0c90b86742b2',
 '0e1e2b68fa58',
 '100681b6cc7a',
 '11c2e4fcac6d',
 '129f894abe35',
 '13325f865bb0',
 '1395c3f12b7c',
 '144a7a69f67d',
 '15aeb12e7a83',
 '174793807517',
 '17754cb5b287',
 '182c3da676bd',
 '1874b96fd317',
 '194f7e69779b',
 '1d2396667910',
 '1d618b80769f',
 '1de9612cb6e1',
 '1ea4e44e5497',
 '24a07145b24d',
 '26d58ec4353a',
 '279107cc7fe4',
 '296926b5656b',
 '29dfe87f3a44',
 '2be2ec84ac11',
 '2c2cb870da85',
 '2c7b7d0a1573',
 '2d9fd17da790',
 '2dbfcf0fc496',
 '34bd8ce0c802',
 '37dd4dd6e76e',
 '393c8540c6fa',
 '3bcc8ba1dc17',
 '41a1f09b4f4e',
 '4318b7f15a71',
 '45a1f06614f0',
 '45b966b60d4b',
 '47fb5fcff2de',
 '48383b66ebd5',
 '4984db4ec8f3',
 '4cd85ba270d0',
 '4de92f67c5b8',
 '52ea449bc02d',
 '52f65c9194c0',
 '541ab846bcb0',
 '541d7fd43b66',
 '549d34aaf226',
 '5507c81bdcb6',
 '551738f39e65',
 '58b543a236c8',
 '5a05cc33a43d',
 '5b0469b7bd04',
 '5c84424f601c',
 '5df720a4fad6',
 '6064a286cbf3',
 '619f91a5c197