# Data Explore
Basically, the dataset we use contains train images and the lables. Detailed imformation are shown below.

## Task
Our task is to detect and delineate distinct objects of interest in biological images depicting neuronal cell types commonly used in the study of neurological disorders.

In [None]:
import os
import cv2

import numpy as np 
import pandas as pd

import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

df_train = pd.read_csv('../input/sartorius-cell-instance-segmentation/train.csv')

### 1. Label
The meta data, i.e. lable here, is given in the `train.csv` file which containing 2 numerical features and 7 categorical. This dataset is well-designed contains no empty value.

In [None]:
df_train.head()

In [None]:
df_train.info()

### 1.1 Image Information
From the `train.csv`, all the images have the same shape *704 x 520* which contains no variable image resolution problem. However, there is only 606 images in the tarin set. The number of rows in this file is far more than 606 which indicates there are more than 1 instances in 1 image.

In [None]:
print(f'Number of images: {df_train.id.nunique()}')

The number of instances in each image is different which varys from 4 to 790.

In [None]:
fig, ax = plt.subplots()

ninstances_per_image = df_train[['id']].value_counts().sort_values()
ninstances_per_image.index = range(606)
ninstances_per_image.median()
ninstances_per_image.plot.bar(ax=ax)

ax.set_xticklabels([])
ax.set_xlabel('Images')
ax.set_ylabel('Number of Instances')
plt.show()

Observing this data, it is easy to find each image is associated with a unique cell type. Those types are cort (neurons), shsy5y (neuroblastoma) and astro (astrocytes).

In [None]:
fig, ax = plt.subplots(1, 1)
df_train.groupby(['id','cell_type'])['cell_type'].first().value_counts().plot.bar(ax=ax)
ax.set_ylabel('Number of Images')
ax.set_xlabel('Cell Types')
fig.tight_layout()
plt.show()

## 2. Image
Here we show 3 image of each cell types.

In [None]:
def decode_rle_mask(rle_mask, shape):

    rle_mask = rle_mask.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (rle_mask[0:][::2], rle_mask[1:][::2])]
    starts -= 1
    ends = starts + lengths

    mask = np.zeros((shape[0] * shape[1]), dtype=np.uint8)
    for start, end in zip(starts, ends):
        mask[start:end] = 1

    mask = mask.reshape(shape[0], shape[1])
    return mask

def visualize_image(df, image_id):   
    image_path = df.loc[df['id'] == image_id, 'id'].values[0]
    cell_type = df.loc[df['id'] == image_id, 'cell_type'].values[0]
    plate_time = df.loc[df['id'] == image_id, 'plate_time'].values[0]
    sample_date = df.loc[df['id'] == image_id, 'sample_date'].values[0]
    sample_id = df.loc[df['id'] == image_id, 'sample_id'].values[0]

    image = cv2.imread(f'../input/sartorius-cell-instance-segmentation/train/{image_path}.png')
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    fig, axes = plt.subplots(figsize=(10, 10), ncols=2)
    fig.tight_layout(pad=5.0)
    
    axes[0].imshow(image, cmap='gray')
    masks = []
    for mask in df.loc[df['id'] == image_id, 'annotation'].values:
        decoded_mask = decode_rle_mask(rle_mask=mask, shape=image.shape)
        masks.append(decoded_mask)
    mask = np.stack(masks)
    mask = np.any(mask == 1, axis=0)
    axes[1].imshow(image, cmap='gray')
    axes[1].imshow(mask, alpha=0.4)

    for i in range(2):
        axes[i].set_xlabel('')
        axes[i].set_ylabel('')
        axes[i].tick_params(axis='x', labelsize=10, pad=10)
        axes[i].tick_params(axis='y', labelsize=10, pad=10)
        
    axes[0].set_title(f'{image_path} - {cell_type} Annotations\n{plate_time} - {sample_date} - {sample_id}', fontsize=10, pad=12)
    axes[1].set_title('Segmentation Mask', fontsize=10, pad=12)
    plt.show()
    plt.close(fig)

#### astro

In [None]:
select_image_ids = []
select_image_ids.append(df_train.loc[df_train['cell_type'] == 'astro', 'id'].sample(1).to_list()[0])
select_image_ids.append(df_train.loc[df_train['cell_type'] == 'astro', 'id'].sample(2).to_list()[0])
select_image_ids.append(df_train.loc[df_train['cell_type'] == 'astro', 'id'].sample(3).to_list()[0])

for image_id in select_image_ids:
     visualize_image(df=df_train, image_id=image_id)

#### cort

In [None]:
select_image_ids = []
select_image_ids.append(df_train.loc[df_train['cell_type'] == 'cort', 'id'].sample(1).to_list()[0])
select_image_ids.append(df_train.loc[df_train['cell_type'] == 'cort', 'id'].sample(2).to_list()[0])
select_image_ids.append(df_train.loc[df_train['cell_type'] == 'cort', 'id'].sample(3).to_list()[0])

for image_id in select_image_ids:
     visualize_image(df=df_train, image_id=image_id)

#### shsy5y

In [None]:
select_image_ids = []
select_image_ids.append(df_train.loc[df_train['cell_type'] == 'shsy5y', 'id'].sample(1).to_list()[0])
select_image_ids.append(df_train.loc[df_train['cell_type'] == 'shsy5y', 'id'].sample(2).to_list()[0])
select_image_ids.append(df_train.loc[df_train['cell_type'] == 'shsy5y', 'id'].sample(3).to_list()[0])

for image_id in select_image_ids:
     visualize_image(df=df_train, image_id=image_id)

* `astro` instances are the biggest in shape
* `cort` instances are smaller than the other cell and circle-like
* `shsy5y` are slightly bigger and more abundant than cort