![Segment multi-organ functional tissue units](https://storage.googleapis.com/kaggle-competitions/kaggle/34547/logos/header.png?t=2022-02-15-22-37-27) > Segment multi-organ functional tissue units

## Please upvote if you find it useful! Thanks

## **Elementary Data Exploration**

In [None]:
import os
import pandas as pd

import tifffile
from tqdm.auto import tqdm
import numpy as np
import plotly.graph_objects as go
from plotly import tools
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
BASE_PATH = '../input/hubmap-organ-segmentation'
TRAIN_PATH = os.path.join(BASE_PATH, 'train_images')
TEST_PATH = os.path.join(BASE_PATH, 'test_images')


print(f'Total number of train images = {len(os.listdir(TRAIN_PATH))}')
print(f'Total number of test images = {len(os.listdir(TEST_PATH))}')

**Train masks**

In addition to the individual IDs for each picture, train.csv also includes an RLE-encoded version of the mask for the image's objects.

In [None]:
df_train = pd.read_csv(os.path.join(BASE_PATH, 'train.csv'))

df_train.head()

**Submission df**

In [None]:
df_sub = pd.read_csv(os.path.join(BASE_PATH, 'sample_submission.csv'))

df_sub

## Utility Functions

In [None]:
# https://www.kaggle.com/paulorzp/rle-functions-run-lenght-encode-decode
def rle2mask(mask_rle, shape):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (width,height) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()

    starts, lengths = [
        np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])
    ]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0] * shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo : hi] = 1
    return img.reshape(shape).T


def read_image(image_id, scale=None, verbose=1):
    image = tifffile.imread(
        os.path.join(BASE_PATH, f"train_images/{image_id}.tiff")
    )
    
    if len(image.shape) == 5:
        image = image.squeeze().transpose(1, 2, 0)
        
    mask = rle2mask(
        df_train[df_train['id'] == image_id]['rle'].values[0], 
        (image.shape[1], image.shape[0])
                    )
    if verbose:
        print(f"[{image_id}] Image shape: {image.shape}")
        print(f"[{image_id}] Mask shape: {mask.shape}")
    
    if scale:
        new_size = (image.shape[1] // scale, image.shape[0] // scale)
        image = cv2.resize(image, new_size)
        mask = cv2.resize(mask, new_size)
        
        if verbose:
            print(f"[{image_id}] Resized Image shape: {image.shape}")
            print(f"[{image_id}] Resized Mask shape: {mask.shape}")
        
    return image, mask


def read_test_image(image_id, scale=None, verbose=1):
    image = tifffile.imread(
        os.path.join(BASE_PATH, f"test/{image_id}.tiff")
    )
    if len(image.shape) == 5:
        image = image.squeeze().transpose(1, 2, 0)
    
    if verbose:
        print(f"[{image_id}] Image shape: {image.shape}")
    
    if scale:
        new_size = (image.shape[1] // scale, image.shape[0] // scale)
        image = cv2.resize(image, new_size)
        
        if verbose:
            print(f"[{image_id}] Resized Image shape: {image.shape}")
        
    return image


def plot_image_and_mask(image, mask, image_id):
    plt.figure(figsize=(16, 10))
    
    plt.subplot(1, 3, 1)
    plt.imshow(image)
    plt.title(f"Image {image_id}", fontsize=18)
    
    plt.subplot(1, 3, 2)
    plt.imshow(image)
    plt.imshow(mask, cmap="hot", alpha=0.5)
    plt.title(f"Image {image_id} + mask", fontsize=18)    
    
    plt.subplot(1, 3, 3)
    plt.imshow(mask, cmap="hot")
    plt.title(f"Mask", fontsize=18)    
    
    plt.show()
    
    
def plot_grid_image_with_mask(image, mask):
    plt.figure(figsize=(16, 16))
    
    w_len = image.shape[0]
    h_len = image.shape[1]
    
    min_len = min(w_len, h_len)
    w_start = (w_len - min_len) // 2
    h_start = (h_len - min_len) // 2
    
    plt.imshow(image[w_start : w_start + min_len, h_start : h_start + min_len])
    plt.imshow(
        mask[w_start : w_start + min_len, h_start : h_start + min_len], cmap="hot", alpha=0.5,
    )
    plt.axis("off")
            
    plt.show()
    

def plot_slice_image_and_mask(image, mask, start_h, end_h, start_w, end_w):
    plt.figure(figsize=(16, 5))
    
    sub_image = image[start_h:end_h, start_w:end_w, :]
    sub_mask = mask[start_h:end_h, start_w:end_w]
    
    plt.subplot(1, 3, 1)
    plt.imshow(sub_image)
    plt.axis("off")
    
    plt.subplot(1, 3, 2)
    plt.imshow(sub_image)
    plt.imshow(sub_mask, cmap="hot", alpha=0.5)
    plt.axis("off")
    
    plt.subplot(1, 3, 3)
    plt.imshow(sub_mask, cmap="hot")
    plt.axis("off")
    
    plt.show()

## Visualizations of Images and Masks

In [None]:
image_ids, images, masks = [[] for _ in range(3)]

for item in tqdm(range(4)):
    idx = np.random.randint(0, high=len(df_train), size=(1,))[0]
    image_id = df_train.iloc[idx].id
    image, mask = read_image(image_id)
    
    images.append(image)
    masks.append(mask)
    
    image_ids.append(image_id)

In [None]:
plt.figure(figsize=(16, 16))
for ind, (image_id, image) in enumerate(zip(image_ids, images)):
    plt.subplot(2, 2, ind + 1)
    plt.imshow(image)
    plt.axis("off")

In [None]:
plt.figure(figsize=(16, 16))
for ind, (image_id, image, mask) in enumerate(zip(image_ids, images, masks)):
    plt.subplot(2, 2, ind + 1)
    plt.imshow(image)
    plt.imshow(mask, cmap="hot", alpha=0.5)
    plt.axis("off")

## **Metadata Analysis**

I learned this from [here](https://www.kaggle.com/code/rhtsingh/hubmap-hpa-exploratory-data-analysis).

In [None]:
def bar_hor(df, col, title, color, w=None, h=None, lm=0, limit=100, return_trace=False, rev=False, xlb = False):
    cnt_srs = df[col].value_counts()
    yy = cnt_srs.head(limit).index[::-1] 
    xx = cnt_srs.head(limit).values[::-1] 
    if rev:
        yy = cnt_srs.tail(limit).index[::-1] 
        xx = cnt_srs.tail(limit).values[::-1] 
    if xlb:
        trace = go.Bar(y=xlb, x=xx, orientation = 'h', marker=dict(color=color))
    else:
        trace = go.Bar(y=yy, x=xx, orientation = 'h', marker=dict(color=color))
    if return_trace:
        return trace 
    layout = dict(title=title, margin=dict(l=lm), width=w, height=h)
    data = [trace]
    fig = go.Figure(data=data, layout=layout)
    fig.show()

In [None]:
bar_hor(df_train, 'data_source', 'Data Source Distribution', ["#F1948A", '#ABEBC6'], h=350, w=600, lm=200, xlb = ['Source : HPA','Target : HuBMAP'])

In [None]:
def gp(df, col, title):
    df1 = df[df["data_source"] == "HPA"]
    df0 = df[df["data_source"] == "HuBMAP"]
    a1 = df1[col].value_counts()
    b1 = df0[col].value_counts()
    
    total = dict(df[col].value_counts())
    x0 = a1.index
    x1 = b1.index
    
    y0 = [float(x)*100 / total[x0[i]] for i,x in enumerate(a1.values)]
    y1 = [float(x)*100 / total[x1[i]] for i,x in enumerate(b1.values)]

    trace1 = go.Bar(x=a1.index, y=y0, name='Target : 1', marker=dict(color="#96D38C"))
    trace2 = go.Bar(x=b1.index, y=y1, name='Target : 0', marker=dict(color="#FEBFB3"))
    return trace1, trace2 

In [None]:
tr0 = bar_hor(df_train, "sex", "Distribution of Gender Variable" ,"#f975ae", w=700, lm=100, return_trace=True)
tr1, tr2 = gp(df_train, 'sex', 'Distribution of Source with Gender')

fig = tools.make_subplots(rows=1, cols=3, print_grid=False, subplot_titles = ["Gender Distribution" , "Gender, Data Source=HMAP" ,"Gender, Data Source=HuBMAP"])
fig.append_trace(tr0, 1, 1);
fig.append_trace(tr1, 1, 2);
fig.append_trace(tr2, 1, 3);
fig['layout'].update(height=350, showlegend=False, margin=dict(l=50));
fig.show()

In [None]:
bar_hor(df_train, "organ", "Organ Type Distribution" ,"#f975ae", w=700, lm=100, return_trace= False)

In [None]:
plt.figure(figsize=(12,5))
plt.title("Distribution of Age")
ax = sns.distplot((df_train["age"]))

In [None]:
plt.figure(figsize=(12,5));
plt.style.use('default')
sns.kdeplot(df_train.loc[df_train['sex'] == "Male", 'age'], label = 'Gender == Male')
sns.kdeplot(df_train.loc[df_train['sex'] == "Female", 'age'], label = 'Gender == Female')
plt.xlabel('Age (years)'); plt.ylabel('Density'); plt.title('Distribution of Ages'); plt.legend()