### Imports

In [None]:
%%time
%autosave 60
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import gc
gc.enable();
import glob
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

import plotly
from plotly import tools
import plotly.express as px
import plotly.offline as pyo
import plotly.io as pio
import plotly.graph_objects as go
pio.templates.default = 'plotly_white'

import cv2
import tifffile

import warnings
warnings.simplefilter("ignore")

### Paths

In [None]:
ROOT = Path('../input/hubmap-organ-segmentation/')
TRAIN_CSV_PATH = ROOT / "train.csv"
TRAIN_IMAGES_PATH = ROOT / "train_images/"
TRAIN_ANNOTATIONS_PATH = ROOT / "train_annotations/"

# print([x for x in list(ROOT.iterdir())])

dataframe = pd.read_csv(TRAIN_CSV_PATH, low_memory=False, squeeze=True)

# assert len(list(TRAIN_IMAGES_PATH.glob("*"))) == dataframe.shape[0]

dataframe.head()

### Utils

In [None]:
def rle2mask(mask_rle, shape):
    s = mask_rle.split()
    starts, lengths = [
        np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])
    ]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0] * shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo : hi] = 1
    return img.reshape(shape).T


def read_image(image_id):
    image = tifffile.imread(str(TRAIN_IMAGES_PATH / f"{image_id}.tiff"))
    
    if len(image.shape) == 5:
        image = image.squeeze().transpose(1, 2, 0)
    
    mask = rle2mask(
        dataframe[dataframe["id"] == image_id]["rle"].values[0], 
        (image.shape[1], image.shape[0])
    )
        
    return image, mask


def plot_image_and_mask(image, mask, image_id):
    plt.figure(figsize=(16, 10))
    
    plt.subplot(1, 3, 1)
    plt.imshow(image)
    plt.title(f"Image {image_id}", fontsize=18)
    
    plt.subplot(1, 3, 2)
    plt.imshow(image)
    plt.imshow(mask, cmap="hot", alpha=0.5)
    plt.title(f"Image {image_id} + mask", fontsize=18)    
    
    plt.subplot(1, 3, 3)
    plt.imshow(mask, cmap="hot")
    plt.title(f"Mask", fontsize=18)    
    
    plt.show()

### Load images and masks

In [None]:
image_ids, images, masks = [[] for _ in range(3)]

for item in tqdm(range(9), total=9, position=0, desc="Reading image and mask"):
    idx = np.random.randint(0, high=len(dataframe), size=(1,))[0]
    image_id = dataframe.iloc[idx].id
    image, mask = read_image(image_id)
    
    images.append(image)
    masks.append(mask)
    image_ids.append(image_id)

### Plot images

In [None]:
plt.figure(figsize=(16, 16))
for ind, (image_id, image) in enumerate(zip(image_ids, images)):
    plt.subplot(3, 3, ind + 1)
    plt.imshow(image)
    plt.axis("off")

### Plot images with masks

In [None]:
plt.figure(figsize=(16, 16))
for ind, (image_id, image, mask) in enumerate(zip(image_ids, images, masks)):
    plt.subplot(3, 3, ind + 1)
    plt.imshow(image)
    plt.imshow(mask, cmap="hot", alpha=0.5)
    plt.axis("off")

### Single Sample Analysis

#### Sample 1

In [None]:
plot_image_and_mask(images[0], masks[0], image_ids[0])

#### Sample 2

In [None]:
plot_image_and_mask(images[1], masks[1], image_ids[2])

### Metadata Analysis

In [None]:
def bar_hor(df, col, title, color, w=None, h=None, lm=0, limit=100, return_trace=False, rev=False, xlb = False):
    cnt_srs = df[col].value_counts()
    yy = cnt_srs.head(limit).index[::-1] 
    xx = cnt_srs.head(limit).values[::-1] 
    if rev:
        yy = cnt_srs.tail(limit).index[::-1] 
        xx = cnt_srs.tail(limit).values[::-1] 
    if xlb:
        trace = go.Bar(y=xlb, x=xx, orientation = 'h', marker=dict(color=color))
    else:
        trace = go.Bar(y=yy, x=xx, orientation = 'h', marker=dict(color=color))
    if return_trace:
        return trace 
    layout = dict(title=title, margin=dict(l=lm), width=w, height=h)
    data = [trace]
    fig = go.Figure(data=data, layout=layout)
    fig.show()

def bar_hor_noagg(x, y, title, color, w=None, h=None, lm=0, limit=100, rt=False):
    trace = go.Bar(y=x, x=y, orientation = 'h', marker=dict(color=color))
    if rt:
        return trace
    layout = dict(title=title, margin=dict(l=lm), width=w, height=h)
    data = [trace]
    fig = go.Figure(data=data, layout=layout)
    fig.show()


def bar_ver_noagg(x, y, title, color, w=None, h=None, lm=0, rt = False):
    trace = go.Bar(y=y, x=x, marker=dict(color=color))
    if rt:
        return trace
    layout = dict(title=title, margin=dict(l=lm), width=w, height=h)
    data = [trace]
    fig = go.Figure(data=data, layout=layout)
    fig.show()
    
def gp(df, col, title):
    df1 = df[df["data_source"] == "HPA"]
    df0 = df[df["data_source"] == "HuBMAP"]
    a1 = df1[col].value_counts()
    b1 = df0[col].value_counts()
    
    total = dict(df[col].value_counts())
    x0 = a1.index
    x1 = b1.index
    
    y0 = [float(x)*100 / total[x0[i]] for i,x in enumerate(a1.values)]
    y1 = [float(x)*100 / total[x1[i]] for i,x in enumerate(b1.values)]

    trace1 = go.Bar(x=a1.index, y=y0, name='Target : 1', marker=dict(color="#96D38C"))
    trace2 = go.Bar(x=b1.index, y=y1, name='Target : 0', marker=dict(color="#FEBFB3"))
    return trace1, trace2 

#### Source Distribution

In [None]:
bar_hor(dataframe, 'data_source', 'Data Source Distribution', ["#F1948A", '#ABEBC6'], h=350, w=600, lm=200, xlb = ['Source : HPA','Target : HuBMAP'])

#### Gender Distribution

In [None]:
tr0 = bar_hor(dataframe, "sex", "Distribution of Gender Variable" ,"#f975ae", w=700, lm=100, return_trace=True)
tr1, tr2 = gp(dataframe, 'sex', 'Distribution of Source with Gender')

fig = tools.make_subplots(rows=1, cols=3, print_grid=False, subplot_titles = ["Gender Distribution" , "Gender, Data Source=HMAP" ,"Gender, Data Source=HuBMAP"])
fig.append_trace(tr0, 1, 1);
fig.append_trace(tr1, 1, 2);
fig.append_trace(tr2, 1, 3);
fig['layout'].update(height=350, showlegend=False, margin=dict(l=50));
fig.show()

#### Organ Type Distribution

In [None]:
bar_hor(dataframe, "organ", "Organ Type Distribution" ,"#f975ae", w=700, lm=100, return_trace= False)

#### Gender Distribution

In [None]:
plt.figure(figsize=(12,5))
plt.title("Distribution of Age")
ax = sns.distplot((dataframe["age"]))

#### Age Distribution of Gender's

In [None]:
plt.figure(figsize=(12,5));
plt.style.use('default')
sns.kdeplot(dataframe.loc[dataframe['sex'] == "Male", 'age'], label = 'Gender == Male')
sns.kdeplot(dataframe.loc[dataframe['sex'] == "Female", 'age'], label = 'Gender == Female')
plt.xlabel('Age (years)'); plt.ylabel('Density'); plt.title('Distribution of Ages'); plt.legend()

## WORK IN PROGRESS ...