In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

###  *In this notebook, we are going to get basic understanding of the csv files and the images provided in the dataset*

In [None]:
# PATHS
TRAIN_IMAGES = "../input/hubmap-organ-segmentation/train_images/*.tiff"
TRAIN_CSV = "../input/hubmap-organ-segmentation/train.csv"
TEST_CSV = "../input/hubmap-organ-segmentation/test.csv"

## Analyzing csv files

In [None]:
train_df = pd.read_csv(TRAIN_CSV)
train_df.head()

In [None]:
train_df.shape

In [None]:
train_df.info()

In [None]:
plt.figure(figsize=(10,5))
plt.title("Unique values in train columns")
unique_counts = train_df[[i for i in train_df.columns if i not in ["id","rle"]]].nunique().to_dict()
ax = sns.barplot(list(unique_counts.keys()), list(unique_counts.values()))
ax.bar_label(ax.containers[0])
plt.plot()

In [None]:
for i in ["organ","data_source","pixel_size","tissue_thickness","sex"]:
    print(train_df[i].value_counts())

In [None]:
plt.title("Gender distribution")
ax = train_df[i].value_counts("%").mul(100).plot.bar()
ax.bar_label(ax.containers[0])

* #### The columns: "data_source","pixel_size","tissue_thickness" have only single unique value
* #### Column Sex has Male and Female
* #### Column organ has 5 unique values

In [None]:
train_df["age"].describe

In [None]:
plt.title("Age distribution")
sns.boxplot(x=train_df["age"])

In [None]:
plt.title("Gender wise age distribution")
sns.boxplot(x="age", y="sex",data=train_df)

In [None]:
train_df["img_height"].value_counts() # all are high resolution squared images 

In [None]:
# Test file: single value
test_df = pd.read_csv(TEST_CSV)
test_df.head()


## Visualizing images

In [None]:
#https://www.kaggle.com/code/pestipeti/decoding-rle-masks/notebook
def rle2mask(mask_rle, shape=(3000,3000)):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (width,height) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0::2], s[1::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape).T

In [None]:
print("Original images")
organs_df = train_df.groupby("organ").nth(-1).reset_index()
fig, ax = plt.subplots(1,len(organs_df),figsize=(15,8))

for _, row in organs_df.iterrows():
    image = plt.imread("../input/hubmap-organ-segmentation/train_images/"+str(row.id)+".tiff")
    ax[_].imshow(image)
    ax[_].set_title(row["organ"])

In [None]:
print("Original images with rle mask")

organs_df = train_df.groupby("organ").nth(-1).reset_index()
fig, ax = plt.subplots(1,len(organs_df),figsize=(15,8))

for _, row in organs_df.iterrows():
    image = plt.imread("../input/hubmap-organ-segmentation/train_images/"+str(row.id)+".tiff")
    mask = rle2mask(row.rle,shape=(row.img_height,row.img_width))
    ax[_].imshow(image)
    ax[_].imshow(mask,alpha=0.2)
    ax[_].set_title(row["organ"])

### *Adding more EDA and viz* ... 

## DO UPVOTE PLEASE!