In [None]:
import json
import os

import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm

In [None]:
%config Completer.use_jedi = False
sns.set_theme(style="whitegrid")

In [None]:
root_annot_path = '../input/hubmap-kidney-segmentation/train'
dset_info_path = '../input/hubmap-kidney-segmentation/HuBMAP-20-dataset_information.csv'

In [None]:
dset_info = pd.read_csv(dset_info_path)
dset_info.head()

In [None]:
glomerulu_dict = {'filename':[],
                   'width_img':[],
                   'height_img':[],
                   'width_seg':[],
                   'height_seg':[],
                   'area':[],
                   'perimeter':[],
                   'center_x':[],
                   'center_y':[],
                   'w/h':[],
                   }
cols = ['width_pixels', 'height_pixels',
        'glomerulus_segmentation_file']

for w_img, h_img, json_filename in tqdm(dset_info[cols].values):
    
    json_path = os.path.join(root_annot_path, json_filename)
    if not os.path.exists(json_path):
        continue
    
    with open(json_path) as json_file:
        data = json.load(json_file)
    
    for glomerulu_info in data:
        contour = glomerulu_info['geometry']['coordinates']
        contour = np.array(contour, dtype=np.float32)
        area = cv2.contourArea(contour)
        perimeter = cv2.arcLength(contour, True)

        xmin, ymin = contour[0].min(axis=0)
        xmax, ymax = contour[0].max(axis=0)

        width = xmax - xmin
        height = ymax - ymin

        center_x = np.mean([xmax, xmin])
        center_y = np.mean([ymax, ymin])
        
        glomerulu_dict['filename'].append(json_filename.split('.')[0])
        glomerulu_dict['width_img'].append(w_img)
        glomerulu_dict['height_img'].append(h_img)
        glomerulu_dict['width_seg'].append(width)
        glomerulu_dict['height_seg'].append(height)
        glomerulu_dict['w/h'].append(width/height)
        glomerulu_dict['area'].append(area)
        glomerulu_dict['perimeter'].append(perimeter)
        glomerulu_dict['center_x'].append(center_x)
        glomerulu_dict['center_y'].append(center_y)
                

In [None]:
glomerulu_df = pd.DataFrame(glomerulu_dict)
glomerulu_df.head()

In [None]:
glomerulu_df.info()

In [None]:
glomerulu_df.describe()

In [None]:
sorted_filenames = glomerulu_df.filename.value_counts().index
sorted_filenames

In [None]:
plt.figure(figsize=(15,5))
plt.xticks(rotation=45)
plt.title('Number of glomerulus for each image')
sns.countplot(data=glomerulu_df, x='filename', order=sorted_filenames);

In [None]:
plt.figure(figsize=(10,10))
for filename, w, h in dset_info[['image_file', 'width_pixels', 'height_pixels']].values:
    plt.text(w, h, filename, fontsize=10)
sns.scatterplot(data=dset_info, x="width_pixels", y="height_pixels");

In [None]:
plt.figure(figsize=(15,5))
plt.xticks(rotation=45)
plt.title('Segment width')
sns.boxplot(data=glomerulu_df, x='filename', y='width_seg', order=sorted_filenames);

In [None]:
plt.figure(figsize=(15,5))
plt.xticks(rotation=45)
plt.title('Segment height')
sns.boxplot(data=glomerulu_df, x='filename', y='height_seg', order=sorted_filenames);

In [None]:
sns.jointplot(data=glomerulu_df, x="width_seg", y="height_seg");

In [None]:
plt.figure(figsize=(15,5))
plt.xticks(rotation=45)
plt.title('Width/height')
sns.boxplot(data=glomerulu_df, x='filename', y='w/h', order=sorted_filenames);

In [None]:
plt.figure(figsize=(10,6))
plt.subplots_adjust(hspace=.5)

plt.subplot(2,1,1)
plt.title('Width/height')
sns.histplot(data=glomerulu_df, x='w/h');

plt.subplot(2,1,2)
plt.xlim(1.5,2.45)
plt.ylim(0, 20)
plt.title('Width/height > 1.5')
sns.histplot(data=glomerulu_df, x='w/h');

In [None]:
(glomerulu_df['w/h']>1.5).sum()

In [None]:
sns.jointplot(data=glomerulu_df, x="center_x", y="center_y");

In [None]:
plt.figure(figsize=(15,5))
plt.xticks(rotation=45)
plt.title('Area')
sns.boxplot(data=glomerulu_df, x='filename', y='area', order=sorted_filenames);

In [None]:
plt.figure(figsize=(10,6))
plt.title('Area')
sns.histplot(data=glomerulu_df, x='area');