# Contents 
1. [Libs & functions](#1)
2. [Summary](#2)
    * 2.1 [Source dataframe files](#2.1)
3. [Datasets preparation](#3)
    * 3.1 [Example of train dataframe](#3.1)
    * 3.2 [Example of test dataframe](#3.2)
4. [Images](#4)
    * 4.1 [Image file names](#4.1)
    * 4.2 [Train images visualization](#4.2)
    * 4.3 [Test images visualization](#4.3)
5. [Metadata EDA](#5)
6. [Glomerulus cells EDA](#6)
    * 6.1 [Correlations](#6.1)
    * 6.2 [Regplot glomerulus cells on pix / bmi_kg/m^2](#6.2)


# <a id='1'>Libs & functions</a>

In [None]:
import PIL
import gc
import random
import tifffile
import cv2
import json
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image

import warnings
warnings.filterwarnings("ignore")

PATH = "../input/hubmap-kidney-segmentation/"
PATH_TRAIN = PATH + "train/"
PATH_TEST = PATH + "test/"

In [None]:
# usefull functions

def js2df(path):
    '''
    Convert json to DataFrame
    '''
    with open(path) as f:
        data = json.load(f)
    return pd.DataFrame(data)

def shoelace_formula(mat):
    '''
    Formula for find area of polygon, mat is matrix with coords of size [Ncoords, 2]
    '''
    m = mat.squeeze()
    sum1 = np.sum(m[:-1, 0] * m[1:, 1]) + m[-1, 0] * m[0, 1] 
    sum2 = np.sum(m[1:, 0] * m[:-1, 1]) + m[0, 0] * m[-1, 1]
    return np.abs(sum1 - sum2) / 2

def open_img(path):
    '''
    Open .tiff image
    '''
    image = np.array(tifffile.imread(path))
    if image.ndim != 3:
        image = np.squeeze(image).transpose(1, 2, 0)
    return image

def rescale_img(image, scale=1):
    '''
    Rescale image
    '''
    dim = (int(image.shape[1]*scale), int(image.shape[0]*scale))
    image = cv2.resize(image, dim)
    return image

def mask_img(image, mask_list, scale=1, rgbcolor=(70, 255, 0), imgw=0.4, maskw=1):
    '''
    Masking image using list of polygons(mask)
    '''
    mask_polygons = [(i*scale).astype(int) for i in mask_list]
    mask = np.zeros(image.shape, dtype=np.uint8)
    mask = cv2.fillPoly(mask, mask_polygons, color=rgbcolor)
    return cv2.addWeighted(image, imgw, mask, maskw, 0)

def zoom_img(img, pos1, pos2, scale=1):
    '''
    Return rectangle pf image with coords in pos1(start position), pos2(end position)
    '''
    x1, y1 = int(pos1[0]*scale), int(pos1[1]*scale)
    x2, y2 = int(pos2[0]*scale), int(pos2[1]*scale)
    return img[y1:y2, x1:x2, :]

def my_style_ax(ax, title='', xlabel='', ylabel='', tfontsize=15, ylfontsize=18):
    ax.set_title(title, fontsize=tfontsize)
    ax.get_yaxis().set_ticks([])
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel, rotation=0, labelpad=45, fontsize=ylfontsize)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)

def preproccess_traindf(traincsv_path, infocsv_path, train_path):
    '''
    Preprocess train dataframe, for train add following data:
        anatomical_structures_segmention_geometry(dict with coordinates of polygons of cortex/medulla segments)
        kidney_bbox(kidney bounding box)
        glomerulus_coords(list of glom. coordinates)
        glom_num(number of glomerulus cells)
        avg_g_area(average area of glom. cell in pixels)
        glor_cells_on_pix(the content of the pixels of the cells of the cell on one pixel of the cortex)
    '''
    # get train dfs, train names
    train_names = [i + '.tiff' for i in list(pd.read_csv(traincsv_path)['id'])]
    df_info = pd.read_csv(infocsv_path)
    df_train = df_info[df_info['image_file'].isin(train_names)].reset_index(drop=True)
    objects_num = len(df_train)

    # new columns: anatomical geometry(cortex and etc.), kidney bbox
    anat_struct_res = []
    kidney_bbox_res = []
    kidney_area_res = []
    for i in range(objects_num):
        # anatomical coords
        df_anatom = js2df(train_path + df_train['anatomical_structures_segmention_file'][i])
        geometry = dict()
        for j in range(len(df_anatom)):
            name = df_anatom['properties'][j]['classification']['name']
            coords = np.array(df_anatom['geometry'][j]['coordinates'])
            if coords.ndim == 2:
                nnum = 0
                for nnum in range(len(coords)):
                    geometry[name + str(nnum)] = np.array([coords[nnum][0]])
            else:         
                geometry[name] = coords
        anat_struct_res.append(geometry)
        # get kidney bbox
        acords = np.hstack(list(geometry.values())).squeeze()
        kidney_bbox_res.append([np.min(acords[:, 0]), np.min(acords[:, 1]), np.max(acords[:, 0]), np.max(acords[:, 1])])
        # kidney area
        kidney_area = 0
        for k, v in geometry.items():
            if 'cortex' in str(k).lower():
                kidney_area += shoelace_formula(v)
        kidney_area_res.append(kidney_area)
    # add in dataframe
    df_train['anatomical_structures_segmention_geometry'] = anat_struct_res
    df_train['kidney_bbox'] = kidney_bbox_res
    df_train['kidney_area'] = kidney_area_res

    # new columns: glomerulus_coords, glom_num, avg_g_area
    glom_coords_res = []
    glom_num_res = []
    avg_g_area_res = []
    for i in range(objects_num):
        # glomerulus cells coordinates                                                                          
        df_glom = js2df(train_path + df_train['glomerulus_segmentation_file'][i])
        glomerulus_coords = []
        g_area = 0
        for k in range(len(df_glom)):
            g_coords = np.array(df_glom['geometry'][k]['coordinates'])
            g_area += shoelace_formula(g_coords)
            glomerulus_coords.append(g_coords)
        glom_coords_res.append(glomerulus_coords)
        # number of glomeruluses
        glom_num_res.append(len(df_glom))
        # average size of glomeruluses cells
        avg_g_area_res.append(g_area / len(df_glom))
    # add in dataframe
    df_train['glomerulus_coords'] = glom_coords_res
    df_train['glom_num'] = glom_num_res
    df_train['avg_g_area'] = avg_g_area_res
    df_train['glor_cells_on_pix'] = df_train['glom_num']*df_train['avg_g_area'] / df_train['kidney_area'] 

    del df_info, train_names, anat_struct_res, kidney_bbox_res, glom_coords_res, glom_num_res, 
    avg_g_area_res, kidney_area_res
    gc.collect()
    return df_train  


def preproccess_testdf(traincsv_path, infocsv_path, test_path):
    '''
    Preprocess train dataframe, for train add following data:
        anatomical_structures_segmention_geometry(dict with coordinates of polygons of cortex/medulla)
        kidney_bbox(kidney bounding box)
    '''
    # get test dfs, train names
    train_names = [i + '.tiff' for i in list(pd.read_csv(traincsv_path)['id'])]
    df_info = pd.read_csv(infocsv_path)
    df_test = df_info[~df_info['image_file'].isin(train_names)].reset_index(drop=True)
    objects_num = len(df_test)

    # new columns: anatomical geometry(cortex and etc.), kidney bbox
    anat_struct_res = []
    kidney_bbox_res = []
    for i in range(objects_num):
        # anatomical coords
        df_anatom = js2df(test_path + df_test['anatomical_structures_segmention_file'][i])
        geometry = dict()
        for j in range(len(df_anatom)):
            name = df_anatom['properties'][j]['classification']['name']
            coords = np.array(df_anatom['geometry'][j]['coordinates'])
            if coords.ndim == 2:
                nnum = 0
                for nnum in range(len(coords)):
                    geometry[name + str(nnum)] = np.array([coords[nnum][0]])
            else:         
                geometry[name] = coords
        anat_struct_res.append(geometry)
        # get kidney bbox
        acords = np.hstack(list(geometry.values())).squeeze()
        kidney_bbox_res.append([np.min(acords[:, 0]), np.min(acords[:, 1]), np.max(acords[:, 0]), np.max(acords[:, 1])])
    # add in dataframe
    df_test['anatomical_structures_segmention_geometry'] = anat_struct_res
    df_test['kidney_bbox'] = kidney_bbox_res 

    del df_info, train_names, anat_struct_res, kidney_bbox_res
    gc.collect()
    return df_test

# <a id='2'>Summary</a>

In [None]:
# get original datasets
df_train = pd.read_csv(PATH + 'train.csv')
df_info = pd.read_csv(PATH + 'HuBMAP-20-dataset_information.csv')

- # <a id='2.1'>Source dataframe files</a>


**train.csv**

- *id* - id of image
- *encoding* - RLE encoded segmentation masks

In [None]:
print("-----------------train.csv-----------------")
print("length: ", len(df_train))
df_train_info = pd.DataFrame()
df_train_info['type'] = df_train.dtypes
df_train_info['nans'] = df_train.isnull().sum()
df_train_info['not-null count'] = df_train.notna().sum()
df_train_info['unique values'] = df_train.apply(lambda col: col.nunique())
df_train_info


In [None]:
df_train.sample(2)

**HuBMAP-20-dataset_information.csv**

- *image_file* - name of image file in .tiff format
- *width_pixels* - image width
- *height_pixels* - image height
- *anatomical_structures_segmention_file* - name of .json file, storing segments(polygons) of kidney parts(cortex/medulla)
- *glomerulus_segmentation_file* - name of .json file, storing segments(polygons) of glomerulus cells
- *patient_number* - patient number
- *race* - race of patient
- *sex* - patient gender
- *ethnicity* ethnicity of patient
- *age* - patient age
- *weight_kilograms* - weight of patient in kg. 
- *height_centimeters* - height of patient in cm.
- *bmi_kg/m^2* - body mass index(weight_kilograms / height_centimeters^2)
- *laterality* - laterality of kidney(left / right)
- *percent_cortex* percent of cortex(outer part of the kidney)
- *percent_medulla* percent of medulla(inner part of the kidney)

In [None]:
print("-----------------HuBMAP-20-dataset_information.csv-----------------")
print("length: ", len(df_info))
df_info_info = pd.DataFrame()
df_info_info['type'] = df_info.dtypes
df_info_info['nans'] = df_info.isnull().sum()
df_info_info['not-null count'] = df_info.notna().sum()
df_info_info['unique values'] = df_info.apply(lambda col: col.nunique())
df_info_info

In [None]:
df_info.sample(2)

# <a id='3'>Datasets preparation</a>
Here i formed two dataFrames for simplify work with dataset: 
- trainc.csv, added columns:
    - **anatomical_structures_segmention_geometry**, dict with coordinates of polygons of cortex/medulla segments
    - **kidney_bbox**, kidney bounding box
    - **glomerulus_coords**, list of glomerulus polygons coordinates
    - **glom_num**, number of glomerulus cells
    - **avg_g_area**, average area of glom. cell in pixels
    - **glor_cells_on_pix**, the content of the pixels of the glomerulus cells of the cell on one pixel of the cortex
- test.csv, added columns:
    - **anatomical_structures_segmention_geometry**
    - **kidney_bbox**

In [None]:
# get preprocessed datasets
df_train = preproccess_traindf(PATH+'train.csv', PATH + 'HuBMAP-20-dataset_information.csv', PATH_TRAIN)
df_test = preproccess_testdf(PATH+'train.csv', PATH + 'HuBMAP-20-dataset_information.csv', PATH_TEST)

<a id='3.1'>Example of train dataframe</a>

In [None]:
# example of train dataframe
df_train[:2]

<a id='3.2'>Example of test dataframe</a>

In [None]:
# example of test dataframe
df_test[:2]

# <a id='4'>Images</a>

- # <a id='4.1'>Image file names</a>

In [None]:
print("Train images: ", df_train['image_file'].to_list())
print("Test images: ", df_test['image_file'].to_list())

- # <a id='4.2'>Train images visualization</a>

In [None]:
# draw train samples
SCALE_COEF = 0.1
fig, ax = plt.subplots(ncols=3, nrows=8, figsize=(16, 50))
fig.subplots_adjust(wspace=0, hspace=0)


for i in range(8):
    img_name = df_train['image_file'][i]
    img_width, img_height = df_train['width_pixels'][i], df_train['height_pixels'][i]
    img_gl = df_train['glomerulus_coords'][i]
    img_bbox = [int(i*SCALE_COEF) for i in df_train['kidney_bbox'][i]]
    img_anatomy = df_train['anatomical_structures_segmention_geometry'][i]
    img_anatomy = {i:(img_anatomy[i]*SCALE_COEF).astype('int32') for i in img_anatomy.keys()}

    # original image
    img = open_img(PATH_TRAIN + img_name)
    img = rescale_img(img, SCALE_COEF)
    ax[i][0].imshow(img)
    ax[i][0].set_title(f"Orig. {img_name}, {img_width}x{img_height}", fontsize=13)
    ax[i][0].axis('off')


    # bboxed image, add anatomical structures segmentation polygons
    anatomical_segmented_img = img.copy()
    # draw bbox for kidney
    anatomical_segmented_img = cv2.rectangle(anatomical_segmented_img, 
                                             (img_bbox[0], img_bbox[1]), (img_bbox[2], img_bbox[3]), 
                                             (0, 0, 255), 20)
    # create segments mask
    for k, v in img_anatomy.items():
        mask_polygons = v.squeeze()
        mask = np.zeros(img.shape, dtype=np.uint8)
        rand_colors = tuple(random.randint(0, 255) for i in range(3))
        mask = cv2.fillConvexPoly(mask, mask_polygons, color=rand_colors)
        anatomical_segmented_img = cv2.addWeighted(anatomical_segmented_img, 1, mask, 1, 0)

        # put text on segments
        M = cv2.moments(mask_polygons)
        if M["m10"] == 0 or  M["m00"] == 0 or M["m01"] == 0 or M["m00"] == 0:
            continue
        else:
            x, y = int(M["m10"] / M["m00"]), int(M["m01"] / M["m00"])
            anatomical_segmented_img = cv2.putText(anatomical_segmented_img, k, (x-200, y), 
                                    cv2.FONT_HERSHEY_SIMPLEX, 
                                    5, 
                                    (0,0,0), 
                                    10, cv2.LINE_AA)
    ax[i][1].imshow(anatomical_segmented_img)
    ax[i][1].set_title(f"Anatomicaly segmented {img_name}", fontsize=13)
    ax[i][1].axis('off')

    # mask
    masked_img = img.copy()
    masked_img = mask_img(masked_img, img_gl, scale=SCALE_COEF)
    ax[i][2].imshow(masked_img)
    ax[i][2].set_title(f"Masked {img_name}", fontsize=13)
    ax[i][2].axis('off')


In [None]:
SCALE_COEF = 1
fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(10, 7))
fig.subplots_adjust(hspace=0)
ax = ax.flatten()


img_filename =  df_train['image_file'][2]
img_gl = df_train['glomerulus_coords'][2]
img_bbox_kidney = df_train['kidney_bbox'][2]

img = open_img(PATH_TRAIN + img_filename)
img = rescale_img(img, SCALE_COEF)
img = mask_img(img, img_gl, maskw=0.4, scale=SCALE_COEF)

zoomed1 = zoom_img(img, (img_bbox_kidney[0], img_bbox_kidney[1]), (img_bbox_kidney[2], img_bbox_kidney[3]), scale=SCALE_COEF)
ax[0].imshow(zoomed1)
zoomed2 = zoom_img(zoomed1, (0, 2500), (20000, 22500))
ax[1].imshow(zoomed2)
zoomed3 = zoom_img(zoomed2, (5000, 4000), (15000, 8000))
ax[2].imshow(zoomed3)
zoomed4 = zoom_img(zoomed3, (2200, 2250), (2456, 2506))
ax[3].imshow(zoomed4)

<a id='4.3'>Test images visualization</a>

In [None]:
# draw test samples
SCALE_COEF = 0.1
fig, ax = plt.subplots(ncols=2, nrows=5, figsize=(16, 30))
fig.subplots_adjust(wspace=0, hspace=0)


for i in range(5):
    img_name = df_test['image_file'][i]
    img_width, img_height = df_test['width_pixels'][i], df_test['height_pixels'][i]
    img_bbox = [int(i*SCALE_COEF) for i in df_test['kidney_bbox'][i]]
    img_anatomy = df_test['anatomical_structures_segmention_geometry'][i]
    img_anatomy = {i:(img_anatomy[i]*SCALE_COEF).astype('int32') for i in img_anatomy.keys()}

    # original image
    img = open_img(PATH_TEST + img_name)
    img = rescale_img(img, SCALE_COEF)
    ax[i][0].imshow(img)
    ax[i][0].set_title(f"Orig. {img_name}, {img_width}x{img_height}", fontsize=13)
    ax[i][0].axis('off')


    # bboxed image, add anatomical structures segmentation polygons
    anatomical_segmented_img = img.copy()
    # draw bbox for kidney
    anatomical_segmented_img = cv2.rectangle(anatomical_segmented_img, 
                                             (img_bbox[0], img_bbox[1]), (img_bbox[2], img_bbox[3]), 
                                             (0, 0, 255), 20)
    # create segments mask
    for k, v in img_anatomy.items():
        mask_polygons = v.squeeze()
        mask = np.zeros(img.shape, dtype=np.uint8)
        rand_colors = tuple(random.randint(0, 255) for i in range(3))
        mask = cv2.fillConvexPoly(mask, mask_polygons, color=rand_colors)
        anatomical_segmented_img = cv2.addWeighted(anatomical_segmented_img, 1, mask, 1, 0)

        # put text on segments
        M = cv2.moments(mask_polygons)
        if M["m10"] == 0 or  M["m00"] == 0 or M["m01"] == 0 or M["m00"] == 0:
            continue
        else:
            x, y = int(M["m10"] / M["m00"]), int(M["m01"] / M["m00"])
            anatomical_segmented_img = cv2.putText(anatomical_segmented_img, k, (x-200, y), 
                                    cv2.FONT_HERSHEY_SIMPLEX, 
                                    5, 
                                    (0,0,0), 
                                    10, cv2.LINE_AA)
    ax[i][1].imshow(anatomical_segmented_img)
    ax[i][1].set_title(f"Anatomicaly segmented {img_name}", fontsize=13)
    ax[i][1].axis('off')

# <a id='5'>Metadata EDA</a>
- Area of image
- Sex
- Age
- Body mass index
- Laterality
- Race

In [None]:
# metadata EDA
# average area of kidney
sns.set_style("white")
fig, ax = plt.subplots(ncols=2, nrows=6, figsize=(12, 40))
fig.subplots_adjust(wspace=0)

# mean image area
train_areas = df_train['width_pixels'] * df_train['height_pixels'] / 10**5
train_area_min = str(df_train['width_pixels'][np.argmin(train_areas)]) + 'x' + str(df_train['height_pixels'][np.argmin(train_areas)])
train_area_max = str(df_train['width_pixels'][np.argmax(train_areas)]) + 'x' + str(df_train['height_pixels'][np.argmax(train_areas)])
sns.distplot(train_areas, ax=ax[0][0])
my_style_ax(
    ax[0][0],
    title=f"TRAIN | min: {train_area_min}, max: {train_area_max}",
    xlabel="pix*10^5",
    ylabel='Images area',
    tfontsize=13
)
test_areas = df_test['width_pixels'] * df_test['height_pixels'] / 10**5 
test_area_min = str(df_test['width_pixels'][np.argmin(test_areas)]) + 'x' + str(df_test['height_pixels'][np.argmin(test_areas)])
test_area_max = str(df_test['width_pixels'][np.argmax(test_areas)]) + 'x' + str(df_test['height_pixels'][np.argmax(test_areas)])
sns.distplot(test_areas, ax=ax[0][1])
my_style_ax(
    ax[0][1],
    title=f"TEST | min: {test_area_min}, max: {test_area_max}",
    xlabel="pix*10^5",
    tfontsize=13
)

# sex
train_sex = df_train['sex']
sns.countplot(train_sex, ax=ax[1][0])
my_style_ax(
    ax[1][0],
    title=f"TRAIN | {train_sex.value_counts()['Male']} male / {train_sex.value_counts()['Female']} female",
    ylabel='Sex'
)
test_sex = df_test['sex']
sns.countplot(test_sex, ax=ax[1][1])
my_style_ax(
    ax[1][1],
    title=f"TEST | {test_sex.value_counts()['Male']} male / {test_sex.value_counts()['Female']} female"
)

# age
train_age = df_train['age']
sns.distplot(train_age, ax=ax[2][0])
my_style_ax(
    ax[2][0],
    title=f"TRAIN | min: {train_age.min()}, max: {train_age.max()}",
    ylabel="Age"
)
test_age = df_test['age']
sns.distplot(test_age, ax=ax[2][1])
my_style_ax(
    ax[2][1],
    title=f"TEST | min: {test_age.min()}, max: {test_age.max()}"
)

# Body mass index
train_bmi = df_train['bmi_kg/m^2'].fillna(0)
sns.distplot(train_bmi, ax=ax[3][0])
my_style_ax(
    ax[3][0],
    title=f"TRAIN | min: {train_bmi.min()}, max: {train_bmi.max()}",
    ylabel="Body mass ind."
)
test_bmi = df_test['bmi_kg/m^2'].fillna(0)
sns.distplot(test_bmi, ax=ax[3][1])
my_style_ax(
    ax[3][1],
    title=f"TEST | min: {test_bmi.min()}, max: {test_bmi.max()}"
)

# literality
train_lit = df_train['laterality']
sns.countplot(train_lit, ax=ax[4][0])
my_style_ax(
    ax[4][0],
    title=f"TRAIN | {train_lit.value_counts()['Right']} right / {train_lit.value_counts()['Left']} left",
    ylabel='Literaluty'
)
test_lit = df_test['laterality']
sns.countplot(test_lit, ax=ax[4][1])
my_style_ax(
    ax[4][1],
    title=f"TEST | {test_lit.value_counts()['Right']} right / {test_lit.value_counts()['Left']} left"
)

# race
train_r = df_train['race']
sns.countplot(train_r, ax=ax[5][0])
my_style_ax(
    ax[5][0],
    title=f"TRAIN | {train_r.value_counts()['White']} white / {train_r.value_counts()['Black or African American']} black",
    ylabel='Race'
)
test_r = df_test['race']
sns.countplot(test_r, ax=ax[5][1])
my_style_ax(
    ax[5][1],
    title=f"TEST | {test_r.value_counts()['White']} white / {test_r.value_counts()['Black or African American']} black"
)

# <a id='6'>Glomerulus cells EDA</a>

In [None]:
_, axs = plt.subplots(nrows=6, ncols=1, figsize=(7, 33))
axs = axs.flatten()

# gl. avg area
gl_avg = df_train['avg_g_area'] / 10**6
gl_avg_min_ind, gl_avg_max_ind = np.argmin(gl_avg), np.argmax(gl_avg)
sns.distplot(gl_avg, ax=axs[0])
my_style_ax(
    axs[0],
    title=f"min: {np.round(gl_avg[gl_avg_min_ind], 3)}({df_train['image_file'][gl_avg_min_ind]}), "
          f"max: {np.round(gl_avg[gl_avg_max_ind], 3)}({df_train['image_file'][gl_avg_max_ind]})",
    ylabel='avg_g_area * 10^-6'
)
# gl. count
gl_count = df_train['glom_num']
gl_count_min_ind, gl_count_max_ind = np.argmin(gl_count), np.argmax(gl_count)
sns.distplot(gl_count, ax=axs[1])
my_style_ax(
    axs[1],
    title=f"min: {np.round(gl_count[gl_count_min_ind], 3)}({df_train['image_file'][gl_count_min_ind]}), "
          f"max: {np.round(gl_count[gl_count_max_ind], 3)}({df_train['image_file'][gl_count_max_ind]})",
    ylabel='glom_num'
)
# gl. content on cortex pixels
gl_cp = df_train['glor_cells_on_pix']
gl_cp_min_ind, gl_cp_max_ind = np.argmin(gl_cp), np.argmax(gl_cp)
sns.distplot(gl_cp, ax=axs[2])
my_style_ax(
    axs[2],
    title=f"min: {np.round(gl_cp[gl_cp_min_ind], 3)}({df_train['image_file'][gl_cp_min_ind]}), "
          f"max: {np.round(gl_cp[gl_cp_max_ind], 3)}({df_train['image_file'][gl_cp_max_ind]})",
    ylabel='glor_cells_on_pix'
)
# sex
fem = df_train[df_train['sex'] == 'Female']['glor_cells_on_pix']
male = df_train[df_train['sex'] == 'Male']['glor_cells_on_pix']
sns.distplot(fem, ax=axs[3], label='Female')
sns.distplot(male, ax=axs[3], label='Male')
my_style_ax(axs[3], 
            title=f'mean male: {round(male.mean(), 4)}, mean female: {round(fem.mean(), 4)}',
            ylabel='sex \ glor_cells_on_pix'
           )
axs[3].legend()
# race
rw = df_train[df_train['race'] == 'White']['glor_cells_on_pix']
rb = df_train[df_train['race'] != 'White']['glor_cells_on_pix']
sns.distplot(rw, ax=axs[4], label='White')
sns.distplot(rb, ax=axs[4], label='Black')
my_style_ax(
    axs[4],
    title=f'mean white: {round(rw.mean(), 4)}, mean black: {round(rb.mean(), 4)}',
    ylabel='race \ glor_cells_on_pix'
)
axs[4].legend()
# laterality
lr = df_train[df_train['laterality'] == 'Right']['glor_cells_on_pix']
ll = df_train[df_train['laterality'] == 'Left']['glor_cells_on_pix']
sns.distplot(lr, ax=axs[5], label='Right')
sns.distplot(ll, ax=axs[5], label='Left')
my_style_ax(
    axs[5],
    title=f'mean right: {round(lr.mean(), 4)}, mean left: {round(ll.mean(), 4)}',
    ylabel='laterality \ glor_cells_on_pix'
)
axs[5].legend()

- # <a id='6.1'>Correlations(pearson)</a>

In [None]:
# correlations with glor_cells_on_pix / glor_num

cols_num = ['age', 'avg_g_area', 'glom_num', 
        'glor_cells_on_pix', 'bmi_kg/m^2', 'height_centimeters', 'weight_kilograms']
sns.heatmap(df_train[cols_num].corr(), annot=True)

- # <a id='6.2'>Regplot(glomerulus cells on pixel / bmi_kg/m^2(corr ~-0.96)</a>

In [None]:
df_glor_bmi_reg = df_train[['glor_cells_on_pix', 'bmi_kg/m^2']].dropna()
sns.regplot(x='glor_cells_on_pix', y='bmi_kg/m^2', data=df_glor_bmi_reg)