In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os, fnmatch
def find(pattern, path):
    """Utility to find files wrt a regex search"""
    result = []
    for root, dirs, files in os.walk(path):
        for name in files:
            if fnmatch.fnmatch(name, pattern):
                result.append(os.path.join(root, name))
    return result

In [None]:
FIND_FOLDER="../input/unet-lung-segmentation-dataset-siim-covid/segmented_data/segmented_data"
png_files=find('*.png', FIND_FOLDER)
print(len(png_files),"Files Found.")

In [None]:
import random
num_subset = 3
random.seed(42) # 42 # 2021
subset_png_files =  random.choices(png_files, k=num_subset) # dcm_files[:3]
subset_png_files

In [None]:
import matplotlib.pyplot as plt
import cv2

In [None]:
def masking_ratio(img):
    Shape_X,Shape_Y = img.shape
    zeros_img = np.sum(img == 0)
    non_zeros_img = Shape_X * Shape_Y - zeros_img
    return non_zeros_img/(Shape_X * Shape_Y)

In [None]:
fig, ax = plt.subplots(1, num_subset, figsize=(20, 12))

for i,path in enumerate(subset_png_files):
    ax[i].imshow(cv2.imread(path,0), cmap="viridis")
    ax[i].axis('off')

plt.show()

In [None]:
fig, ax = plt.subplots(1, num_subset, figsize=(20, 12))

for i,path in enumerate(subset_png_files):
    ax[i].imshow(cv2.imread(path,0), cmap="gray")
    ax[i].axis('off')

plt.show()

In [None]:
for i,path in enumerate(subset_png_files):
    print(masking_ratio(cv2.imread(path,0)))

### Properly Segmented Masks must have a High Masking Ratio

In [None]:
import tqdm
masking_ratio_vals = np.zeros((len(png_files)))
for i,path in tqdm.tqdm(enumerate(png_files)):
    masking_ratio_vals[i] = masking_ratio(cv2.imread(path,0))

In [None]:
plt.style.use('dark_background')

In [None]:
plt.figure(figsize=(20,12))
plt.hist(masking_ratio_vals,bins=100)
plt.xticks(np.arange(min(masking_ratio_vals), max(masking_ratio_vals)+1, 0.1))
plt.show()

`Masking Ratio > 0.12` looks like a sensible choice

In [None]:
THRESH_VALUE = 0.12

In [None]:
num_samples=50
sorted_masks_idx=masking_ratio_vals.argsort()
bad_masks_idx=sorted_masks_idx[10:13]
good_masks_idx=sorted_masks_idx[7000:7003]

### Visualize Good Masks

In [None]:
fig, ax = plt.subplots(1, num_subset, figsize=(20, 12))
c=0


for i in good_masks_idx:
    ax[c].imshow(cv2.imread(png_files[i],0), cmap="gray")
    ax[c].axis('off')
    print("Masking Ratio Values :",masking_ratio_vals[i])
    c+=1

plt.show()

### Visualize Bad Masks

In [None]:
fig, ax = plt.subplots(1, num_subset, figsize=(20, 12))
c=0


for i in bad_masks_idx:
    ax[c].imshow(cv2.imread(png_files[i],0), cmap="gray")
    ax[c].axis('off')
    print("Masking Ratio Values :",masking_ratio_vals[i])
    c+=1

plt.show()

### Hence it is definitely advisable to threshold using masking ratio!

In [None]:
df = pd.DataFrame.from_dict({'image_id': list(map(lambda x: os.path.basename(x), png_files)), 
                             'image_path': png_files, 
                             'masking_ratio': masking_ratio_vals, 
                             'is_well_segmented': masking_ratio_vals>THRESH_VALUE})
df.to_csv('segmented_metadata.csv', index=False)