Using this notebook as reference: https://www.kaggle.com/jandal487/siim-covid19-classification-detection

# Prep

In [None]:
# Load Grassroots DICOM (GDCM) for xray DICOM files
!pip install python-gdcm -q

# Load glob2
!pip install glob2

# Load tqdm
!pip install tqdm

# !pip install --upgrade numpy==1.20.0 --no-binary

In [None]:
# Loading necessary packages
import os
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob2
from tqdm.notebook import tqdm
import cv2
import gdcm
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
from fastai.vision.all import *
from fastai.medical.imaging import *
from torchvision.utils import save_image

In [None]:
SOURCE = '/kaggle/input/siim-covid19-detection'
os.listdir(SOURCE)

In [None]:
train_image_level = pd.read_csv(f'{SOURCE}/train_image_level.csv')
train_study_level = pd.read_csv(f'{SOURCE}/train_study_level.csv')
sample_submission = pd.read_csv(f'{SOURCE}/sample_submission.csv')

# Data Overview and processing

In [None]:
train_image_level.head(10)

In [None]:
train_study_level.head()

In [None]:
sample_submission.head()

In [None]:
# XRAY Files
def get_dcm_files(path, recurse=True, folders=None):
    "Get image files in `path` recursively, only in `folders`, if specified."
    return get_files(path, extensions=['.dcm'], recurse=recurse, folders=folders)

# Read DICOM files
TRAIN_DIR = f'{SOURCE}/train/'
TEST_DIR =  f'{SOURCE}/test/'

train_dcm = get_dcm_files(TRAIN_DIR)
test_dcm = get_dcm_files(TEST_DIR)

# Looking on a sample XRAY
xray_sample = train_dcm[1].dcmread()

In [None]:
xray_sample.show()

In [None]:
# combining data from different tables
train_study_level['StudyInstanceUID'] = train_study_level['id'].apply(lambda x: x[:-6])

train_map = train_image_level.merge(train_study_level, on='StudyInstanceUID')
train_map['ImageID'] = train_map['id_x'].apply(lambda x: x[:-6])

train_map.drop('id_y', axis=1, inplace=True)
train_map.drop('id_x', axis=1, inplace=True)

train_map.set_index('ImageID', inplace=True)
train_map.sort_values('StudyInstanceUID', inplace=True)

In [None]:
train_map.head(2)

In [None]:
train_map['StudyInstanceUID'].value_counts()

In [None]:
train_map[train_map['StudyInstanceUID'] == '0fd2db233deb']

In [None]:
# functions to show images with boxes or training images

import re
import matplotlib.patches as patches

def show_img_with_boxes(image, boxes):
    fig, ax = plt.subplots()
    fig.set_size_inches(10, 10)

    plt.imshow(image.dcmread().pixel_array)

    for box in boxes:
        ax.add_patch(patches.Rectangle((float(box[0]), float(box[1])),
                            float(box[2]), float(box[3]),
                            edgecolor='red',
                            fill=False))
        
    plt.show()

def show_training_img(image_num=0):

    file_path = str(train_dcm[image_num])
    img_pattern = "\/([0-z]{12})\."
    image_id = re.findall(img_pattern, file_path)[0]

    boxes_str = train_map.loc[image_id].boxes
    box_pattern = "{'x': ([0-9]*\.[0-9]*), 'y': ([0-9]*\.[0-9]*), 'width': ([0-9]*\.[0-9]*), 'height': ([0-9]*\.[0-9]*)}"

    if type(boxes_str) == str:
        boxes = re.findall(box_pattern, boxes_str)
    else:
        boxes = []
    
    print(train_map.loc[image_id][['Negative for Pneumonia',
       'Typical Appearance', 'Indeterminate Appearance',
       'Atypical Appearance']])
    show_img_with_boxes(train_dcm[image_num], boxes)

In [None]:
for i in range(2):
    show_training_img(i)

# EDA for the box sizes - searching for patterns

In [None]:
boxes_data = pd.DataFrame(columns=['x', 'y', 'width', 'height'])

for index, row in train_map.iterrows():
    boxes_str = row.boxes
    box_pattern = "{'x': ([0-9]*\.[0-9]*), 'y': ([0-9]*\.[0-9]*), 'width': ([0-9]*\.[0-9]*), 'height': ([0-9]*\.[0-9]*)}"
    if type(boxes_str) == str:
        boxes = re.findall(box_pattern, boxes_str)
        for box in boxes:
            box_dict = {'x': float(box[0]), 'y': float(box[1]),
                       'width': float(box[2]), 'height': float(box[3])}
            boxes_data = boxes_data.append(box_dict, ignore_index=True)

In [None]:
boxes_data.head()

In [None]:
boxes_data.describe()

In [None]:
sns.displot(x=boxes_data['width'])

In [None]:
sns.displot(x=boxes_data['height'])

In [None]:
sns.scatterplot(data=boxes_data, x='width', y='height')

In [None]:
sns.displot(boxes_data['height']/boxes_data['width'], kde=True)

In [None]:
from sklearn.cluster import KMeans

# kmeans for width and height data of boxes
box_sizes = boxes_data[['width', 'height']]
kmeans = KMeans(n_clusters=11)
kmeans.fit(box_sizes[['width', 'height']])
box_sizes['labels'] = kmeans.labels_

sns.scatterplot(data=box_sizes, x='width', y='height', hue='labels', palette='rainbow')
print(box_sizes['labels'].value_counts())

In [None]:
# kmeans considering ratio of height / width
box_size_ratio = boxes_data[['width', 'height']]
box_size_ratio['ratio'] = box_size_ratio['height'] / box_size_ratio['width'] * 1000 # adjusting scale of ratio
kmeans2 = KMeans(n_clusters=7)
kmeans2.fit(box_size_ratio[['width', 'height', 'ratio']])
box_size_ratio['labels'] = kmeans2.labels_

sns.scatterplot(data=box_size_ratio, x='width', y='height', hue='labels', palette='rainbow')
print(box_size_ratio['labels'].value_counts())

In [None]:
# exploring the average boxes from the first kmeans (just width/height)
centers = kmeans.cluster_centers_
centers = pd.DataFrame(centers, columns=['width', 'height'])
centers['ratio'] = centers['height'] / centers['width']

print(centers['ratio'])
sns.displot(centers['ratio'], binwidth=0.1)

In [None]:
# visualization
spacing_x = 350
spacing_y = 200
average_boxes = centers

average_boxes = average_boxes.sort_values('width', ascending=False)

average_boxes['x'] = range(spacing_x, spacing_x*11 + 1, spacing_x)
average_boxes['y'] = range(spacing_y, spacing_y*11 + 1, spacing_y)

average_boxes = average_boxes[['x', 'y', 'width', 'height', 'ratio']] # to reorder columns
average_boxes = average_boxes.drop('ratio', axis=1).to_numpy()  # drop ratio

show_img_with_boxes(train_dcm[1], average_boxes)

# check image pixel data

In [None]:
train_dcm[0].dcmread().pixel_array.shape

In [None]:
# check sizes - JUST A SAMPLE
sizes_list = []
for dcm in train_dcm[:100]:
    sizes_list.append(dcm.dcmread().pixel_array.shape)

image_array=np.array(sizes_list)


In [None]:
img_sizes = pd.DataFrame(image_array, columns=['width', 'height'])

In [None]:
img_sizes.head()

In [None]:
img_sizes['ratio'] = img_sizes['height']/img_sizes['width']

In [None]:
sns.displot(img_sizes['ratio'])

In [None]:
img_sizes['width'].value_counts().head(5)

In [None]:
img_sizes['height'].value_counts().head(5)

In [None]:
img_sizes['ratio'].value_counts().head(5)

In [None]:
img_sizes.value_counts().head(5)