# 0. Objective

The objective of this notebook is to provide a minimum viable pipeline using [fastai](https://docs.fast.ai/) for classification and detection of opacity in xray images. Anyone who is new to this competition can start from this notebook and learn the followings:

* Basic understanding of the nature of medical imaging data
* Introduction to fastai library for [medical imaging](https://docs.fast.ai/medical.imaging)
* Submission of predictions

The progress is as following:

    1. Data Overview           (Done)
    2. Data Preprocessing      (Done)
    3. Preparing Datablock     (Done)
    4. Training                (Done)
    5. Predictions             (In Progress)
    6. Subsmission             (tdb)

I am using following 2 notebooks as reference:
* https://www.kaggle.com/avirdee/siim-covid-19-initial-pipeline-fastai
* https://github.com/muellerzr/Practical-Deep-Learning-for-Coders-2.0/blob/master/Computer%20Vision/06_Object_Detection.ipynb

    

# 1. Data Overview

In [None]:
# Load Grassroots DICOM (GDCM) for xray DICOM files
!pip install python-gdcm -q

# Load glob2
!pip install glob2

# Load tqdm
!pip install tqdm

In [None]:
# Loading necessary packages
import os
from datetime import datetime
import pandas as pd
import numpy as np
import glob2
from tqdm.notebook import tqdm
import cv2
import gdcm
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
from fastai.vision.all import *
from fastai.medical.imaging import *
from torchvision.utils import save_image

In [None]:
SOURCE = '/kaggle/input/siim-covid19-detection'
os.listdir(SOURCE)

In [None]:
train_image_level = pd.read_csv(f'{SOURCE}/train_image_level.csv')
train_study_level = pd.read_csv(f'{SOURCE}/train_study_level.csv')
sample_submission = pd.read_csv(f'{SOURCE}/sample_submission.csv')

In [None]:
train_image_level.head()

In [None]:
train_study_level.head()

In [None]:
sample_submission.head()

In [None]:
# XRAY Files
def get_dcm_files(path, recurse=True, folders=None):
    "Get image files in `path` recursively, only in `folders`, if specified."
    return get_files(path, extensions=['.dcm'], recurse=recurse, folders=folders)

# Read DICOM files
TRAIN_DIR = f'{SOURCE}/train/'
TEST_DIR =  f'{SOURCE}/test/'
train_dcm = get_dcm_files(TRAIN_DIR)
test_dcm = get_dcm_files(TEST_DIR)

# Looking on a sample XRAY
xray_sample = train_dcm[1].dcmread()
xray_sample

In [None]:
xray_sample.show()

# 2. Data Preprocessing

In [None]:
# Merging study_level and image_level
# rename id column in study_level to StudyInstanceUID
train_study_level.rename(columns = {'id':'StudyInstanceUID'}, inplace = True)

# remove _study from StudyInstanceUID
train_study_level['StudyInstanceUID'] = train_study_level['StudyInstanceUID'].str.replace('_study', '')

# merge
df_train = pd.merge(train_image_level, train_study_level, on='StudyInstanceUID')

# remove _image from id column
df_train['id'] = df_train['id'].str.replace('_image', '')

# rename id column as imageID
df_train.rename(columns = {'id':'imageID'}, inplace = True)

# renaming target columns
df_train.rename(columns = {'Negative for Pneumonia':'negative'}, inplace = True)
df_train.rename(columns = {'Typical Appearance':'typical'}, inplace = True)
df_train.rename(columns = {'Indeterminate Appearance':'indeterminate'}, inplace = True)
df_train.rename(columns = {'Atypical Appearance':'atypical'}, inplace = True)

# Create a new target column
categories = ['negative','typical','indeterminate','atypical']
df = df_train[categories]
df_train["target"] = pd.Series(df.columns[np.where(df!=0)[1]])
df_train.head()

In [None]:
# Creating path column for each image
TRAIN_DIR = f'{SOURCE}/train/'
paths = []

for instance_id in tqdm(df_train['StudyInstanceUID']):
    paths.append(glob.glob(os.path.join(TRAIN_DIR, instance_id +"/*/*"))[0])

df_train['path'] = paths
df_train[:5]

In [None]:
# Calculate number of bounding boxes
# Source: https://www.kaggle.com/avirdee/siim-covid-19-initial-pipeline-fastai
num_of_boxes = []
for i in df_train.index:
    val_len = len(df_train['label'][i].split(' '))
    val = df_train['label'][i].split(' ')
    label = df_train['target'][i]
    box_count = val_len//6
    num_of_boxes.append(box_count)
    
df_train['num_of_boxes'] = num_of_boxes
df_train.head()

In [None]:
df_train['num_of_boxes'].value_counts()

In [None]:
# Parse label column
bboxes = []
for i in df_train.index:
    num_of_boxes = df_train['num_of_boxes'][i]
    val = df_train['label'][i].split(' ')
    if num_of_boxes == 1: boxes = val[2:6]
    if num_of_boxes == 2: boxes = val[2:6] + val[8:12]
    if num_of_boxes == 3: boxes = val[2:6] + val[8:12] + val[14:18]
    if num_of_boxes == 4: boxes = val[2:6] + val[8:12] + val[14:18] + val[20:24]
    bboxes.append(boxes)
    
df_train['parsed_label'] = bboxes
df_train.head()

# 3. Preparing DataBlock

In [None]:
# Subsetting df_train on columns required for datablock
df_datablock = df_train[['imageID', 'target', 'parsed_label', 'path']].copy()
df_datablock.head()

In [None]:
# Defining get_items() as Path() object to file
im_df = df_datablock['path'].unique()
fns = [Path(str(f'{fn}')) for fn in im_df]
#fns[:5]

def get_items(noop): return fns

In [None]:
# Convert data frame to numpy array for faster processing
df_np = df_datablock.to_numpy()
df_np[0]

In [None]:
def get_tmp_bbox(fn):
    rows = np.where(df_np[:,0] == fn.name[:-4])
    bboxs = df_np[rows][:,-2][0]
    return np.array([np.fromstring(b, sep=',') for b in bboxs])

def get_tmp_lbl(fn):
    rows = np.where((df_np[:, 0] == fn.name[:-4]))
    bboxs = len(df_np[rows][:,-2][0])
    if bboxs > 12:
        return np.concatenate(([df_np[rows][:,1]]*4))
    if bboxs > 8:
        return np.concatenate(([df_np[rows][:,1]]*3))
    if bboxs > 4:
        return np.concatenate(([df_np[rows][:,1]]*2))
    else:
        return df_np[rows][:,1]

In [None]:
get_tmp_bbox(get_items(SOURCE)[2])

In [None]:
get_tmp_lbl(get_items(SOURCE)[2])

In [None]:
bboxs = get_tmp_bbox(fns[0])
lbls = get_tmp_lbl(fns[0])
arr = np.array([fns[0].name[:-4], bboxs, lbls], dtype=object)
arr

In [None]:
# Whole dataset
for path in fns[1:]:
    bbox = get_tmp_bbox(path)
    lbl = get_tmp_lbl(path)
    arr2 = np.array([path.name[:-4], bbox, lbl], dtype='object')
    arr = np.vstack((arr, arr2))

In [None]:
def get_bbox(fn):
    idx = np.where((arr[:,0] == fn.name[:-4]))
    return arr[idx][0][1]

def get_lbl(fn):
    idx = np.where((arr[:,0] == fn.name[:-4]))
    return arr[idx][0][-1]

In [None]:
get_bbox(get_items(SOURCE)[2])

In [None]:
get_lbl(get_items(SOURCE)[2])

In [None]:
# Source: https://www.kaggle.com/avirdee/siim-covid-19-initial-pipeline-fastai
class HistView(PILDicom):
    "View histogram scaled version of the pixel array"
    @classmethod
    def create(cls, fn:(Path, str, bytes))->None:
        if isinstance(fn, bytes): im = pydicom.dcmread(pydicom.filebase.DicomBytesIO(fn))
        if isinstance(fn, (Path, str)): im = pydicom.dcmread(fn)
        scaled = np.array(im.hist_scaled())
        scaled = scaled - np.min(scaled)
        scaled = scaled / np.max(scaled)
        scaled = (scaled * 255).astype(np.uint8)
        pill_im = Image.fromarray(scaled)
        return cls(pill_im)

In [None]:
set_seed(7)
datablock = DataBlock(blocks=(ImageBlock(cls=HistView), BBoxBlock, BBoxLblBlock),
                 get_items=get_items,
                 splitter=RandomSplitter(),
                 get_y=[get_bbox, get_lbl],
                 item_tfms=[Resize(128, method='pad'),],
                 batch_tfms=[Rotate(), Flip(), Dihedral(), Normalize.from_stats(*imagenet_stats)],
                 n_inp=1)

dls = datablock.dataloaders(TRAIN_DIR, bs=128)
dls.show_batch(max_n=20, ncols=5)

In [None]:
# Checking shape of a batch
batch = dls.one_batch()
batch[0].shape

In [None]:
batch[1].shape

In [None]:
batch[1][0]

In [None]:
batch[2].shape

In [None]:
batch[2][0]

# 4. Training

In [None]:
# Source: https://github.com/muellerzr/Practical-Deep-Learning-for-Coders-2.0/blob/master/Computer%20Vision/06_Object_Detection.ipynb
!git clone https://github.com/muellerzr/Practical-Deep-Learning-for-Coders-2.0.git
%cd "Practical-Deep-Learning-for-Coders-2.0/Computer Vision"

In [None]:
from imports import *

In [None]:
encoder = create_body(resnet34, pretrained=True)

In [None]:
arch = RetinaNet(encoder, get_c(dls), final_bias=-4)

In [None]:
# Loss function
ratios = [1/2,1,2]
scales = [1,2**(-1/3), 2**(-2/3)]

In [None]:
crit = RetinaNetFocalLoss(scales=scales, ratios=ratios)

In [None]:
def _retinanet_split(m): 
    return L(m.encoder,nn.Sequential(m.c5top6, m.p6top7, m.merges, m.smoothers, m.classifier, m.box_regressor)).map(params)

In [None]:
learn = Learner(dls, arch, loss_func=crit, splitter=_retinanet_split)

In [None]:
learn.freeze()

In [None]:
learn.fit_one_cycle(2, slice(1e-5, 1e-4))

In [None]:
# Saving weights of the trained model
TRAINED_MODELS_DIR = '/kaggle/working/trained_models_dir/'
os.mkdir(TRAINED_MODELS_DIR)

timestamp = datetime.now().strftime("_%Y%m%d_%H%M%S_")
file_name = TRAINED_MODELS_DIR + "trainedModelWeights" + timestamp
learn.save(file = file_name)

In [None]:
# Exporting the trained model
timestamp = datetime.now().strftime("_%Y%m%d_%H%M%S_")
file_name = TRAINED_MODELS_DIR + "trainedModelExport" + timestamp + ".pkl"
learn.export(fname = file_name)
os.listdir(TRAINED_MODELS_DIR)

# 5. Predictions

In [None]:
sample_img_path = TEST_DIR + '/2fb11712bc93/b056067b8455/a29c5a68b07b.dcm'
sample_img_path

In [None]:
learn.predict(sample_img_path)

# 6. Submission