Reference to load DICOM Images: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
and https://www.kaggle.com/tanlikesmath/siim-covid-19-detection-a-simple-eda

In [None]:
!conda install -c conda-forge gdcm -y >> /dev/null
!pip install timm
!pip install wandb

In [None]:
!wandb login 528ac788f9f5a417bf1f94f3f1423607eb82d84c

# Classification Training Script
This script is written to be modified and facilitate different types of experiments.

In [None]:
import glob
import os
import time

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import PIL

from sklearn.model_selection import GroupKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map

import torch
import torchvision
from torch.utils.data.dataset import Dataset

import timm

import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

import albumentations as A
from albumentations.pytorch import ToTensorV2

DATA_DIR = "/kaggle/input/siim-covid19-detection"
RESIZE_DIR = "/kaggle/input/resized/"
SIZE = (512, 512)
FOLDS = 5
EPOCHS = 10
NUM_CLASSES = 4
BATCHSIZE = 24

In [None]:
import wandb
wandb.login()

## Clean column names in CSV's
We clean up a few column names in the `train_image_level.csv` and the `train_study_level.csv`, to merge these two csvs into one.
We also rename the columns to simplified names to use later on.

In [None]:
train_images_df = pd.read_csv(os.path.join(DATA_DIR, 'train_image_level.csv'))
train_study_df = pd.read_csv(os.path.join(DATA_DIR, 'train_study_level.csv'))
train_images_df['StudyInstanceUID'] = train_images_df['StudyInstanceUID'] + "_study"

train_study_df.columns = train_study_df.columns.map(lambda x: x.split(' ')[0])

train_study_df.rename(columns={"id":"study_id"}, inplace=True)
train_images_df.rename(columns={"StudyInstanceUID":"study_id"}, inplace=True)

## Label Map Creation
At prediction time, we will need these labels in string form. These maps can also be modified to train different types of models, like Binary classification.

In [None]:
if NUM_CLASSES == 4:
    NAME_TO_LABEL_MAP = {
                "Negative":0,
                "Typical":1,
                "Indeterminate":2,
                "Atypical":3
    }


def get_str_label(row):
    for k in NAME_TO_LABEL_MAP:
            if row[k]:
                return k
    return None
def get_int_label(row):
    for k in NAME_TO_LABEL_MAP:
            if row[k]:
                return NAME_TO_LABEL_MAP[k]
    return None

## Group K Fold
We create a `fold` column to be used while we run cross validation.
We choose the `study_id` column created above to split into the number of groups defined by the `FOLDS` variable.

In [None]:
train_study_df["int_label"] = train_study_df.apply(get_int_label, axis=1)
train_study_df["str_label"] = train_study_df.apply(get_str_label, axis=1)

gkf  = GroupKFold(n_splits = FOLDS)
train_study_df['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(gkf.split(train_study_df, groups = train_study_df["study_id"].tolist())):
    train_study_df.loc[val_idx, 'fold'] = fold

### Plotting Group Counts
We see from the plots below that each group is fairly balanced. The `Typical` category is the most common followed by `Negative`.

In [None]:
sns.catplot(x = "str_label", col="fold", data=train_study_df, kind="count")

## Clean Study level data
1. As per the recommendations made [here](https://www.kaggle.com/c/siim-covid19-detection/discussion/246597) for studies with more than one image it appears that there only one which has bounding boxes. 

2. As per the recommendation made [here](https://www.kaggle.com/c/siim-covid19-detection/discussion/240250#1351079), the label for only the one with bounding boxes is retained since the other images were not looked at by the annotators.

3. For studies that have more than one image but no bounding boxes associated with them, it is unclear as to which image was looked at therefore all images are retained in those studies.

In [None]:
train_samples_df = pd.merge(train_images_df, train_study_df, on="study_id", how="inner").reset_index(drop=True)

box_and_images_counts_df = train_samples_df.groupby("study_id")[["id", "boxes"]].count().sort_values(ascending=False, by="id").reset_index()
box_and_images_counts_df.rename(columns={
    "id":"id_count", "boxes":"boxes_count"
}, inplace=True)

In [None]:
sns.countplot(data=box_and_images_counts_df, x="id_count")

In [None]:
train_samples_df = pd.merge(train_samples_df, box_and_images_counts_df, how="inner", on="study_id")

In [None]:
# train_samples_df.sort_values(["id_count", "boxes_count"], ascending=False, inplace=True)
train_samples_df.head(5)


In [None]:
for fold in range(FOLDS):
    os.makedirs(f'/kaggle/tmp/covid/images/train{fold}', exist_ok=True)
    os.makedirs(f'/kaggle/tmp/covid/images/valid{fold}', exist_ok=True)

    os.makedirs(f'/kaggle/tmp/covid/labels/train{fold}', exist_ok=True)
    os.makedirs(f'/kaggle/tmp/covid/labels/valid{fold}', exist_ok=True)

In [None]:
def read_xray(path, voi_lut = True, fix_monochrome = True):
    # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to 
    # "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

def resize_xray(array, size, keep_ratio=False, resample=Image.LANCZOS):
    # Original from: https://www.kaggle.com/xhlulu/vinbigdata-process-and-resize-to-image
    im = Image.fromarray(array)
    
    if keep_ratio:
        im.thumbnail(size, resample)
    else:
        im = im.resize(size, resample)
    
    return im

In [None]:
image_id = []
dim0 = []
dim1 = []
save_dir = f'/kaggle/working/train/'

os.makedirs(save_dir, exist_ok=True)

for dirname, _, filenames in tqdm(os.walk(f'/kaggle/input/siim-covid19-detection/train')):
    for file in filenames:
        # set keep_ratio=True to have original aspect ratio
        xray = read_xray(os.path.join(dirname, file))
        im = resize_xray(xray, SIZE)
        im = np.array(im)
        cv2.imwrite(os.path.join(save_dir, file.replace('dcm', 'jpg')),im)

        image_id.append(file.replace('.dcm', ''))
        dim0.append(xray.shape[0])
        dim1.append(xray.shape[1])

In [None]:
for i, img in enumerate(image_id):
    image_id[i] = img + '_image'
meta = pd.DataFrame(list(zip(image_id, dim0, dim1)),columns =['id', 'dim0', 'dim1'])

In [None]:
train_samples_df = pd.merge(train_samples_df, meta, how="inner", on="id")

In [None]:
from shutil import copyfile
ignoreList = []
with open('/kaggle/input/ignorelist/ignore.txt','r') as f:
    ignoreList = f.readlines()
    
for fold in range(FOLDS):
# Move the images to relevant split folder.
    for i in tqdm(range(len(train_samples_df))):
        row = train_samples_df.loc[i]
        if row.id in ignoreList:
            continue
        if row.fold != fold:
            copyfile(f'/kaggle/working/train/{row.id.split("_")[0]}.jpg', f'/kaggle/tmp/covid/images/train{fold}/{row.id.split("_")[0]}.jpg')
        else:
            copyfile(f'/kaggle/working/train/{row.id.split("_")[0]}.jpg', f'/kaggle/tmp/covid/images/valid{fold}/{row.id.split("_")[0]}.jpg')

In [None]:
!git clone https://github.com/ultralytics/yolov5  
%cd yolov5

%pip install -qr requirements.txt  

import torch
print(f"Setup complete. Using torch {torch.__version__} ({torch.cuda.get_device_properties(0).name if torch.cuda.is_available() else 'CPU'})")

In [None]:
import yaml
for fold in range(FOLDS): 
    data_yaml = dict(
        train = f'/kaggle/tmp/covid/images/train{fold}',
        val = f'/kaggle/tmp/covid/images/valid{fold}',
        nc = 1,
        names = ['opacity']
    )
    # Note that I am creating the file in the yolov5/data/ directory.
    with open(f'/kaggle/working/yolov5/data/data{fold}.yaml', 'w') as outfile:
        yaml.dump(data_yaml, outfile, default_flow_style=True)

In [None]:
def get_bbox(row):
    bboxes = []
    bbox = []
    for i, l in enumerate(row.label.split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l))
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []  
            
    return bboxes

# Scale the bounding boxes according to the size of the resized image. 
def scale_bbox(row, bboxes):
    # Get scaling factor
    scale_x = SIZE[1]/row.dim1
    scale_y = SIZE[0]/row.dim0
    
    scaled_bboxes = []
    for bbox in bboxes:
        x = int(np.round(bbox[0]*scale_x, 4))
        y = int(np.round(bbox[1]*scale_y, 4))
        x1 = int(np.round(bbox[2]*scale_x, 4))
        y1 = int(np.round(bbox[3]*scale_y, 4))

        scaled_bboxes.append([x, y, x1, y1]) # xmin, ymin, xmax, ymax
        
    return scaled_bboxes

# Convert the bounding boxes in YOLO format.
def get_yolo_format_bbox(img_w, img_h, bboxes):
    yolo_boxes = []
    for bbox in bboxes:
        w = bbox[2] - bbox[0] # xmax - xmin
        h = bbox[3] - bbox[1] # ymax - ymin
        xc = bbox[0] + int(np.round(w/2)) # xmin + width/2
        yc = bbox[1] + int(np.round(h/2)) # ymin + height/2
        
        yolo_boxes.append([xc/img_w, yc/img_h, w/img_w, h/img_h]) # x_center y_center width height
    
    return yolo_boxes

In [None]:

!rm /kaggle/tmp/covid/labels/valid0/*

In [None]:
for fold in range(FOLDS):
    for i in tqdm(range(len(train_samples_df))):
        row = train_samples_df.loc[i]
        # Get image id
        if row.id in ignoreList:
            continue
        img_id = row.id.split("_")[0]
        # Check if bboxes exist
        bbx_count = row.boxes_count

        if row.fold != fold:
            file_name = f'/kaggle/tmp/covid/labels/train{fold}/{img_id}.txt'
        else:
            file_name = f'/kaggle/tmp/covid/labels/valid{fold}/{img_id}.txt'


        if bbx_count>0:
            # Get bboxes
            bboxes = get_bbox(row)
            # Scale bounding boxes
            scale_bboxes = scale_bbox(row, bboxes)
            # Format for YOLOv5
            yolo_bboxes = get_yolo_format_bbox(SIZE[0],SIZE[1], scale_bboxes)

            with open(file_name, 'w') as f:
                for bbox in yolo_bboxes:
                    bbox = [0]+bbox
                    bbox = [str(i) for i in bbox]
                    bbox = ' '.join(bbox)
                    f.write(bbox)
                    f.write('\n')
        else:
            with open(file_name, 'w') as f:
                f.write("")
            

In [None]:
# fold = 0
# data_file = 'data'+str(fold)+'.yaml'
# !python train.py --img {SIZE[0]} \
#                  --batch {BATCHSIZE} \
#                  --epochs {EPOCHS} \
#                  --data {data_file} \
#                  --weights yolov5l6.pt \
#                  --evolve \
#                  --cache

In [None]:
fold = 1
data_file = 'data'+str(fold)+'.yaml'
!python train.py --img {SIZE[0]} \
                 --batch {BATCHSIZE} \
                 --epochs {EPOCHS} \
                 --data {data_file} \
                 --weights yolov5l6.pt \
                 --evolve \
                 --cache

In [None]:
# fold = 2
# data_file = 'data'+str(fold)+'.yaml'
# !python train.py --img {SIZE[0]} \
#                  --batch {BATCHSIZE} \
#                  --epochs {EPOCHS} \
#                  --data {data_file} \
#                  --weights yolov5l6.pt \
#                  --evolve \
#                  --cache

In [None]:
# fold = 3
# data_file = 'data'+str(fold)+'.yaml'
# !python train.py --img {SIZE[0]} \
#                  --batch {BATCHSIZE} \
#                  --epochs {EPOCHS} \
#                  --data {data_file} \
#                  --weights yolov5l6.pt \
#                  --evolve \
#                  --cache

In [None]:
# fold = 4
# data_file = 'data'+str(fold)+'.yaml'
# !python train.py --img {SIZE[0]} \
#                  --batch {BATCHSIZE} \
#                  --epochs {EPOCHS} \
#                  --data {data_file} \
#                  --weights yolov5l6.pt \
#                  --evolve \
#                  --cache