## High-level info

* Ideal image resolution: https://pubs.rsna.org/doi/10.1148/ryai.2019190177
    * Plateau's around 300 x 300 -- use higher resolution (512 x 512) for nodule pathologies
* Outputs:
    * We are going to output the label of the study
        * One of: Atypical, Typical, Undeterminate, Negative
    * Also the coordinates of the bounding box if possible
    

## What do we know about DICOM files
It seems to contain not only the image, but the image metadata:
* Image type
* Study date and time
* Patient UID - name, ID, sex, 
* Body part examined
* samples per pixel
* Bit information (high bit)
* Image size (number of rows and columns)
* Number of elements (pixel data)

Information about Pydicom (full user guide is [here](https://pydicom.github.io/pydicom/stable/old/pydicom_user_guide.html)):

Important methods and properties:
***FileDataset* object**
* `Dataset.pixel_array` -- converts pydicom object into pixel array
* `Dataset.PatientName` -- returns the value stored in the file_meta file with the variable name `PatientName` (works for other variables)
* `Dataset.Rows` & `Dataset.Columns` -- returns the number of rows and columns, respectively

### Implementing YOLOV5 Preprocessing Pipeline

In [None]:
from sklearn import model_selection
import cv2
from matplotlib import patches
import ast
from tqdm import tqdm
import os
import pydicom as dicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import numpy as np
from pathlib import Path
import pandas as pd
from fastai.medical.imaging import *
from PIL import Image
import matplotlib.pyplot as plt

In [None]:
# Run if you don't have file structure
!rm -r labels images
dirs = ['images/', 'labels/', 'images/train', 'images/validation', 'labels/train', 'labels/validation']
for dir in dirs:
    os.makedirs(dir, exist_ok=True)

In [None]:
train = pd.read_csv('../input/siim-covid19-detection/train_image_level.csv')

In [None]:
train

In [None]:
def get_bbox_format(row):
    if row is np.nan: return [[0, 0, 1, 1]]
    bbox_list = ast.literal_eval(row)
    return [list(x.values()) for x in bbox_list]
            
train = pd.read_csv('../input/siim-covid19-detection/train_image_level.csv')
train['bbox'] = train.boxes.apply(get_bbox_format)
train['image'] = train['id'].str.extract('(.*)_image')
img_paths = get_dicom_files('../input/siim-covid19-detection/train')
for path in tqdm(img_paths, total=len(img_paths)):
    study = path.parent.parent.name
    series = path.parent.name
    image = path.stem
    train.loc[train.image == image, ['study', 'series', 'path']] = [study, series, path]

In [None]:
train.sample(5)

In [None]:
study_df = pd.read_csv('../input/siim-covid19-detection/train_study_level.csv')
conditions = [
    study_df['Negative for Pneumonia'] == 1,
    study_df['Typical Appearance'] == 1,
    study_df['Indeterminate Appearance'] == 1,
    study_df['Atypical Appearance'] == 1,
]

choices_str = [
    "negative", "typical", "indeterminate", "atypical"
]

choices_int = [
    0, 1, 2, 3
]

study_df['label'] = np.select(conditions, choices_str, None)
study_df['label_int'] = np.select(conditions, choices_int, None)
study_df['study'] = study_df['id'].str.extract('(.*)_study')

In [None]:
labeled_df = pd.merge(train[['image', 'series', 'study', 'path', 'bbox']], 
                      study_df[['study', 'label', 'label_int']],
                      on='study', how='left')
labeled_df.head()

In [None]:
num_images = len(labeled_df)
train_df, valid_df = model_selection.train_test_split(labeled_df.sample(num_images, random_state=42), 
                                                      random_state=42,
                                                      train_size=0.9, 
                                                      shuffle=True)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)

In [None]:
def label_img(df, idx):
    boxes = df.bbox.iloc[idx]
    image_path = df.path.iloc[idx]
    label = df.label.iloc[idx]
    fig, ax = plt.subplots()
    img_info = dicom.read_file(image_path)
    img_arr = apply_voi_lut(img_info.pixel_array, img_info)
    ax.imshow(img_arr);
    if boxes != []:
        for box in boxes:
            x1, y1, w, h = box
            ax.add_patch(patches.Rectangle((x1, y1), w, h, linewidth=1, edgecolor='r', facecolor='none', label=f'{label}'))
            ax.annotate(label, (x1, y1), color='r', weight='bold', fontsize=15, ha='left', va='bottom')

In [None]:
for i, row in labeled_df.iloc[4:5].iterrows():
    label_img(labeled_df, i)

In [None]:
for i, row in train_df[:5].iterrows():
    label_img(train_df, i)

## Organize files into Yolo file structure
```
|-- main
    |-- images
    |   |-- train
    |   |-- validation
    |
    |-- labels
        |-- train
        |-- validation
```

In [None]:
def get_bbox(df, idx, normalize=True): 
    img_path = df.iloc[idx].path
    img_bbox = df.iloc[idx].bbox
    img_class = df.iloc[idx].label_int
    img = dicom.dcmread(img_path)
    xsize = img.Columns
    ysize = img.Rows

    yolo_bboxes = []

    for box in img_bbox:
        x1, y1, w, h = box       
        # yolo bboxes
        bx = x1 + w / 2
        by = y1 + h / 2
        if normalize:
            bx_norm = bx / xsize
            by_norm = by / ysize
            w_norm = w / xsize
            h_norm = h / ysize

            yolo_bboxes.append([img_class, bx_norm, by_norm, w_norm, h_norm])
        else:
            yolo_bboxes.append([img_class, bx, by, w, h])
            
    return yolo_bboxes

def convert_to_png(img_path, output_path):
    filename = Path(img_path).stem
    img_dicom = dicom.dcmread(img_path)
    # Conversion to image using min-max scaling and conversion to 8-bit from [@heyytanay](https://www.kaggle.com/heyytanay)
    data = apply_voi_lut(img_dicom.pixel_array, img_dicom)
    if img_dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    img_arr = (data * 255).astype(np.uint8)
    # Resize image when converting to png to save space
    img_arr_resize = cv2.resize(img_arr, dsize=(512, 512), interpolation=cv2.INTER_CUBIC)
    cv2.imwrite(output_path,img_arr_resize)

#### Test conversion to png

In [None]:
path = r'../input/siim-covid19-detection/train/2a234c42eaac/a4dd021980a5/e24d2e46a243.dcm'
convert_to_png(path, './test_img.png')

In [None]:
def file_organizer(df, dataset):
    for i, row in tqdm(df.iterrows(), total=len(df)):
        image_id = row.image
        image_path = row.path
        try:
            img_dicom = dicom.dcmread(image_path)
            img_arr = img_dicom.pixel_array
            
            # Save file in labels directory
            labels = get_bbox(df, i, normalize=True)
            np.savetxt(f"./labels/{dataset}/{image_id}.txt", labels)

            # Copy image to local directory
            convert_to_png(image_path, f"./images/{dataset}/{image_id}.png")
        except RuntimeError:
            print(f'could not process {image_id}')
            continue

file_organizer(train_df, 'train')
file_organizer(valid_df, 'validation')

In [None]:
!zip -r training_dataset.zip .

### Inference for YoloV5 in future notebook!

---
### Extra: Reading from files

In [None]:
Image.fromarray(np.uint8(Image.open('./images/train/a1b9944654af.png'))*255, 'L')

In [None]:
with open('./labels/train/a1b9944654af.txt') as f:
    content = f.read()
content

In [None]:
file_organizer(train_df, 'train')