In [None]:
# Let's get all the relevant libraries

# Data Managment  
import numpy as np
import pandas as pd
import os
import cv2 
import json 
from glob import glob
from PIL import Image

# Dicom readers 
import pydicom 
from pydicom.pixel_data_handlers.util import apply_voi_lut

# Plotting and Vizualization 
import seaborn as sns 
import matplotlib.pyplot as plt

# Miscellaneous 
from tqdm.auto import tqdm

#Torch 
import torch 

In [None]:
IMG_SIZE = 512
BATCH_SIZE = 16
EPOCHS = 40

In [None]:
os.listdir('/kaggle/input/siim-covid19-detection')

In [None]:
dataset = '/kaggle/input/siim-covid19-detection'

In [None]:
train_study_df = pd.read_csv(dataset + '/train_study_level.csv')
train_study_df 

In [None]:
train_image_df = pd.read_csv(dataset + '/train_image_level.csv')
train_image_df

In [None]:
print("There are {} images with no bounding boxes in the dataset"
                      .format(train_image_df["boxes"].isna().sum()))

In [None]:
train_image_df["label"]

# Let's have a look at the labels: the opacity or none class
# opacity means that the image contains a bouding box, no means that there is no such box. 
# Then, the last 4 numbers correspond to the coordinates of the box, in the following format: 
# xmin ymin xmax ymax 
# and if the class is non, the values are 0 0 1 1 

In [None]:
# Let's get an idea of what is asked in the submission file

submission_df = pd.read_csv(dataset + '/sample_submission.csv')
print(submission_df.shape)
for i in range(10): 
    print(submission_df.loc[i,:])
    
# We need to return, for each study in the test dataset, and Predicition String that include
# the opaque or none label (or, disease or no disease) and if opaque, the values of all coordinates 

In [None]:
# The train_study file also fives use, for each study, which kind of Pneumonia is 
# associated with the patients.

# Let's plot each subtypes 
subtypes = train_study_df.groupby(['Negative for Pneumonia', 'Typical Appearance',
       'Indeterminate Appearance', 'Atypical Appearance']).count().reset_index()
subtypes["label"] = ['Atypical Appearance', 'Indeterminate Appearance',
               'Typical Appearance', 'Negative for Pneumonia']

ax = plt.subplots(figsize=(21,10))
ax = sns.barplot(x=subtypes.label, y=subtypes.id, palette="deep", orient='v')

In [None]:
#Let's see the distribution between opacity and none 
class_df = train_image_df["label"].apply(lambda x: x.split(" ")[0]).value_counts().reset_index()
class_df
sns.barplot(x=class_df.label, y=["opacity","none"], palette="deep", orient='h')

In [None]:
# Now let's create a column with the study_ids, to make life a bit easier 
train_study_df["study_id"] = train_study_df["id"].apply(lambda x: x.split("_")[0])
train_study_df

In [None]:
# Let's create a final train dataframe with all the information 
train = pd.merge(train_image_df, train_study_df, 
                 left_on="StudyInstanceUID", right_on="study_id")
train.drop([ "StudyInstanceUID", "id_y"], axis=1, inplace=True)
train

In [None]:
train.sort_values('study_id')

In [None]:
train = train.rename(columns={"id_x":"id"})

In [None]:
# Make a list of all the paths for all the images 
dicom_paths = glob(f'{dataset}/train/*/*/*.dcm')

In [None]:
test_df = pd.read_csv(dataset + '/sample_submission.csv')

In [None]:
test_df

In [None]:
test_path = glob(f'{dataset}/test/*/*/*.dcm')

In [None]:
test_dcm = pd.DataFrame({'dcm_path':test_path})
test_dcm['id']  = test_dcm.dcm_path.map(lambda x: x.split('/')[-1].replace('.dcm','_image'))
test_dcm

In [None]:
# Get a Dataframe that includes the path 
dcm_df = pd.DataFrame({'dcm_path':dicom_paths})
dcm_df['id'] = dcm_df.dcm_path.map(lambda x: x.split('/')[-1].replace('.dcm','_image'))
dcm_df

In [None]:
# Merge both dataframe to have the paths in the train DataFrame 
train = train.merge(dcm_df, on='id', how='left')
train

In [None]:
# Merge both dataframe to have the paths in the train DataFrame 
test = test_df.merge(test_dcm, on='id', how='left')
test

In [None]:
test = test.dropna()

In [None]:
test

In [None]:
train_dev = train[:200]
train_dev

In [None]:
valid_dev = train[-100:]
valid_dev

In [None]:
# The dicom to array function simply reads the dicom image, and returns a numpy array
# Then, the plot_img and plot_imgs functions can plot one or several images


def dicom2array(path, voi_lut=True, fix_monochrome=True):
    dicom = pydicom.read_file(path)
    if voi_lut: 
        array = apply_voi_lut(dicom.pixel_array, dicom)
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        array = np.amax(array) - array
    array = array - np.min(array)
    array = array / np.max(array)
    array = (array * 255).astype(np.uint8)
    return array

def plot_img(img, size=(7, 7), is_rgb=True, title="", cmap='gray'):
    plt.figure(figsize=size)
    plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()


def plot_imgs(imgs, cols=4, size=7, is_rgb=True, title='',cmap='gray', img_size=(512,512)):
    rows = len(imgs)//cols + 1 
    print(rows)
    fig = plt.figure(figsize=(cols*size, rows*size))
    for i, img in enumerate(imgs):
        if img_size is not None: 
            img = cv2.resize(img, img_size)
        fig.add_subplot(rows, cols, i+1)
        plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()


In [None]:
# Let's look at one image 
img = dicom2array(dicom_paths[20])
plot_img(img)

In [None]:
# Let's look at several images 

imgs = [dicom2array(path) for path in dicom_paths[:4]]
plot_imgs(imgs)

In [None]:
# Let's make some bounding boxes, to visualize the task 
# The function plot_bboxes_with_label takes as imput a label, n images, and plots
# n number of images from the corresping label with the boxes associated 

# while I know that in this project, the positive classes for COVID should be green, and every
# thing else yellow. 
# I will keep it that was for development sake, and we will see later on

# Credits to:  https://www.kaggle.com/piantic/siim-fisabio-rsna-covid-19-detection-basic-eda

from colorama import Fore, Back, Style

label2color = {
    '[1, 0, 0]': [255,0,0], # Typical Appearance
    '[0, 1, 0]': [0,255,0], # Indeterminate Appearance
    '[0, 0, 1]': [0,0,255], # Atypical Appearance
    '[0, 0, 0]': None, # negative
}

class_names = ['Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance']

def plot_bboxes_with_label(label_name, n): 
    print('Typical Appearance: ' + Fore.RED + 'Red',Style.RESET_ALL)
    print('Indeterminate Appearance: '  + Fore.GREEN + 'Green',Style.RESET_ALL)
    print('Atypical Appearance: ' + Fore.BLUE + 'Blue',Style.RESET_ALL)
    
    imgs = []
    
    thickness = 2 
    scale = 5 
    
    if label_name == "Negative for Pneumonia": 
        flag = 0
    else: 
        flag = 1
    
    for _, row in train[train[label_name]==flag].iloc[:n].iterrows():
        # _ is the index, row is well, the row 
        study_id=row['study_id'] # get the study ids 
        img_path = glob(f'{dataset}/train/{study_id}/*/*')[0] # get all the path, 
        img = dicom2array(img_path)
        img = cv2.resize(img, None, fx=1/scale, fy=1/scale)
        img = np.stack([img, img, img], axis=-1)
        
        claz = row[class_names].values
        color = label2color[str(claz.tolist())]

        bboxes = []
        bbox = []
        
        for i, l in enumerate(row['label'].split(' ')): 
            # i is index, l the label
            if (i % 6 == 0) | (i % 6 == 1):
                continue
            bbox.append(float(l)/scale)
            if i % 6 == 5: 
                bboxes.append(bbox)
                bbox = []
        for box in bboxes: 
            img = cv2.rectangle(
                img,
                (int(box[0]), int(box[1])),
                (int(box[2]), int(box[3])),
                color, thickness
            )
        img = cv2.resize(img, (512,512))
        imgs.append(img)
    
    plot_imgs(imgs, cmap=None)
    
    del img, imgs, bbox, bboxes

In [None]:
# This cell will print several images with bounding box
# You can change the label to print different images from differnt categories 
plot_bboxes_with_label("Negative for Pneumonia", 4)

# Now let's clone the model and save the images in a different directories for future use 

In [None]:
os.makedirs('/kaggle/working/tmp/', exist_ok=True)

In [None]:
%cd /kaggle/working/tmp

In [None]:
!git clone https://github.com/ultralytics/yolov5

In [None]:
%cd yolov5
!pip install -r requirements.txt

In [None]:
%ls

In [None]:
os.makedirs('data/images/train', exist_ok=True)
os.makedirs('data/images/valid', exist_ok=True)

os.makedirs('data/labels/train', exist_ok=True)
os.makedirs('data/labels/valid', exist_ok=True)


In [None]:
%cd data

In [None]:
# Create .yaml file 
import yaml

data_yaml = dict(
    train = '/kaggle/working/tmp/yolov5/data/images/train',
    val = '/kaggle/working/tmp/yolov5/data/images/valid',
    nc = 2,
    names = ['none', 'opacity']
)

# Note that I am creating the file in the yolov5/data/ directory.
with open('/kaggle/working/tmp/yolov5/data/data.yaml', 'w') as outfile:
    yaml.dump(data_yaml, outfile, default_flow_style=True)
    
%cat /kaggle/working/tmp/yolov5/data/data.yaml

In [None]:
from PIL import Image
dim0 = []
dim1 = []
def resize_and_save(end_path, df):
    dim0 = []
    dim1 = []
    filenames = []
    for index, row in tqdm(df[['study_id', 'dcm_path']].iterrows(), total = df.shape[0]):
        try: 
            array = dicom2array(row['dcm_path'])
            dim0.append(array.shape[0])
            dim1.append(array.shape[1])
            img = cv2.resize(array, (IMG_SIZE,IMG_SIZE))
            img = Image.fromarray(img)
   
            filename = row['dcm_path'].split('/')[-1].split('.')[0]
            filenames.append(filename)
            img.save(os.path.join(end_path, f'{filename}.png'))
        except RuntimeError:
            pass
    return pd.DataFrame({'dim0':dim0, 'dim1': dim1, 'id': filenames})
        #return filename.replace('dcm','') + '_image', array.shape[0], array.shape[1]

In [None]:
# Let's save the image in a new file for training 

dims_train = resize_and_save('/kaggle/working/tmp/yolov5/data/images/train/', train_dev)
dims_valid = resize_and_save('/kaggle/working/tmp/yolov5/data/images/valid/', valid_dev)

In [None]:
#Let's change the train dataframe to include the name with png
train['id'] = train['id'].apply(lambda x: x.replace('_image','.png'))

train_dev['id'] = train_dev['id'].apply(lambda x: x.replace('_image','.png'))

valid_dev['id'] = valid_dev['id'].apply(lambda x: x.replace('_image','.png'))
valid_dev

In [None]:
dims_valid['id'] = dims_valid['id'].astype(str) + '.png'
dims_train['id'] = dims_train['id'].astype(str) + '.png'
dims_train

In [None]:
train_dev = train_dev.merge(dims_train, on='id', how='left')
valid_dev = valid_dev.merge(dims_valid, on='id', how='left')

In [None]:
train_dev

In [None]:
# Get the raw bounding box by parsing the row value of the label column.
# Ref: https://www.kaggle.com/yujiariyasu/plot-3positive-classes
def get_bbox(row):
    bboxes = []
    bbox = []
    for i, l in enumerate(row.label.split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l))
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []  
            
    return bboxes

# Scale the bounding boxes according to the size of the resized image. 
def scale_bbox(row, bboxes):
    # Get scaling factor
    scale_x = IMG_SIZE/row.dim1
    scale_y = IMG_SIZE/row.dim0
    
    scaled_bboxes = []
    for bbox in bboxes:
        x = int(np.round(bbox[0]*scale_x, 4))
        y = int(np.round(bbox[1]*scale_y, 4))
        x1 = int(np.round(bbox[2]*(scale_x), 4))
        y1= int(np.round(bbox[3]*scale_y, 4))

        scaled_bboxes.append([x, y, x1, y1]) # xmin, ymin, xmax, ymax
        
    return scaled_bboxes

# Convert the bounding boxes in YOLO format.
def get_yolo_format_bbox(img_w, img_h, bboxes):
    yolo_boxes = []
    for bbox in bboxes:
        w = bbox[2] - bbox[0] # xmax - xmin
        h = bbox[3] - bbox[1] # ymax - ymin
        xc = bbox[0] + int(np.round(w/2)) # xmin + width/2
        yc = bbox[1] + int(np.round(h/2)) # ymin + height/2
        
        yolo_boxes.append([xc/img_w, yc/img_h, w/img_w, h/img_h]) # x_center y_center width height
    
    return yolo_boxes

In [None]:
train_dev['image_level'] = train_dev.apply(lambda x: x.label.split(' ')[0], axis=1)
train_dev['id'] = train_dev['id'].apply(lambda x: x.replace('.png', '.txt'))

valid_dev['image_level'] = valid_dev.apply(lambda x: x.label.split(' ')[0], axis=1)
valid_dev['id'] = valid_dev['id'].apply(lambda x: x.replace('.png', '.txt'))

In [None]:
# Prepare the txt files for bounding box


for i in tqdm(range(len(train_dev))):
    row = train_dev.loc[i]
    # Get image id
    img_id = row.id
    # Get image-level label
    label = row.image_level
    

    file_name = f'/kaggle/working/tmp/yolov5/data/labels/train/{row.id}'
        
    try: 
        if label=='opacity':
            # Get bboxes
            bboxes = get_bbox(row)
            # Scale bounding boxes
            scale_bboxes = scale_bbox(row, bboxes)
            # Format for YOLOv5
            yolo_bboxes = get_yolo_format_bbox(IMG_SIZE, IMG_SIZE, scale_bboxes)
        
        
            with open(file_name, 'w') as f:
                for bbox in yolo_bboxes:
                    
                    bbox = [1]+bbox
                    bbox = [str(i) for i in bbox]
                    bbox = ' '.join(bbox)
                    f.write(bbox)
                    f.write('\n')
    except ValueError: 
        pass

In [None]:
for i in tqdm(range(len(valid_dev))):
    row = valid_dev.loc[i]
    # Get image id
    img_id = row.id
    # Get image-level label
    label = row.image_level
    

    file_name = f'/kaggle/working/tmp/yolov5/data/labels/valid/{row.id}'
        
    try: 
        if label=='opacity':
            # Get bboxes
            bboxes = get_bbox(row)
            # Scale bounding boxes
            scale_bboxes = scale_bbox(row, bboxes)
            # Format for YOLOv5
            yolo_bboxes = get_yolo_format_bbox(IMG_SIZE, IMG_SIZE, scale_bboxes)
        
        
            with open(file_name, 'w') as f:
                for bbox in yolo_bboxes:
                    
                    bbox = [1]+bbox
                    bbox = [str(i) for i in bbox]
                    bbox = ' '.join(bbox)
                    f.write(bbox)
                    f.write('\n')
    except ValueError: 
        pass

In [None]:
%cd /kaggle/working/tmp/yolov5/data/labels/train
%ls

In [None]:
# Let's verify that this is what we want 

f = open('000a312787f2.txt', 'r')
content = f.read()
f.close
print(content)


In [None]:
# Install W&B, login into your account and paste the API key 

# A note here: You can create and wandb account and login by uncommenting the last line of this 
# cell. This will save the run on your account, and allow you to vizualise the results very
# easily, and give you access to several valuable options and tools 
!pip install -q --upgrade wandb
# Login 
import wandb
#wandb.login()

In [None]:
# If you are running the model while being logged in a wandb account, remove the 
# calling "WANDB_MODE="dryrun" 
%cd /kaggle/working/tmp/yolov5
!WANDB_MODE="dryrun" python train.py --img {IMG_SIZE} \
                 --batch {BATCH_SIZE} \
                 --epochs {EPOCHS} \
                 --data data.yaml \
                 --weights yolov5s.pt \
                # --save_period 1\
                 --project kaggle-siim-covid

In [None]:
plt.figure(figsize=(30,15))
plt.axis('off')
plt.imshow(plt.imread('/kaggle/working/tmp/yolov5/runs/train/exp/confusion_matrix.png'));

In [None]:
%cd /kaggle/working/tmp/yolov5/runs/train/exp
%ls

In [None]:
# This shows a batch of the validation data with the corresponding label 
plt.figure(figsize=(15,15))
plt.imshow(plt.imread('val_batch0_labels.jpg'))

In [None]:
plt.imshow(plt.imread('R_curve.png'))

In [None]:
plt.imshow(plt.imread('P_curve.png'))

In [None]:
# This prints all the results curves 
plt.figure(figsize=(20,30))
plt.imshow(plt.imread('results.png'))

In [None]:
# The weights are stored here, and could be used for inference 
%cd /kaggle/working/tmp/yolov5/kaggle-siim-covid/exp/weights
%ls

In [None]:
weights = '/kaggle/working/tmp/yolov5/kaggle-siim-covid/exp/weights/best.pt'

In [None]:
%cd /kaggle/working/tmp/yolov5/data/images

In [None]:
os.makedirs('test', exist_ok=True)

In [None]:
def save_test(end_path, df):

    filenames = []
    for index, row in tqdm(df[['id', 'dcm_path']].iterrows(), total = df.shape[0]):
        try: 
            array = dicom2array(row['dcm_path'])
            img = cv2.resize(array, (IMG_SIZE,IMG_SIZE))
            img = Image.fromarray(img)
   
            filename = row['dcm_path'].split('/')[-1].split('.')[0]
            filenames.append(filename)
            img.save(os.path.join(end_path, f'{filename}.png'))
        except RuntimeError:
            pass
        #return filename.replace('dcm','') + '_image', array.shape[0], array.shape[1]

In [None]:
# We save the test images in a new folder; not all the images are necessary, you can make this smaller by cutting the dataframe
save_test('test', test)

In [None]:
%cd /kaggle/working/tmp/yolov5

In [None]:
# This makes all the necessary predicitions 

!python detect.py --weights /kaggle/working/tmp/yolov5/kaggle-siim-covid/exp/weights/best.pt /kaggle/working/tmp/yolov5/kaggle-siim-covid/exp/weights/last.pt --img 512 --source data/images/test

In [None]:
%cd /kaggle/working/tmp/yolov5/runs/detect/
%ls
directory = os.listdir('exp')
plt.figure(figsize=(15,15))
for i, file in enumerate((directory)[0:5]):
    img = plt.imread('exp/' + file)
    plt.subplot(2, 3, i+1)
    plt.imshow(img)
    