## Import libraries

In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
from os import listdir, makedirs, symlink, chdir
from os.path import isfile, join

from shutil import copyfile, rmtree

from IPython.display import Image, clear_output  # to display images
clear_output()

## Setup YOLOv5

In [None]:
if not os.path.exists('/kaggle/working/yolov5/'):
    !git clone https://github.com/ultralytics/yolov5  # clone repo
    chdir('yolov5')
    !pip install pycocotools -qr requirements.txt  # install dependencies
    !pip uninstall -y wandb  # open wandb bugs 

    import torch
    print('Setup complete. Using torch %s %s' % (torch.__version__, torch.cuda.get_device_properties(0) if torch.cuda.is_available() else 'CPU'))

## Setup directories

In [None]:
# Input data is generated in my other notebook x-ray-with-yolov5-data-setup

test_path = '/kaggle/input/x-ray-with-yolov5-test-setup'
training_weights_path = '/kaggle/input/yolov5xrayrunsweights' # generated with X-ray_with_YoloV5 (training)
orig_dicom_files = '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/test'

working_path = '/kaggle/working'
data_path = join(working_path,'data')
images_path = join(data_path,'images')
in_images_path = join(test_path,'test_images')
output_path = join(working_path,'output')


# Working paths for yolov5
yolo_path = join(working_path,'yolov5')
yolo_runs = join(yolo_path,'runs')

## Setup images directory with links to images

In [None]:
makedirs(images_path, exist_ok = True)

if len(listdir(in_images_path)) > len(listdir(images_path)):
    for img_file in listdir(in_images_path):
        if img_file.endswith('.jpg'):
            symlink(join(in_images_path,img_file), join(images_path,img_file))

## Setup weights from training execution
Training was done in [https://www.kaggle.com/amirsher/x-ray-with-yolov5-training/](https://www.kaggle.com/amirsher/x-ray-with-yolov5-training/)

In [None]:
# copy the weight file created in training
best_weights = 'best.pt'
copyfile(join(training_weights_path,best_weights), join(yolo_path,best_weights))

In [None]:
# CSV of submission test images with original image sizes

csv_file = join(test_path,'sample_submission.csv')
sub_csv_df = pd.read_csv(csv_file)
sub_csv_df.head()


## Inference

In [None]:
# Output paths for YOLO results
project = 'xray'
detect_path = join(yolo_path,project,'exp') # image results
labels_path = join(detect_path,'labels') # labels *.txt results

In [None]:
# Clear previous results
rmtree(detect_path, ignore_errors=True)

In [None]:
chdir(yolo_path)

conf = 0.01

!python detect.py --weights $best_weights --img 512 --conf $conf --source $images_path --save-txt --save-conf --project $project --exist-ok


## Define Dicom_image class to examine outputs

In [None]:
# import packages for dicom image
import cv2
import matplotlib
import imgaug as ia
from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage
from imgaug import augmenters as iaa
from matplotlib import colors
import random
import pydicom as dicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
from os.path import join, exists


# Define dicom_image class for displaying dicom image files with detections (removed unnecessary functions from my original class)
class Dicom_image:
    def __init__(self, path, image_name, classes=[]):
        '''
        Import image
        :param path: Path to the image
        :param image_name: image name
        '''
        self.path = path
        if image_name.endswith('.dicom'):
            self.file_name = image_name
            self.image_name = image_name[:-6]
        else:
            self.file_name = image_name + '.dicom'
            self.image_name = image_name
        self.file_path = join(self.path, self.file_name)
        if exists(self.file_path):
            self.dicom_file = dicom.dcmread(self.file_path)
            self.image_orig = self.read_xray()
            self.orig_size = (self.image_orig.shape[0],self.image_orig.shape[1]) # Original image size before resizing - (h,w)
            self.bbs = [] # Initial bounding boxes
            self.classes = classes
            self.image_aug = None # Augmented image
            self.bbs_aug = []  # bounding boxes after augmentation
            self.image_resize = None # image after resize
            self.bbs_resize= [] # bounding boxes after resize
        else:
            print (f"Error! {self.file_path} does not exists!")
    
    def read_xray(self, voi_lut = True, fix_monochrome = True):
        '''
        Read image data from the dicom file
        (reference: https://www.kaggle.com/raddar/vinbigdata-competition-jpg-data-2x-downsampled)
        '''
        data_file = self.dicom_file
        # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
        if voi_lut:
            data = apply_voi_lut(data_file.pixel_array, data_file)
        else:
            data = data_file.pixel_array

        # depending on this value, X-ray may look inverted - fix that:
        if fix_monochrome and data_file.PhotometricInterpretation == "MONOCHROME1":
            data = np.amax(data) - data
        data = data - np.min(data)
        data = data / np.max(data)
        data = (data * 255.0).astype(np.uint8)
        # Use CLAHE (Contrast Limited Adaptive Histogram Equalization) to improve contrast
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
        data = clahe.apply(data)
        data = cv2.cvtColor(data,cv2.COLOR_GRAY2RGB) # convert to 3 channel grayscale
        return data

    def add_bbox(self, xmin, ymin, xmax, ymax, cl_label):
        '''
        Add a bounding box to list of bounding boxes in the image
        :param xmin, ymin, xmax, ymax: bounding box coordinates
        :param cl_label: string label class of the bounding box
        '''
        self.bbs.append(BoundingBox(x1=xmin, y1=ymin, x2=xmax, y2=ymax, label=cl_label))
    
    def add_bbox_from_df(self, df):
        '''
        Add bounding boxes to the list of bounding boxes from a pandas dataframe
        :param df: dataframe that contains bounding boxes and image_id with the same self.image_name
        '''
        image_df = df[df['image_id']==self.image_name]
        for i, row in image_df.iterrows():
            self.add_bbox(row.x_min, row.y_min ,row.x_max, row.y_max,str(row.cl))

    def clear_bbs(self):
        '''
        Remove all bounding boxes from the list of bounding boxes
        '''
        self.bbs = []
    
    
    def resize (self, img=None, bounding_boxes=None, downscale_factor = 1, max_dim = None):
        '''
        Resize image
        :param downscale_factor: downscale factor (default = 1)
        :param max_dim: downscale so that largest dimension (height or width) is equal max_dim. The other dimension reduced proportionately.
        '''
        img = self.image_orig if img is None else img
        img_size = (img.shape[0], img.shape[1])
        bounding_boxes = self.bbs if bounding_boxes is None else bounding_boxes
        if type(bounding_boxes)==BoundingBoxesOnImage:
            bbs = bounding_boxes
        else:
            bbs = BoundingBoxesOnImage(bounding_boxes, shape=img.shape)
        
        if downscale_factor > 1:  # Resize by a constant factor
            new_shape = tuple([int(x / downscale_factor) for x in img_size])       
        elif max_dim != None: # Resize so that maximum length (width or height) will be max_dim. The other dimension resized proportionately
            downscale_factor =  max(img.shape) / max_dim
            new_shape = tuple([round(x / downscale_factor) for x in img_size])
        else:
            new_shape = img_size # Don't resize

        self.image_resize = ia.imresize_single_image(img, new_shape)
        self.bbs_resize = bbs.on(self.image_resize)
        return self.image_resize, self.bbs_resize
    
    def parse_label(self, bb):
        '''
        Convert label index to label class name and color
        :param bb: bounding box with label
        '''
        color_pallete = ['lightcoral','brown','red','darksalmon','chocolate',
                  'darkorange','darkgoldenrod','gold','olive','yellow',
                  'green','lime','blue','darkorchid']
        try:
            class_code = int(bb.label)
            class_name = self.classes[class_code]
        except:
            class_code = random.randrange(len(colors)) # if class nunber was not specified, or label was not an index to class
            class_name = ''
        cl_color = tuple(np.array(colors.to_rgb(color_pallete[class_code]))*255)
        return cl_color, class_name
    
    def show_image(self, img=None, bounding_boxes=None, showbbs=True, showlabel=True):
        '''
        Show the image with bounding boxes
        :param img: image to be processed
        :param bounding_boxes: list of bounding boxes
        :param showbbs: flag to show the bounding boxes, if there are any (default=True)
        :param showlabel: flag to show the labels of the bounding boxes, if there are any (default=True)
        '''
        img = self.image_orig if img is None else img
        bounding_boxes = self.bbs if bounding_boxes is None else bounding_boxes
        bbs=[]
        if bounding_boxes != []:
            if type(bounding_boxes)==BoundingBoxesOnImage:
                bbs = bounding_boxes.bounding_boxes
            else:
                bbs = bounding_boxes
            
            for bb in bbs:
                bbox_color, bbox_label = self.parse_label(bb)
                if showlabel:
                    bbl = bb.copy(label=bbox_label) # replace label from class nunber to class name
                    img = bbl.draw_on_image(image=img, color=bbox_color)
                elif showbbs:
                    img = bb.draw_box_on_image(image=img, color=bbox_color)  # only draw box without label   
        ia.imshow(img)
        

In [None]:
# Display xray image from dicom with bounding boxes that are attached to the image

classes = ['Aortic enlargement','Atelectasis','Calcification','Cardiomegaly','Consolidation','ILD','Infiltration','Lung Opacity','Nodule/Mass','Other lesion','Pleural effusion','Pleural thickening','Pneumothorax','Pulmonary fibrosis']

def show_dicom(fpath, image_id, df):
    dic = Dicom_image(path=fpath, image_name=image_id, classes=classes)
    dic.add_bbox_from_df(df)
    dic.resize(max_dim=700)
    dic.show_image(img=dic.image_resize, bounding_boxes=dic.bbs_resize, showlabel=True)

## Process detection output files

In [None]:
# Get detections from txt files into a DataFrame

num_detections = 0
num_detected_files = 0
my_columns = ['image_id','cl','prob','x_min','y_min','x_max','y_max','rel_cx','rel_cy','rel_h','rel_w','rel_area']
detect_df = pd.DataFrame([], columns=my_columns)

for txt_file in listdir(labels_path):
        if txt_file.endswith('.txt'):
            num_detected_files += 1
            image_id = txt_file[:-4]
            df = sub_csv_df.loc[sub_csv_df['image_id']==image_id]  # locate image in submission dataframe
            image_h = int(df['image_h'])
            image_w = int(df['image_w'])
            file1 = open(join(labels_path, txt_file), 'r') 
            Lines = file1.readlines()
            
            for line in Lines:
                (cl,cx,cy,w,h,prob) = line.split()
                bbox_w = float(w)*image_w
                bbox_h = float(h)*image_h
                bbox_cx = float(cx)*image_w
                bbox_cy = float(cy)*image_h
                x_min = int(bbox_cx - bbox_w/2)
                x_max = int(bbox_cx + bbox_w/2)
                y_min = int(bbox_cy - bbox_h/2)
                y_max = int(bbox_cy + bbox_h/2)
                prob_round = round(float(prob), 1)
                rel_h, rel_w = round(float(h),3), round(float(w),3)
                rel_cx, rel_cy = round(float(cx),2), round(float(cy),2)
                rel_area = round(rel_h*rel_w,4)
                num_detections += 1
                df_row = pd.DataFrame([[image_id,cl,prob_round,x_min,y_min,x_max,y_max,rel_cx,rel_cy,rel_h,rel_w,rel_area]], columns=my_columns)
                detect_df = detect_df.append(df_row)
            

In [None]:
print (f'Using confidence threshold of {conf}:')
print (f'Found {num_detections} detections in {num_detected_files} files.')

In [None]:
# Sort detections by probability
detect_df.sort_values(by=['image_id', 'prob'], inplace=True)
detect_df.head()

## Examine the output and clean some errors

In [None]:
%%script echo skipping
import seaborn as sns
sns.set(rc={'figure.figsize':(20,10)})

In [None]:
%%script echo skipping
!pip install matplotlib==3.1.3  # workaround a bug in matplotlib that happen sometimes

In [None]:
%%script echo skipping
# Check for each class - x,y scatter; area histogram. Look for anormalities.

check_class = 13
detect_df_filt = detect_df[detect_df['cl']==str(check_class)]
print (f'showing class {check_class} = {classes[check_class]}')
print ('Number of detections:', len(detect_df_filt))

sns.scatterplot(data=detect_df_filt, x="rel_cx", y="rel_cy", hue="cl", size="prob")

In [None]:
%%script echo skipping
sns.distplot(a=detect_df_filt['rel_area'],norm_hist=False)

In [None]:
%%script echo skipping
sns.displot(data=detect_df_filt, x="rel_cx", y="rel_cy", kind="kde")

In [None]:
%%script echo skipping
examine_df = detect_df_filt[detect_df_filt['rel_area']>0.10]
print ('Num cases = ', len(examine_df))
examine_df.head()

In [None]:
%%script echo skipping
show_dicom(orig_dicom_files,'19c44a83a0b26af42aefbcbf7b9f9380', detect_df)

## Remove possible errors (by statistics and examinations)
Create filters according to the above findings

In [None]:
# err_filter1 = ~((detect_df['cl']=='1') & (detect_df['rel_area']<0.02))
# err_filter2 = ~((detect_df['cl']=='6') & (detect_df['rel_cy']>0.7))
# err_filter3 = ~((detect_df['cl']=='8') & (detect_df['rel_area']>0.01))
# err_filter4 = ~((detect_df['cl']=='11') & (detect_df['rel_area']>0.07))

# error_filter = err_filter1 & err_filter2 & err_filter3 & err_filter4

reduced_cols = ['image_id','cl','prob','x_min','y_min','x_max','y_max']
# reduced_df = detect_df[error_filter][reduced_cols]
reduced_df = detect_df[reduced_cols]
reduced_df.head()

In [None]:
num_final = len(reduced_df)
num_detections = len(detect_df)
print (f'Total detections:{num_detections}. Total removed:{num_detections-num_final}. Detections left:{num_final}')

In [None]:
# Create submission

default_detect = '14 1 0 0 1 1'

for index, row in reduced_df.iterrows():
    detection = '%s %.1f %d %d %d %d'% tuple(row.values[1:])
    df = sub_csv_df.loc[sub_csv_df['image_id']==row.image_id]
    if default_detect in sub_csv_df.loc[df.index.item(),'PredictionString']: # remove default 
        sub_csv_df.loc[df.index,'PredictionString'] = detection # first detection
    else:
        sub_csv_df.loc[df.index,'PredictionString'] += ' ' + detection # add detections
        

submission_df = sub_csv_df.drop(['image_h', 'image_w'], axis=1)
submission_df.head(10)

In [None]:
chdir(working_path)
makedirs(output_path, exist_ok = True)
submit_file = join(output_path,'submission.csv')
submission_df.to_csv(submit_file, index=False)
print ('created file ', submit_file)

In [None]:
# Remove files that hide the submission output
rmtree(yolo_path, ignore_errors=True)
rmtree(data_path, ignore_errors=True)
