Official Ultralytics YoloV5 Kaggle notebook: [Yolov5-kaggle](https://www.kaggle.com/ultralytics/yolov5-ultralytics)

Yolov5 implementation notebook: [Yolo v5 Object Detection Tutorial](https://jooskorstanje.com/yolov5-training-a-custom-object-detection-model.html)
Tutorial: [https://towardsdatascience.com/yolo-v5-object-detection-tutorial-2e607b9013ef](https://towardsdatascience.com/yolo-v5-object-detection-tutorial-2e607b9013ef)

Ultralitics YoloV5 wiki: [https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data](https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data)

Another tutorial: [https://medium.com/towards-artificial-intelligence/yolo-v5-object-detection-on-a-custom-dataset-61d478bc08f9](https://medium.com/towards-artificial-intelligence/yolo-v5-object-detection-on-a-custom-dataset-61d478bc08f9)

And another: [https://lionbridge.ai/articles/create-an-end-to-end-object-detection-pipeline-using-yolov5/](https://lionbridge.ai/articles/create-an-end-to-end-object-detection-pipeline-using-yolov5/)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pydicom as dicom
import cv2
import os
from os.path import join, exists

## Get Data
### I use a revised CSV file with some extra fields I added in previous execution.
### (The code generated these fields is in comments some blocks ahead.)
- w,h: width and height of the bounding box (bbox) in pixels.
- rad_score: Radiologists score according to number of abnormalities each radiologist found. Higher score means the radiologist detected more abnormalities, thus he may be more professional and more accurate (my assumption).
- img_w, img_h: the original image width and height.
- w_r, h_r: relative width and height of the bbox, in relative to the image size (betwen 0-1).
- area_r - relative bbox area size.
- c_x, c_y - relative x,y center location of the bbox.

In [None]:
path = '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/'
path_csv_new = '/kaggle/input/vinbigdataxraytraincsvnew/'
path_train = join(path,'train')

train_df = pd.read_csv(join(path_csv_new,'train_new.csv'))  # Revised train.csv version with extra fields I added
samp_subm = pd.read_csv(path+'sample_submission.csv')

In [None]:
print('Number train samples:', len(train_df.index))
print('Number test samples:', len(samp_subm.index))
train_df.head()

### Get a list of class names in order of class id

In [None]:
# Get label classes
class_ids, class_names = list(zip(*set(zip(train_df.class_id, train_df.class_name))))
classes = list(np.array(class_names)[np.argsort(class_ids)])
classes = list(map(lambda x: str(x), classes))
classes

### Split the data to 2 Dataframes - DF of images that include labels, and DF with images that wasn't detected any abnormalities.

In [None]:
## detected_df was generated and saved in previous version, so I now read it from database.

#detected_df = train_df[train_df.class_id  < 14].sort_values(by=['image_id','class_id','rad_score'], ascending=False)
#detected_df = detected_df[detected_df.area_r < 0.7] # Remove bboxes that bigger than 70% of the image (not helpfull)

detected_df = pd.read_csv(join(path_csv_new,'detected_df.csv'), index_col=0) # DF of labeled images with grouping info (the code appears later in comments)

nolabel_df = train_df[train_df.class_id  == 14].copy()
print('Number of detections:', len(detected_df.index))
detected_df.head(10)

### Get lists of image names.
- list_of_images - list of all images
- list_of_images_no_label - list of images without labels
- list_of_images_labled - list of images with labels

In [None]:
list_of_images = train_df['image_id'].unique().tolist()
list_of_images_no_label = nolabel_df['image_id'].unique().tolist()
list_of_images_labled = detected_df['image_id'].unique().tolist()

print ('Total number of images: ', len(list_of_images))
print ('Number of images without labels: ', len(list_of_images_no_label))
print ('Number of images with labels: ', len(list_of_images_labled))

## Find groups of similar bboxes, and create average bboxes
### Mark bbox type:
- Single: Only one bbox of that class is in the area of the image.
- Group#: bbox is part of a group # of similar bboxes (same class, matching IOU), detected by different radiologists.
- Average#: bbox is created by average of group #.

In [None]:
# Calculate IOU (Intersection Over Union) of two bounding boxes
#   box[0] - x_min
#   box[1] - y_min
#   box[2] - x_max
#   box[3] - y_max

def bb_iou (boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    # compute the area of intersection rectangle
    interArea = abs(max((xB - xA, 0)) * max((yB - yA), 0))
    if interArea == 0:
        return 0
    # compute the area of both boxes
    boxAArea = abs((boxA[2] - boxA[0]) * (boxA[3] - boxA[1]))
    boxBArea = abs((boxB[2] - boxB[0]) * (boxB[3] - boxB[1]))
    # compute the intersection over union
    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

In [None]:
# # Find groups of bboxes by calculating IOU between pairs of same classes.
# ## This block is commented out after generating the data and creating the new detected_df.csv

# detected_df['bbox_type'] = ''  # Create new field bbox_type for grouping info

# from tqdm.auto import tqdm

# # Set thresholds
# IOU_threshold = 0.4 # IOU threshold that consider 2 bboxes (of the same class) as being the same annotation

# for image_id in tqdm(list_of_images_labled):
#     image_data = detected_df[detected_df['image_id'] == image_id]  # Dataframe containing all rows for the specific image   
    
#     list_of_classes = image_data['class_id'].unique().tolist() # All class id's in the image
   
#     for class_id in list_of_classes:
#         # Combine ovelapping bboxes with same class_id, and separate them from non overlapping bboxes
#         class_id_bboxes = image_data[image_data['class_id']==class_id].sort_values('rad_score', ascending=False) # DF of rows with same class_id
#         group = []
#         group_indx = []
#         group_num=-1  # Count different groups of the same class
        
#         for i1, row1 in class_id_bboxes.iterrows():
#             if i1 in class_id_bboxes.index:
#                 boxA = (row1.x_min, row1.y_min ,row1.x_max, row1.y_max)
#                 row_dict = row1.to_dict()
#                 group.append(boxA)
#                 group_indx.append(i1)
#                 class_id_bboxes.drop(i1,inplace=True, errors='ignore')
#                 for i2, row2 in class_id_bboxes.iterrows():  # search for other boxes that overlap with boxA
#                     boxB = (row2.x_min, row2.y_min ,row2.x_max, row2.y_max)
#                     if bb_iou (boxA, boxB) > IOU_threshold:
#                         group.append(boxB)
#                         group_indx.append(i2)
#                         class_id_bboxes.drop(i2,inplace=True,errors='ignore')
#                 if len(group)==1:
#                     # update bbox_type to 'Single'
#                     detected_df.loc[group_indx, 'bbox_type'] = 'Single'
#                 elif len(group)>1:
#                     #need to create new row with image_id and update bbox_type for the group and average
#                     group_num += 1
#                     detected_df.loc[group_indx, 'bbox_type'] = 'Group'+str(group_num)
#                     row_dict['bbox_type'] = 'Average'+str(group_num)
#                     row_dict['x_min'] = round(sum([i[0] for i in group])/len(group))
#                     row_dict['y_min'] = round(sum([i[1] for i in group])/len(group))
#                     row_dict['x_max'] = round(sum([i[2] for i in group])/len(group))
#                     row_dict['y_max'] = round(sum([i[3] for i in group])/len(group))
#                     row_dict['w'] = row_dict['x_max'] - row_dict['x_min']
#                     row_dict['h'] = row_dict['y_max'] - row_dict['y_min']
#                     row_dict['w_r'] = round(row_dict['w'] / row_dict['img_w'], 3)
#                     row_dict['h_r'] = round(row_dict['h'] / row_dict['img_h'], 3)
#                     row_dict['area_r'] = round(row_dict['w_r'] * row_dict['h_r'],3)
#                     row_dict['c_x'] = round((row_dict['x_min'] + row_dict['w']/2) / row_dict['img_w'], 3)
#                     row_dict['c_y'] = round((row_dict['y_min'] + row_dict['h']/2) / row_dict['img_h'], 3)
#                     #detected_df = detected_df.append(row_dict, ignore_index=True)
#                     detected_df.loc[detected_df.index.max() + 1] = row_dict
                    
#                 group=[]
#                 group_indx = []
                  

In [None]:
# detected_df = detected_df.sort_values(by=['image_id','class_id','bbox_type'], ascending=False)
# detected_df.head(20)

# Explore the Data
## Exploring distribution of labels

In [None]:
# Plot label distribution

fig, ax = plt.subplots(1, 1, figsize=(12, 4))
x = train_df['class_name'].value_counts().keys()
y = train_df['class_name'].value_counts().values
ax.bar(x, y)
ax.set_xticklabels(x, rotation=90)
ax.set_title('Distribution of the labels')
plt.grid()
plt.show()

In [None]:

labels = classes[:-1]
orig_vals = []
total_vals = []
no_groups_vals = []
avg_vals = []

for label in labels:
    temp_df = detected_df[detected_df['class_name']==label]
    total = len(temp_df.index) # total number of labels
    avgs = len(temp_df[temp_df['bbox_type'].str.contains("Average")].index)  # labels creatged by average of groups
    singles = len(temp_df[temp_df['bbox_type']=='Single'].index)
    orig_vals.append(total-avgs)
    total_vals.append(total)
    no_groups_vals.append(avgs+singles)
    avg_vals.append(avgs)
    

x = np.arange(len(labels))  # the label locations
width = 0.23  # the width of the bars

fig, ax = plt.subplots(figsize=(12, 8))
# rects: orig(no avg), total(orig+avg), no_group(avg+single), avg(only)
rects1 = ax.bar(x - width*1.5, orig_vals, width, label='Original')
rects2 = ax.bar(x - width/2, total_vals, width, label='Original+Avg')
rects3 = ax.bar(x + width/2, no_groups_vals, width, label='No_Groups')
rects4 = ax.bar(x + width*1.5, avg_vals, width, label='Averages only')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Sums')
ax.set_title('Distribution of Labels')
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=90)
ax.legend()


def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')


autolabel(rects1)
autolabel(rects2)
autolabel(rects3)
autolabel(rects4)

fig.tight_layout()
plt.grid()
plt.show()


In [None]:
import seaborn as sns

sns.catplot(data=detected_df, kind="violin", x="class_name", y="area_r",  split=False, height=8.27, aspect=11.7/8.27)

In [None]:
detected_df[detected_df['area_r']>0.4]
# Conclusion - need to remove detections with area_r > 0.4 - cover too much general area

In [None]:
detected_df['AR'] = detected_df['w']/ detected_df['h']  # add bbox Aspect-Ratio parameter
detected_df.head()

In [None]:
sns.catplot(data=detected_df, kind="violin", x="class_name", y="AR",  split=False, height=8.27, aspect=11.7/8.27)

In [None]:
detected_df[detected_df['AR']>10]

In [None]:
# Remove not helpfull labels
detected_df = detected_df[detected_df['area_r']<0.4]
detected_df = detected_df[detected_df['AR']<10]

In [None]:
# Checking the distibution of radiologists
# The idea is that radiologists that are more active (more labelings) would probably be more accurate
# so later when I choose a reference bbox to combine other bboxes I would choose the reference
# from the labling of the higher scored radiologist.

fig, ax = plt.subplots(1, 1, figsize=(12, 4))
x = detected_df['rad_id'].value_counts().keys()
y = detected_df['rad_id'].value_counts().values
ax.bar(x, y)
ax.set_xticklabels(x, rotation=90)
ax.set_title('Distribution of the radiologists')
plt.grid()
plt.show()

## Add some columns
### This code was used to generate train_new.csv and is now in comments.

In [None]:
# # add width, height to df
# train_df['w'] = train_df['x_max'] - train_df['x_min']
# train_df['h'] = train_df['y_max'] - train_df['y_min']
# #train_df = train_df[train_df['w'] > 1500 ]

# train_df.head()

In [None]:
# # Checking the distibution of radiologists
# # The idea is that radiologists that are more active (more labelings) would probably be more accurate
# # so later when I choose a reference bbox to combine other bboxes I would choose the reference
# # from the labling of the higher scored radiologist.

# fig, ax = plt.subplots(1, 1, figsize=(12, 4))
# non_empty_train_df = train_df[train_df["class_id"]!=14]
# x = non_empty_train_df['rad_id'].value_counts().keys()
# y = non_empty_train_df['rad_id'].value_counts().values
# ax.bar(x, y)
# ax.set_xticklabels(x, rotation=90)
# ax.set_title('Distribution of the radiologists')
# plt.grid()
# plt.show()

In [None]:
# rad_list = x.tolist() # Radiologists that labled abnormalities
# rad_scores = list(range((len(rad_list)),0,-1))  # Score of radiologists according to the number of abnormalities found

# all_rads = train_df['rad_id'].unique().tolist()  # All Radiologists in the dataframe
# rad_no_score = [i for i in all_rads if i not in rad_list]  # Radiologists that didn't find any abnormality. They get 0 score.

# rad_scores.extend([0]*len(rad_no_score))
# rad_list.extend(rad_no_score)   # Add radiologists with score 0

# train_df["rad_score"] = train_df["rad_id"].replace(rad_list, rad_scores) # Add new column with that Radiologist score
# train_df.head()

In [None]:
# # Get images size - img_w, img_h 
# train_df = train_df.assign(img_w=np.nan, img_h=np.nan)

# from tqdm.auto import tqdm

# list_of_images = train_df['image_id'].unique().tolist()

# # Read dicom files and fetch image size(columns,rows) - this is time consuming!
# for image_id in tqdm(list_of_images):
#     dicom_path = os.path.join(path,'train',image_id+'.dicom')
#     data_file = dicom.dcmread(dicom_path)   
#     train_df.loc[train_df.image_id==image_id, 'img_w'] = data_file.Columns
#     train_df.loc[train_df.image_id==image_id, 'img_h'] = data_file.Rows

# train_df.head()

In [None]:
# # add relative window width, relative window height and relative area to df
# train_df['w_r'] = round(train_df['w'] / train_df['img_w'], 3)
# train_df['h_r'] = round(train_df['h'] / train_df['img_h'], 3)
# train_df['area_r'] = round(train_df['h_r'] * train_df['w_r'], 3)

# # add window relative center c_x, c_y
# train_df['c_x'] = round(((train_df['w'] / 2 ) + train_df['x_min'])/train_df['img_w'], 3)
# train_df['c_y'] = round(((train_df['h'] / 2 ) + train_df['y_min'])/train_df['img_h'], 3)

In [None]:
# # Write new CSV file 
# from os import makedirs
# working_path = '/kaggle/working'
# csv_new_path = join(working_path,'csv_new')
# makedirs(csv_new_path, exist_ok = True)

# train_new_file = join(csv_new_path,'train_new.csv')
# detected_df_file = join(csv_new_path,'detected_df.csv')
# #train_df.to_csv(train_new_file, index=False)
# detected_df.to_csv(detected_df_file)

# Read DICOM image files

## Define Dicom class
- Read dicom file
- Get image
- Get bounding boxes
- Create augmentation
- Save image

In [None]:
import cv2
import imgaug as ia
from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage
from imgaug import augmenters as iaa
from matplotlib import colors
import random
from pydicom.pixel_data_handlers.util import apply_voi_lut
from os.path import join


In [None]:
class Dicom_image:
    def __init__(self, path, image_name, classes=[]):
        '''
        Import image
        :param path: Path to the image
        :param image_name: image name
        '''
        self.path = path
        if image_name.endswith('.dicom'):
            self.file_name = image_name
            self.image_name = image_name[:-6]
        else:
            self.file_name = image_name + '.dicom'
            self.image_name = image_name
        self.file_path = join(self.path, self.file_name)
        if exists(self.file_path):
            self.dicom_file = dicom.dcmread(self.file_path)
            self.image_orig = self.read_xray()
            self.orig_size = (self.image_orig.shape[0],self.image_orig.shape[1]) # Original image size before resizing - (h,w)
            self.bbs = [] # Initial bounding boxes
            self.classes = classes
            self.image_aug = None # Augmented image
            self.bbs_aug = []  # bounding boxes after augmentation
            self.image_resize = None # image after resize
            self.bbs_resize= [] # bounding boxes after resize
        else:
            print (f"Error! {self.file_path} does not exists!")
    
    def read_xray(self, voi_lut = True, fix_monochrome = True):
        '''
        Read image data from the dicom file
        (reference: https://www.kaggle.com/raddar/vinbigdata-competition-jpg-data-2x-downsampled)
        '''
        data_file = self.dicom_file
        # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
        if voi_lut:
            data = apply_voi_lut(data_file.pixel_array, data_file)
        else:
            data = data_file.pixel_array

        # depending on this value, X-ray may look inverted - fix that:
        if fix_monochrome and data_file.PhotometricInterpretation == "MONOCHROME1":
            data = np.amax(data) - data
        data = data - np.min(data)
        data = data / np.max(data)
        data = (data * 255.0).astype(np.uint8)
        # Use CLAHE (Contrast Limited Adaptive Histogram Equalization) to improve contrast
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
        data = clahe.apply(data)
        data = cv2.cvtColor(data,cv2.COLOR_GRAY2RGB) # convert to 3 channel grayscale
        return data

    def add_bbox(self, xmin, ymin, xmax, ymax, cl_label):
        '''
        Add a bounding box to list of bounding boxes in the image
        :param xmin, ymin, xmax, ymax: bounding box coordinates
        :param cl_label: string label class of the bounding box
        '''
        self.bbs.append(BoundingBox(x1=xmin, y1=ymin, x2=xmax, y2=ymax, label=cl_label))
    
    def remove_bbox(self, bbox):
        '''
        Remove a bounding box from the list of bounding boxes
        :param bbox: a list of bbox coordinates and label in the form [xmin, ymin, xmax, ymax, cl_label]
        '''
        if bbox in self.bbs:
            self.bbs.remove(bbox)
            success = True
        else:
            success = False
        return success
    
    def minmax_bbox(self, bbs=None):
        '''
        Find min and max values of bounding boxes (so cropping will not cut the bboxes)
        :param bboxs (default: self.bbs): list of bounding boxes
        '''
        bbs = self.bbs if bbs is None else bbs
        bbs_xmax = max([bbs[i].x2_int for i in range(len(bbs))])
        bbs_ymax = max([bbs[i].y2_int for i in range(len(bbs))])
        bbs_xmin = min([bbs[i].x1_int for i in range(len(bbs))])
        bbs_ymin = min([bbs[i].y1_int for i in range(len(bbs))])        
        return bbs_xmin, bbs_ymin, bbs_xmax, bbs_ymax
    
    def sort_bbs(self):
        '''
        Sort the bbs list of bounding boxes by the class label
        :param none
        '''
        def sort_class(elem):
            return elem.label
        self.bbs.sort(key = sort_class)
        return self.bbs
    
    def resize (self, img=None, bounding_boxes=None, downscale_factor = 1, max_dim = None):
        '''
        Resize image
        :param downscale_factor: downscale factor (default = 1)
        :param max_dim: downscale so that largest dimension (height or width) is equal max_dim. The other dimension reduced proportionately.
        '''
        img = self.image_orig if img is None else img
        img_size = (img.shape[0], img.shape[1])
        bounding_boxes = self.bbs if bounding_boxes is None else bounding_boxes
        if type(bounding_boxes)==BoundingBoxesOnImage:
            bbs = bounding_boxes
        else:
            bbs = BoundingBoxesOnImage(bounding_boxes, shape=img.shape)
        
        if downscale_factor > 1:  # Resize by a constant factor
            new_shape = tuple([int(x / downscale_factor) for x in img_size])       
        elif max_dim != None: # Resize so that maximum length (width or height) will be max_dim. The other dimension resized proportionately
            downscale_factor =  max(img.shape) / max_dim
            new_shape = tuple([round(x / downscale_factor) for x in img_size])
        else:
            new_shape = img_size # Don't resize

        self.image_resize = ia.imresize_single_image(img, new_shape)
        self.bbs_resize = bbs.on(self.image_resize)
        return self.image_resize, self.bbs_resize
    
    def augment(self, img=None, bounding_boxes=None, keep_size=False):
        '''
        Augment the image
        :param img: Image to be processed
        :param bounding_boxes: Bounding boxes (default is the original image bboxes).
        :param keep_size: Keep the size of the image after cropping (default is False - don't resize to original size)
        '''
        img = self.image_orig if img is None else img
        bounding_boxes = self.bbs if bounding_boxes is None else bounding_boxes
        if type(bounding_boxes)==BoundingBoxesOnImage:
            bbs = bounding_boxes
        else:
            bbs = BoundingBoxesOnImage(bounding_boxes, shape=img.shape)
        
        # Do basic augmentation - brightness, scale and shear
        seq = iaa.Sequential([
            iaa.MultiplyBrightness((0.7, 1.2)),
            iaa.Affine(scale={"x": (0.8, 1.2), "y": (0.8, 1.2)}),
            iaa.Affine(shear=(-10, 10))
        ])
        image_aug, bbs_aug = seq(image=img, bounding_boxes=bbs)
        
        # Do random Cropping up to 20% of all sides
        # First - find safe margins for cropping so that bboxes won't be cropped out
        if not bbs_aug.empty:
            max_percent = 0.2
            w = image_aug.shape[1]
            h = image_aug.shape[0]
            bbs_xmin, bbs_ymin, bbs_xmax, bbs_ymax = self.minmax_bbox(bbs=bbs_aug)
            marg_left = (bbs_xmin -2) / w
            marg_right = (w - bbs_xmax -2) / w
            marg_top = (bbs_ymin -2) / h
            marg_bottom = (h - bbs_ymax -2) / h
            max_crop_percent = min(0.3, marg_left, marg_right, marg_top, marg_bottom)
            max_crop_percent = 0 if max_crop_percent<0 else max_crop_percent
        crop = iaa.Crop(percent=(0, max_crop_percent), keep_size=keep_size) # crop image
        image_aug=crop.augment_image(image_aug)
        bbs_aug = bbs_aug.on(image_aug)
        
        self.image_aug, self.bbs_aug = image_aug, bbs_aug
        return image_aug, bbs_aug

    def parse_label(self, bb):
        '''
        Convert label marked as 'c_r' (c=class number, r=radiologist rank) to label title and color
        :param bb: bounding box with label
        '''
        color_pallete = ['lightcoral','brown','red','darksalmon','chocolate',
                  'darkorange','darkgoldenrod','gold','olive','yellow',
                  'green','lime','blue','darkorchid']
        try:
            class_code = int(bb.label.split('_')[0])
            class_name = self.classes[class_code]
        except:
            class_code = random.randrange(len(colors))
            class_name = ''
        cl_color = tuple(np.array(colors.to_rgb(color_pallete[class_code]))*255)
        return cl_color, class_name
    
    def show_image(self, img=None, bounding_boxes=None, showbbs=True, showlabel=True):
        '''
        Show the image with bounding boxes
        :param img: image to be processed
        :param bounding_boxes: list of bounding boxes
        :param showbbs: flag to show the bounding boxes, if there are any (default=True)
        :param showlabel: flag to show the labels of the bounding boxes, if there are any (default=True)
        '''
        img = self.image_orig if img is None else img
        bounding_boxes = self.bbs if bounding_boxes is None else bounding_boxes
        bbs=[]
        if bounding_boxes != []:
            if type(bounding_boxes)==BoundingBoxesOnImage:
                bbs = bounding_boxes.bounding_boxes
            else:
                bbs = bounding_boxes
            
            for bb in bbs:
                bbox_color, bbox_label = self.parse_label(bb)
                if showlabel:
                    bbl = bb.copy(label=bbox_label)
                    img = bbl.draw_on_image(image=img, color=bbox_color)
                elif showbbs:
                    img = bb.draw_box_on_image(image=img, color=bbox_color)     
        ia.imshow(img)
        
    def save_image_label (self, fname=None, img_path='', label_path='', img=None):
        '''
        Save the image as jpg and bounding boxes as txt
        :param fname: file name (without extension - jpg and txt will be added to fname automatically)
        :param img: what image to save - 'orig' or None = original image; 'aug' = augmented image; 'resize' = resized image
        :param bounding_boxes: list of bounding boxes
        :param savelabel: flag to save label text file (default=True)
        '''
        fname = self.image_name if fname is None else fname
        
        if ('aug' in img):
            image = self.image_aug
            bounding_boxes = self.bbs_aug
        elif ('resize' in img):
            image = self.image_resize
            bounding_boxes = self.bbs_resize
        else:
            image = self.image_orig
            bounding_boxes = self.bbs
        if image is None:  # img='aug' or 'resize' but augmentation/resize was not performed before.
            image = self.image_orig
            bounding_boxes = self.bbs
            print (f'Warning: Saving original image. {img} was not performed on image!')
            
        self.save_image(fname = fname, img_path=img_path, img=image)
        self.save_label(img_size=image.shape, fname = fname, label_path=label_path, bounding_boxes=bounding_boxes)
        
                
    def save_image(self, fname = None, img_path=None, img=None):
        '''
        Save the image as jpg
        :param fname: file name (without extension - jpg will be added to fname automatically)
        :param img: image to be saved
        '''
        fname = self.image_name if fname is None else fname
        img = self.image_orig if img is None else img
        image_path = join(img_path, fname+'.jpg')
        status=cv2.imwrite(image_path, img)
        if not status:
            print (f'Error: Could not save {fname}.jpg to {img_path}!')
            
        
    def save_label(self, img_size, fname = None, label_path='', bounding_boxes=None):
        '''
        Save the bounding boxes as txt
        :param fname: file name (without extension - jpg and txt will be added to fname automatically)
        :param bounding_boxes: list of bounding boxes
        :param savelabel: flag to save label text file (default=True)
        '''
        fname = self.image_name if fname is None else fname
        bounding_boxes = self.bbs if bounding_boxes is None else bounding_boxes
        bbs=[]
        if bounding_boxes != []:
            if type(bounding_boxes)==BoundingBoxesOnImage:
                bbs = bounding_boxes.bounding_boxes
            else:
                bbs = bounding_boxes
        
        lbl_path = join(label_path, fname+'.txt')
        
        with open(lbl_path, 'w+') as file:
            for bb in bbs:
                bb = bb.clip_out_of_image(img_size)
                xc = round(bb.center_x / img_size[1], 5)
                yc = round(bb.center_y / img_size[0], 5)
                w = round(bb.width / img_size[1], 5)
                h = round(bb.height / img_size[0], 5)
                class_id = bb.label.split('_')[0]
                line = ' '.join((str(class_id), str(xc), str(yc), str(w), str(h))) + '\n'
                file.write(line) 
        
        

In [None]:
def show_dicom(fpath, image_id, df):
    dic = Dicom_image(path=fpath, image_name=image_id, classes=classes)
    image_df = df[df['image_id']==image_id]
    for i, row in image_df.iterrows():
        dic.add_bbox(row.x_min, row.y_min ,row.x_max, row.y_max,str(row.class_id))
    dic.resize(max_dim=1024)
    dic.show_image(img=dic.image_resize, bounding_boxes=dic.bbs_resize, showlabel=True)

In [None]:
show_dicom(fpath=path_train, image_id='051132a778e61a86eb147c7c6f564dfe',df=detected_df)

In [None]:
%%script echo skipping
# test timing of augment:
#1. Augment full size, Resize to 512
#2. Resize half size. Augment. Resize to 512.
#3. Resize to 512. Augment with keep_size=True (lower quality)

import timeit

dic = Dicom_image(path=path+'train', image_name='051132a778e61a86eb147c7c6f564dfe', classes=classes)
dic.add_bbox(500,700,1000,2000,'9')
dic.add_bbox(900,500,1200,1900,'2')
dic.add_bbox(100,800,1100,1200,'4')
dic.add_bbox(1500,2000,2200,2700,'5')
 
# Test 1 - Augment full size, Resize to 512
starttime = timeit.default_timer()
dic.augment()
dic.resize(img=dic.image_aug, bounding_boxes=dic.bbs_aug, max_dim=512)
dic.show_image(img=dic.image_resize, bounding_boxes=dic.bbs_resize)
print("Test1 time difference is :", timeit.default_timer() - starttime)

# Test 2 - Resize half size. Augment. Resize to 512.
starttime = timeit.default_timer()
dic.resize(downscale_factor = 2)
dic.augment(img=dic.image_resize, bounding_boxes=dic.bbs_resize)
dic.resize(img=dic.image_aug, bounding_boxes=dic.bbs_aug, max_dim=512)
dic.show_image(img=dic.image_resize, bounding_boxes=dic.bbs_resize)

print("Test2 time difference is :", timeit.default_timer() - starttime)

from os import makedirs
working_path = '/kaggle/working'
testpath = join(working_path,'test')
makedirs(testpath, exist_ok = True)

dic.save_image_label (fname='test_aug', img_path=testpath, label_path=testpath, img='aug')
dic.save_image_label (fname='test_resize', img_path=testpath, label_path=testpath, img='resize')

# Test 3 - Resize to 512. Augment with keep_size=True
starttime = timeit.default_timer()
dic.resize(max_dim=512)
dic.augment(img=dic.image_resize, bounding_boxes=dic.bbs_resize, keep_size=True)
dic.show_image(img=dic.image_aug, bounding_boxes=dic.bbs_aug)
print("Test2 time difference is :", timeit.default_timer() - starttime)


## Show examples of each class

In [None]:
%%script echo skipping

import warnings
warnings.filterwarnings("ignore")

def plot_example(df, idx_list):
    fig, axs = plt.subplots(1, 3, figsize=(15, 10))
    fig.subplots_adjust(hspace = .1, wspace=.1)
    axs = axs.ravel()
    for i in range(3):
        image_id = df.loc[idx_list[i], 'image_id']
        img,_ = read_xray(os.path.join(path,'train',image_id+'.dicom'))
        axs[i].imshow(img, cmap='gray')
        axs[i].set_title(df.loc[idx_list[i], 'class_name'])
        axs[i].set_xticklabels([])
        axs[i].set_yticklabels([])
        if df.loc[idx_list[i], 'class_name'] != 'No finding':
            bbox, anchor, w, h = get_bbox(df, idx_list[i])
            p = matplotlib.patches.Rectangle(anchor, w,h,ec='r', fc='none', lw=2.)
            axs[i].add_patch(p)

# Show examples: Remove the comments to show the examples            
#for num in range(15):
    #idx_list = train_df[train_df['class_id']==num][0:3].index.values
    #plot_example(train_df, idx_list)

# Setting the environment for YOLOV5
## Set up the directories
Yolo V5 needs a very specific set up of data folders in order to work:

![](https://miro.medium.com/max/472/1*XupA8TGTSGdZdjsrs16hkw.png)

## Set up the data
### The images
The images have to be directly in the image folders. Training images in the data/images/train folder and validation images in the data/images/valid folder.
The names of the images have to be simply unique names with a .jpg (or another format).
### The labels
The labels have to be in the data/labels/train/ or in the data/labels/valid.
The name of the labels file has to be the same name as the image, but with “.txt” instead of “.jpg”.
The bounding boxes have to be listed as one bounding box per line, with on that line:
* the class number of the object in the bounding box (always 0 if only one class)
* the standardized center pixel of the bounding box in terms of width
* the standardized center pixel of the bounding box in terms of height
* the standardized width of the bounding box
* the standardized height of the bounding box

In [None]:
%%script echo skipping
# Calculate IOU (Intersection Over Union) of two bounding boxes
#   box[0] - x_min
#   box[1] - y_min
#   box[2] - x_max
#   box[3] - y_max

def bb_iou (boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    # compute the area of intersection rectangle
    interArea = abs(max((xB - xA, 0)) * max((yB - yA), 0))
    if interArea == 0:
        return 0
    # compute the area of both boxes
    boxAArea = abs((boxA[2] - boxA[0]) * (boxA[3] - boxA[1]))
    boxBArea = abs((boxB[2] - boxB[0]) * (boxB[3] - boxB[1]))
    # compute the intersection over union
    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou
    

In [None]:

print ('Total number of images: ', len(list_of_images))
print ('Number of images without labels: ', len(list_of_images_no_label))
print ('Number of images with labels before augmentation: ', len(list_of_images_labled))

## Prepare dataframe to calculate number of augmentation for each image
### The purpose is to create balanced dataset 

In [None]:

no_group_df = detected_df[~detected_df['bbox_type'].str.contains("Group")]

labeled_df = no_group_df[['image_id','class_name']].copy()
for label in classes[:-1]:
    labeled_df[label] = labeled_df['class_name'].apply(lambda x: int(label in x))

#newdf = labeled_df[labeled_df['image_id']=='ffceb71a80efba3b83c88e11f4b9694b']
images_df = labeled_df.groupby(['image_id']).sum()
images_df.head()

In [None]:
show_dicom(fpath=path_train, image_id='051132a778e61a86eb147c7c6f564dfe',df=no_group_df)

In [None]:
for label in classes[:-1]:
    images_df[label+'_'] = 0
images_df.insert(0,'numaug',1)
images_df.head()

In [None]:
for label in classes[:-1]:
    images_df[label+'_'] = images_df['numaug'] * images_df[label]

### Checking the distibution of lables. Looking for smallest labels to duplicate (using augmentation)

In [None]:
images_df.sum(axis=0)

In [None]:
no_large_classes = (images_df['Pleural thickening']==0) & (images_df['Pulmonary fibrosis']==0) & (images_df['Aortic enlargement']==0) & (images_df['Cardiomegaly']==0) 
images_df[(images_df['Pneumothorax']>0) & (images_df['Atelectasis']>0) & no_large_classes][classes[:-1]]

### Trying to reach target of ~3000 detections per label. Need to add ~2800 to 'Atelectasis' and 'Pneumothorax'
### I don't want to inflate 'Other lesion' so I would augment image af201da8a5f8354c4c3291995d5cbafd 300 times and the rest will be augmented (2800-300)/3 = 834 times

In [None]:
images_df.loc[(images_df['Pneumothorax']>0) & (images_df['Atelectasis']>0) & no_large_classes, 'numaug'] = 901 # was 835
images_df.loc['af201da8a5f8354c4c3291995d5cbafd', 'numaug'] = 101 # was 301

images_df[(images_df['Pneumothorax']>0) & (images_df['Atelectasis']>0) & no_large_classes]

In [None]:
# Update number of generated labels
for label in classes[:-1]:
    images_df[label+'_'] = images_df['numaug'] * images_df[label]
    
images_df.sum(axis=0)

### Now looking to duplicate 'Calcification', 'ILD', 'Infiltration'. Need to add ~2500 to them. 

In [None]:
check_cols = ['Calcification', 'ILD', 'Infiltration']
remove_cols = [i for i in labels if i not in check_cols]


In [None]:
images_df[(images_df['ILD']>0) & (images_df['Calcification']>0) & no_large_classes][classes[:-1]]


In [None]:
images_df.loc['35e38672875ff60d5a131d91b4db5a6d', 'numaug'] += 400
images_df.loc['9c314e403d3e6e3ed09e79a57019f9ad', 'numaug'] += 600

In [None]:
# Update number of generated labels
for label in classes[:-1]:
    images_df[label+'_'] = images_df['numaug'] * images_df[label]
    
images_df.sum(axis=0)

In [None]:
no_large_classes2 = (images_df['Pleural thickening']==0) & (images_df['Pulmonary fibrosis']==0) & (images_df['Aortic enlargement']==0) & (images_df['Cardiomegaly']==0) & (images_df['Lung Opacity']==0)

images_df[(images_df['Consolidation']>0) & (images_df['Infiltration']>0) & no_large_classes2][classes[:-1]]

In [None]:
images_df[(images_df['Consolidation']==0) & (images_df['Infiltration']>0) & no_large_classes2][classes[:-1]]

In [None]:
images_df.loc['82877be2465c084b0b9bc186fc7f158f', 'numaug'] += 300
images_df.loc['cfbac484f94686cd93d564487d9e5a8a', 'numaug'] += 1000

In [None]:
# Update number of generated labels
for label in classes[:-1]:
    images_df[label+'_'] = images_df['numaug'] * images_df[label]
    
images_df.sum(axis=0)

In [None]:
images_df[images_df['numaug']>1]

In [None]:
show_dicom(fpath=path_train, image_id='cfbac484f94686cd93d564487d9e5a8a',df=no_group_df)

In [None]:
show_dicom(fpath=path_train, image_id='af201da8a5f8354c4c3291995d5cbafd',df=no_group_df)

In [None]:
# Remove unhelpfull 'Other lesion'(9)
images_df.loc['af201da8a5f8354c4c3291995d5cbafd','Other lesion']=0
no_group_df.drop(no_group_df[(no_group_df['image_id']=='af201da8a5f8354c4c3291995d5cbafd') & (no_group_df['class_id']==9)].index, inplace=True)

In [None]:
show_dicom(fpath=path_train, image_id='9c314e403d3e6e3ed09e79a57019f9ad',df=no_group_df)

In [None]:
# Remove 2 larger 'Calcification'(2)
#print(no_group_df[no_group_df['image_id']=='9c314e403d3e6e3ed09e79a57019f9ad'])
no_group_df.drop(19138, inplace=True)
no_group_df.drop(16321, inplace=True)
images_df.loc['9c314e403d3e6e3ed09e79a57019f9ad','Calcification']=1
no_group_df[no_group_df['image_id']=='9c314e403d3e6e3ed09e79a57019f9ad']

In [None]:
show_dicom(fpath=path_train, image_id='82877be2465c084b0b9bc186fc7f158f',df=no_group_df)

In [None]:
# Remove 'Other lesion'(9) and medium 'Infiltration'(6) - index 72294
print(no_group_df[no_group_df['image_id']=='82877be2465c084b0b9bc186fc7f158f'])
no_group_df.drop(72294, inplace=True)
no_group_df.drop(no_group_df[(no_group_df['image_id']=='82877be2465c084b0b9bc186fc7f158f') & (no_group_df['class_id']==9)].index, inplace=True)
images_df.loc['82877be2465c084b0b9bc186fc7f158f','Other lesion']=0
images_df.loc['82877be2465c084b0b9bc186fc7f158f','Infiltration']=3

In [None]:
show_dicom(fpath=path_train, image_id='76339ec8c17dbcdd117914581cee59f5',df=no_group_df)

In [None]:
show_dicom(fpath=path_train, image_id='6ce61a39f1e1bff629566de047ab8775',df=no_group_df)

In [None]:
no_group_df[no_group_df['image_id']=='6ce61a39f1e1bff629566de047ab8775']

In [None]:
show_dicom(fpath=path_train, image_id='35e38672875ff60d5a131d91b4db5a6d',df=no_group_df)

In [None]:
show_dicom(fpath=path_train, image_id='106a3da41d2e3d9f508c09b28e8abdaf',df=train_df)

In [None]:
# Remove 'Other lesion'(9)
print(no_group_df[no_group_df['image_id']=='106a3da41d2e3d9f508c09b28e8abdaf'])
no_group_df.drop(33745, inplace=True)
images_df.loc['106a3da41d2e3d9f508c09b28e8abdaf','Other lesion']=0
no_group_df[no_group_df['image_id']=='106a3da41d2e3d9f508c09b28e8abdaf']

In [None]:
# Show final classes distribution:

# Update number of generated labels
for label in classes[:-1]:
    images_df[label+'_'] = images_df['numaug'] * images_df[label]
    
images_df.sum(axis=0)

# Save images from dicom
## Save images for un-labeled dicoms

In [None]:
from os import makedirs, listdir

# makedirs('data/images/train', exist_ok = True)
# makedirs('data/images/valid', exist_ok = True)
# makedirs('data/labels/train', exist_ok = True)
# makedirs('data/labels/valid', exist_ok = True)

working_path = '/kaggle/working'
images_path = join(working_path,'images')
labels_path = join(working_path,'labels')
makedirs(images_path, exist_ok = True)
makedirs(labels_path, exist_ok = True)

In [None]:
from tqdm.auto import tqdm

for image_id in tqdm(list_of_images_no_label):
    dic = Dicom_image(path=path_train, image_name=image_id, classes=classes)
    dic.resize(max_dim=512)
    dic.save_image_label (img_path=images_path, label_path=labels_path, img='resize')

## Save images for labeled dicoms, including augmentations

In [None]:

for image_id in tqdm(list_of_images_labled):
    image_data_df = no_group_df[no_group_df['image_id'] == image_id]  # Dataframe containing all rows for the specific image
    #dicom_path = join(path_train, image_id+'.dicom')
    #img, (img_h, img_w) = read_xray(dicom_path, max_dim = 512) # Read image, resize to 512pt and get original image size h,w for norm
    
    dic = Dicom_image(path=path_train, image_name=image_id, classes=classes)  # Read dicom file

    for i, row in image_data_df.iterrows():
        dic.add_bbox(row.x_min, row.y_min ,row.x_max, row.y_max,str(row.class_id))  # Get bounding boxes
    
    dic.resize(max_dim=512)
    dic.save_image_label (img_path=images_path, label_path=labels_path, img='resize') # Save original image
    
    # Create augmentations
    numaug = images_df.loc[image_id, 'numaug'] - 1
    if numaug > 0:
        img_temp, bbs_temp = dic.resize(downscale_factor = 2)
        for aug in tqdm(range(numaug)):
            dic.augment(img=img_temp, bounding_boxes=bbs_temp)
            dic.resize(img=dic.image_aug, bounding_boxes=dic.bbs_aug, max_dim=512)
            dic.save_image_label (fname=image_id+'_aug'+str(aug), img_path=images_path, label_path=labels_path, img='resize') # Save augmented image

In [None]:
## Count files
print("No. of images", len(listdir(images_path)))

In [None]:
%%script echo skipping
txtfile = r'data/labels/train/051132a778e61a86eb147c7c6f564dfe.txt'
def print_txt_file(txtfile):
    f = open(txtfile, 'r')
    file_contents = f.read()
    print(file_contents)
    f.close()
    
print_txt_file(txtfile)