# Convert label files for use with YOLO

For use with YOLOv4 directly, or other YOLO versions through the CSV.

It is also possible to remove all "no finding" bounding boxes.

The given bboxes are in the format of (xmin, ymin, xmax, ymax) in pixels but YOLO requires (xmin, ymin, width, heght) in relative values. Fortunately, this means that these labels can be used at any resolution or different aspect ratio

References:
- https://www.kaggle.com/pabloberhauser/creating-label-files-for-use-in-yolov4


In [None]:
import numpy as np
import pandas as pd

import pydicom
import glob

train_df = pd.read_csv("../input/vinbigdata-chest-xray-abnormalities-detection/train.csv")
train_df = train_df[train_df.class_id!=14].reset_index(drop = True)

train_df.head()

In [None]:
dicom_metadata = [pydicom.filereader.dcmread(f"../input/vinbigdata-chest-xray-abnormalities-detection/train/{image_id}.dicom", stop_before_pixels=True) for image_id in train_df['image_id']]

train_df['width'] = [i.Columns for i in dicom_metadata]
train_df['height'] = [i.Rows for i in dicom_metadata]

train_df['x_min'] = train_df.apply(lambda row: row.x_min/row.width, axis =1)
train_df['y_min'] = train_df.apply(lambda row: row.y_min/row.height, axis =1)

train_df['x_max'] = train_df.apply(lambda row: row.x_max/row.width, axis =1)
train_df['y_max'] = train_df.apply(lambda row: row.y_max/row.height, axis =1)

train_df['x_mid'] = train_df.apply(lambda row: (row.x_max+row.x_min)/2, axis =1)
train_df['y_mid'] = train_df.apply(lambda row: (row.y_max+row.y_min)/2, axis =1)

train_df['w'] = train_df.apply(lambda row: (row.x_max-row.x_min), axis =1)
train_df['h'] = train_df.apply(lambda row: (row.y_max-row.y_min), axis =1)

train_df['area'] = train_df['w']*train_df['h']
train_df.head()

In [None]:
train_df['yolo_box'] = train_df[['x_min', 'y_min', 'w', 'h']].values.tolist()

print("We have {} unique images with boxes.".format(len(train_df.image_id.unique())))
unique_img_ids = train_df.image_id.unique()

!mkdir vbd_train_data
folder_location = "vbd_train_data"

for img_id in unique_img_ids: # loop through all unique image ids. Remove the slice to do all images
    filt_df = train_df.query("image_id == @img_id") # filter the df to a specific id
    #all_boxes = filt_df.yolo_box.values
    file_name = "{}/{}.txt".format(folder_location,img_id) # specify the name of the folder and get a file name

    with open(file_name, 'w+') as file: # append lines to file
        for i in filt_df.iterrows():
            s = f"{i[1].class_id} %s %s %s %s \n" # The first number is the class name
            new_line = (s % tuple(i[1].yolo_box))
            file.write(new_line)

In [None]:
# Create labels for training images that do not have bounding boxes
# If you wish to train on only images with a finding, remove this code cell
all_imgs = glob.glob("../input/vinbigdata-chest-xray-abnormalities-detection/train/*.dicom")
all_imgs = [i.split("/")[-1].replace(".dicom", "") for i in all_imgs]
positive_imgs = train_df.image_id.unique()

negative_images = set(all_imgs) - set(positive_imgs)
print('All images:', len(all_imgs), 'Positive images:', len(positive_imgs))

for i in list(negative_images):
    file_name = "{}/{}.txt".format(folder_location, i)
    #print(file_name)
    with open(file_name, 'w') as fp:
        pass

In [None]:
%%capture

# zip to make files easier to download

!zip -r yolo_labels.zip vbd_train_data

!rm -r vbd_train_data