---
Downsize https://www.kaggle.com/xhlulu/vinbigdata-process-and-resize-to-jpg

to 608px and normalize+CLIHE images

In [None]:
import numpy as np
import pandas as pd
import pydicom
import os
import cv2
from pydicom.pixel_data_handlers.util import apply_voi_lut
from PIL import Image
from skimage import exposure
from tqdm.auto import tqdm

In [None]:
def read_xray(path, voi_lut = True, fix_monochrome = True):
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    
    data = data - np.min(data)

    # added
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

In [None]:
def resize(img, size, padColor=0):

    h, w = img.shape[:2]
    sh, sw = size

    # interpolation method
    if h > sh or w > sw: # shrinking image
        interp = cv2.INTER_AREA
    else: # stretching image
        interp = cv2.INTER_CUBIC

    # aspect ratio of image
    aspect = w/h  # if on Python 2, you might need to cast as a float: float(w)/h

    # compute scaling and pad sizing
    if aspect > 1: # horizontal image
        new_w = sw
        new_h = np.round(new_w/aspect).astype(int)
        pad_vert = (sh-new_h)/2
        pad_top, pad_bot = np.floor(pad_vert).astype(int), np.ceil(pad_vert).astype(int)
        pad_left, pad_right = 0, 0
    elif aspect < 1: # vertical image
        new_h = sh
        new_w = np.round(new_h*aspect).astype(int)
        pad_horz = (sw-new_w)/2
        pad_left, pad_right = np.floor(pad_horz).astype(int), np.ceil(pad_horz).astype(int)
        pad_top, pad_bot = 0, 0
    else: # square image
        new_h, new_w = sh, sw
        pad_left, pad_right, pad_top, pad_bot = 0, 0, 0, 0

    # set pad color
    if len(img.shape) is 3 and not isinstance(padColor, (list, tuple, np.ndarray)): # color image but only one color provided
        padColor = [padColor]*3

    # scale and pad
    scaled_img = cv2.resize(img, (new_w, new_h), interpolation=interp)
    # keep aspect ratio (no padding)
    # scaled_img = cv2.copyMakeBorder(scaled_img, pad_top, pad_bot, pad_left, pad_right, borderType=cv2.BORDER_CONSTANT, value=padColor)

    return scaled_img

In [None]:
# test 1 img
image_id = []
orig_height = []
orig_width = []
re_height = []
re_width = []

for split in ['train']:
    load_dir = f'../input/vinbigdata-chest-xray-abnormalities-detection/{split}/'
    save_dir = f'/kaggle/working/{split}/'
    
    os.makedirs(save_dir, exist_ok=True)

    for file in tqdm(os.listdir(load_dir)):
        xray = read_xray(load_dir + file)
        im = resize(xray, (608,608))  # yolov4 default 608
        im = exposure.equalize_hist(im) # histogram normalization
        im = exposure.equalize_adapthist(im/np.max(im)) #clahe
        cv2.imwrite(save_dir + file.replace('dicom', 'jpg'), im*255)
        
        # shape[0] = height, 1 = width
        if split == 'train':
            image_id.append(file.replace('.dicom', ''))
            re_height.append(im.shape[0])
            re_width.append(im.shape[1])
            orig_height.append(xray.shape[0])
            orig_width.append(xray.shape[1])
            
            break
    break

In [None]:
!ls /kaggle/working/train/

In [None]:
df_resized = pd.DataFrame.from_dict({
    'image_id': image_id, 
    're_height': re_height, 
    're_width': re_width,
    'orig_height': orig_height,
    'orig_width': orig_width
})
df_resized

In [None]:
# resize
image_id = []
orig_height = []
orig_width = []
re_height = []
re_width = []

for split in ['train', 'test']:
    load_dir = f'../input/vinbigdata-chest-xray-abnormalities-detection/{split}/'
    save_dir = f'/kaggle/tmp/{split}/'
#     save_dir = f'/kaggle/working/{split}/'


    os.makedirs(save_dir, exist_ok=True)

    for file in tqdm(os.listdir(load_dir)):
        xray = read_xray(load_dir + file)
        im = resize(xray, (608,608))  # yolov4 default 608
        im = exposure.equalize_hist(im) # histogram normalization
        im = exposure.equalize_adapthist(im/np.max(im)) #clahe
        cv2.imwrite(save_dir + file.replace('dicom', 'jpg'), im*255)
        
        # shape[0] = height, 1 = width
        if split == 'train':
            image_id.append(file.replace('.dicom', ''))
            re_height.append(im.shape[0])
            re_width.append(im.shape[1])
            orig_height.append(xray.shape[0])
            orig_width.append(xray.shape[1])

In [None]:
df_resized = pd.DataFrame.from_dict({
    'image_id': image_id, 
    're_height': re_height, 
    're_width': re_width,
    'orig_height': orig_height,
    'orig_width': orig_width
})


---
clean up

In [None]:
%cd /kaggle/tmp/
!ls

In [None]:
%cd /kaggle/tmp/train/
dir = f'/kaggle/tmp/train/'

for file in os.listdir(dir):
    im_id = file[:-4]
    print(file)
    break
im_id

In [None]:
# shape[0] = height, 1 = width
# resize
image_id = []
# orig_height = []
# orig_width = []
re_height = []
re_width = []

dir = f'/kaggle/tmp/train/'
for file in tqdm(os.listdir(dir)):
    im_id = file[:-4]
    im =  cv2.imread(file)

    image_id.append(im_id)
    re_height.append(im.shape[0])
    re_width.append(im.shape[1])

In [None]:
df_resized = pd.DataFrame.from_dict({
    'image_id': image_id, 
    're_height': re_height, 
    're_width': re_width
})

---
Create yolov4 txt files

https://www.kaggle.com/jackpodkim/vbd-convert-labels-to-yolo-yolov4/edit

In [None]:
%cd /kaggle/working/

In [None]:
import numpy as np
import pandas as pd

import pydicom
import glob

df = pd.read_csv("../input/vinbigdata-chest-xray-abnormalities-detection/train.csv")

df.head()

In [None]:
dicom_metadata = [pydicom.filereader.dcmread(f"../input/vinbigdata-chest-xray-abnormalities-detection/train/{image_id}.dicom", stop_before_pixels=True) for image_id in df['image_id']]

In [None]:
df['orig_width'] = [i.Columns for i in dicom_metadata]
df['orig_height'] = [i.Rows for i in dicom_metadata]

In [None]:
df = df[df.class_id!=14].reset_index(drop = True)

print("We have {} unique images with boxes.".format(len(df.image_id.unique())))
unique_img_ids = df.image_id.unique()

In [None]:
df = df.merge(df_resized, how='left', on='image_id')

In [None]:
df.head().T

In [None]:
# # resized reindex
# df['x'] = df.apply(lambda row: row.x_min*(row.re_width/row.orig_width), axis =1)
# df['y'] = df.apply(lambda row: row.y_min*(row.re_height/row.orig_height), axis =1)

# df['x_re_max'] = df.apply(lambda row: row.x_max*(row.re_width/row.orig_width), axis =1)
# df['y_re_max'] = df.apply(lambda row: row.y_max*(row.re_height/row.orig_height), axis =1)

In [None]:
# # resized reindex
# df['x_re_min'] = df.apply(lambda row: row.x_min*(row.re_width_x/row.orig_width), axis =1)
# df['y_re_min'] = df.apply(lambda row: row.y_min*(row.re_height_x/row.orig_height), axis =1)

# df['x_re_max'] = df.apply(lambda row: row.x_max*(row.re_width_x/row.orig_width), axis =1)
# df['y_re_max'] = df.apply(lambda row: row.y_max*(row.re_height_x/row.orig_height), axis =1)

In [None]:
# yolov4 format
df['x_mid'] = df.apply(lambda row: (row.x_min+row.x_max)/2, axis =1)
df['y_mid'] = df.apply(lambda row: (row.y_re_max+row.y_re_min)/2, axis =1)

# df['w'] = df.apply(lambda row: (row.x_re_max-row.x_re_min), axis =1)
# df['h'] = df.apply(lambda row: (row.y_re_max-row.y_re_min), axis =1)

# df['area'] = df['w']*df['h']
df.head()

In [None]:
df['yolo_box'] = df[['x_mid', 'y_mid', 'w', 'h']].values.tolist()

print("We have {} unique images with boxes.".format(len(df.image_id.unique())))
unique_img_ids = df.image_id.unique()

In [None]:
%cd /kaggle/tmp/

In [None]:
folder_location = "/kaggle/tmp/train/"

for img_id in tqdm(unique_img_ids): # loop through all unique image ids. Remove the slice to do all images
    filt_df = df.query("image_id == @img_id") # filter the df to a specific id
    #all_boxes = filt_df.yolo_box.values
    file_name = "{}/{}.txt".format(folder_location,img_id) # specify the name of the folder and get a file name

    with open(file_name, 'w+') as file: # append lines to file
        for i in filt_df.iterrows():
            s = f"{i[1].class_id} %s %s %s %s \n" # The first number is the class name
            new_line = (s % tuple(i[1].yolo_box))
            file.write(new_line)

In [None]:
!ls /kaggle/tmp/train/

In [None]:
# Create labels for training images that do not have bounding boxes
# If you wish to train on only images with a finding, remove this code cell
all_imgs = glob.glob("../input/vinbigdata-chest-xray-abnormalities-detection/train/*.dicom")
all_imgs = [i.split("/")[-1].replace(".dicom", "") for i in all_imgs]
positive_imgs = df.image_id.unique()

negative_images = set(all_imgs) - set(positive_imgs)
print('All images:', len(all_imgs), 'Positive images:', len(positive_imgs))

for i in tqdm(list(negative_images)):
    file_name = "{}/{}.txt".format(folder_location, i)
    #print(file_name)
    with open(file_name, 'w') as fp:
        pass

In [None]:
%%capture

# zip to make files easier to download

!zip -r yolo_labels.zip /kaggle/tmp/

In [None]:
!mv /kaggle/tmp/yolo_labels.zip /kaggle/working/

---
clean up

In [None]:
# resize
image_id = []
orig_height = []
orig_width = []
re_height = []
re_width = []

#test
load_dir = f'../input/vinbigdata-chest-xray-abnormalities-detection/test/'
save_dir = f'/kaggle/working/test/'
#     save_dir = f'/kaggle/working/{split}/'


os.makedirs(save_dir, exist_ok=True)

for file in tqdm(os.listdir(load_dir)):
    xray = read_xray(load_dir + file)
    im = resize(xray, (608,608))  # yolov4 default 608
    im = exposure.equalize_hist(im) # histogram normalization
    im = exposure.equalize_adapthist(im/np.max(im)) #clahe
    cv2.imwrite(save_dir + file.replace('dicom', 'jpg'), im*255)

In [None]:
%%capture

# zip to make files easier to download

!zip -r yolo_test.zip /kaggle/working/test

In [None]:
!ls