In [None]:
import ast
import sys
import pydicom
from pydicom import dcmread
from pydicom.pixel_data_handlers.util import apply_voi_lut
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import hashlib
import os
from io import BytesIO
from PIL import Image, ImageFont, ImageDraw
import cv2
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from sklearn import model_selection
from tqdm.auto import tqdm
import shutil

In [None]:
df = pd.read_csv('../input/updated-csv/combined_train_data.csv')
df = df.dropna()
df = df.reset_index(drop=True)
df.head()

In [None]:
df_train, df_valid = model_selection.train_test_split(
    df,
    test_size=0.1,
    random_state=42,
    shuffle=True
)
df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)

In [None]:
os.makedirs(f'/kaggle/tmp/siim_640x640_yolo/images/', exist_ok=True)
os.makedirs(f'/kaggle/tmp/siim_640x640_yolo/labels/', exist_ok=True)

os.makedirs(f'/kaggle/tmp/siim_640x640_yolo/images/train/', exist_ok=True)
os.makedirs(f'/kaggle/tmp/siim_640x640_yolo/images/validation/', exist_ok=True)
os.makedirs(f'/kaggle/tmp/siim_640x640_yolo/labels/train/', exist_ok=True)
os.makedirs(f'/kaggle/tmp/siim_640x640_yolo/labels/validation/', exist_ok=True)

In [None]:
def process_data(data, data_type='train'):
    for _, row in tqdm(data.iterrows(), total=len(data)):
        image_name = row['id'].replace('_image','')        
        label = row['label'].split()
        length = row['No_of_findings']
        H = float(row['width'])
        W = float(row['height'])
                
        yolo_data = []
        j = 0
        for i in range(length):
            x_min = float(label[j+2])
            y_min = float(label[j+3])            
            x_max = float(label[j+4])           
            y_max = float(label[j+5])
            
            w = x_max - x_min
            h = y_max - y_min
            
            x_center = x_min + w / 2
            y_center = y_min + h / 2
            
            x_center /= W           
            y_center /= H
            w /= W
            h /= H            
            yolo_data.append([0, x_center, y_center, w, h])
            j += 6
        yolo_data = np.array(yolo_data)
        
        np.savetxt(f'/kaggle/tmp/siim_640x640_yolo/labels/{data_type}/{image_name}.txt',
                    yolo_data,
                    fmt=["%d", "%f", "%f", "%f", "%f"]
                   )
        
        shutil.copyfile(
            f'../input/siim-train-image/{image_name}.jpg',
            f'/kaggle/tmp/siim_640x640_yolo/images/{data_type}/{image_name}.jpg'
        )

In [None]:
process_data(df_train, data_type='train')
process_data(df_valid, data_type='validation')

In [None]:
%%time
!tar -zcf siim_640x640_yolo.tar.gz -C "/kaggle/tmp/siim_640x640_yolo/" .
#!tar -zcf labels.tar.gz -C "/kaggle/tmp/labels/" .