## Get filenames

In [1]:
import os

files = sorted(os.listdir('./all_border_dataset_2000_annotations'), reverse=True)
len(files)

2185

In [7]:
XML_files = [f'./all_border_dataset_2000_annotations/{f}' for f in files if f[-4:] == '.xml']
len(XML_files)

183

In [33]:
import xml.etree.ElementTree as ET

for file in XML_files:
    tree = ET.parse(file)
    root = tree.getroot()

    for bndbox in root.findall('./object/bndbox'):
        xmin = bndbox[0]
        ymin = bndbox[1]
        xmax = bndbox[2]
        ymax = bndbox[3]
        if int(xmin.text) >= int(xmax.text):
            xmax.text = str(int(xmin.text) + 0.1)
        if int(ymin.text) >= int(ymax.text):
            ymax.text = str(int(ymin.text) + 0.1)

    tree.write(file)

## Split into border and negative

In [34]:
border_filenames = [f[:-4] for f in files if f[-4:] == '.xml']
len(border_filenames)

183

In [35]:
negative_filenames = [f[:-4] for f in files if f[:-4] not in border_filenames]
len(negative_filenames)

1819

In [36]:
# sanity check
len(negative_filenames) + 2*len(border_filenames)

2185

## Split border and negative into train-val-test

In [37]:
train, val, test = 0.8, 0.1, 0.1

In [38]:
import numpy as np

border_splits = np.split(border_filenames, [int(train*len(border_filenames)), int((train+val)*len(border_filenames))])
negative_splits = np.split(negative_filenames, [int(train*len(negative_filenames)), int((train+val)*len(negative_filenames))])
splits = ['train', 'val', 'test']

In [39]:
negative_splits[0]

array(['Img_002002', 'Img_002001', 'Img_002000', ..., 'Img_000390',
       'Img_000389', 'Img_000388'], dtype='<U10')

In [40]:
import shutil
for i in range(len(splits)):
    data_dir = './all_border_dataset_2000_annotations'
    save_dir = f'./all_border_dataset_2000_annotations_split/{splits[i]}'
    
    for filename in negative_splits[i]:
        shutil.copy(f'{data_dir}/{filename}.jpg', f'{save_dir}/{filename}.jpg')
        
    for filename in border_splits[i]:
        shutil.copy(f'{data_dir}/{filename}.jpg', f'{save_dir}/{filename}.jpg')
        shutil.copy(f'{data_dir}/{filename}.xml', f'{save_dir}/{filename}.xml')

In [41]:
for i in range(len(splits)):
    save_dir = f'./all_border_dataset_2000_annotations_split/{splits[i]}'
    print(f'{splits[i]}: {len(os.listdir(save_dir))}')

train: 1747
val: 218
test: 220


In [42]:
1747+218+220

2185