UPDATE ON 2023/04/02

1. Aims to prepare data for classification and detection training
2. Split original data based on the guidelines provided by train.txt, val.txt and test.txt

In [18]:
import os
import uuid
import copy
import math
import numpy as np

from marco import *
from visualise import *
from prepare_data import *

In [19]:
'''
have changed the origina folder name from
"VOCdevkit" to
f"VOCdevkit_{year}_{trainval}" or 
f"VOCdevkit_{year}_{test}"
'''

TRAIN_ROOTS = ['VOCdevkit_2007_trainval/VOC2007', 
               'VOCdevkit_2007_test/VOC2007', 
               'VOCdevkit_2012_trainval/VOC2012']

VALID_ROOTS = ['VOCdevkit_2007_trainval/VOC2007', 
               'VOCdevkit_2012_trainval/VOC2012']

TEST_ROOTS = ['VOCdevkit_2012_test/VOC2012']

In [20]:
train_img_path, train_lab_path = load_data_path(TRAIN_ROOTS, target = 'train')
valid_img_path, valid_lab_path = load_data_path(VALID_ROOTS, target = 'valid')
test_img_path, test_lab_path   = load_data_path( TEST_ROOTS, target = 'test')

In [21]:
print('N_train:', len(train_img_path))
print('N_valid:', len(valid_img_path))
print('N_test:', len(test_img_path))

assert len(train_img_path) == 2501 + 4952 + 5717 and len(train_img_path) == len(train_lab_path)
assert len(valid_img_path) == 2510 + 5823 and len(valid_img_path) == len(valid_lab_path)
assert len(test_img_path)  == 10991 and len(test_img_path) == len(test_lab_path)

def check_id(img_path, lab_path):
    for img_id, lab_id in zip(img_path, lab_path):
        img_id = (img_id.split('.')[0]).split('/')[-1]
        lab_id = (lab_id.split('.')[0]).split('/')[-1]
        assert img_id == lab_id
    
check_id(train_img_path, train_lab_path)
check_id(valid_img_path, valid_lab_path)
check_id(test_img_path, test_lab_path)

N_train: 13170
N_valid: 8333
N_test: 10991


In [22]:
print('[TRAIN]')
print(train_img_path[0])
print(train_lab_path[0])

print('[VALID]')
print(valid_img_path[0])
print(valid_lab_path[0])

print('[TEST]')
print(test_img_path[0])
print(test_lab_path[0])

[TRAIN]
VOCdevkit_2007_trainval/VOC2007/JPEGImages/000012.jpg
VOCdevkit_2007_trainval/VOC2007/Annotations/000012.xml
[VALID]
VOCdevkit_2007_trainval/VOC2007/JPEGImages/000005.jpg
VOCdevkit_2007_trainval/VOC2007/Annotations/000005.xml
[TEST]
VOCdevkit_2012_test/VOC2012/JPEGImages/2008_000001.jpg
VOCdevkit_2012_test/VOC2012/Annotations/2008_000001.xml


# Split data into train, valid and test

In [14]:
FOLDERS = ['train', 'valid', 'test']
for folder in FOLDERS:
    if folder == 'train':
        split_data(train_img_path, 
                   train_lab_path, 
                   target_root = os.path.join('data', folder))
    elif folder == 'valid':
        split_data(valid_img_path, 
                   valid_lab_path, 
                   target_root = os.path.join('data', folder))
    elif folder == 'test':
        split_data(valid_img_path, 
                   valid_lab_path, 
                   target_root = os.path.join('data', folder))