In [22]:
import os
import shutil
import random
from tqdm import tqdm

import logging
logging.basicConfig(level=logging.INFO)

In [23]:
from global_config import DATASET_DIR_PATH, ORIGIN_IMG_DIR_PATH, ORIGIN_ANNO_DIR_PATH,\
    IMG_DIR_PATH, ANNO_DIR_PATH,\
    IMG_TRAIN_DIR_PATH, ANNO_TRAIN_DIR_PATH, IMG_VAL_DIR_PATH, ANNO_VAL_DIR_PATH

In [24]:
DIRS = [
    IMG_TRAIN_DIR_PATH,
    ANNO_TRAIN_DIR_PATH,
    IMG_VAL_DIR_PATH,
    ANNO_VAL_DIR_PATH
]

In [25]:
for dir_ in DIRS:
    shutil.rmtree(dir_, ignore_errors=True)

In [26]:
if all(os.path.exists(d) for d in DIRS):
    logging.critical("Follow the dataset configuration from the previous training")
    exit(0)
else:
    for d in DIRS:
        os.makedirs(d)

## Split train and validation datasets

In [27]:
logging.info("Start splitting train and validation datasets")
img_files = [f for f in os.listdir(ORIGIN_IMG_DIR_PATH) if f.endswith(".jpg")]
img_files.sort()
random.seed(42)
random.shuffle(img_files)
len(img_files),img_files[:10]

INFO:root:Start splitting train and validation datasets


(1000,
 ['c73231653c8c43b19565757ec495dba1.jpg',
  '85e26c7dea8245d0971c66b5385c0733.jpg',
  'e5312db3d9a54d499f18dbc80c049298.jpg',
  'ecafb678fe744d43a2e38e789d1e7efb.jpg',
  '09d3c22bd1ed45f29bd4b5ca5d025b6d.jpg',
  '803d3d8e202748bbac790ddd25ff2f07.jpg',
  '162c3e6540ef4302bdf4722747336aa6.jpg',
  'c14d9f490e4146b3a60e68f24a697741.jpg',
  '60c3133016134029afb08c63e14b4d76.jpg',
  '88b87d143d1647b98dd84ea8e7a0b706.jpg'])

In [28]:
val_frac = 0.2
num_train = int(len(img_files)*(1-val_frac))
train_img_files = img_files[:num_train]
val_img_files = img_files[num_train:]
len(train_img_files), len(val_img_files)

(800, 200)

In [29]:
for file in train_img_files:
    shutil.copy(os.path.join(ORIGIN_IMG_DIR_PATH, file), os.path.join(IMG_TRAIN_DIR_PATH, file))
    
    anno_file = file.replace(".jpg", ".png")
    shutil.copy(os.path.join(ORIGIN_ANNO_DIR_PATH, anno_file), os.path.join(ANNO_TRAIN_DIR_PATH, anno_file))

In [30]:
len(os.listdir(IMG_TRAIN_DIR_PATH)), len(os.listdir(ANNO_TRAIN_DIR_PATH))

(800, 800)

In [31]:
for file in val_img_files:
    shutil.copy(os.path.join(ORIGIN_IMG_DIR_PATH, file), os.path.join(IMG_VAL_DIR_PATH, file))
    
    anno_file = file.replace(".jpg", ".png")
    shutil.copy(os.path.join(ORIGIN_ANNO_DIR_PATH, anno_file), os.path.join(ANNO_VAL_DIR_PATH, anno_file))

In [32]:
len(os.listdir(IMG_VAL_DIR_PATH)), len(os.listdir(ANNO_VAL_DIR_PATH))

(200, 200)

## Delete Redundant files which may disturb 

In [33]:
os.chdir(DATASET_DIR_PATH)
os.getcwd()

'/home/featurize/work/AI6126project1/dev-public-fixed'

In [34]:
!find . -iname '__MACOSX'

In [35]:
!find . -iname '.DS_Store'

In [36]:
!find . -iname '.ipynb_checkpoints'

In [37]:
!for i in `find . -iname '__MACOSX'`; do rm -rf $i;done

In [38]:
!for i in `find . -iname '.DS_Store'`; do rm -rf $i;done

In [39]:
!for i in `find . -iname '.ipynb_checkpoints'`; do rm -rf $i;done

In [40]:
!find . -iname '__MACOSX'

In [41]:
!find . -iname '.DS_Store'

In [42]:
!find . -iname '.ipynb_checkpoints'