In [1]:
original = 'livestalk-16'
new = 'livestalk-resplit-v2-720'

In [2]:
from shutil import copyfile
import os
import glob
import pandas as pd

In [3]:
def make_folder(path):
    if not os.path.exists(path):
        os.mkdir(path)
        os.mkdir(os.path.join(path,'train'))
        os.mkdir(os.path.join(path,'train','images'))
        os.mkdir(os.path.join(path,'train','labels'))

        os.mkdir(os.path.join(path,'valid'))
        os.mkdir(os.path.join(path,'valid','images'))
        os.mkdir(os.path.join(path,'valid','labels'))
        
        os.mkdir(os.path.join(path,'test'))
        os.mkdir(os.path.join(path,'test','images'))
        os.mkdir(os.path.join(path,'test','labels'))        
        
        print(f'folders created: {path}')
        
    else:
        print(f'a folder already exists here: {path}')

### Identify folder for original and resplit

In [4]:
orig_path = os.path.join(os.getcwd(),original)
new_path = os.path.join(os.getcwd(),new)
make_folder(new_path)

yaml_txt = """names:
- cow
nc: 1
train: {0}/train/images
val: {0}/valid/images
""".format(new_path)

with open(os.path.join(new_path,'data.yaml'),'w') as f:
    f.write(yaml_txt)



train_path = os.path.join(orig_path,'train')
valid_path = os.path.join(orig_path,'valid')
test_path = os.path.join(orig_path,'test')

folders created: /home/omar/datasci/w251-DL/livestalk/livestalk-resplit-v2-720


### Get counts in old folder

In [5]:
train_images = glob.glob(os.path.join(orig_path,'train','images','*.jpg'))
train_labels = glob.glob(os.path.join(orig_path,'train','labels','*.txt'))

valid_images = glob.glob(os.path.join(orig_path,'valid','images','*.jpg'))
valid_labels = glob.glob(os.path.join(orig_path,'valid','labels','*.txt'))

test_images = glob.glob(os.path.join(orig_path,'test','images','*.jpg'))
test_labels = glob.glob(os.path.join(orig_path,'test','labels','*.txt'))

all_files = train_images + valid_images + test_images
print('Counts in old location:')
print(f'Training: {len(train_images)}')
print(f'Validation: {len(valid_images)}')
print(f'Test: {len(test_images)}')


all_names = [x.rsplit('/',1)[-1].rsplit('.',1)[0] for x in all_files]
mov_names = [x.rsplit('_',1)[0].split('_')[1] for x in all_names]
names = pd.Series(mov_names)
names.value_counts()

Counts in old location:
Training: 731
Validation: 214
Test: 192


0044    444
0045    262
0043    137
0047    136
0046    120
0048     38
dtype: int64

In [6]:
def get_counts(files):
    names = [x.rsplit('/',1)[-1].rsplit('.',1)[0].rsplit('_',1)[0].split('_')[1] for x in files]
    names = pd.Series(names)
    print(names.value_counts())
    
print('Old Distribution')
print('Training images:')
get_counts(train_images)
print('\nValidation images:')
get_counts(valid_images)
print('\nTest images:')
get_counts(test_images)

Old Distribution
Training images:
0044    264
0045    157
0043    103
0047     93
0046     83
0048     31
dtype: int64

Validation images:
0044    74
0045    56
0047    28
0046    27
0043    22
0048     7
dtype: int64

Test images:
0044    106
0045     49
0047     15
0043     12
0046     10
dtype: int64


### Copy files to new location

In [7]:

def copy_files(images,labels,new_path):
    assert len(images) == len(labels)
    
    images = sorted(images)
    labels = sorted(labels)
    
    for image,label in zip(images,labels):
        if "DJI_0043" in image:
            location = 'valid'
        if "DJI_0044" in image:
            location = 'train'
        if "DJI_0045" in image:
            location = 'train'
        if "DJI_0046" in image:
            location = 'test'
        if "DJI_0047" in image:
            location = 'train'
        if "DJI_0048" in image:
            location = 'train'
    
        image_file = image.rsplit('/',1)[-1].rsplit('.',1)[0]
        label_file = label.rsplit('/',1)[-1].rsplit('.',1)[0]
    
        assert image_file == label_file, 'lists not aligned'
    
        new_image_path = os.path.join(new_path,location,'images',image.rsplit('/',1)[-1])
        new_label_path = os.path.join(new_path,location,'labels',label.rsplit('/',1)[-1])
        
        copyfile(image,new_image_path)
        copyfile(label,new_label_path)
    
                                      
copy_files(train_images,train_labels,new_path)
copy_files(valid_images,valid_labels,new_path)
copy_files(test_images,test_labels,new_path)

### Get counts in new location

In [8]:
new_train = glob.glob(os.path.join(new_path,'train','images','*.jpg'))
new_valid = glob.glob(os.path.join(new_path,'valid','images','*.jpg'))
new_test = glob.glob(os.path.join(new_path,'test','images','*.jpg'))

new_train_labels = glob.glob(os.path.join(new_path,'train','labels','*.txt'))
new_valid_labels = glob.glob(os.path.join(new_path,'valid','labels','*.txt'))
new_test_labels = glob.glob(os.path.join(new_path,'test','labels','*.txt'))

print('Totals')
print(f'Number of training images in new location: {len(new_train)}')
print(f'Number of validation images in new location: {len(new_valid)}')
print(f'Number of test images in new location: {len(new_test)}')

Totals
Number of training images in new location: 880
Number of validation images in new location: 137
Number of test images in new location: 120


In [9]:
print('NEW Distribution')
print('Training LABELS:')
get_counts(new_train_labels)
print('\nValidation LABELS:')
get_counts(new_valid_labels)
print('\nTest LABELS:')
get_counts(new_test_labels)

NEW Distribution
Training LABELS:
0044    444
0045    262
0047    136
0048     38
dtype: int64

Validation LABELS:
0043    137
dtype: int64

Test LABELS:
0046    120
dtype: int64


In [10]:
print('NEW Distribution')
print('Training images:')
get_counts(new_train)
print('\nValidation images:')
get_counts(new_valid)
print('\nTest images:')
get_counts(new_test)

NEW Distribution
Training images:
0044    444
0045    262
0047    136
0048     38
dtype: int64

Validation images:
0043    137
dtype: int64

Test images:
0046    120
dtype: int64
