## Preparing the train/validation/test image folders

See below for the source and target data directory structures and number of files in each subfolder/category. 

There are 3 target subfolders: train, validation, test (data split will be 80/10/10). There are 3 categories of images: coast, forest, mountain.

### Source sun09 dataset structure
```
data_sun09
|
|- coast (405 images)
|- forest (351 images)
|- mountain (402 images)
```

### Data folder structure

```
data
|
|-train
|    |- coast (325 files)
|    |- forest ( 281 files )
|    |- mountain (322 files )
|
|-validation
|    |- coast (40 files)
|    |- forest (35 files )
|    |- mountain (40 files )
|
|-test
|    |- coast (40 files)
|    |- forest (35 files )
|    |- mountain (40 files ) 
```

coast total files: 325+40+40 = 405  
forest total files: 281+35+35 = 351  
mountain total files: 322+40+40 = 402 


In [12]:
import os
import shutil
import math

In [13]:
original_dataset_dir = "../data_sun09/"  # image file source directory
cnn_dataset_dir = "../data/"    # we are creating this directory with train, val, test subfolders

In [14]:
# Create the train, validation, test subfolders within data/

base_dir = cnn_dataset_dir

if not os.path.exists(base_dir):
    os.mkdir(base_dir)

train_dir = os.path.join(base_dir,'train')
if not os.path.exists(train_dir):
    os.mkdir(train_dir)
    
validation_dir = os.path.join(base_dir,'validation')
if not os.path.exists(validation_dir):
    os.mkdir(validation_dir)
    
test_dir = os.path.join(base_dir,'test')
if not os.path.exists(test_dir):
    os.mkdir(test_dir)

In [15]:
!ls -R ../data

[34mtest[m[m       [34mtrain[m[m      [34mvalidation[m[m

../data/test:

../data/train:

../data/validation:


In [16]:
# Create the category directories within the train/test/validation split subfolders

if not os.path.exists(base_dir):
    os.mkdir(base_dir)

# create the category subdirs within the split folder
def create_splitdir(split_dir_name):
    
    split_dir = os.path.join(base_dir, split_dir_name)
    if not os.path.exists(split_dir):
        os.mkdir(split_dir)  
        
    split_coast_dir = os.path.join(split_dir,'coast')
    if not os.path.exists(split_coast_dir):
        os.mkdir(split_coast_dir)
        
    split_forest_dir = os.path.join(split_dir,'forest')
    if not os.path.exists(split_forest_dir):
        os.mkdir(split_forest_dir)
        
    split_mountain_dir = os.path.join(split_dir,'mountain')
    if not os.path.exists(split_mountain_dir):
        os.mkdir(split_mountain_dir)  

        
# create the train folder and its subfolders
create_splitdir("train")
# create the validation folder
create_splitdir("validation")
# create the test folder and its subfolders
create_splitdir("test")


In [17]:
!ls -R ../data

[34mtest[m[m       [34mtrain[m[m      [34mvalidation[m[m

../data/test:
[34mcoast[m[m    [34mforest[m[m   [34mmountain[m[m

../data/test/coast:

../data/test/forest:

../data/test/mountain:

../data/train:
[34mcoast[m[m    [34mforest[m[m   [34mmountain[m[m

../data/train/coast:

../data/train/forest:

../data/train/mountain:

../data/validation:
[34mcoast[m[m    [34mforest[m[m   [34mmountain[m[m

../data/validation/coast:

../data/validation/forest:

../data/validation/mountain:


## Split each category and copy to respective subfolders

The train/val/test split is 80/10/10.

In [18]:
def copy_files(category_name):

    # Get the list of file names in each category folder of the original dataset
    cat_fnames = os.listdir(os.path.join(original_dataset_dir, category_name))
    cat_fnames.sort()
    
    # Get the number of files in the category 
    print(f"Number of files in {category_name} dataset: {len(cat_fnames)}")
    print(*cat_fnames, sep="\n")

    # We'll do a 80-10-10 train-val-test split for each category
    val_test_size = round( 0.10*len(cat_fnames) )  # size of val and test set

    # Select fnames evenly from the sequence list of file names for 'coast' since files are named 'semantically'
    gap = math.floor( len(cat_fnames)/val_test_size )
    print(f"Gap: {gap}")

    # select file names for validation folder
    cat_val_fnames = [ cat_fnames[i*gap] for i in range( round( 0.10*len(cat_fnames) ))]
    print(f"Number of {category_name} files for validation: { len(cat_val_fnames) }")

    cat_test_fnames = [ cat_fnames[i*gap+1] for i in range( round( 0.10*len(cat_fnames) ))]
    print(f"Number of {category_name} files for test: { len(cat_test_fnames) }")

    # copy the coast files to their designated respetive subfolders 
          
          
    original_cat_dir = os.path.join(original_dataset_dir, category_name)
        
    # validation files
    validation_cat_dir = os.path.join(validation_dir, category_name)
    for fname in cat_val_fnames:
        src = os.path.join(original_cat_dir, fname)
        dst = os.path.join(validation_cat_dir, fname)
        shutil.copyfile(src, dst)
    print(f"Number of files in {category_name} validation: {len(os.listdir(validation_cat_dir))}" )

    # test files
    test_cat_dir = os.path.join(test_dir, category_name)
    for fname in cat_test_fnames:
        src = os.path.join(original_cat_dir, fname)
        dst = os.path.join(test_cat_dir, fname)
        shutil.copyfile(src, dst)
    print(f"Number of files in {category_name} test: {len(os.listdir(test_cat_dir))}" )
          
    # train files
    train_cat_dir = os.path.join(train_dir, category_name)
    for fname in cat_fnames:
        if fname in cat_test_fnames or fname in cat_val_fnames:
            pass
        else:
            src = os.path.join(original_cat_dir, fname)
            dst = os.path.join(train_cat_dir, fname)
            shutil.copyfile(src, dst)
    print(f"Number of files in {category_name} train: {len(os.listdir(train_cat_dir))}" )

In [19]:
# copy the coast files
copy_files("coast")

Number of files in coast dataset: 405
b_beach_coast_bea1.jpg
b_beach_coast_bea10.jpg
b_beach_coast_bea14.jpg
b_beach_coast_bea2.jpg
b_beach_coast_bea26.jpg
b_beach_coast_bea27.jpg
b_beach_coast_bea29.jpg
b_beach_coast_bea3.jpg
b_beach_coast_bea38.jpg
b_beach_coast_bea39.jpg
b_beach_coast_bea4.jpg
b_beach_coast_bea5.jpg
b_beach_coast_bea9.jpg
b_beach_coast_cdmc1000.jpg
b_beach_coast_cdmc123.jpg
b_beach_coast_cdmc821.jpg
b_beach_coast_cdmc851.jpg
b_beach_coast_cdmc862.jpg
b_beach_coast_cdmc866.jpg
b_beach_coast_cdmc871.jpg
b_beach_coast_cdmc873.jpg
b_beach_coast_cdmc906.jpg
b_beach_coast_cdmc916.jpg
b_beach_coast_cdmc922.jpg
b_beach_coast_cdmc933.jpg
b_beach_coast_cdmc934.jpg
b_beach_coast_cdmc940.jpg
b_beach_coast_cdmc942.jpg
b_beach_coast_cdmc948.jpg
b_beach_coast_cdmc954.jpg
b_beach_coast_cdmc976.jpg
b_beach_coast_cdmc977.jpg
b_beach_coast_cdmc988.jpg
b_beach_coast_cdmc997.jpg
b_beach_coast_land112.jpg
b_beach_coast_land114.jpg
b_beach_coast_land277.jpg
b_beach_coast_land334.jpg
b_bea

Number of files in coast test: 40
Number of files in coast train: 325


In [20]:
# copy the forest files
copy_files("forest")

Number of files in forest dataset: 351
b_backwater_24_19a_flooded_forest.jpg
b_bambouserie_2983517_xitou_bamboo_forest_taiwan.jpg
b_bambouserie_44200_bamboo_forest2.jpg
b_bambouserie_456892_a_bamboo_forest_1.jpg
c_country_road_forest_land222.jpg
c_country_road_forest_land224.jpg
f_forest_150056.jpg
f_forest_151008.jpg
f_forest_151051.jpg
f_forest_266095.jpg
f_forest_36072.jpg
f_forest_482008.jpg
f_forest_482082.jpg
f_forest_50058.jpg
f_forest_536_n_123_x4909.jpg
f_forest_95041.jpg
f_forest_aa048712.jpg
f_forest_e003764.jpg
f_forest_forest_000042.jpg
f_forest_forest_000099.jpg
f_forest_forest_11_20_yahoo_80.jpg
f_forest_forest_36_05_altavista_87.jpg
f_forest_forest_art114.jpg
f_forest_forest_bost101.jpg
f_forest_forest_bost102.jpg
f_forest_forest_bost103.jpg
f_forest_forest_bost190.jpg
f_forest_forest_bost98.jpg
f_forest_forest_cdmc101.jpg
f_forest_forest_cdmc12.jpg
f_forest_forest_cdmc271.jpg
f_forest_forest_cdmc277.jpg
f_forest_forest_cdmc278.jpg
f_forest_forest_cdmc280.jpg
f_forest_f

Number of files in forest validation: 35
Number of files in forest test: 35
Number of files in forest train: 281


In [21]:
copy_files("mountain")

Number of files in mountain dataset: 402
c_canyon_mountain_land4.jpg
c_canyon_mountain_sharp22.jpg
c_country_road_mountain_land225.jpg
m_mountain_142053.jpg
m_mountain_19083.jpg
m_mountain_38038.jpg
m_mountain_38041.jpg
m_mountain_671080.jpg
m_mountain_mountain_gre242.jpg
m_mountain_mountain_land11.jpg
m_mountain_mountain_land131.jpg
m_mountain_mountain_land150.jpg
m_mountain_mountain_land153.jpg
m_mountain_mountain_land197.jpg
m_mountain_mountain_land198.jpg
m_mountain_mountain_land201.jpg
m_mountain_mountain_land260.jpg
m_mountain_mountain_land27.jpg
m_mountain_mountain_land278.jpg
m_mountain_mountain_land280.jpg
m_mountain_mountain_land281.jpg
m_mountain_mountain_land286.jpg
m_mountain_mountain_land30.jpg
m_mountain_mountain_land465.jpg
m_mountain_mountain_land471.jpg
m_mountain_mountain_land475.jpg
m_mountain_mountain_land644.jpg
m_mountain_mountain_land879.jpg
m_mountain_mountain_land880.jpg
m_mountain_mountain_land886.jpg
m_mountain_mountain_land917.jpg
m_mountain_mountain_moun1.

Number of files in mountain validation: 40
Number of files in mountain test: 40
Number of files in mountain train: 322


In [22]:
325+40+40 

405

In [23]:
281+35+35

351

In [24]:
322+40+40

402