# Split Data into folders 

In [1]:
import glob
import xml.etree.ElementTree as ET
from tqdm import tqdm
import os
import shutil
import sys

<b>Initial dir structure</b> 

    \DATA
    ├───images
    └───labels
   
- images folder contains all samples
- labels contains mapping of images to labels (in our case it's in XML so ,we will parse it)

In [3]:
# variables to store paths
path = "./data"
label_path = path + "/labels/*.xml"        # Label path
image_path = path + "/images/*.tif"        # Image Path

# we use glob to extract all paths
label_paths = glob.glob(label_path)
image_paths = glob.glob(image_path)        # If in case of CSV open it read_csv and convert it into a list

In [4]:
# This block is specific to our case where we extract image labels from XML file and store it in a list

# function to read labels from xml file which is specfic to this case 
def get_label(file):
    with open(file, 'r') as file:
        tree = ET.parse(file)
        root = tree.getroot()
        return int(root[3][0].text)
    
# label path to actual labels
labels = [get_label(i) for i in tqdm(label_paths)]
total_len = len(labels)

100%|██████████████████████████████████████████████████████████████████████████| 65918/65918 [00:09<00:00, 7091.02it/s]


In [5]:
# logic to split into class folders

split_dir = "./samples" # FLoder name 
if not os.path.exists(split_dir): # Create a new folder
    os.mkdir(split_dir)

for image, label in tqdm(zip(image_paths,labels),total=total_len):
    if not os.path.exists(split_dir + '/' + str(label)): # Create label folder for the first time
        os.mkdir(split_dir + '/' + str(label))
    # copy from images to label folder 
    src_path = image
    dst_path = split_dir + '/' + str(label) +  image[13:]
    
    try:
        shutil.copy(src_path, dst_path)
    except IOError as e:
        print('Unable to copy file {} to {}'
              .format(src_path, dst_path))
    except:
        print('When try copy file {} to {}, unexpected error: {}'
              .format(src_path, dst_path, sys.exc_info()))

100%|███████████████████████████████████████████████████████████████████████████| 65918/65918 [05:43<00:00, 191.78it/s]


<b> Result </b>

        \SAMPLES
        ├───1
        ├───2
        ├───3
        ├───4
        ├───5
        └───6
        
Now we will convert this into train,test and validation folder by using split-folders lib

In [None]:
!pip install split-folders

In [None]:
import splitfolders  

input_folder = './samples/'

splitfolders.ratio(input_folder, output="split", # name of output folder 
                   seed=42, ratio=(.1, .9), # seed for generating random seq and ratio for specifying train and test split %
                   group_prefix=None) 

<b> End Result </b>

        \SPLIT
        ├───train
        │   ├───1
        │   ├───2
        │   ├───3
        │   ├───4
        │   ├───5
        │   └───6
        └───val
            ├───1
            ├───2
            ├───3
            ├───4
            ├───5
            └───6
            
Now we can load this for ML stuff :)