# Prepare Data for Train-Test

In [1]:
def housekeeping(files):
    """
    Checks and removes unwanted files or folders
    1. .ipynb_checkpoints/
    2. .DS_Store
    3. train/
    4. test/
    """
    
    if ".ipynb_checkpoints" in files:
        print("Removing .ipynb_checkpoints...")
        shutil.rmtree(path + ".ipynb_checkpoints")
    else:
        print("No action taken")
        
    if ".DS_Store" in files:
        os.remove(path + ".DS_Store")
    else:
        print("No action taken")
        
    if 'train' or 'test' in files:
        try:
            shutil.rmtree(path + 'train')
        except OSError as e:
            print ("Note: %s - %s." % (e.filename, e.strerror))
        try:
            shutil.rmtree(path + 'test')
        except OSError as e:
            print ("Note: %s - %s." % (e.filename, e.strerror))
    
    return


def randomize_files(files):
    """
    Shuffles files in a random order
    """
    return shuffle(files)


def prepare_split(files):
    """
    Identifies the ratios for files to be split into
    
    **Note**
    Current setting is for 80-20
    """
    eighty = int(0.8 * len(files))
    twenty = int(len(files) - eighty)
    files = np.array(files)
    
    return eighty, twenty, files


def generate_ids(eighty, twenty):
    """
    Assigns 1 to 80% of the files and 0 to 20% of the files
    """
    idx = np.hstack((np.ones(eighty),
                     np.zeros(twenty)))
    return idx


def train_test_split(files, idx):
    """
    Files tagged as 1 are categorised as training files
    Files tagged as 0 are categorised as test files
    """
    train = files[idx == 1]
    test = files[idx == 0]
    print("TRAIN SET: {0}".format(train))
    print("TEST SET: {0}".format(test))
    return train, test


def save_mixed_files():
    """
    Saves all mixed groups in the respective train and test folders
    """
    dir_mixed_paths = [PATH_MIXED + "train/", PATH_MIXED + "test/"]

    for paths, category in zip(dir_mixed_paths, categories):
        if not os.path.exists(paths):
            os.makedirs(paths)
        
        if os.path.exists(paths):
            for file in category:
                file_path = PATH_MIXED + file
                print(f'file_path: {file_path}')
                shutil.move(file_path, paths)
                
                
def save_noise_files():
    """
    Saves all noise files in its respective train and test folders
    """
    dir_noise_paths = [PATH_NOISE + "train/", PATH_NOISE + "test/"]

    for paths, category in zip(dir_noise_paths, categories):
        if not os.path.exists(paths):
            os.makedirs(paths)

        if os.path.exists(paths):
            for noise_file in category:
                file_path = PATH_NOISE + noise_file
                print(f'file_path: {file_path}')
                shutil.move(file_path, paths)

In [2]:
import pandas as pd
import numpy as np
import os
import random
from random import shuffle
import shutil

PATH_MIXED = "../../../data/test/xyt/meshes/mixed/"
PATH_NOISE = "../../../data/test/xyt/meshes/noise/"

dir_paths = [PATH_MIXED, PATH_NOISE]
save_types = [save_mixed_files, save_noise_files]


for path, types in zip(dir_paths, save_types):
    print("Working on {}".format(path))
    
    # Cleanup unwanted files/folders    
    files = os.listdir(path)
    housekeeping(files)
    print (files)
    
    # First: Randomize data 
    randomize_files(files)
    
    # Calculate Split
    eighty, twenty, files = prepare_split(files)

    # Generate 1s and 0s as IDs
    idx = generate_ids(eighty, twenty)

    # Split data 
    train, test = train_test_split(files, idx) 
    categories = [train, test]
    types()

Working on ../../../data/test/xyt/meshes/mixed/
No action taken
No action taken
Note: ../../../data/test/xyt/meshes/mixed/train - No such file or directory.
Note: ../../../data/test/xyt/meshes/mixed/test - No such file or directory.
['group_2795_mesh.off', 'group_32_mesh.off', 'group_6493_mesh.off', 'group_941_mesh.off', 'group_1918_mesh.off', 'group_491_mesh.off', 'group_3514_mesh.off', 'group_4620_mesh.off', 'group_1357_mesh.off', 'group_2098_mesh.off', 'group_1038_mesh.off', 'group_4243_mesh.off', 'group_4717_mesh.off', 'group_2517_mesh.off', 'group_3310_mesh.off', 'group_2211_mesh.off', 'group_3670_mesh.off', 'group_3778_mesh.off', 'group_440_mesh.off', 'group_4336_mesh.off', 'group_1637_mesh.off', 'group_1825_mesh.off', 'group_3923_mesh.off', 'group_3212_mesh.off', 'group_1172_mesh.off', 'group_4532_mesh.off', 'group_2759_mesh.off', 'group_1496_mesh.off', 'group_5030_mesh.off', 'group_4104_mesh.off', 'group_1276_mesh.off', 'group_2711_mesh.off', 'group_554_mesh.off', 'group_4793_m