# Prepare Data for Train-Test

In [7]:
def housekeeping(files):
    """
    Checks and removes unwanted files or folders
    1. .ipynb_checkpoints/
    2. train/
    3. test/
    """
    
    if ".ipynb_checkpoints" in files:
        shutil.rmtree(path + ".ipynb_checkpoints")
        
    if 'train' or 'test' in files:
        try:
            shutil.rmtree(path + 'train')
        except OSError as e:
            print ("Note: %s - %s." % (e.filename, e.strerror))
        try:
            shutil.rmtree(path + 'test')
        except OSError as e:
            print ("Note: %s - %s." % (e.filename, e.strerror))
    
    return


def randomize_files(files):
    """
    Shuffles files in a random order
    """
    return shuffle(files)


def prepare_split(files):
    """
    Identifies the ratios for files to be split into
    
    **Note**
    Current setting is for 80-20
    """
    eighty = int(0.8 * len(files))
    twenty = int(len(files) - eighty)
    files = np.array(files)
    
    return eighty, twenty, files


def generate_ids(eighty, twenty):
    """
    Assigns 1 to 80% of the files and 0 to 20% of the files
    """
    idx = np.hstack((np.ones(eighty),
                     np.zeros(twenty)))
    return idx


def train_test_split(files, idx):
    """
    Files tagged as 1 are categorised as training files
    Files tagged as 0 are categorised as test files
    """
    train = files[idx == 1]
    test = files[idx == 0]
    print("TRAIN SET: {0}".format(train))
    print("TEST SET: {0}".format(test))
    return train, test


def save_mixed_files():
    dir_mixed_paths = [PATH_MIXED + "train/", PATH_MIXED + "test/"]

    # Save MIXED data
    for paths, category in zip(dir_mixed_paths, categories):
        if not os.path.exists(paths):
            os.makedirs(paths)
        
        if os.path.exists(paths):
            for file in category:
                file_path = PATH_MIXED + file
                print(f'file_path: {file_path}')
                shutil.move(file_path, paths)
                
                
def save_noise_files():
    dir_noise_paths = [PATH_NOISE + "train/", PATH_NOISE + "test/"]

    # Save NOISE data
    for paths, category in zip(dir_noise_paths, categories):
        if not os.path.exists(paths):
            os.makedirs(paths)

        if os.path.exists(paths):
            for noise_file in category:
                file_path = PATH_NOISE + noise_file
                print(f'file_path: {file_path}')
                shutil.move(file_path, paths)

In [10]:
import pandas as pd
import numpy as np
import os
import random
from random import shuffle
import shutil

PATH_MIXED = "../data/points/mixed/"
PATH_NOISE = "../data/points/noise/"

dir_paths = [PATH_MIXED, PATH_NOISE]
save_types = [save_mixed_files, save_noise_files]


for path, types in zip(dir_paths, save_types):
    print("Working on {}".format(path))
    
    # Cleanup unwanted files/folders    
    files = os.listdir(path)
    housekeeping(files)
    print (files)
    
    # First: Randomize data 
    randomize_files(files)
    
    # Calculate Split
    eighty, twenty, files = prepare_split(files)

    # Generate 1s and 0s as IDs
    idx = generate_ids(eighty, twenty)

    # Split data 
    train, test = train_test_split(files, idx) 
    categories = [train, test]
    types()

Working on ../data/points/mixed/
Note: ../data/points/mixed/train - No such file or directory.
Note: ../data/points/mixed/test - No such file or directory.
['group_1869.xyz', 'group_941.xyz', 'group_3924.xyz', 'group_6109.xyz', 'group_6337.xyz', 'group_5010.xyz', 'group_1276.xyz', 'group_809.xyz', 'group_6054.xyz', 'group_607.xyz', 'group_3845.xyz', 'group_6530.xyz', 'group_4327.xyz', 'group_2024.xyz', 'group_2795.xyz', 'group_3514.xyz', 'group_6493.xyz', 'group_997.xyz', 'group_6134.xyz', 'group_1671.xyz', 'group_3700.xyz', 'group_4735.xyz', 'group_2185.xyz', 'group_3310.xyz', 'group_375.xyz', 'group_1275.xyz', 'group_5211.xyz', 'group_639.xyz', 'group_2231.xyz', 'group_1658.xyz', 'group_3063.xyz', 'group_5823.xyz', 'group_4532.xyz', 'group_1516.xyz', '.DS_Store', 'group_6084.xyz', 'group_5215.xyz', 'group_615.xyz', 'group_3670.xyz', 'group_4679.xyz', 'group_6495.xyz', 'group_4282.xyz', 'group_1488.xyz', 'group_4080.xyz', 'group_3923.xyz', 'group_1887.xyz', 'group_4243.xyz', 'group_65

file_path: ../data/points/noise/group_63.xyz
file_path: ../data/points/noise/group_151.xyz
file_path: ../data/points/noise/group_364.xyz
file_path: ../data/points/noise/group_407.xyz
file_path: ../data/points/noise/group_376.xyz
file_path: ../data/points/noise/group_89.xyz
file_path: ../data/points/noise/group_302.xyz
file_path: ../data/points/noise/group_347.xyz
file_path: ../data/points/noise/group_370.xyz
file_path: ../data/points/noise/group_315.xyz
file_path: ../data/points/noise/group_3.xyz
file_path: ../data/points/noise/group_227.xyz
file_path: ../data/points/noise/group_153.xyz
file_path: ../data/points/noise/group_100.xyz
file_path: ../data/points/noise/group_96.xyz
file_path: ../data/points/noise/group_95.xyz
file_path: ../data/points/noise/group_262.xyz
file_path: ../data/points/noise/group_155.xyz
file_path: ../data/points/noise/group_210.xyz
file_path: ../data/points/noise/group_479.xyz
file_path: ../data/points/noise/group_448.xyz
file_path: ../data/points/noise/group_23