Create "permanent" folders with the train, validation, and test data

Train the model on this ONCE and then continue to use the same test data until everything is working. Then also only need to train the center detector once as well. 

Once everything is working, can work on retraining the model for repeatability. 

In [1]:
# import libraries
import numpy as np
import PIL
from PIL import Image

from sklearn.model_selection import train_test_split
from sklearn import utils # used to shuffle data

from code_files.imagePreprocessing import * 

In [2]:
# import the images and create a dataframe containing the image path locations and their associated labels
df = importImages('datasets/Spiral_DataSet1_relabelled')
df.head()

total number of labels:  102
total number of images:  102


Unnamed: 0,images,label
61,datasets/Spiral_DataSet1_relabelled\parkinsons...,1
72,datasets/Spiral_DataSet1_relabelled\parkinsons...,1
51,datasets/Spiral_DataSet1_relabelled\healthy\V5...,0
81,datasets/Spiral_DataSet1_relabelled\healthy\V5...,0
53,datasets/Spiral_DataSet1_relabelled\healthy\V0...,0


In [7]:
# randomly split data in train and validation subsets (70-30 split)
# stratify attempts to keep the labels 50-50 in the validation data (i.e. 7 total 0's and 8 total 1's)
train_feature, val_feature, train_label, val_label = train_test_split(df['images'], df['label'], test_size=0.30, stratify=df['label'])

# shuffle data
train_feature, train_label = utils.shuffle(train_feature, train_label)
val_feature, val_label = utils.shuffle(val_feature, val_label)

## (OPTIONAL) split validation data into validation and testing data
val_feature, test_feature, val_label, test_label = train_test_split(val_feature, val_label, test_size=0.5, stratify=val_label)

# sort the test array so that all healthy images are first and PD images are last
# this is useful for later when plotting
testDF = pd.DataFrame(columns = ['img', 'lbl'])
testDF['img'] = test_feature
testDF['lbl'] = test_label
testDF = testDF.sort_values('lbl')

test_feature = testDF['img']
test_label   = testDF['lbl']

print("total validation samples: ", len(val_label))
print("total testing samples: ", len(test_label))
print('total training samples: ', len(train_label))

total validation samples:  15
total testing samples:  16
total training samples:  71


In [8]:
# for each of the data sets (train, val, and test), convert from rgb to grayscale
# then convert to an array of pixels [0-255] --> this returns an array of size (256,256,1)
# resize to (128x128) for faster image processing and a more efficient model, data doesn't get lost in this resize
# need to account for the batch dimension (used in tensorflow), so expand dim to shape (1,128,128,1)
def img2array(dataset):
    storage_array = []
    for img_path in dataset:
        openImg = PIL.Image.open(img_path)
        image = openImg.convert("L") # covert to grayscale (L), color use (P)
        imgArray = np.array(image)
        imgArray = cv2.resize(imgArray, (128,128))
        imgArray = np.expand_dims(imgArray, axis=2) # if keeping rgb, use axis=0

        # store in array
        storage_array.append(imgArray)
    
    return storage_array

train_array = img2array(train_feature)
val_array   = img2array(val_feature)
test_array  = img2array(test_feature)

In [9]:
# save the data into folders in the dataset directory 
# parentDir = 'C:/Users/rebec/Documents/git-repos/hand-tremor-detection/hand-drawn-spiral-classifier-100patient/'
val_pd_folder = 'datasets/val/pd/'
val_h_folder = 'datasets/val/healthy/'
test_pd_folder = 'datasets/test/pd/'
test_h_folder = 'datasets/test/healthy/'

for i in range(len(train_array)):
    if train_label.iat[i] == 0: # if healthy, place in healthy folder
        filename = 'datasets/spiral_data/train/healthy/' + 'train_h_' + str(i) + '.png'
        img = Image.fromarray(np.squeeze(train_array[i]))
        img.save(filename)
    else: # place in pd folder
        filename = 'datasets/spiral_data/train/pd/' + 'train_pd_' + str(i) + '.png'
        img = Image.fromarray(np.squeeze(train_array[i]))
        img.save(filename)

for i in range(len(val_array)):
    if val_label.iat[i] == 0: # if healthy, place in healthy folder
        filename = 'datasets/spiral_data/val/healthy/' + 'val_h_' + str(i) + '.png'
        img = Image.fromarray(np.squeeze(val_array[i]))
        img.save(filename)
    else: # place in pd folder
        filename = 'datasets/spiral_data/val/pd/' + 'val_pd_' + str(i) + '.png'
        img = Image.fromarray(np.squeeze(val_array[i]))
        img.save(filename)

for i in range(len(test_array)):
    if test_label.iat[i] == 0: # if healthy, place in healthy folder
        filename = 'datasets/spiral_data/test/healthy/' + 'test_h_' + str(i) + '.png'
        img = Image.fromarray(np.squeeze(test_array[i]))
        img.save(filename)
    else: # place in pd folder
        filename = 'datasets/spiral_data/test/pd/' + 'test_pd_' + str(i) + '.png'
        img = Image.fromarray(np.squeeze(test_array[i]))
        img.save(filename)
