In [10]:
import os
import numpy as np
import json
import pickle
import random

## Create Lists of Train, Test, and Val Folders

In [7]:
# make a list of the folders in the data directory
folders = [name for name in os.listdir('Data_MIT/')]
# drop the irrelevant file in the data directory
folders.remove('LICENSE.md')

# create three lists for recording the training, testing, and
# validating frames (folders)
train =[]
val = []
test = []


# create three counters for counting the number of frames/samples per set
train_sp = 0
val_sp = 0
test_sp = 0

# loop over all folders available in the folders list
for folder in folders:
    # set the path to the folder on my system
    path = 'Data_MIT/' + folder
    #  read the info jason
    with open(path + '/info.json', 'r') as file:
        info = json.load(file)
    # if there is at least one useable frame in the folder,
    if info['NumFaceDetections'] and info['NumEyeDetections']:
        # read the whether it is a train, test, or validation folder
        subset = info['Dataset']
        # and append the name of the folder to the corresponding list
        exec(subset + '.append(str(folder))')
        # and also add the number of useable/valid frames avaible in the folder
        # to the corresponding counter
        exec(subset + '_sp += min(info[\'NumFaceDetections\'], info[\'NumEyeDetections\'])')

# check the number of folders/subjects in each set
train_sb = len(train)
val_sb = len(val)
test_sb = len(test)

# print it out
print(' We have', train_sb, 'train folders/subjects, containing',
      train_sp, 'frames/samples\n and',
      val_sb, 'validate folders/subjects, containing',
      val_sp, 'frames/samples\n and',
      test_sb, 'test folders/subjects, containing',
      test_sp, 'frames/samples.')

with open('train.pkl', 'wb') as f:
    pickle.dump(train, f)

with open('val.pkl', 'wb') as f:
    pickle.dump(val, f)
    
with open('test.pkl', 'wb') as f:
    pickle.dump(test, f)



 We have 1271 train folders/subjects, containing 1251983 frames/samples
 and 50 validate folders/subjects, containing 59480 frames/samples
 and 150 test folders/subjects, containing 179496 frames/samples.


## Read Random Folders and Creat a Dataset

In [8]:
# manually set the size of test, train, and validation sets (by samples)

train_size = 10000 # must be less than 1251983
val_size = 1000 # must be less than 179496
test_size = 2000 # must be less than 59480


In [None]:
# define a function to read a set-list, pick random folders,
# and add the folder's valid frames to dataset

def read_folder(folder_name, samples, dataset):
    '''
    The function takes the name of the target folder (string),
    the samples (integer) which is the number of needed samples
    before the target set gets full, and
    the target dataset (string='train'/'val'/'test'), and
    will add the desired number of samples (4 images: face, left eye,
    right eye patches, and the face grid, 4 positional float values
    in the form an np.array to the labels array) to the target set
    '''
    # set the path to the target folder
    path = 'Data_MIT/' + folder_name
    
    
    
    # ----------------------------------------------------------------
    # reading the json files
    # ----------------------------------------------------------------
    # Face Crop
    with open(path + '/appleFace.json', 'r') as file:
        face = json.load(file)
    
    # Left Eye
    with open(path + '/appleLeftEye.json', 'r') as file:
        eye_l = json.load(file)

    # Right Eye
    with open(path + '/appleRightEye.json', 'r') as file:
        eye_r = json.load(file)

    # Face Grid
    with open(path + '/faceGrid.json', 'r') as file:
        grid = json.load(file)

    # dot
    with open(path + '/dotInfo.json', 'r') as file:
        dot = json.load(file)
    # ----------------------------------------------------------------
    
        
    
    # ----------------------------------------------------------------
    # make a list of indices of the invalid Frames in the Folder,
    # check if the requested number of samples if greater than the
    # number of valid frames in the folder, take the min value and
    # randomly pick 'min' number of samples
    # ----------------------------------------------------------------
    indices = [i for i, x in enumerate(face['IsValid']) if x == 1]
    indices += [i for i, x in enumerate(eye_l['IsValid']) if x == 1]
    indices += [i for i, x in enumerate(eye_r['IsValid']) if x == 1]
    indices += [i for i, x in enumerate(grid['IsValid']) if x == 1]

    indices = list(set(indices))
    
    pick = min(samples, len(indices))
    
    idx = random.sample(indices, k=pick)
    # ----------------------------------------------------------------
    
    
    
    # ----------------------------------------------------------------
    # iterate over the indices in idx list, read images and save data
    # ----------------------------------------------------------------
    for ind in idx:
        path_frames = 'Data_MIT/' + folder + '/frames/'
        files = [name for name in os.listdir(path)]

    # ----------------------------------------------------------------
    
    

In [None]:
# define a function to read one frame/image,
# and return 

def read_file(folder_name, file_name, dataset):
    '''
    The function takes the name of the target folder (string),
    and the name of the target file (string), and
    the target dataset (string='train'/'val'/'test'),
    and will save 4 images: face, left eye, right eye patches, and
    the face grid, in the target dataset and also 4 positional float
    values will be appended to the label an np.array in the same dataset
    '''
    # set the path to the target folder
    path = 'Data_MIT/' + folder_name + '/frames/' + file_name
    
    

In [None]:
# first we have to check to make sure the stupid face bounding box
# didn't fall out of the frame
# X,Y is the coordinate of the top-left corner of the bounding box,
# so four offisde cases are possible:

# 1. X is negative
if round(face['X'][idx]) < 0:
    # so we fill the shortage by replicating the edge 
    img = cv2.copyMakeBorder(img, 0, 0, np.abs(round(face['X'][idx])), 0, cv2.BORDER_REPLICATE)
    # and reset the X value regarding the new frame
    face['X'][idx] = 0
    print('Negative X\n')

# 2. Y is negative
if round(face['Y'][idx]) < 0:
    # so we fill the shortage by replicating the edge 
    img = cv2.copyMakeBorder(img, np.abs(round(face['Y'][idx])), 0, 0, 0, cv2.BORDER_REPLICATE)
    # and reset the Y value regarding the new frame
    face['Y'][idx] = 0
    print('Negative Y\n')

# 3. X+W is larger than the frame width
if round(face['X'][idx])+round(face['W'][idx]) > img.shape[1]:
    # so we fill the shortage (diff) by replicating the edge
    diff = np.abs(img.shape[1]-(round(face['X'][idx])+round(face['W'][idx])))
    img = cv2.copyMakeBorder(img, 0, 0, 0, diff, cv2.BORDER_REPLICATE)
    print('Face box out of the right border\n')

# 4. Y+H is larger than the frame height
if round(face['Y'][idx])+round(face['H'][idx]) > img.shape[0]:
    # so we fill the shortage (diff) by replicating the edge
    diff = np.abs(img.shape[0]-(round(face['Y'][idx])+round(face['H'][idx])))
    img = cv2.copyMakeBorder(img, 0, diff, 0, 0, 0, cv2.BORDER_REPLICATE)
    print('Face box out of the lower border\n')

In [None]:
    # ----------------------------------------------------------------
    # randomly pick 'min' number of samples
    # ----------------------------------------------------------------
    # ----------------------------------------------------------------
    


# set the path to save the images in the test folder
path_test = 'test/' + str(folder) + '_' + str(idx) + '.pkl'

# save the array in a pickle file
with open(path_test,'wb') as f:
    pickle.dump(label, f)