In [10]:
import os
import numpy as np
import json
import pickle
import random

## Create Lists of Train, Test, and Val Folders

In [7]:
# make a list of the folders in the data directory
folders = [name for name in os.listdir('Data_MIT/')]
# drop the irrelevant file in the data directory
folders.remove('LICENSE.md')

# create three lists for recording the training, testing, and
# validating frames (folders)
train =[]
val = []
test = []


# create three counters for counting the number of frames/samples per set
train_sp = 0
val_sp = 0
test_sp = 0

# loop over all folders available in the folders list
for folder in folders:
    # set the path to the folder on my system
    path = 'Data_MIT/' + folder
    #  read the info jason
    with open(path + '/info.json', 'r') as file:
        info = json.load(file)
    # if there is at least one useable frame in the folder,
    if info['NumFaceDetections'] and info['NumEyeDetections']:
        # read the whether it is a train, test, or validation folder
        subset = info['Dataset']
        # and append the name of the folder to the corresponding list
        exec(subset + '.append(str(folder))')
        # and also add the number of useable/valid frames avaible in the folder
        # to the corresponding counter
        exec(subset + '_sp += min(info[\'NumFaceDetections\'], info[\'NumEyeDetections\'])')

# check the number of folders/subjects in each set
train_sb = len(train)
val_sb = len(val)
test_sb = len(test)

# print it out
print(' We have', train_sb, 'train folders/subjects, containing',
      train_sp, 'frames/samples\n and',
      val_sb, 'validate folders/subjects, containing',
      val_sp, 'frames/samples\n and',
      test_sb, 'test folders/subjects, containing',
      test_sp, 'frames/samples.')

with open('train.pkl', 'wb') as f:
    pickle.dump(train, f)

with open('val.pkl', 'wb') as f:
    pickle.dump(val, f)
    
with open('test.pkl', 'wb') as f:
    pickle.dump(test, f)



 We have 1271 train folders/subjects, containing 1251983 frames/samples
 and 50 validate folders/subjects, containing 59480 frames/samples
 and 150 test folders/subjects, containing 179496 frames/samples.


## Read Random Folders and Creat a Dataset

In [8]:
# manually set the size of test, train, and validation sets (by samples)

train_size = 10000 # must be less than 1251983
val_size = 1000 # must be less than 179496
test_size = 2000 # must be less than 59480

DPI = 96 # resolution


In [None]:
# define a function to read one frame/image,
# and return 

def read_file(file, coords, idx, dataset):
    '''
    The function takes the file (string), and the
    coordinates info (np.array), and the target dataset
    (string='train'/'val'/'test'), and will save 4 images:
    face, left eye, right eye patches, and the face grid,
    under the 'idx' name (integer), in the target dataset
    and also 4 positional float values will be appended
    to the label an np.array in the same dataset.
    '''
    
    # read the image file
    img = cv2.cvtColor(cv2.imread(file), cv2.COLOR_BGR2RGB)
    
    # ----------------------------------------------------------------
    # first we have to check to make sure the stupid face bounding box
    # does not fall out of the frame
    # ----------------------------------------------------------------
    # 1. X is negative
    if coords[0] < 0:
        # so we fill the shortage by replicating the edge 
        img = cv2.copyMakeBorder(img, 0, 0, np.abs(coords[0]), 0, cv2.BORDER_REPLICATE)
        # and reset the X value regarding the new frame
        coords[0] = 0

    # 2. Y is negative
    if coords[1] < 0:
        # so we fill the shortage by replicating the edge 
        img = cv2.copyMakeBorder(img, np.abs(coords[1]), 0, 0, 0, cv2.BORDER_REPLICATE)
        # and reset the Y value regarding the new frame
        coords[1] = 0

    # 3. X+W is larger than the frame width
    if (coords[0]+coords[2]) > img.shape[1]:
        # so we fill the shortage (diff) by replicating the edge
        diff = np.abs(img.shape[1]-(coords[0]+coords[2]))
        img = cv2.copyMakeBorder(img, 0, 0, 0, diff, cv2.BORDER_REPLICATE)

    # 4. Y+H is larger than the frame height
    if (coords[1]+coords[3]) > img.shape[0]:
        # so we fill the shortage (diff) by replicating the edge
        diff = np.abs(img.shape[0]-(coords[1]+coords[3]))
        img = cv2.copyMakeBorder(img, 0, diff, 0, 0, 0, cv2.BORDER_REPLICATE)
    # ----------------------------------------------------------------
    
    
    # ----------------------------------------------------------------
    # save the face patch
    # ----------------------------------------------------------------
    crop = img[coords[1]:coords[1]+coords[3],
               coords[0]:coords[0]+coords[2]]
    # set the path-name
    path = dataset + '/face/' + idx + '.png'

    fig = plt.figure(figsize=(crop.shape[0]/DPI,crop.shape[1]/DPI), dpi=DPI)
    plt.axis('off')
    plt.imshow(crop)
    plt.tight_layout()
    plt.savefig(path, pad_inches=0.0, dpi=DPI)
    plt.close(fig);
    # ----------------------------------------------------------------
    
    
    # ----------------------------------------------------------------
    # save the left eye patch
    # ----------------------------------------------------------------
    eyel = crop[coords[5]:coords[5]+coords[7],
                coords[4]:coords[4]+coords[6]]
    # set the path-name
    path = dataset + '/eyel/' + idx + '.png'

    fig = plt.figure(figsize=(eyel.shape[0]/DPI,eyel.shape[1]/DPI), dpi=DPI)
    plt.axis('off')
    plt.imshow(eyel)
    plt.tight_layout()
    plt.savefig(path, pad_inches=0.0, dpi=DPI)
    plt.close(fig);
    # ----------------------------------------------------------------
    
    
    # ----------------------------------------------------------------
    # save the right eye patch
    # ----------------------------------------------------------------
    eyer = crop[coords[9]:coords[9]+coords[11],
                coords[8]:coords[8]+coords[10]]
    # set the path-name
    path = dataset + '/eyer/' + idx + '.png'

    fig = plt.figure(figsize=(eyer.shape[0]/DPI,eyer.shape[1]/DPI), dpi=DPI)
    plt.axis('off')
    plt.imshow(eyer)
    plt.tight_layout()
    plt.savefig(path, pad_inches=0.0, dpi=DPI)
    plt.close(fig);
    # ----------------------------------------------------------------
    
    
    # ----------------------------------------------------------------
    # save the face grid
    # ----------------------------------------------------------------
    fig, ax = plt.subplots(figsize=(25/DPI,25/DPI), dpi=DPI)
    # set the white canvas
    canvas = np.zeros((25,25))
    
    # set the black face grid
    the_grid = patches.Rectangle((coords[12],25-coords[13]-coords[15]),
                                 coords[14],coords[15],
                                 linewidth=0,facecolor='black')
    # set the path-name
    path = dataset + '/grid/' + idx + '.png'

    ax.add_patch(the_grid)
    plt.axis('off')
    ax.imshow(canvas, extent=[0,25,0,25], cmap='Greys')
    plt.tight_layout()
    plt.savefig(path, pad_inches=0.0, dpi=DPI)
    plt.close(fig);
    # ----------------------------------------------------------------
    
    
    # ----------------------------------------------------------------
    # append the labels grid
    # ----------------------------------------------------------------
    label = np.array([[coords[16], coords[17], coords[18], coords[19]]])
    exec(dataset + '_labels = np.append(' + dataset + '_labels, label, axis=0)')
    # ----------------------------------------------------------------
    


In [None]:
# define a function to read a set-list, pick random folders,
# and add the folder's valid frames to dataset

def read_folder(folder, samples, dataset):
    '''
    The function takes the name of the target folder (string),
    the samples (integer) which is the number of needed samples
    before the target set gets full, and
    the target dataset (string='train'/'val'/'test'), and
    will add the desired number of samples (4 images: face, left eye,
    right eye patches, and the face grid, 4 positional float values
    in the form an np.array to the labels array) to the target set.
    '''
    # set the path to the target folder
    path = 'Data_MIT/' + folder
    
    
    # ----------------------------------------------------------------
    # reading the json files
    # ----------------------------------------------------------------
    # Face Crop
    with open(path + '/appleFace.json', 'r') as file:
        face = json.load(file)
    
    # Left Eye
    with open(path + '/appleLeftEye.json', 'r') as file:
        eye_l = json.load(file)

    # Right Eye
    with open(path + '/appleRightEye.json', 'r') as file:
        eye_r = json.load(file)

    # Face Grid
    with open(path + '/faceGrid.json', 'r') as file:
        grid = json.load(file)

    # dot
    with open(path + '/dotInfo.json', 'r') as file:
        dot = json.load(file)
    # ----------------------------------------------------------------
    
        
    
    # ----------------------------------------------------------------
    # make a list of indices of the invalid Frames in the Folder,
    # check if the requested number of samples if greater than the
    # number of valid frames in the folder, take the min value and
    # randomly pick 'min' number of samples
    # ----------------------------------------------------------------
    *******************************************************************
    there's a huge problem here. you are taking the union, while you
    should have taken the intersection
    *******************************************************************
    indices = [i for i, x in enumerate(face['IsValid']) if x == 1]
    indices += [i for i, x in enumerate(eye_l['IsValid']) if x == 1]
    indices += [i for i, x in enumerate(eye_r['IsValid']) if x == 1]
    indices += [i for i, x in enumerate(grid['IsValid']) if x == 1]

    indices = list(set(indices))
    
    pick = min(samples, len(indices))
    
    idx = random.sample(indices, k=pick)
    # ----------------------------------------------------------------
    
    
    
    # ----------------------------------------------------------------
    # iterate over the indices in idx list, read images and save data
    # ----------------------------------------------------------------
    for ind in idx:
        path_frames = 'Data_MIT/' + folder + '/frames/'
        files = [name for name in os.listdir(path)]
        
        # set the path to the target folder
        path = 'Data_MIT/' + folder + '/frames/' + file_name
        
        coords = np.array([round(face['X'][idx]), round(face['Y'][idx]),  #0-1
                           round(face['W'][idx]), round(face['H'][idx]),  #2-3
                           round(eye_l['X'][idx]), round(eye_l['Y'][idx]),#4-5
                           round(eye_l['W'][idx]), round(eye_l['H'][idx]),#6-7
                           round(eye_r['X'][idx]), round(eye_r['Y'][idx]),#8-9
                           round(eye_r['W'][idx]), round(eye_r['H'][idx]),#10-11
                           round(grid['X'][idx]), round(grid['Y'][idx]),  #12-13
                           round(grid['W'][idx]), round(grid['H'][idx]),  #14-15
                           dot['XPts'][idx], dot['YPts'][idx],            #16-17
                           dot['XCam'][idx], dot['YCam'][idx]])           #18-19

    # ----------------------------------------------------------------
    
    

In [None]:
    # ----------------------------------------------------------------
    # randomly pick 'min' number of samples
    # ----------------------------------------------------------------
    # ----------------------------------------------------------------
    


# set the path to save the images in the test folder
path_test = 'test/' + str(folder) + '_' + str(idx) + '.pkl'

# save the array in a pickle file
with open(path_test,'wb') as f:
    pickle.dump(label, f)