In [10]:
import numpy as np
from helper_functions import resize_and_keep_ratio, crop_image
from paths import dataset_paths
import random

In [194]:
def data_generator(batch_size=256):
    """
    Data Generator for training process in the .fit function. Yields x and y pair with batch size every epoch
    
    Arguments:
        batch_size: desired batch size in int format
        
    Returns:
        Batch of siamese image pairs x with their label y
    """
    height, width = 224, 224
    path = dataset_paths
    while True:
        x, y = create_batch(batch_size, height, width, path)
        yield x, y

In [12]:
def create_list(folder):
    """
    Creates an list with an item inside
    
    Arguments:
        folder: Desired item to get into list
        
    Returns:
        List with the folder object inside
    """
    folder_list = []
    folder_list.append(folder)
    return folder_list

In [271]:
def create_batch(batch_size, height, width, path):
    """
    Function to create siamese pairs. Randomly takes genuine and opposite pairs from data directory, scales it, croppe out an image piece and normalizes it.
    
    Arguments:
        batch_size: desired batch size in int format
        height: desired height of the cropped images
        width: desired width of the cropped images
        path: path to the data directories
        
    Returns:
        Three shuffled arrays in format ((x_1, x_2), y). x_1 and x_2 are the cropped images and y is the labels array
    """
    #batch size / 2 bc. we have two pairs per iteration
    batch_size = int(batch_size / 2)
    
    #create empty arrays
    x_genuine = np.zeros([batch_size, 2, 1, height, width])
    y_genuine = np.zeros([batch_size, 1])
    x_opposite = np.zeros([batch_size, 2, 1, height, width])
    y_opposite = np.zeros([batch_size, 1])
    
    i = 0
    
    while i < batch_size:
        
        #Select random folders for genuine and opposite pairs and save the paths to lists
        list_genuine = create_list(random.choice(dataset_paths))
        folder_opposite_1, folder_opposite_2 = random.sample(dataset_paths, 2)
        list_opposite_1 = create_list(folder_opposite_1)
        list_opposite_2 = create_list(folder_opposite_2)
        
        #load the genuine and opposite images from disk and preprocess it
        image_1_genuine = resize_and_keep_ratio(list_genuine, height)
        image_2_genuine = resize_and_keep_ratio(list_genuine, height)
        image_1_opposite = resize_and_keep_ratio(list_opposite_1, height)
        image_2_opposite = resize_and_keep_ratio(list_opposite_2, height)
        
        #new cycle if one of the 4 loaded images is 0
        if image_1_genuine == 0 or image_2_genuine == 0 or image_1_opposite == 0 or image_2_opposite == 0:
            pass
        else:
            #crop and standardize the loaded images and save it in the arrays
            x_genuine[i, 0, 0, :, :] = (crop_image(image_1_genuine, width)) / 255.
            x_genuine[i, 1, 0, :, :] = (crop_image(image_2_genuine, width)) / 255.
            y_genuine[i] = 1
            
            x_opposite[i, 0, 0, :, :] = (crop_image(image_1_opposite, width)) / 255.
            x_opposite[i, 1, 0, :, :] = (crop_image(image_2_opposite, width)) / 255.
            y_opposite[i] = 0
        
            i += 1
    #concatenate the arrays
    x = np.concatenate([x_genuine, x_opposite], axis=0)
    y = np.concatenate([y_genuine, y_opposite], axis=0)
    #change the shape of array
    x = np.einsum("abcde->abdec", x)
    x_1 = x[:, 0]
    x_2 = x[:, 1]
    
    randomize = np.arange(len(x_1))
    np.random.shuffle(randomize)
    y = y[randomize]
    x_1 = x_1[randomize]
    x_2 = x_2[randomize]

    return [x_1, x_2], y 

In [272]:
%%time
x, y = create_batch(256, 224, 224, dataset_paths)

CPU times: user 2.51 s, sys: 1.48 s, total: 3.99 s
Wall time: 4.06 s


In [219]:
gen = data_generator()

In [247]:
x, y = next(gen)