In [1]:
import numpy as np
from numpy import savez_compressed
from numpy import load
import random

from paths import dataset_paths
from helper_functions import resize_and_keep_ratio
from helper_functions import crop_image
from helper_functions import create_list

In [4]:
def create_dataset(dataset_size, height, width, path):
    """
    Function for creating an numpy arrays Dataset of image pairs with associated labels
    
    Arguments:
        dataset_size: Number of image pairs in int format
        
        batch_size: desired batch size in int format
        height: desired height of the cropped images
        width: desired width of the cropped images
        path: path to the data directories
        
    Returns:
        An numpy array with the image pairs and labels
    """
    #dataset size / 2 bc. we have two pairs per iteration
    dataset_size = int(dataset_size / 2)
    
    #create empty arrays
    x_genuine = np.zeros([dataset_size, 2, 1, height, width])
    y_genuine = np.zeros([dataset_size, 1])
    x_opposite = np.zeros([dataset_size, 2, 1, height, width])
    y_opposite = np.zeros([dataset_size, 1])
    
    i = 0
    
    while i < dataset_size:
        
        #Select random folders for genuine and opposite pairs and save the paths to lists
        list_genuine = create_list(random.choice(dataset_paths))
        folder_opposite_1, folder_opposite_2 = random.sample(dataset_paths, 2)
        list_opposite_1 = create_list(folder_opposite_1)
        list_opposite_2 = create_list(folder_opposite_2)

        #load the genuine and opposite images from disk and preprocess it
        image_1_genuine = resize_and_keep_ratio(list_genuine, height)
        image_2_genuine = resize_and_keep_ratio(list_genuine, height)
        image_1_opposite = resize_and_keep_ratio(list_opposite_1, height)
        image_2_opposite = resize_and_keep_ratio(list_opposite_2, height)
        
        #new cycle if one of the 4 loaded images is 0
        if image_1_genuine == 0 or image_2_genuine == 0 or image_1_opposite == 0 or image_2_opposite == 0:
            pass
        else:
            #crop and standardize the loaded images and save it in the arrays
            x_genuine[i, 0, 0, :, :] = (crop_image(image_1_genuine, width)) / 255.
            x_genuine[i, 1, 0, :, :] = (crop_image(image_2_genuine, width)) / 255.
            y_genuine[i] = 1
            
            x_opposite[i, 0, 0, :, :] = (crop_image(image_1_opposite, width)) / 255.
            x_opposite[i, 1, 0, :, :] = (crop_image(image_2_opposite, width)) / 255.
            y_opposite[i] = 0
        
            i += 1
            
    #concatenate the arrays
    x = np.concatenate([x_genuine, x_opposite], axis=0)
    y = np.concatenate([y_genuine, y_opposite], axis=0)
    #change the shape of array
    x = np.einsum("abcde->abdec", x)
    x_1 = x[:, 0]
    x_2 = x[:, 1]
    
    randomize = np.arange(len(x_1))
    np.random.shuffle(randomize)
    y = y[randomize]
    x_1 = x_1[randomize]
    x_2 = x_2[randomize]

    return [x_1, x_2], y 

In [5]:
%%time
sample_size = 10000
x, y = create_dataset(sample_size, 224, 224, dataset_paths)

CPU times: user 1min 38s, sys: 56 s, total: 2min 34s
Wall time: 2min 36s


In [6]:
def save_arrays(x, y, sample_size):
    """
    Function for saving numpy arrays in numpy .npz format on disc
    
    Arguments:
        sample_size: Number of image pairs in int format
        x: x data as numpy array
        y: y data as numpy array
    
    Returns:
        Print statement if saving is successfull
    """
    
    savez_compressed(f'npz_datasets/data_x_{sample_size}.npz', x)
    savez_compressed(f'npz_datasets/data_y_{sample_size}.npz', y)
    print("saved successfully")

In [None]:
save_arrays(x, y, sample_size)

In [19]:
def load_arrays(path1, path2):
    """
    Function for load .npz files from disc
    
    Arguments
        path1: Path to file 1 in string format (x data)
        path2: Path to file 2 in string format (y data)
        
    Returns:
        x and y data in numpy array format
    """
    dict_data_x = load(path1)
    dict_data_y = load(path2)
    x = dict_data_x['arr_0']
    y = dict_data_y['arr_0']
    return x, y

In [22]:
x, y = load_arrays("npz_datasets/data_x_5000.npz", "npz_datasets/data_y_5000.npz")