In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from numpy import savez_compressed
from numpy import load
import random

from paths import dataset_paths
from helper_functions import resize_and_keep_ratio
from helper_functions import crop_image
from helper_functions import create_list
from helper_functions import crop_image_triplet_loss

In [4]:
def create_dataset(dataset_size, height, width, path, rgb=False):
    """
    Function for creating an numpy arrays Dataset of image pairs with associated labels
    
    Arguments:
        dataset_size: Number of image pairs in int format
        height: desired height of the cropped images
        width: desired width of the cropped images
        path: path to the data directories
        rgb: Images should be loaded in grascale or RGB - default is False, means grayscale is default
        
    Returns:
        An numpy array with the image pairs and labels
    """
    #dataset size / 2 bc. we have two pairs per iteration
    dataset_size = int(dataset_size / 2)
    
    if rgb == True:
        #create empty arrays for rgb data
        x_genuine = np.zeros([dataset_size, 2, height, width, 3])
        y_genuine = np.zeros([dataset_size, 1])
        x_opposite = np.zeros([dataset_size, 2, height, width, 3])
        y_opposite = np.zeros([dataset_size, 1])
    else:
        #create empty arrays for grayscale data
        x_genuine = np.zeros([dataset_size, 2, 1, height, width])
        y_genuine = np.zeros([dataset_size, 1])
        x_opposite = np.zeros([dataset_size, 2, 1, height, width])
        y_opposite = np.zeros([dataset_size, 1])
    
    i = 0
    
    while i < dataset_size:
        
        #Select random folders for genuine and opposite pairs and save the paths to lists
        list_genuine = create_list(random.choice(dataset_paths))
        folder_opposite_1, folder_opposite_2 = random.sample(dataset_paths, 2)
        list_opposite_1 = create_list(folder_opposite_1)
        list_opposite_2 = create_list(folder_opposite_2)

        #load the genuine and opposite images from disk and preprocess it
        image_1_genuine = resize_and_keep_ratio(list_genuine, height, rgb)
        image_2_genuine = resize_and_keep_ratio(list_genuine, height, rgb)
        image_1_opposite = resize_and_keep_ratio(list_opposite_1, height, rgb)
        image_2_opposite = resize_and_keep_ratio(list_opposite_2, height, rgb)
        
        #new cycle if one of the 4 loaded images is 0
        if image_1_genuine == 0 or image_2_genuine == 0 or image_1_opposite == 0 or image_2_opposite == 0:
            pass
        else:
            if rgb == True:
                #crop and standardize the loaded images and save it in the arrays
                x_genuine[i, 0, :, :, :] = (crop_image(image_1_genuine, width)) / 255.
                x_genuine[i, 1, :, :, :] = (crop_image(image_2_genuine, width)) / 255.
                y_genuine[i] = 1

                x_opposite[i, 0, :, :, :] = (crop_image(image_1_opposite, width)) / 255.
                x_opposite[i, 1, :, :, :] = (crop_image(image_2_opposite, width)) / 255.
                y_opposite[i] = 0
            else:
                #crop and standardize the loaded images and save it in the arrays
                x_genuine[i, 0, 0, :, :] = (crop_image(image_1_genuine, width)) / 255.
                x_genuine[i, 1, 0, :, :] = (crop_image(image_2_genuine, width)) / 255.
                y_genuine[i] = 1

                x_opposite[i, 0, 0, :, :] = (crop_image(image_1_opposite, width)) / 255.
                x_opposite[i, 1, 0, :, :] = (crop_image(image_2_opposite, width)) / 255.
                y_opposite[i] = 0
                
            i += 1
            
    #concatenate the arrays
    x = np.concatenate([x_genuine, x_opposite], axis=0)
    y = np.concatenate([y_genuine, y_opposite], axis=0)
    
    
    if rgb == True:
        #change the shape of array
        x = np.einsum("abcde->abcde", x)
        x_1 = x[:, 0]
        x_2 = x[:, 1]
    else:
        #change the shape of array
        x = np.einsum("abcde->abdec", x)
        x_1 = x[:, 0]
        x_2 = x[:, 1]
    
    
    randomize = np.arange(len(x_1))
    np.random.shuffle(randomize)
    y = y[randomize]
    x_1 = x_1[randomize]
    x_2 = x_2[randomize]
    

    return [x_1, x_2], y 

In [4]:
def create_triplet_dataset(dataset_size, height, width, path, rgb=False):
    """
    Function for creating an numpy arrays Dataset of triplet images - anchor - positive and negative image
    
    Arguments:
        dataset_size: Number of image pairs in int format
        height: desired height of the cropped images
        width: desired width of the cropped images
        path: path to the data directories
        rgb: Images should be loaded in grascale or RGB - default is False, means grayscale is default
        
    Returns:
        An numpy array with the image pairs and labels
    """
    if rgb == True:
        #create empty arrays for rgb data
        anchors = np.zeros([dataset_size, height, width, 3])
        positives = np.zeros([dataset_size, height, width, 3])
        negatives = np.zeros([dataset_size, height, width, 3])
        y = np.zeros([dataset_size, 0])
    else: 
        #create empty arrays for grayscale data
        anchors = np.zeros([dataset_size, height, width, 1])
        positives = np.zeros([dataset_size, height, width, 1])
        negatives = np.zeros([dataset_size, height, width, 1])
        y = np.zeros([dataset_size, 0])
        
    i = 0
    
    while i < dataset_size:
        #Select two random folders for anchor/positive and negative samples and save the paths to lists
        list_anchor_positive, list_negative = random.sample(dataset_paths, 2)
        list_anchor_positive = create_list(list_anchor_positive)
        list_negative = create_list(list_negative)
        
        #load two images for anchor/positive and negative from disk and preprocess it
        image_anchor_positive = resize_and_keep_ratio(list_anchor_positive, height, rgb)
        image_negative = resize_and_keep_ratio(list_negative, height, rgb)
        
        #new cycle if one of the 2 loaded images is 0
        if image_anchor_positive == 0 or image_negative == 0:
            pass
        else:
            if rgb == True:
                anchor, positive = (crop_image_triplet_loss(image_anchor_positive, width))
                anchors[i] = anchor / 255.
                positives[i] = positive / 255.
                negatives[i] = (crop_image(image_negative, width)) / 255.
                y[i] = 0
                
            else:
                anchor, positive = (crop_image_triplet_loss(image_anchor_positive, width))
                anchors[i] = anchor / 255.
                positives[i] = positive / 255.
                negative[i] = (crop_image(image_negative, width)) / 255.
                y[i] = 0
        
            i += 1
        
    return[anchors, positives, negatives], y

In [5]:
%%time
sample_size = 15000
x, y = create_dataset(sample_size, 224, 224, dataset_paths, rgb=True)

CPU times: user 2min 57s, sys: 1min 35s, total: 4min 32s
Wall time: 4min 38s


In [5]:
%%time
sample_size = 30000
x, y = create_triplet_dataset(sample_size, 224, 224, dataset_paths, rgb=True)

CPU times: user 5min 9s, sys: 2min 45s, total: 7min 54s
Wall time: 8min 2s


In [6]:
def save_arrays(x, y, sample_size, rgb=False):
    """
    Function for saving numpy arrays in numpy .npz format on disc
    
    Arguments:
        sample_size: Number of image pairs in int format
        x: x data as numpy array
        y: y data as numpy array
        rgb: npz extension if rgb or grayscale images
    
    Returns:
        Print statement if saving is successfull
    """
    if rgb == True:
        savez_compressed(f'npz_datasets/data_x_{sample_size}_rgb.npz', x)
        savez_compressed(f'npz_datasets/data_y_{sample_size}_rgb.npz', y)
    else:
        savez_compressed(f'npz_datasets/data_x_{sample_size}_gray.npz', x)
        savez_compressed(f'npz_datasets/data_y_{sample_size}_gray.npz', y)
    print("saved successfully")

In [7]:
def save_triplet_arrays(x, y, sample_size, rgb=False):
    """
    Function for saving numpy arrays in numpy .npz format on disc
    
    Arguments:
        sample_size: Number of image pairs in int format
        x: x data as numpy array
        y: y data as numpy array
        rgb: npz extension if rgb or grayscale images
    
    Returns:
        Print statement if saving is successfull
    """
    if rgb == True:
        savez_compressed(f'npz_datasets/data_x_{sample_size}_rgb_triplet.npz', x)
        savez_compressed(f'npz_datasets/data_y_{sample_size}_rgb_triplet.npz', y)
    else:
        savez_compressed(f'npz_datasets/data_x_{sample_size}_gray_triplet.npz', x)
        savez_compressed(f'npz_datasets/data_y_{sample_size}_gray_triplet.npz', y)
    print("saved successfully")

In [7]:
save_arrays(x, y, sample_size, rgb=True)

saved successfully


In [None]:
save_triplet_arrays(x, y, sample_size, rgb=True)

In [19]:
def load_arrays(path1, path2):
    """
    Function for load .npz files from disc
    
    Arguments
        path1: Path to file 1 in string format (x data)
        path2: Path to file 2 in string format (y data)
        
    Returns:
        x and y data in numpy array format
    """
    dict_data_x = load(path1)
    dict_data_y = load(path2)
    x = dict_data_x['arr_0']
    y = dict_data_y['arr_0']
    return x, y

In [22]:
x, y = load_arrays("npz_datasets/data_x_5000.npz", "npz_datasets/data_y_5000.npz")