In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from paths import dataset_paths
import random
import os
from PIL import Image, ImageOps, ImageDraw
import numpy as np

In [3]:
def create_image_pairs(height, width, path_anchors, path_positives, dataset_size, rgb=True):
    """
    Function to create image pairs and save it to folders on disk
    Takes pairs from the same image in same folder
    
    Arguments:
        height: desired height of the cropped images - integer
        width: desired width of the cropped images - interger
        rgb: rgb oder gray images, standard rgb=True
        path_anchor: path of the folder for anchor images to save in on disk - string_format
        path_positives: path of the folder for positive images to save in on disk - string_format
        dataset_size: how many image pairs should be generated and saved on disk - integer
    """
    
    i = 0
    #padding for image saving
    padding = len(str(dataset_size))

    while i < dataset_size:

        #select random path and image
        path = (random.choice(dataset_paths))
        image = random.choice(os.listdir(path))

        #Convert image to grayscale iw wanted
        if rgb == True:
            image = Image.open(path + "/" + image)
        else:
            image = Image.open(path + "/" + image).convert("L")

        #check if image is wider than 500 px
        if image.size[0] > 500:
            #resize the image to specific height and keeping the same aspect ratio
            height_precent = (height / float(image.size[1]))
            resized_width = int((float(image.size[0]) * float(height_precent)))
            image = image.resize((resized_width, height), Image.NEAREST)

            #crop randomly 2 images out of source image
            img_array = np.array(image)
            x_max = img_array.shape[1] - width
            x_1 = np.random.randint(0, x_max)
            x_2 = np.random.randint(0, x_max)
            anchor = img_array[0:width, x_1: x_1 + width]
            positive = img_array[0:width, x_2: x_2 + width]

            #save the crops on disk
            anchor = Image.fromarray(anchor)
            positive = Image.fromarray(positive)

            anchor.save(os.path.join(path_anchors + "/" + f"{i:0{padding}}.jpg"))
            positive.save(os.path.join(path_positives + "/" + f"{i:0{padding}}.jpg"))

            i += 1
        else:
            pass

In [4]:
%%time
create_image_pairs(height=224, width=224, path_anchors="npz_datasets/pairs_20k/anchor", path_positives="npz_datasets/pairs_20k/positive", dataset_size=20000, rgb=True)

CPU times: user 2min 30s, sys: 43.8 s, total: 3min 14s
Wall time: 3min 15s


In [3]:
def create_image_pairs_new(height, width, path_anchors, path_positives, dataset_size, rgb=True):
    """
    Function to create image pairs and save it to folders on disk
    Takes pairs from different images in the same folder
    
    Arguments:
        height: desired height of the cropped images - integer
        width: desired width of the cropped images - interger
        rgb: rgb oder gray images, standard rgb=True
        path_anchor: path of the folder for anchor images to save in on disk - string_format
        path_positives: path of the folder for positive images to save in on disk - string_format
        dataset_size: how many image pairs should be generated and saved on disk - integer
    """

    i = 0
    padding = len(str(dataset_size))

    while i < dataset_size:
        path = (random.choice(dataset_paths))
        image_1, image_2 = random.sample(os.listdir(path), 2)

        if rgb == True:
            image_1 = Image.open(path + "/" + image_1)
            image_2 = Image.open(path + "/" + image_2)
        else:
            image_1 = Image.open(path + "/" + image_1).convert("L")
            image_2 = Image.open(path + "/" + image_2).convert("L")

        if image_1.size[0] > 500 and image_2.size[0] > 500:
            #resize the images to specific height and keeping the same aspect ratio
            height_precent_image_1 = (height / float(image_1.size[1]))
            height_precent_image_2 = (height / float(image_2.size[1]))

            resized_width_image_1 = int((float(image_1.size[0]) * float(height_precent_image_1)))
            resized_width_image_2 = int((float(image_2.size[0]) * float(height_precent_image_2)))

            image_1 = image_1.resize((resized_width_image_1, height), Image.NEAREST)
            image_2 = image_2.resize((resized_width_image_2, height), Image.NEAREST)

            #crop randomly  image out of source image
            img_array_1 = np.array(image_1)
            img_array_2 = np.array(image_2)

            x_max_1 = img_array_1.shape[1] - width
            x_max_2 = img_array_2.shape[1] - width

            x_1 = np.random.randint(0, x_max_1)
            x_2 = np.random.randint(0, x_max_2)

            anchor = img_array_1[0:width, x_1: x_1 + width]
            positive = img_array_2[0:width, x_2: x_2 + width]

            anchor = Image.fromarray(anchor)
            positive = Image.fromarray(positive)

            anchor.save(os.path.join(path_anchors + "/" + f"{i:0{padding}}.jpg"))
            positive.save(os.path.join(path_positives + "/" + f"{i:0{padding}}.jpg"))

        else:
            pass

        i += 1

In [4]:
%%time
create_image_pairs_new(height=224, width=224, path_anchors="npz_datasets/pairs_20k_224_224_gray/anchor", path_positives="npz_datasets/pairs_20k_224_224_gray/positive", dataset_size=20000, rgb=False)

CPU times: user 2min 40s, sys: 43.8 s, total: 3min 23s
Wall time: 3min 26s
