In [1]:
import os
import random
import itertools
import numpy as np
import matplotlib.pyplot as plt

from utils import *
from tqdm import trange
from PIL import Image

import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from torchvision import transforms

%load_ext autoreload
%autoreload 2

In [2]:
data_dir = './data/processed/'
train_dir = data_dir + 'background/'
test_dir = data_dir + 'evaluation/'

In [3]:
# ensuring reproducibility
np.random.seed(0)

## Train Data

The train data consists of 40 alphabets and 12 drawers.

In each alphabet folder, there is 1 folder for each character in the alphabet. In each character folder, there are 20 images of this character drawn by each of the 20 drawers.

- To create a pair belonging to the same class, we need to select 2 drawers from the same character dir, for any character dir inside the permitted language dirs.
- To create a pair belonging to different classes, we need to select 2 drawers from different character dirs, for character dir inside the permitted language dirs.

Ok so I'm gonna have a counter that starts from 0 and increments up to 30k. At each iteration, if the iteration number is even, I sample a like pair. If it is odd, I sample a dissimilar pair. 

- To sample a like pair, first I randomly select an alphabet from the list of training alphabets (uniform proba). Then, I randomly select a character from the list of characters in this alphabet (uniform proba). Then, I randomly select 2 drawers from the list of training drawers (uniform sampling) and select their corresponding image from the subdirectory.
- To sample a dissimilar pair, I can do 2 things. The first method is to uniformally sample 1 alphabet, then uniformally sample 2 characters in the alphabet, uniformally sample 1 drawer (or 2) and index 1 drawer per alphabet to obtain 2 different characters from the same alphabet. The second method is to uniformally sample 2 alphabets, uniformally sample a character in each alphabet, uniformally sample a drawer per alphabet (or the same drawer), and obtain 2 different characters from 2 different alphabets.

In [4]:
# get list of alphabets
original_alphabets = [os.path.join(train_dir, x) for x in next(os.walk(train_dir))[1]]

# total number of drawers
original_drawers = np.arange(1, 21)

print("There are {} alphabets.".format(len(original_alphabets)))

There are 40 alphabets.


In [5]:
# from 40 alphabets, randomly select 30
train_alphabets = np.random.choice(original_alphabets, size=30, replace=False)
remaining_alphabets = [x for x in original_alphabets if x not in train_alphabets]

# from 20 drawers, randomly select 12
train_drawers = np.random.choice(np.arange(20), size=12, replace=False)
remaining_drawers = [x for x in original_drawers if x not in train_drawers]

In [6]:
num_iters = int(30e3 / 2)
cnt = True

img_pairs = []
label_pairs = []
for i in trange(num_iters):
    # sample a like pair
    if i % 2 == 0:
        # uniformly select 1 alphabet
        alph = np.random.choice(train_alphabets)
                
        # uniformly sample 1 character
        chars = [os.path.join(alph, x) for x in next(os.walk(alph))[1]]
        char = np.random.choice(chars)
                
        # uniformly sample 2 drawers
        ds = np.random.choice(train_drawers, size=2, replace=False)
                
        # get list of filenames to read in char dir
        filenames = [
            os.path.join(char, x) for x in next(os.walk(char))[-1] if int(
                x.split("_")[1][0:2].lstrip("0")
            ) in ds
        ]
        
        # load pair as numpy array and store
        pair = []
        for name in filenames:
            img_arr = img2array(name, gray=True, expand=True)
            img_arr = np.transpose(img_arr, (0, 3, 1, 2))
            pair.append(img_arr)        
        img_pairs.append(np.concatenate(pair, axis=0))
        
        # store ground truth lbl
        gd_truth = np.array([1], dtype=np.int64)
        label_pairs.append(gd_truth)
        
    # sample a dissimilar pair
    else:
        if cnt:
            cnt = False

            # uniformly select 1 alphabet
            alph = np.random.choice(train_alphabets)

             # uniformly sample 2 characters
            chars = [os.path.join(alph, x) for x in next(os.walk(alph))[1]]
            chars = np.random.choice(chars, size=2, replace=False)

            # uniformly sample 1 drawer
            d = np.random.choice(train_drawers)

            filenames = []
            for c in chars:
                # get list of filenames to read in char dir
                name = [
                    os.path.join(c, x) for x in next(os.walk(c))[-1] if int(
                        x.split("_")[1][0:2].lstrip("0")
                    ) == d
                ]
                filenames.append(*name)
            
            # load pair as numpy array and store
            pair = []
            for name in filenames:
                img_arr = img2array(name, gray=True, expand=True)
                img_arr = np.transpose(img_arr, (0, 3, 1, 2))
                pair.append(img_arr)        
            img_pairs.append(np.concatenate(pair, axis=0))
            
             # store ground truth lbl
            gd_truth = np.array([0], dtype=np.int64)
            label_pairs.append(gd_truth)
        else:
            cnt = True
            
            # uniformly select 2 alphabets
            alph = np.random.choice(train_alphabets, size=2, replace=False)
            
            # uniformly sample 1 drawer
            d = np.random.choice(train_drawers)
            
            filenames = []
            for a in alph:
                # uniformly sample 1 character
                chars = [os.path.join(a, x) for x in next(os.walk(a))[1]]
                char = np.random.choice(chars)

                # get list of filenames to read in char dir
                name = [
                    os.path.join(char, x) for x in next(os.walk(char))[-1] if int(
                        x.split("_")[1][0:2].lstrip("0")
                    ) == d
                ]
                filenames.append(*name)

            # load pair as numpy array and store
            pair = []
            for name in filenames:
                img_arr = img2array(name, gray=True, expand=True)
                img_arr = np.transpose(img_arr, (0, 3, 1, 2))
                pair.append(img_arr)        
            img_pairs.append(np.concatenate(pair, axis=0))
            
             # store ground truth lbl
            gd_truth = np.array([0], dtype=np.int64)
            label_pairs.append(gd_truth)

100%|██████████| 15000/15000 [00:16<00:00, 889.47it/s]


In [7]:
# shuffle img and labels to prevent monotone (same, different) sequence
indices = list(range(len(img_pairs)))
np.random.shuffle(indices)

img_pairs = [img_pairs[idx] for idx in indices]
label_pairs = [label_pairs[idx] for idx in indices]

In [8]:
pickle_dump(img_pairs, data_dir + 'X_train.p')
pickle_dump(label_pairs, data_dir + 'y_train.p')

Writing 1323735142 total bytes
Writing bytes [0, 1073741824]
Writing bytes [1073741824, 1323735142]
Done!
Writing 570223 total bytes
Writing bytes [0, 570223]
Done!


## Train Data Augmentation

We add 8 transforms for each training example pair in `img_pair`. The transformation is affine with the following constraints on its parameters:

- theta $\in$ [-10, 10] uniform (rotation)
- $\rho_x$ and $\rho_y$ $\in$ [-0.3, 0.3] uniform (shear)
- $s_x$ and $s_y$ $\in$ [0.8, 1.2] uniform (scale)
- $t_x$ and $t_y$ $\in$ [-2, 2] uniform (translation)

Each of these parameters is included with probability 0.5.

In [9]:
arr2pil = transforms.ToPILImage()

def pil2array(im):
    x = np.asarray(im, dtype=np.float32)
    x = np.expand_dims(x, axis=-1)
    x = np.expand_dims(x, axis=0)
    x = x / 255
    return x

augmented_img_pairs = []
augmented_label_pairs = []
for idx in trange(len(img_pairs)):
    # get gd truth label
    label = label_pairs[idx]
    
    # grab img pair
    pair = img_pairs[idx]
    pair = np.transpose(pair, (0, 2, 3, 1))
    im1, im2 = np.array(pair)
    
    # convert back to [0, 255] range
    im1 *= 255
    im2 *= 255
    
    # transform to PIL image
    im1, im2 = arr2pil(im1), arr2pil(im2)
    
    # compose 8 transforms
    for i in range(8):
        # randomly select transform with proba 0.5
        rot = random.choice([0, [-10, 10]])
        shear = random.choice([None, [-0.3, 0.3]])
        scale = random.choice([None, [0.8, 1.2]])
        trans = random.choice([None, [2/150, 2/150]]) # absolute value
        
        # apply affine transformation
        aff = transforms.RandomAffine(rot, trans, scale, shear)
        aug_im1, aug_im2 = aff(im1), aff(im2)
        
        # convert to numpy array
        aug_im1 = pil2array(aug_im1)
        aug_im2 = pil2array(aug_im2)
        
        # transpose to C,H,W
        aug_im1 = np.transpose(aug_im1, (0, 3, 1, 2))
        aug_im2 = np.transpose(aug_im2, (0, 3, 1, 2))
        
        # add to list
        aug_pairs = np.concatenate([aug_im1, aug_im2], axis=0)
        augmented_img_pairs.append(aug_pairs)
        augmented_label_pairs.append(label)

100%|██████████| 15000/15000 [00:37<00:00, 402.57it/s]


## Merge Train and Augmented Train

In [10]:
# shuffle img and labels
indices = list(range(len(augmented_img_pairs)))
np.random.shuffle(indices)

augmented_img_pairs = [augmented_img_pairs[idx] for idx in indices]
augmented_label_pairs = [augmented_label_pairs[idx] for idx in indices]

In [11]:
train_img_pairs = img_pairs + augmented_img_pairs
train_label_pairs = label_pairs + augmented_label_pairs

print("Effective Train Size: {}".format(2 * len(train_img_pairs)))

Effective Train Size: 270000


In [12]:
pickle_dump(train_img_pairs, data_dir + 'X_train_aug.p')
pickle_dump(train_label_pairs, data_dir + 'y_train_aug.p')

Writing 11913615382 total bytes
Writing bytes [0, 1073741824]
Writing bytes [1073741824, 2147483648]
Writing bytes [2147483648, 3221225472]
Writing bytes [3221225472, 4294967296]
Writing bytes [4294967296, 5368709120]
Writing bytes [5368709120, 6442450944]
Writing bytes [6442450944, 7516192768]
Writing bytes [7516192768, 8589934592]
Writing bytes [8589934592, 9663676416]
Writing bytes [9663676416, 10737418240]
Writing bytes [10737418240, 11811160064]
Writing bytes [11811160064, 11913615382]
Done!
Writing 1169584 total bytes
Writing bytes [0, 1169584]
Done!


## Validation Data

The validation data consists of 10 alphabets and 4 drawers.

The authors used 2 types of validation strategies for early-stopping of the model training. One of the method consists in creating a validation one-shot scenario to test the model's ability to generalize. We pick an alphabet from among the 10 available, choose 16 characters uniformly at random and select 2 of the 4 available drawers. We then select all the 16 characters produced by the first drawer, and individually compare against all 16 characters from the second drawer, with the goal of predicting the class of the character from among all of the second drawer's characters.

This process is repeated twice for all alphabets (the second time we pick the 2 other drawers), so that there are 32 one-shot learning trials for each of the 10 validation alphabets, for a total of 320 one-shot trials.

In [13]:
# from the 40 alphabets, select the last 10
valid_alphabets = remaining_alphabets

# from remaining 8 drawers, select 4
valid_drawers = np.random.choice(remaining_drawers, size=4, replace=False)
remaining_drawers = [x for x in remaining_drawers if x not in valid_drawers]

In [14]:
num_iters = len(valid_alphabets)

# number of characters to sample in each alphabet
# I can't increase indefinitely since some alphabets
# have less characters than others
pop = 13

valid_img_pairs = []
valid_label_pairs = []
for alph in valid_alphabets:
    for j in range(2):
        # grab drawers
        ds = [valid_drawers[2*j], valid_drawers[2*j + 1]]
        
        # sample 10 characters uniformly
        chars = [os.path.join(alph, x) for x in next(os.walk(alph))[1]]
        chars = np.random.choice(chars, size=pop, replace=False)
        
        # grab filenames for both drawers
        filenames = []
        for d in ds:
            for char in chars:
                names = [
                    os.path.join(char, x) for x in next(os.walk(char))[-1] if int(
                        x.split("_")[1][0:2].lstrip("0")
                    ) == d
                ]
                filenames.append(*names)
        d1 = filenames[:pop]
        d2 = filenames[pop:]

        for i, left in enumerate(d1):
            for right in d2:
                img_names = [left, right]
                pair = []
                for name in img_names:
                    img_arr = img2array(name, gray=True, expand=True)
                    img_arr = np.transpose(img_arr, (0, 3, 1, 2))
                    pair.append(img_arr) 
                # create img and label
                pair = np.concatenate(pair, axis=0)
                label = np.array([i], dtype=np.int64)
                
                # store
                valid_img_pairs.append(pair)
                valid_label_pairs.append(label)        

In [15]:
# shuffle img and labels
indices = list(range(len(valid_img_pairs)))
np.random.shuffle(indices)

valid_img_pairs = [valid_img_pairs[idx] for idx in indices]
valid_label_pairs = [valid_label_pairs[idx] for idx in indices]

In [16]:
pickle_dump(valid_img_pairs, data_dir + 'X_valid.p')
pickle_dump(valid_label_pairs, data_dir + 'y_valid.p')

Writing 298281740 total bytes
Writing bytes [0, 298281740]
Done!
Writing 128578 total bytes
Writing bytes [0, 128578]
Done!


## Test Data

The test set consists in 10 alphabets and 4 drawers (just like the validation set).

In [17]:
# get list of alphabets
test_alphabets = [os.path.join(test_dir, x) for x in next(os.walk(test_dir))[1]]

# there are 20 drawers
test_drawers = np.arange(1, 21)

In [18]:
num_iters = len(test_alphabets)

# number of characters to sample in each alphabet
pop = 20

test_img_pairs = []
test_label_pairs = []
for alph in test_alphabets:
    for j in range(2):
        # sample a pair of drawers
        ds = np.random.choice(test_drawers, size=2, replace=False)
        
        # sample 20 characters uniformly
        chars = [os.path.join(alph, x) for x in next(os.walk(alph))[1]]
        chars = np.random.choice(chars, size=pop, replace=False)
        
        # grab filenames for both drawers
        filenames = []
        for d in ds:
            for char in chars:
                names = [
                    os.path.join(char, x) for x in next(os.walk(char))[-1] if int(
                        x.split("_")[1][0:2].lstrip("0")
                    ) == d
                ]
                filenames.append(*names)
        d1 = filenames[:pop]
        d2 = filenames[pop:]

        for i, left in enumerate(d1):
            for right in d2:
                img_names = [left, right]
                pair = []
                for name in img_names:
                    img_arr = img2array(name, gray=True, expand=True)
                    img_arr = np.transpose(img_arr, (0, 3, 1, 2))
                    pair.append(img_arr) 
                # create img and label
                pair = np.concatenate(pair, axis=0)
                label = np.array([i], dtype=np.int64)
                
                # store
                test_img_pairs.append(pair)
                test_label_pairs.append(label)        

In [19]:
# shuffle img and labels
indices = list(range(len(test_img_pairs)))
np.random.shuffle(indices)

test_img_pairs = [test_img_pairs[idx] for idx in indices]
test_label_pairs = [test_label_pairs[idx] for idx in indices]

In [20]:
pickle_dump(test_img_pairs, data_dir + 'X_test.p')
pickle_dump(test_label_pairs, data_dir + 'y_test.p')

Writing 705992128 total bytes
Writing bytes [0, 705992128]
Done!
Writing 304173 total bytes
Writing bytes [0, 304173]
Done!
