In [76]:
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt

from utils import *
from tqdm import trange

from torch.utils.data.dataset import Dataset
from torchvision import transforms

from PIL import Image

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [77]:
data_dir = '../datasets/omniglot/'
train_dir = data_dir + 'background/'
test_dir = data_dir + 'evaluation/'
processed = data_dir + 'processed/'

In [3]:
# ensuring reproducibility
np.random.seed(0)

## Train Data

The train data consists in 40 alphabets.

In each alphabet folder, there is 1 folder for each character in the alphabet. In each character folder, there are 20 images of this character drawn by each of the 20 drawers.

- To create a pair belonging to the same class, we need to select 2 drawers from the same character dir, for any character dir inside the permitted language dirs.
- To create a pair belonging to different classes, we need to select 2 drawers from different character dirs, for character dir inside the permitted language dirs.

Ok so I'm gonna have a counter that starts from 0 and increments up to 30k. At each iteration, if the iteration number is even, I sample a like pair. If it is odd, I sample a dissimilar pair. 

- To sample a like pair, first I randomly select an alphabet from the list of training alphabets (uniform proba). Then, I randomly select a character from the list of characters in this alphabet (uniform proba). Then, I randomly select 2 drawers from the list of training drawers (uniform sampling) and select their corresponding image from the subdirectory.
- To sample a dissimilar pair, I can do 2 things. The first method is to uniformally sample 1 alphabet, then uniformally sample 2 characters in the alphabet, uniformally sample 1 drawer (or 2) and index 1 drawer per alphabet to obtain 2 different characters from the same alphabet. The second method is to uniformally sample 2 alphabets, uniformally sample a character in each alphabet, uniformally sample a drawer per alphabet (or the same drawer), and obtain 2 different characters from 2 different alphabets.

In [5]:
# get list of alphabets
original_alphabets = [os.path.join(train_dir, x) for x in next(os.walk(train_dir))[1]]

# total number of drawers
original_drawers = np.arange(20)

print("There are {} alphabets.".format(len(original_alphabets)))

There are 40 alphabets.


In [6]:
# from 40 alphabets, randomly select 30
train_alphabets = np.random.choice(original_alphabets, size=30, replace=False)
remaining_alphabets = [x for x in original_alphabets if x not in train_alphabets]

# from 20 drawers, randomly select 12
train_drawers = np.random.choice(np.arange(20), size=12, replace=False)
remaining_drawers = [x for x in original_drawers if x not in train_drawers]

In [61]:
num_iters = int(30e3 / 2)
cnt = True

img_pairs = []
label_pairs = []
for i in trange(num_iters):
    # sample a like pair
    if i % 2 == 0:
        # uniformly select 1 alphabet
        alph = np.random.choice(train_alphabets)
                
        # uniformly sample 1 character
        chars = [os.path.join(alph, x) for x in next(os.walk(alph))[1]]
        char = np.random.choice(chars)
                
        # uniformly sample 2 drawers
        ds = np.random.choice(train_drawers, size=2, replace=False)
                
        # get list of filenames to read in char dir
        filenames = [
            os.path.join(char, x) for x in next(os.walk(char))[-1] if int(
                x.split("_")[1][0:2].lstrip("0")
            ) in ds
        ]
        
        # load pair as numpy array and store
        pair = []
        for name in filenames:
            img_arr = img2array(name, gray=True, expand=True)
            pair.append(img_arr)        
        img_pairs.append(np.concatenate(pair, axis=0))
        
        # store ground truth lbl
        gd_truth = np.array([1], dtype=np.int8)
        label_pairs.append(gd_truth)
        
    # sample a dissimilar pair
    else:
        if cnt:
            cnt = False

            # uniformly select 1 alphabet
            alph = np.random.choice(train_alphabets)

             # uniformly sample 2 characters
            chars = [os.path.join(alph, x) for x in next(os.walk(alph))[1]]
            chars = np.random.choice(chars, size=2, replace=False)

            # uniformly sample 1 drawer
            d = np.random.choice(train_drawers)

            filenames = []
            for c in chars:
                # get list of filenames to read in char dir
                name = [
                    os.path.join(c, x) for x in next(os.walk(c))[-1] if int(
                        x.split("_")[1][0:2].lstrip("0")
                    ) == d
                ]
                filenames.append(*name)
            
            # load pair as numpy array and store
            pair = []
            for name in filenames:
                img_arr = img2array(name, gray=True, expand=True)
                pair.append(img_arr)        
            img_pairs.append(np.concatenate(pair, axis=0))
            
             # store ground truth lbl
            gd_truth = np.array([0], dtype=np.int8)
            label_pairs.append(gd_truth)
        else:
            cnt = True
            
            # uniformly select 2 alphabets
            alph = np.random.choice(train_alphabets, size=2, replace=False)
            
            # uniformly sample 1 drawer
            d = np.random.choice(train_drawers)
            
            filenames = []
            for a in alph:
                # uniformly sample 1 character
                chars = [os.path.join(a, x) for x in next(os.walk(a))[1]]
                char = np.random.choice(chars)

                # get list of filenames to read in char dir
                name = [
                    os.path.join(char, x) for x in next(os.walk(char))[-1] if int(
                        x.split("_")[1][0:2].lstrip("0")
                    ) == d
                ]
                filenames.append(*name)

            # load pair as numpy array and store
            pair = []
            for name in filenames:
                img_arr = img2array(name, gray=True, expand=True)
                pair.append(img_arr)        
            img_pairs.append(np.concatenate(pair, axis=0))
            
             # store ground truth lbl
            gd_truth = np.array([0], dtype=np.int8)
            label_pairs.append(gd_truth)

100%|██████████| 15000/15000 [00:15<00:00, 948.96it/s]


In [83]:
pickle.dump(img_pairs, open(processed + 'X_train.p', "wb"))
pickle.dump(label_pairs, open(processed + 'y_train.p', "wb"))

## Validation Data

The validation data consists in 10 alphabets, from 4 drawers out of the remaining 8.

The authors used 2 types of validation strategies for early-stopping of the model training. One of the method consists in creating a validation one-shot scenario to test the model's ability to generalize. 

In [88]:
# from the 40 alphabets, select the last 10
valid_alphabets = remaining_alphabets

# from remaining 8 drawers, select 4
valid_drawers = np.random.choice(remaining_drawers, size=4, replace=False)
remaining_drawers = [x for x in remaining_drawers if x not in valid_drawers]