# MNIST templates

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from fastai.vision import *
from fastai import datasets

MNIST_URL='http://deeplearning.net/data/mnist/mnist.pkl'

def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    #return map(tensor, (x_train,y_train,x_valid,y_valid))
    return x_train,y_train,x_valid,y_valid

Let's get the data and training interface from where we left in the last notebook.

In [7]:
x_train,y_train,x_valid,y_valid = get_data()
len(x_train), len(x_valid)

(50000, 10000)

## n-class subset

Get a sub-set of only a few classes (e.g. to do faster calculations during initial dev)

In [13]:
def get_subset(x, y, n_classes): 
    "extract only entries that are in n_classes (e.g. n_classes=2 for int classes: 0, 1)"
    return list(zip(*[(x[i],y[i]) for i in range(len(y)) if y[i] < n_classes]))

# 0, 1 subset - y's are integers
n_classes = 2
x2_train,y2_train = get_subset(x_train, y_train, n_classes)
x2_valid,y2_valid = get_subset(x_valid, y_valid, n_classes)
y2_train[:5]
y2_valid[:5]
len(x2_train), len(x2_valid)

(0, 1, 1, 1, 1)

(1, 0, 1, 0, 0)

(10610, 2055)

## all-class sub-sample

Get a sample of size N from each class

In [16]:
def get_sample(x, y, n_classes, sample_size):
    "extract only a sample size from each class"
    cnt = torch.zeros(n_classes)
    return list(zip(*[(x[i],y[i]) for i in range(len(y)) if cnt[y[i]].add_(1) < sample_size+1]))

sample_size = 50
n_classes = 10 # y_train.max()+1 

x3_train,y3_train = get_sample(x_train, y_train, n_classes, sample_size)
x3_valid,y3_valid = get_sample(x_valid, y_valid, n_classes, sample_size)

y3_train[:5]
y3_valid[:5]
len(x3_train), len(x3_valid)

(5, 0, 4, 1, 9)

(3, 8, 6, 9, 6)

(500, 500)

## n-class sub-sample

Get a sub-set of only a few classes and further a sub-sample of each

In [15]:
sample_size = 50
n_classes = 2
# stage 1 - get only n-classes
x5_train,y5_train = get_subset(x_train, y_train, n_classes)
x5_valid,y5_valid = get_subset(x_valid, y_valid, n_classes)
# stage 2 - get a sub-sample
x6_train,y6_train = get_sample(x5_train, y5_train, n_classes, sample_size)
x6_valid,y6_valid = get_sample(x5_valid, y5_valid, n_classes, sample_size)


y6_train[:5]
y6_valid[:5]

len(x6_train), len(x6_valid)

(0, 1, 1, 1, 1)

(1, 0, 1, 0, 0)

(100, 100)