In [4]:
using Flux, Flux.Data.MNIST, Statistics
using Flux: onehotbatch, onecold, crossentropy, throttle
using Base.Iterators: repeated, partition
using Printf, BSON

# This cell import the MNIST dataset and group and divide the training images 
# and labels into small batchs. It also group the images and labels of the 
# testing set.

# Load MNIST set with label and image
@info("Loading MNIST training set")
train_labels = MNIST.labels()
train_imgs = MNIST.images()

# group img and labels into small batches
# idxs: all the index contained in this batch
function make_minibatch(img, label, idxs)
    img_batch = Array{Float32}(undef, size(img[1])..., 1, length(idxs))
    # copy the ith img into the img_batch variable
    for i in 1:length(idxs)
        img_batch[:, :, :, i] = Float32.(img[idxs[i]])
    end
    # match the label of all indices in 0 to 9
    label_batch = onehotbatch(label[idxs], 0:9)
    return (img_batch, label_batch)
end

@info("Dividing and grouping training set into batches")
# each batch has 128 samples
batch_size = 128
# divide 60000 samples into 128 each, result in 469 training sets
# partition is basically an iterator that divide things into equal parts
mb_idxs = partition(1:length(train_imgs), batch_size)
train_set = [make_minibatch(train_imgs, train_labels, i) for i in mb_idxs]

# load test sets and combine imgs and labels into one structure
# here we are not dividing the sets into smaller batches 
@info("Loading MNIST testing set")
test_imgs = MNIST.images(:test)
test_labels = MNIST.labels(:test)
test_set = make_minibatch(test_imgs, test_labels, 1:length(test_imgs))

# now each train/test set is a tuple with a 4d float array and a onehot matrix
println("Loading dataset...Done.")

┌ Info: Loading MNIST training set
└ @ Main In[4]:11
┌ Info: Dividing and grouping training set into batches
└ @ Main In[4]:28


Loading dataset...Done.


┌ Info: Loading MNIST testing set
└ @ Main In[4]:38


In [23]:
# Examine the dataset here, make sure I understand it

# each training image sample is literally an image, encoded with a 2d array
println("The training image is of type: ",typeof(train_imgs[1]))
# each training label is just a number of 0-1 indicating the number of the image
println("The training label is of type: ",typeof(train_labels[1]))

# the type of our training batch
# the whole set is a tuple of a 4d array and a onehot matrix, with 469 batches
println("The training batch is of type: ",typeof(train_set))
# further examine the 2 part of the batch
# each image is 28*28*1*128
# 28*28 stands for the pixals 
# 1 stands for the channel(common channel is 3 for RGB, but here it's just gray)
# 128 stands for the batch size
println("The dimension of the image part in one batch: ", size(train_set[1][1]))
# the dimension of the label part is 10*128
# 10 is the label 0-9, in boolean type, where the ith boolean is true for label i
# 128 is the batch size
println("The dimension of the label part in one batch: ", size(train_set[1][2]))

The training image is of type: Array{ColorTypes.Gray{FixedPointNumbers.Normed{UInt8,8}},2}
The training label is of type: Int64
The training batch is of type: Array{Tuple{Array{Float32,4},Flux.OneHotMatrix{Array{Flux.OneHotVector,1}}},1}
The dimension of the image part in one batch: (28, 28, 1, 128)
The dimension of the label part in one batch: (10, 128)


In [25]:
# Define the CNN model, here I follow how most examples of MNIST by using 3 levels of
# convelutions, using ReLU and maxpooling in a 2*2 grid, then flatten it to feed into 
# the NN model with a softmax activation function. Everything here is pretty standard

# Chain collects multiple layers/functions to be called on a given input
@info("CNN model construction")
model = Chain(
    # First convolution
    # (3,3) refers to a 3*3 filter, 1=>16 transform 1 channel to 16 for each digit samples
    # pad(1,1) add 1 layer of zeros around the input, thus prevent losing pixels when doing
    # convolution, this preserve the output as 28*28
    # The stride is not set here, I believe the default is simply 1
    # ReLU is a standard function for convolution layer, in Flux it is max(0, x)
    # This conv() should produce a 28*28*16*128 output for each batch
    Conv((3, 3), 1=>16, pad=(1,1), relu),
    # Maxpool select the largest value in a 2*2 window slide through the original input
    # This would reduce the dimension of the pixels to half
    # So the output of this layer is 14*14*16*128
    x -> maxpool(x, (2,2)),

    # Second convolution
    # Basically the same as the first layer increasing the channel from 16 to 32
    # produce a 14*14*32*128 output
    Conv((3, 3), 16=>32, pad=(1,1), relu),
    # produce a 7*7*32*128 output
    x -> maxpool(x, (2,2)),

    # Third convolution and pooling, reduce the size to half and leave others unchanged
    # So the output here is 3*3*32*128
    Conv((3, 3), 32=>32, pad=(1,1), relu),
    x -> maxpool(x, (2,2)),

    # flatten the 3*3*32*128 input for each batch to 288*128, 288 is the input for a 
    # traditional NN for each image.
    x -> reshape(x, :, size(x, 4)),
    
    # Just the dense layer in NN models that takes 288 inputs from previous layer and output 
    # the 10 classifications for all 128 samples in this batch
    Dense(288, 10),

    # the activation function max(0, x), x being the output of all 10 classifications.
    softmax,
)

┌ Info: CNN model construction
└ @ Main In[25]:6


Chain(Conv((3, 3), 1=>16, relu), #17, Conv((3, 3), 16=>32, relu), #18, Conv((3, 3), 32=>32, relu), #19, #20, Dense(288, 10), softmax)

In [None]:
# This part apply the CNN model defined above and do the actually training and also 
# calculate the loss and accuracy.

# It seems it is a common practice to load the sets into gpu
train_set = gpu.(train_set)
test_set = gpu.(test_set)
model = gpu(model)

# precompile the model
model(train_set[1][1])

# here use crossentropy to calculate the loss between predicted value and the true value
function loss(x, y)
    # The reference I read added a small amount of gaussian noise to x enhance robustness
    # I'm not sure why we need to do this, but it does lead to better results.
    x_aug = x .+ 0.1f0*gpu(randn(eltype(x), size(x)))
    y_hat = model(x_aug)
    return crossentropy(y_hat, y)
end

# .== is the vector operator, similar to ==, since onecold is in vector form
# This compares the percentage of true prediction
accuracy(x, y) = mean(onecold(model(x)) .== onecold(y))

# use Adam optimizer here with learning rate of 0.001
opt = ADAM(0.001)

@info("Beginning training loop...")
# record the best accuracy so far
best_acc = 0.0
# record the last epoch when the accuracy improve to avoid overfitting
last_improvement = 0
# run all training data 100 times at maximum
# Note: in case I forgot later, an epoch means run all training data once
for epoch_idx in 1:100
    # make those variables global for easier access
    global best_acc, last_improvement
    # Train for a single epoch
    Flux.train!(loss, params(model), train_set, opt)

    # Calculate accuracy:
    acc = accuracy(test_set...)
    @info(@sprintf("[%d]: Test accuracy: %.4f", epoch_idx, acc))
    
    # If our accuracy is good enough, quit out.
    if acc >= 0.999
        @info("Reached our target accuracy of 99.9%...terminate.")
        break
    end

    # If this is the best accuracy we've seen so far, save the model in BSON
    if acc >= best_acc
        @info("New best accuracy! Saving model out to mnist_conv.bson")
        BSON.@save "mnist_conv.bson" model epoch_idx acc
        best_acc = acc
        last_improvement = epoch_idx
    end

    # If we haven't seen improvement in 5 epochs, drop our learning rate by a factor of 10
    if epoch_idx - last_improvement >= 5 && opt.eta > 1e-6
        opt.eta /= 10.0
        @warn("Haven't improved in a while, dropping learning rate to $(opt.eta)!")
        # After dropping learning rate, give it a few epochs to improve
        last_improvement = epoch_idx
    end

    # have not improve accuracy for more tha 10 epoch, quit and call it a day
    if epoch_idx - last_improvement >= 10
        @warn("Terminate to prevent overfitting")
        break
    end
end

┌ Info: Beginning training loop...
└ @ Main In[28]:28
  likely near /home/send_fuze/.julia/packages/IJulia/e8kqU/src/kernel.jl:53
  likely near /home/send_fuze/.julia/packages/IJulia/e8kqU/src/kernel.jl:53
