# Setup

Note: you may have to add/clone/checkout some of these packages
Notebook based on Tom Berloff works: http://www.breloff.com

In [12]:
# this re-exports Transformations, StochasticOptimization, Penalties, and ObjectiveFunctions
using Learn
using MLPlots

# my version of ML iteration.  Hopefully will be replaced with what's currently in MLDataUtils dev branch
using StochasticOptimization.Iteration

import MLDataUtils: rescale!

# for loading the data
import MNIST

# for plotting
using StatPlots, MLPlots
#gr(leg=false, linealpha=0.5, legendfont=font(7), fmt=:png)
gr(leg=false, linealpha=0.5)

Plots.GRBackend()

# Helper functions

In [13]:
# create a one-hot matrix given class labels
# TODO: this should be added as a utility in MLDataUtils
function to_one_hot(y::AbstractVector)
    yint = map(yi->round(Int,yi)+1, y)
    nclasses = maximum(yint)
    hot = zeros(Float64, nclasses, length(y))
    for (i,yi) in enumerate(yint)
        hot[yi,i] = 1.0
    end
    hot
end

# randomly pick a subset of testdata (size = totcount) and compute the total loss
function my_test_loss(obj, testdata, totcount = 500)
    totloss = 0.0
    totcorrect = 0
    for (x,y) in each_obs(rand(each_obs(testdata), totcount))
        totloss += transform!(obj,y,x)

        # logistic version:
        # ŷ = output_value(obj.transformation)[1]
        # correct = (ŷ > 0.5 && y > 0.5) || (ŷ <= 0.5 && y < 0.5)

        # softmax version:
        ŷ = output_value(obj.transformation)
        chosen_idx = indmax(ŷ)
        correct = y[chosen_idx] > 0

        totcorrect += correct
    end
    totloss, totcorrect/totcount
end



my_test_loss (generic function with 2 methods)

# Set up the dataset

In [14]:
# our data:
x_train, y_train = MNIST.traindata()
x_test, y_test = MNIST.testdata()

# normalize the input data given μ/σ for the input training data
# note: scale both train and test sets using the train data
μ, σ = rescale!(x_train)
rescale!(x_test, μ, σ)
# xmin, xmax = extrema(x_train)
# x_train .= 2 .* (x_train .- xmin) ./ (xmax - xmin) .- 1
# x_test .= 2 .* (x_test .- xmin) ./ (xmax - xmin) .- 1

# convert y data to one-hot
y_train, y_test = map(to_one_hot, (y_train, y_test))

# optional: limit to only 0/1 digits for easier training
# to_isone(y::AbstractVector) = (z = Array(eltype(y), 1, length(y)); map!(yi->float(yi==1.0), z, y))
# y_train, y_test = map(to_isone, (y_train, y_test))
# train = filterobs(i -> y_train[i] < 1.5, x_train, y_train)
# test = filterobs(i -> y_test[i] < 1.5, x_test, y_test)

# store as tuples to make it easier
train = (x_train, y_train)
test = (x_test, y_test);

# Construct our model and objective function

In [15]:
# Activation Functions => :logistic, :tanh, :softsign, :relu, :softplus, :sinusoid, :gaussian, :threshold, :sign

nin, nh, nout = 784, [50,50], 10

# this is our gradient calculation method

#grad_calc = :backprop
grad_calc = :dfa

# create a feedforward neural net with softplus activations and softmax output
t = nnet(nin, nout, nh, :softplus, :softmax, grad_calc=grad_calc)

# create an objective function with L2 penalty and an implicit cross entropy loss layer
penalty = NoPenalty()
obj = objective(t, penalty)

ObjectiveFunctions.RegularizedObjective{Transformations.Chain{Float64,Transformations.Params{SubArray{Float64,1,Array{Float64,1},Tuple{UnitRange{Int64}},true},Tuple{},Tuple{}},Transformations.NoPreprocessing},ObjectiveFunctions.CrossEntropy{Float64},PenaltyFunctions.NoPenalty}(Chain{Float64}(
   Affine{784-->50}
   softplus{50}
   Affine{50-->50}
   softplus{50}
   Affine{50-->10}
   softmax{10}
) ,ObjectiveFunctions.CrossEntropy{Float64}(10,Transformations.InputNode{:+,Float64,1}(Transformations.Node[Transformations.OutputNode{Float64,1}(Transformations.Node[Transformations.InputNode{:+,Float64,1}(#= circular reference @-4 =#)],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],Dict{Transformations.OutputNode{Float64,1},Int64}()),Transformations.InputNode{:+,Float64,1}(Transformations.Node[],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.

# optional: set up plotting

In [16]:
# the parts of the plot
chainplt = ChainPlot(t, maxn=10, tickfont=font(5))
lossplt = TracePlot(2, title="Loss", ylim=(0,Inf), leg=true, lab=["Train" "Test"])
accuracyplt = TracePlot(2, title="Accuracy", ylim=(0.4,1), leg=true, lab=["Train" "Test"])
hmplt = heatmap(rand(28,28), ratio=1, title="outgoing wgt")
hmplt2 = heatmap(rand(28,28), ratio=1, title="input grad")

# put together the full plot... a ChainPlot with loss, accuracy, and the heatmap
plot(
    chainplt.plt,
    lossplt.plt,
    accuracyplt.plt,
    hmplt,
    hmplt2,
    size = (1200,800),
    layout=@layout([a{0.8h}; grid(1,2){0.75w} grid(1,2)])
)

anim = nothing
#anim = Animation()

# this is our custom callback which will be called on every 100 iterations
# note: we do the plotting here.
tracer = IterFunction((obj, i) -> begin
    # sample points from the test set and compute/save the loss
    @show i
    if mod1(i,500)==500
        # training loss
        trainloss, trainaccuracy = my_test_loss(obj, train, 200)
        @show trainloss, trainaccuracy
        
        testloss, testaccuracy = my_test_loss(obj, test, 200)
        @show testloss, testaccuracy
        
        push!(lossplt, i, [trainloss, testloss])
        push!(accuracyplt, i, [trainaccuracy, testaccuracy])
    end

    # add transformation data
    update!(chainplt)

    # update the heatmap of the total outgoing weight from each pixel
    t1 = isa(t[1], InputNorm) ? t[2] : t[1]
    pixel_importance = reshape(sum(t1.params.views[1],1), 28, 28)
    hmplt[1][1][:z].surf[:] = pixel_importance
    
    pixel_importance = reshape(abs(input_grad(t)),28,28)  # another possible metric
    hmplt2[1][1][:z].surf[:] = pixel_importance

    # handle animation frames/output
    anim == nothing || frame(anim)

    # update the plot display
    #gui()
    inline()
end, every=100)

# trace once before we start learning to see initial values
tracer.f(obj, 0)

# Create a MetaLearner

In [17]:
learner = make_learner(
    # averages the gradient over minibatches, updating params using the Adam method
    GradientLearner(1e-3, SGD(0.7)),

    # our custom iteration method
    tracer,

    # shorthand to add a MaxIter(10000)
    maxiter = 1500
)

LearningStrategies.MetaLearner{Tuple{StochasticOptimization.GradientLearner{StochasticOptimization.FixedLR,StochasticOptimization.SGD{Float64},StochasticOptimization.GradientAverager},LearningStrategies.IterFunction,LearningStrategies.MaxIter}}((StochasticOptimization.GradientLearner{StochasticOptimization.FixedLR,StochasticOptimization.SGD{Float64},StochasticOptimization.GradientAverager}(StochasticOptimization.FixedLR(0.001),StochasticOptimization.SGD{Float64}(0.7,#undef),StochasticOptimization.GradientAverager(#undef)),LearningStrategies.IterFunction(#5,100),LearningStrategies.MaxIter(1500)))

# Learn!

In [18]:
# do the learning... average over minibatches of size 5 for maxiter iterations
# learn!(obj, learner, infinite_batches(train, size=5))
learn!(obj, learner, infinite_obs(train))