# MLP on MINST Dataset

In [1]:
Pkg.update()
Pkg.add("MNIST")

INFO: Updating METADATA...
INFO: Computing changes...
INFO: No packages to install, update or remove
INFO: Nothing to be done


In [4]:
using MNIST
features = trainfeatures(1)
label = trainlabel(1)

trainX, trainY = traindata()
testX, testY = testdata()

trainX = trainX'
ttl = 64
trainX, trainY = trainX[1:ttl,:], trainY[1:ttl,:]

(
[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0],

[5.0; 0.0; … ; 6.0; 0.0])

In [32]:
size(trainX), size(trainY)

((64,784),(64,1))

In [33]:
abstract Layer
abstract Nonlinearity <: Layer
abstract LossCriteria <: Layer

In [142]:
# Define the Fully Connected layers
type FCLayer <: Layer
    W           :: Array{Float64}
    last_input  :: Array{Float64}
    last_output :: Array{Float64}
    last_loss   :: Array{Float64}

    function FCLayer(i, o)
        return new(rand(o,i), zeros(i), zeros(o), zeros(o))
    end
end

function forward(l::FCLayer, x::Array{Float64,1})
    @assert ndims(x) == 1 && size(x) == (size(l.W)[2],)
    l.last_input  = x
    l.last_output = l.W * x # matrix multiplication
    l.last_output
end

function backward(l::FCLayer, loss::Array{Float64,1})
    @assert size(loss) == (size(l.W)[1],)
    l.last_loss = loss
    l.W'*loss
end

function gradient(l::FCLayer)
    @assert size(l.last_loss) == (size(l.W)[1],)
    l.last_loss * l.last_input'
end

function getParam(l::FCLayer)
    l.W
end

function setParam(l::FCLayer, theta::Array{Float64})
    @assert size(l.W) == size(theta)
    l.W = theta
end

l = FCLayer(10,20)
forward(l, rand(10))



20-element Array{Float64,1}:
 1.35371 
 1.28055 
 0.943183
 1.70016 
 1.85538 
 1.43631 
 1.67573 
 1.14401 
 2.14995 
 1.44026 
 1.61698 
 0.652001
 1.7755  
 1.17284 
 1.29214 
 1.95913 
 1.43071 
 1.11893 
 1.43883 
 1.18264 

In [143]:
# Define the ReLu layers
type ReLu <: Nonlinearity
    alpha       :: Float64
    last_input  :: Array{Float64}
    last_output :: Array{Float64}
    last_loss   :: Array{Float64}
    function ReLu(alpha::Float64 = 1.0)
        @assert alpha >= 0.
        return new(alpha, Float64[], Float64[], Float64[])
    end
end

function forward(l::ReLu, x::Array{Float64})
    l.last_input  = x
    l.last_output = map(y -> max(0., y*l.alpha), x)
    l.last_output
end

function backward(l::ReLu, loss::Array{Float64})
    @assert size(l.last_input) == size(loss)
    l.last_loss = loss
    map(idx -> l.last_input[idx]>=0 ? l.last_input[idx]*l.alpha*loss[idx] : 0., 1:length(l.last_input))
end

function gradient(l::ReLu)
    0
end

function getParam(l::ReLu)
    0
end

function setParam(l::ReLu, theta)
    nothing
end

l = ReLu()
#println(forward(l, [1.,0.,-1.,2.]))
#println(backward(l, [3.0,2.0,1.,1.0]))



ReLu(1.0,Float64[],Float64[],Float64[])

In [144]:
type CrossEntropyLoss <: LossCriteria
    last_loss  :: Array{Float64}
    last_input :: Array{Float64}
    function CrossEntropyLoss()
        return new(Float64[], Float64[])
    end
end    

function forward(l::CrossEntropyLoss, y::Array{Float64,1}, label::Array{Float64, 1})
    """
    [label]  label[i] == 1 iff the data is classified to class i
    [y]      final input to the loss layer
    """
    local class = convert(Int64,label[1]) + 1
    local ysubt = y - maximum(y)
    local ynorm = (e .^ ysubt) / sum(e .^ ysubt)
    local loss  = (-log(ynorm))[class]
    if loss > e^3
#         print("Loss:$(loss); y=$(y); Y-subtract:$(ysubt); Y-normalized:$(ynorm)")
        loss = e^3
    end
    println("Loss layer:$(loss)")
    return loss
end

function backward(l::CrossEntropyLoss, x::Array{Float64,1}, label::Array{Float64, 1})
    """
    [label]  label[i] == 1 iff the data is classified to class i
    [y]      final input to the loss layer
    """
    local class = convert(Int64,label[1]) + 1
    local t = zeros(length(x))
    t[class] = 1.
    @assert sum(t) == 1 && minimum(t) >= 0
    local max = maximum(x)
    local y = e.^(x-max) / sum(e.^(x-max))
    local dldy = y - t
    return dldy
end
l = CrossEntropyLoss()
println(forward(l, [1.,2.,0.], [2.]))
println(backward(l, [1.,2.,0.], [2.]))

Loss layer:2.40760596444438
2.40760596444438
[0.244728,0.665241,-0.909969]




In [145]:
abstract NN
type SequentialNet <: NN
    layers :: Array{Layer}
    lossfn :: LossCriteria
    function SequentialNet(layers::Array{Layer}, lossfn::LossCriteria)
        return new(layers, lossfn)
    end
end

function forward(net::SequentialNet, x::Array{Float64}, label::Array)
    local inp = x
    for i = 1:length(net.layers)
        inp = forward(net.layers[i], inp)
    end
    local loss = forward(net.lossfn, inp, label)
    println("Network bastract loss:$(loss)")
    return loss
end

function backward(net::SequentialNet, label)
    local dldy = backward(net.lossfn, net.layers[end].last_output, label)
    for i = length(net.layers):-1:1
        dldy = backward(net.layers[i], dldy)
    end
    return dldy
end



backward (generic function with 4 methods)

In [146]:
layers = [
    FCLayer(784, 196),
    ReLu(),
    FCLayer(196, 49),
    ReLu(),
    FCLayer(49, 10)
]
criteria = CrossEntropyLoss()
net = SequentialNet(layers, criteria)



SequentialNet(Layer[FCLayer([0.458559 0.711546 … 0.389605 0.883198; 0.280016 0.322319 … 0.918423 0.623072; … ; 0.420409 0.351594 … 0.381774 0.195997; 0.791072 0.0800652 … 0.480632 0.936868],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0  …  0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0  …  0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0  …  0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]),ReLu(1.0,Float64[],Float64[],Float64[]),FCLayer([0.267457 0.930258 … 0.459908 0.895927; 0.202104 0.0665172 … 0.810728 0.878629; … ; 0.022329 0.0029283 … 0.788282 0.911252; 0.458837 0.656028 … 0.509612 0.525306],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0  …  0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0  …  0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0  …  0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]),ReLu(1.0,Float64[],Float64[],Float64[]),FCLayer([0.776261 0.0900316 … 0.731216 0.

In [141]:
function sgd(net::SequentialNet, batch_X, batch_Y, lr::Float64 = 0.0001)
    local batch_size = size(batch_X)[1]
    local ttl_loss   = 0.
    local gradients  = []
    for i = 1:length(net.layers)
        local layer = net.layers[i]
        append!(gradients,zeros(size(getParam(layer))))
    end
    for b = 1:batch_size
        local X = batch_X[b,:] 
        local Y = batch_Y[b,:]
        local loss = forward(net, X, Y) # Propogate the input and output, calculate the loss
        #println(net.layers[1].last_output)
        backward(net, Y) # Propagate the dldy
        for i = 1:length(net.layers)
            gradients[i] += gradient(net.layers[i]) 
        end
        ttl_loss += loss
    end
    for i = 1:length(net.layers)
        local layer = net.layers[i]
        setParam(layer, getParam(layer) - lr * gradients[i] / batch_size )
    end

    return ttl_loss
end

function train(net::SequentialNet, X, Y)
    local batch_size = 64
    local N = size(Y)[1]
    local batch=0
    for epo = 1:2
        println("Epo $(epo):")
        local num_batch = ceil(length(X)/batch_size)-1
        for bid = 0:num_batch
            batch += 1
            local sidx::Int = convert(Int64, bid*batch_size+1)
            local eidx::Int = convert(Int64, min(N, (bid+1)*batch_size))
            local batch_X = X[sidx:eidx,:]
            local batch_Y = Y[sidx:eidx,:]
            local loss = sgd(net, batch_X, batch_Y)
            println("[$(bid)/$(num_batch)]Loss is: $(loss)")
        end
    end
end

@assert size(trainX)[1] == size(trainY)[1]
println(size(trainX), size(trainY))

train(net, trainX, trainY)

(64,784)(64,1)
Epo 1:
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN




Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastract loss:NaN
Loss layer:NaN
Network bastrac

In [92]:
minimum(trainX[1,:])

0.0