# MLP on MINST Dataset

In [1]:
Pkg.update()
Pkg.add("MNIST")

INFO: Updating METADATA...
INFO: Computing changes...
INFO: No packages to install, update or remove
INFO: Nothing to be done


In [4]:
using MNIST
features = trainfeatures(1)
label = trainlabel(1)

trainX, trainY = traindata()
testX, testY = testdata()

trainX = trainX'
ttl = 64
trainX, trainY = trainX[1:ttl,:], trainY[1:ttl,:]

(
[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0],

[5.0; 0.0; … ; 6.0; 0.0])

In [5]:
size(trainX), size(trainY)

((64,784),(64,1))

In [6]:
abstract Layer
abstract Nonlinearity <: Layer
abstract LossCriteria <: Layer

In [7]:
# Define the Fully Connected layers
type FCLayer <: Layer
    W           :: Array{Float64}
    last_input  :: Array{Float64}
    last_output :: Array{Float64}
    last_loss   :: Array{Float64}

    function FCLayer(i, o)
        return new(rand(o,i), zeros(i), zeros(o), zeros(o))
    end
end

function forward(l::FCLayer, x::Array{Float64,1})
    @assert ndims(x) == 1 && size(x) == (size(l.W)[2],)
    l.last_input  = x
    l.last_output = l.W * x # matrix multiplication
    l.last_output
end

function backward(l::FCLayer, loss::Array{Float64,1})
    @assert size(loss) == (size(l.W)[1],)
    l.last_loss = loss
    println("At FC loss is:")
    println(loss)
    l.W'*loss
end

function gradient(l::FCLayer)
    @assert size(l.last_loss) == (size(l.W)[1],)
    l.last_loss * l.last_input'
end

function getParam(l::FCLayer)
    l.W
end

function setParam(l::FCLayer, theta::Array{Float64})
    @assert size(l.W) == size(theta)
    l.W = theta
end

l = FCLayer(10,20)
forward(l, rand(10))

20-element Array{Float64,1}:
 3.67384
 3.51317
 2.74777
 2.86112
 3.48544
 2.14243
 3.03312
 3.33919
 4.01502
 3.40085
 2.67561
 2.94025
 3.85563
 3.68353
 3.78292
 3.3746 
 3.18694
 2.92315
 2.61781
 2.47392

In [8]:
# Define the ReLu layers
type ReLu <: Nonlinearity
    alpha       :: Float64
    last_input  :: Array{Float64}
    last_output :: Array{Float64}
    last_loss   :: Array{Float64}
    function ReLu(alpha::Float64 = 1.0)
        @assert alpha >= 0.
        return new(alpha, Float64[], Float64[], Float64[])
    end
end

function forward(l::ReLu, x::Array{Float64})
    l.last_input  = x
    l.last_output = map(y -> max(0., y*l.alpha), x)
    l.last_output
end

function backward(l::ReLu, loss::Array{Float64})
    @assert size(l.last_input) == size(loss)
    println("At ReLu loss is:")
    println(loss)
    l.last_loss = loss
    map(idx -> l.last_input[idx]>=0 ? l.last_input[idx]*l.alpha*loss[idx] : 0., 1:length(l.last_input))
end

function gradient(l::ReLu)
    0
end

function getParam(l::ReLu)
    0
end

function setParam(l::ReLu, theta)
    nothing
end

l = ReLu()
#println(forward(l, [1.,0.,-1.,2.]))
#println(backward(l, [3.0,2.0,1.,1.0]))

ReLu(1.0,Float64[],Float64[],Float64[])

In [22]:
type CrossEntropyLoss <: LossCriteria
    last_loss  :: Array{Float64}
    last_input :: Array{Float64}
    function CrossEntropyLoss()
        return new(Float64[], Float64[])
    end
end    

function forward(l::CrossEntropyLoss, y::Array{Float64,1}, label::Array{Float64, 1})
    """
    [label]  label[i] == 1 iff the data is classified to class i
    [y]      final input to the loss layer
    """
    class = convert(Int64,label[1]) + 1
    #println("y is:")
    #println(y)
    #println("y - max : ")
    #println(y-maximum(y))
    #println("after: ")
    #println( -log(e .^ (y-maximum(y)) ./ sum(e .^ (y-maximum(y))))[class])
    return ( -log(e .^ (y-maximum(y)) ./ sum(e .^ (y-maximum(y))))[class])
end

function backward(l::CrossEntropyLoss, x::Array{Float64,1}, label::Array{Float64, 1})
    """
    [label]  label[i] == 1 iff the data is classified to class i
    [y]      final input to the loss layer
    """
    class = convert(Int64,label[1]) + 1
    l = zeros(length(x))
    l[class] = 1.
    @assert sum(l) == 1 && minimum(l) >= 0
    max = maximum(x)
    y = e.^(x-max) / sum(e.^(x-max))
    dldy = y - l
    println("y normalized is $(y)")
    println("dldy is $(dldy)")
    return dldy
end
l = CrossEntropyLoss()
#println(forward(l, [1.,2.,0.], [2.]))
#println(backward(l, [1.,2.,0.], [2.]))



CrossEntropyLoss(Float64[],Float64[])

In [23]:
abstract NN
type SequentialNet <: NN
    layers :: Array{Layer}
    lossfn :: LossCriteria
    function SequentialNet(layers::Array{Layer}, lossfn::LossCriteria)
        return new(layers, lossfn)
    end
end

function forward(net::SequentialNet, x::Array{Float64}, label::Array)
    local inp = x
    for i = 1:length(net.layers)
        inp = forward(net.layers[i], inp)
    end
    forward(net.lossfn, inp, label)
end

function backward(net::SequentialNet, label)
    dldy = backward(net.lossfn, net.layers[end].last_output, label)
    for i = length(net.layers):-1:1
        dldy = backward(net.layers[i], dldy)
    end
    dldy
end



backward (generic function with 4 methods)

In [24]:
layers = [
    FCLayer(784, 196),
    ReLu(),
    FCLayer(196, 49),
    ReLu(),
    FCLayer(49, 10)
]
criteria = CrossEntropyLoss()
net = SequentialNet(layers, criteria)



SequentialNet(Layer[FCLayer([0.61838 0.254285 … 0.809475 0.42235; 0.688391 0.541867 … 0.828351 0.653934; … ; 0.963204 0.888338 … 0.240882 0.706058; 0.781338 0.961596 … 0.990359 0.309834],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0  …  0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0  …  0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0  …  0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]),ReLu(1.0,Float64[],Float64[],Float64[]),FCLayer([0.039385 0.847113 … 0.862584 0.8009; 0.49094 0.299299 … 0.0599353 0.590931; … ; 0.400136 0.493112 … 0.780808 0.646728; 0.659503 0.427899 … 0.640264 0.257329],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0  …  0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0  …  0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0  …  0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]),ReLu(1.0,Float64[],Float64[],Float64[]),FCLayer([0.320568 0.501893 … 0.794415 0.886375; 

In [25]:
function sgd(net::SequentialNet, batch_X, batch_Y, lr::Float64 = 0.0001)
    batch_size = size(batch_X)[1]
    ttl_loss   = 0.
    for b = 1:batch_size
        X, Y = batch_X[b,:], batch_Y[b,:]
        loss = forward(net, X, Y) # Propogate the input and output, calculate the loss
        #println(net.layers[1].last_output)
        backward(net, Y) # Propagate the dldy
        for l = 1:length(net.layers)
            layer = net.layers[l]
            #println(layer.last_loss)
            setParam(layer, getParam(layer) - lr * gradient(layer) / batch_size )
        end
        ttl_loss += loss
    end
    ttl_loss
end

function train(net::SequentialNet, X, Y)
    batch_size, N = 64, size(Y)[1]
    batch=0
    for epo = 1:2
        println("Epo $(epo):")
        for bid = 0:ceil(length(X)/batch_size)-1
            batch += 1
            sidx::Int = convert(Int64, bid*batch_size+1)
            eidx::Int = convert(Int64, min(N, (bid+1)*batch_size))
            batch_X = X[sidx:eidx,:]
            batch_Y = Y[sidx:eidx,:]
            loss = sgd(net, batch_X, batch_Y)
            println("Loss is:")
            println(loss)
            #println("[Epo $(epo) : batch $(batch)]: loss = $(loss)")
        end
    end
end

@assert size(trainX)[1] == size(trainY)[1]
println(size(trainX), size(trainY))

train(net, trainX, trainY)

(64,784)(64,1)
Epo 1:




y normalized is [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]
dldy is [0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,1.0]
At FC loss is:
[0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,1.0]
At ReLu loss is:
[0.277587,0.505294,0.475789,0.122522,0.544657,0.479699,0.642427,0.493569,0.404607,-0.669819,0.0732646,0.328313,0.506599,0.158754,-0.0534308,0.037522,-0.0552032,0.276901,-0.682483,0.0764058,-0.071843,-0.539555,-0.556893,-0.449544,0.321734,-0.106337,-0.166696,-0.152032,0.335478,-0.285311,0.462932,-0.277164,-0.19997,-0.527903,-0.071115,-0.632826,0.614966,-0.289084,-0.095916,-0.562261,0.0549359,0.11703,-0.0449488,0.697334,-0.552127,-0.331971,0.170789,-0.323272,0.514054]
At FC loss is:
[3.86801e5,658114.0,6.09247e5,1.65344e5,7.2527e5,6.65659e5,8.90814e5,6.90358e5,5.20009e5,-859286.0,1.00423e5,451014.0,7.04361e5,207727.0,-70382.1,50875.5,-78185.8,3.82249e5,-8.8736e5,1.10641e5,-98145.1,-7.06129e5,-7.72491e5,-5.91817e5,4.38446e5,-1.50927e5,-2.31649e5,-2.00229e5,438070.0,-3.86269e5,5.99783e5,-3.83007e5,-270565.0,

In [170]:
minimum(trainX[1,:])

0.0