In [60]:
@time begin

n_in, n_h, n_out = 5, 4, 2
M = 1000 # no of training examples

W1 = Array{Float64}(rand(n_h, n_in))
b1 = Array{Float64}(rand(n_h, M))
W2 = Array{Float64}(rand(n_out, n_h))

x_in = Array{Float64}(rand(n_in, M)) 
y = Array{Float64}(rand(n_out, M))

α = 1e-6  # Learning rate

for t in range(1,1000)
    # forward pass
    hidden_1 = W1 * x_in .+ b1              # [n_h, n_in] * [n_in, M] = [n_h, M]
    activated_1 = max.(hidden_1, 0)    # [n_h, M]
    out = W2 * activated_1             # [n_out, n_h] * [n_h, M] = [n_out, M]
    
    # calculate loss
    
    loss = sum((out .- y).^2)
    
    # backprop
    ∇out = 2*(out .- y)               # [n_out,M]
    ∇W2 = ∇out * (activated_1')       # [n_out, M] * [M * n_h] = [n_out, n_h] == shape of W2
    ∇activated_1 = W2' * (∇out)       # [n_h, n_out] * [n_out, M] = [n_h, M]
    ∇hidden_1 = copy(∇activated_1)    # [n_h, M]
    ∇hidden_1[hidden_1 .< 0] = 0      # [n_h, M]
    ∇W1 = ∇hidden_1 * (x_in')         # [n_h, M] * [M, n_in] = [n_h, n_in] == shape of W1
    
    # update params
    W1 -= α * ∇W1
    W2 -= α * ∇W2
    b1 -= α * ∇hidden_1
    
    if t % 100 == 0
        println("loss $(trunc(Int,loss)) at run $t")
    end
end
end

loss 502 at run 100
loss 255 at run 200
loss 248 at run 300
loss 245 at run 400
loss 242 at run 500
loss 240 at run 600
loss 237 at run 700
loss 234 at run 800
loss 232 at run 900
loss 230 at run 1000
  0.274577 seconds (180.55 k allocations: 288.172 MiB, 6.80% gc time)
