# 誤差逆伝播法

❌ gradientの計算ができていない

## 単純なレイヤの実装

In [1]:
type MulLayer
    x::Number
    y::Number
    forward::Function
    backward::Function
    
    function MulLayer()
        instance = new()

        instance.forward = function(x, y) 
            instance.x = x
            instance.y = y
            x * y
        end
        
        instance.backward = function(dout)
            dx = dout * instance.y
            dy = dout * instance.x
            dx, dy
        end
        instance
    end    
end

In [2]:
apple = 100
apple_num = 2
tax = 1.1

# layer
mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()

# forward
apple_price = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax)

println(price)

220.00000000000003


In [3]:
# backward
dprice = 1
dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

println("$dapple $dapple_num $dtax")

2.2 110.00000000000001 200


In [4]:
type AddLayer
    forward::Function
    backward::Function
    
    function AddLayer()
        forward = function(x, y)
            x+y
        end
        
        backward = function(dout)
            dx = dout * 1
            dy = dout * 1
            dx, dy
        end
        
        new(forward, backward)
    end
end

In [5]:
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1

# layer
mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()

# forward
apple_price = mul_apple_layer.forward(apple, apple_num)
orange_price = mul_orange_layer.forward(orange, orange_num)
all_price = add_apple_orange_layer.forward(apple_price, orange_price)
price = mul_tax_layer.forward(all_price, tax)

# backward
dprice = 1
dall_price, dtax= mul_tax_layer.backward(dprice)
dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price)
dorange, dorange_num = mul_orange_layer.backward(dorange_price)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

println(price)
println("$dapple_num $dapple $dorange $dorange_num $dtax")

715.0000000000001
110.00000000000001 2.2 3.3000000000000003 165.0 650


## 活性化関数レイヤの実装

In [22]:
type Relu
    mask::Array{Float64}
    forward::Function
    backward::Function
    
    function Relu()
        instance = new()
        instance.forward = function(x)
            instance.mask = (x .<=  0)[:]
            out = copy(x)
            out[instance.mask] = 0
            out
        end

        instance.backward = function(dout)
            dout[instance.mask] = 0
            dx = dout
            dx
        end
        instance
    end
end

In [7]:
x = [1.0 -0.5; -2.0 3.0]
println(x)
mask = x .<= 0
println(mask)

[1.0 -0.5; -2.0 3.0]
Bool[false true; true false]


In [8]:
type Sigmoid
    forward::Function
    backward::Function
    
    function Sigmoid()
        instance = new()
        instance.forward = function(x)
            out = 1 ./ (1 .+ exp.(-x))
            out
        end
        
        instance.backward = function(dout)
            dx = dout .* (1.0 .- instance.out) .* instance.out
            dx
        end
        instance
    end
end

## Affine/Softmaxレイヤの実装

### Affineレイヤ

In [9]:
X = rand(2)
W = rand(2, 3)
B = rand(3)

@show size(X)
@show size(W)
@show size(B)

Y = W'*X + B

size(X) = (2,)
size(W) = (2, 3)
size(B) = (3,)


3-element Array{Float64,1}:
 0.415242
 0.452299
 0.973252

### バッチ版Affineレイヤ

In [10]:
X_dot_W = [0 0 0; 10 10 10]
B = [1 2 3]
@show X_dot_W
@show X_dot_W .+ B

X_dot_W = [0 0 0; 10 10 10]
X_dot_W .+ B = [1 2 3; 11 12 13]


2×3 Array{Int64,2}:
  1   2   3
 11  12  13

In [11]:
dY = [1 2 3; 4 5 6]

2×3 Array{Int64,2}:
 1  2  3
 4  5  6

In [12]:
dB = sum(dY,1)

1×3 Array{Int64,2}:
 5  7  9

In [13]:
type Affine
    W::Array{Float64}
    b::Array{Float64}
    x::Array{Float64}
    dW::Array{Float64}
    db::Array{Float64}
    forward::Function
    backward::Function

    function Affine(W, b)
        instance = new()
        instance.W, instance.b = W, b
        
        instance.forward = function(x)
            instance.x = x
            out = instance.W' * x .+ instance.b
            out
        end
        
        instance.backward = function(dout)
            dx = instance.W' * dout
            instance.dW = dout * instance.x'
            instance.db = sum(dout, 1)
            dx
        end
        instance
    end
end

### Softmax-with-Lossレイヤ

In [14]:
type SoftmaxWithLoss
    loss::Float64
    y::Float64
    t::Array{Float64}
    forward::Function
    backward::Function
    
    function SoftmaxWithLoss()
        instance = new()
        instance.forward = function(x, t)
            instance.t = t
            instance.y = softmax(x)
            instance.loss = cross_entropy_error(instance.y, instance.t)
        end
        
        instance.backward = function(dout)
            batch_size = size(instance.t)[1]
            dx = (instance.y - instance.t) / batch_size
            dx
        end
        instance
    end
end

## 誤差逆伝播法の実装

In [15]:
function sigmoid(x)
    1 ./ (1 .+ exp.(-x))
end

function softmax(a)
    exp_a = exp.(a)
    sum_exp_a = sum(exp_a)
    y = exp_a / sum_exp_a
    y
end

function cross_entropy_error(y, t) # one-hot vector
    if ndims(y) == 0
        t = reshape(t, 1, length(t))
        y = reshpae(y, 1, length(y))
    end
    batch_size = size(y)[end]
    -sum(t.*log.(y))/batch_size
end

cross_entropy_error (generic function with 1 method)

In [23]:
using DataStructures

type TwoLayerNet
    params::Dict{String, Array{Float64}}
    predict::Function
    loss::Function
    accuracy::Function
    numerical_gradient::Function
    gradient::Function
    layers::OrderedDict
    lastLayer::SoftmaxWithLoss
    
    function TwoLayerNet(input_size, hidden_size, output_size; weight_init_std=0.01)
        params = Dict()
        params["W1"] = weight_init_std * randn(input_size, hidden_size)
        params["b1"] = zeros(hidden_size)
        params["W2"] = weight_init_std * randn(hidden_size, output_size)
        params["b2"] = zeros(output_size)
        
        layers = OrderedDict()
        layers["Affine1"] = Affine(params["W1"], params["b1"])
        layers["Relu1"] = Relu()
        layers["Affine2"] = Affine(params["W2"], params["b2"])
        lastLayer = SoftmaxWithLoss()
        
        predict = function(x)
            for layer in values(layers)
                x = layer.forward(x[:])
            end
            x
        end
        
        loss = function(x, t)
            y = predict(x)
            cross_entropy_error(y, t)
        end
        
        accuracy = function(x, t)
            y = predict(x)
            tmp = findmax(y, 1)[2]
            y = [tmp[j] - (i-1)*output_size for i in 1:length(tmp)]
            t = indmax(t)
            accuracy = sum(t[i:i+batch_size-1] .== p-1) / float(size(x)[1])
        end
        
        numerical_gradient = function(x, t)
            loss_W = W -> loss(x, t)
            grads = Dict()
            grads["W1"] = numerical_gradient(loss_W, params["W1"])
            grads["b1"] = numerical_gradient(loss_W, params["b1"])
            grads["W2"] = numerical_gradient(loss_W, params["W2"])
            grads["b2"] = numerical_gradient(loss_W, params["b2"])
            grads
        end
        
        gradient = function(x, t)
            # forward
            loss(x, t)
            
            # backward
            dout = 1
            dout = lastLayer.backward(dout)
            
            layers = values(instance.layers)
            layers = reverse(layers)
            for layer in layers
                dout = layer.backward(dout)
            end
            
            grads = Dict()
            grads["W1"] = layers["Affine1"].dW
            grads["b1"]  = layers["Affine1"].db
            grads["W2"] = layers["Affine2"].dW
            grads["b2"]  = layers["Affine2"].db
            grads
        end
        
        new(params, predict, loss, accuracy, numerical_gradient, gradient, layers, lastLayer)
    end    
end

In [17]:
using MLDatasets

x_train, t_train = MNIST.traindata()
x_test, t_test = MNIST.testdata()

network = TwoLayerNet(784, 50, 10)
x_batch = x_train[:, :, 3]
t_batch = t_train[3]

4

In [18]:
grad_numerical = network.numerical_gradient(x_batch, t_batch)

LoadError: [91mStackOverflowError:[39m

In [19]:
Affine(params["W1"], params["b1"]).forward(x_batch[:])

LoadError: [91mUndefVarError: params not defined[39m

In [24]:
Relu().forward(x_batch[:])

LoadError: [91mBoundsError: attempt to access 784-element Array{Float64,1} at index [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]][39m

In [25]:
grad_backprop = network.gradient(x_batch[:], t_batch)

LoadError: [91mBoundsError: attempt to access 50-element Array{Float64,1} at index [[1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0]][39m

数値微分できなかったので比較できていない
``` julia
for key in keys(grad_numerical)
    diff = average(abs(grad_backprop[key] - grad_numerical[key]))
    println("$key:$diff")
end
```