In [157]:
using ReverseDiff
using ReverseDiff: @forward
using ForwardDiff
using DiffResults

In [234]:
relu(x) = x >= 0 ? x : zero(x)
leaky_relu(x) = x >= 0 ? x : 0.25 * x
elu(x, α = 1) = x > 0 ? x : α * (exp(x) - 1)

elu (generic function with 2 methods)

In [245]:
forward_pass = (params, x) -> begin
#     params[1] .* x .+ params[2]
    @forward(elu).(x .* params[1:1] .+ params[2:2])
end
    

predict(params, x) = forward_pass(params, x)
function predict_sensitivity(params, x)
    hcat(predict(params, x), ForwardDiff.jacobian(x -> predict(params, x), x))
end

predict_sensitivity (generic function with 2 methods)

In [246]:

x = [1.0]
yJ = [1.0 2.0]
loss(params, x, yJ) = sum(abs2, [1.0, 1.0] .* (predict_sensitivity(params, x) .- yJ))

params = randn(2)
ForwardDiff.hessian(x -> predict(params, x)[], x)

1×1 Array{Float64,2}:
 0.220555

In [252]:
for i in 1:1000
    ∇ = ReverseDiff.gradient(p -> loss(p, x, yJ), params)
    params .-= 1e-2 .* ∇
end

In [253]:
predict_sensitivity(params, x)

1×2 Array{Float64,2}:
 1.0  2.0

In [254]:
ReverseDiff.gradient(p -> loss(p, x, yJ), params)

2-element Array{Float64,1}:
 -2.66169e-8
  4.30671e-8

In [255]:
ForwardDiff.hessian(x -> predict(params, x)[], x)

1×1 Array{Float64,2}:
 0.0

In [256]:
params

2-element Array{Float64,1}:
  2.0
 -1.0

In [258]:
tape = ReverseDiff.compile(ReverseDiff.GradientTape(predict, (params, x)))

ReverseDiff.CompiledTape(predict)

In [259]:
ReverseDiff.seeded_forward_pass!(tape, (params, x))