Skip to content

Commit

Permalink
Merge pull request #1 from pluskid/master
Browse files Browse the repository at this point in the history
Catch up to master
  • Loading branch information
zacsketches authored Oct 28, 2016
2 parents 94bc8d0 + 9494ecd commit 9a39c87
Show file tree
Hide file tree
Showing 11 changed files with 296 additions and 8 deletions.
3 changes: 1 addition & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@ os:
- linux
- osx
julia:
- 0.3
- 0.4
- 0.5
- nightly
notifications:
email: false
Expand Down
2 changes: 1 addition & 1 deletion deps/build.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ flags = ["-fPIC", "-Wall", "-O3", "-shared"]
libname = "libmochaext.so"
openmp = "-fopenmp"

@osx? begin
@static is_apple() ? begin
if !haskey(ENV, "MOCHA_FORCE_OMP")
println("OpenMP is currently not officially supported by OS X Clang compiler yet.")
println("(see http://clang-omp.github.io/ to install OpenMP clang extension, or")
Expand Down
2 changes: 2 additions & 0 deletions src/Mocha.jl
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ include("solvers/policies.jl")
include("solvers/sgd.jl")
include("solvers/nesterov.jl")
include("solvers/adam.jl")
include("solvers/adagrad.jl")
include("solvers/adadelta.jl")

if Config.use_cuda
include("cuda/solvers.jl")
Expand Down
4 changes: 2 additions & 2 deletions src/cuda/layers/binary-cross-entropy-loss.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ function forward(backend::GPUBackend, state::BinaryCrossEntropyLossLayerState, i
num = get_num(pred)
dim = length(pred)

x_block = int(ceil(convert(Float64, dim)/CUDA.THREADS_PER_BLOCK_X))
x_block = Int(ceil(convert(Float64, dim)/CUDA.THREADS_PER_BLOCK_X))

loss_blob = make_zero_blob(backend, Float32, 1, 1, 1, 1)

Expand Down Expand Up @@ -39,7 +39,7 @@ function backward(backend::GPUBackend, state::BinaryCrossEntropyLossLayerState,
num = get_num(pred)
dim = length(pred)

x_block = int(ceil(convert(Float64, dim)/CUDA.THREADS_PER_BLOCK_X))
x_block = Int(ceil(convert(Float64, dim)/CUDA.THREADS_PER_BLOCK_X))

if data_type == Float32
kernel = backend.mocha.binary_cross_entropy_loss_backward_float
Expand Down
2 changes: 1 addition & 1 deletion src/layers/random-mask.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ function setup(backend::Backend, layer::RandomMaskLayer, inputs::Vector{Blob}, d
dropouts = Array(DropoutLayerState, length(inputs))
for i = 1:length(inputs)
dropout_layer = DropoutLayer(name="$(layer.name)-dropout-$i", auto_scale=false, ratio=layer.ratio,
bottoms=Symbol[symbol("$(layer.bottoms[i])-$i")])
bottoms=Symbol[Symbol("$(layer.bottoms[i])-$i")])
dropouts[i] = setup(backend, dropout_layer, Blob[inputs[i]], Blob[diffs[i]])
end
return RandomMaskLayerState(layer, dropouts)
Expand Down
91 changes: 91 additions & 0 deletions src/solvers/adadelta.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# An implementation of Adadelta: An Adaptive Learning Rate Method
# in Mocha.jl
# CREATED BY: ALEXANDER AMINI
#################################################################

export Adadelta

immutable Adadelta <: SolverMethod
end

make_solver_parameters(method::Adadelta; kwargs...) =
merge( make_solver_parameters(rho=0.95, eps=1e-6), SolverParameters(kwargs) )

validate_parameters(method::Adadelta, params::SolverParameters) = validate_parameters(params, :rho, :eps)

type AdadeltaSolverState <: InternalSolverState
param_states :: Vector{LayerState}
gradients_sq :: Vector{Vector{Blob}}
deltas_sq :: Vector{Vector{Blob}}
end

type AdadeltaSolverSnapshot <: SolverStateSnapshot
iteration :: Int
obj_val :: Float64
end

function snapshot(state::SolverState{AdadeltaSolverState})
AdadeltaSolverSnapshot(state.iter, state.obj_val)
end

solver_state(net::Net, snapshot::AdadeltaSolverSnapshot) = begin
SolverState{AdadeltaSolverState}(snapshot.iteration, snapshot.obj_val,
Dict(), AdadeltaSolverState(net))
end

solver_state(method::Adadelta, net::Net, params::SolverParameters) = begin
SolverState(AdadeltaSolverState(net))
end


AdadeltaSolverState(net::Net) = begin
param_states = updatable_layer_states(net)

gradients_sq = Array(Vector{Blob}, length(param_states))
deltas_sq = Array(Vector{Blob}, length(param_states))

for i = 1:length(param_states)
state = param_states[i]
gradients_sq[i] = [make_zero_blob(net.backend, eltype(x.blob),size(x.blob)...) for x in state.parameters]
deltas_sq[i] = [make_zero_blob(net.backend, eltype(x.blob),size(x.blob)...) for x in state.parameters]
end
return AdadeltaSolverState(param_states, gradients_sq, deltas_sq)
end


function shutdown(state::SolverState{AdadeltaSolverState})
map(x -> map(destroy, x), state.internal.gradients_sq)
map(x -> map(destroy, x), state.internal.deltas_sq)
end

function update(solver::Solver{Adadelta}, net::Net, state::SolverState{AdadeltaSolverState})

for i = 1:length(state.internal.param_states)
layer_state = state.internal.param_states[i]
gradients_sq = state.internal.gradients_sq[i]
deltas_sq = state.internal.deltas_sq[i]
for j = 1:length(layer_state.parameters)
gradSq = gradients_sq[j]
deltaSq = deltas_sq[j]
gradient = layer_state.parameters[j].gradient
data_type = eltype(gradSq)

update_parameters!(net, solver.method, solver.params[:rho], solver.params[:eps],
layer_state.parameters[j].blob, gradSq, deltaSq, gradient, data_type)
end
end
end

function update_parameters!(net::Net{CPUBackend}, method::Adadelta, rho, eps,
param_blob, gradSq, deltaSq, gradient, data_type)

BLAS.scal!(length(gradSq), convert(data_type, rho), gradSq.data, 1)
BLAS.axpy!(length(gradSq), convert(data_type, 1-rho), pointer(gradient.data.^2), 1, gradSq.data, 1)

deltas = (sqrt(deltaSq.data+eps) ./ sqrt(gradSq.data+eps)) .* gradient.data
BLAS.scal!(length(gradSq), convert(data_type, rho), deltaSq.data, 1)
BLAS.axpy!(length(gradSq), convert(data_type, 1-rho), pointer(deltas.^2), 1, deltaSq.data, 1)

BLAS.axpy!(length(gradSq), convert(data_type, -1), pointer(deltas), 1, param_blob.data, 1)

end
83 changes: 83 additions & 0 deletions src/solvers/adagrad.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
####################################################################
#### An implementation of Adagrad: Adaptive Subgradient Methods ####
###########for Online Learning and Stochastic Optimization##########
########################### in Mocha.jl ############################
################### CREATED BY: ALEXANDER AMINI ####################
####################################################################

export Adagrad

immutable Adagrad <: SolverMethod
end

make_solver_parameters(method::Adagrad; kwargs...)=
merge( make_solver_parameters(gamma=1.0, epsilon=1e-8), SolverParameters(kwargs))

validate_parameters(method::Adagrad, params::SolverParameters) = validate_parameters(params, :gamma, :epsilon)

type AdagradSolverState <: InternalSolverState
param_states :: Vector{LayerState}
param_history :: Vector{Vector{Blob}}
end

type AdagradSolverSnapshot <: SolverStateSnapshot
iteration :: Int
obj_val :: Float64
end

function snapshot(state::SolverState{AdagradSolverState})
AdagradSolverSnapshot(state.iter, state.obj_val)
end

solver_state(net::Net, snapshot::AdagradSolverSnapshot) = begin
SolverState{AdagradSolverState}(snapshot.iteration, snapshot.obj_val,
Dict(), AdagradSolverState(net))
end

solver_state(method::Adagrad, net::Net, params::SolverParameters) = begin
SolverState(AdagradSolverState(net ))
end

AdagradSolverState(net::Net) = begin
param_states = updatable_layer_states(net)

param_history = Array(Vector{Blob}, length(param_states))

for i = 1:length(param_states)
state = param_states[i]
param_history[i] = [make_zero_blob(net.backend, eltype(x.blob),size(x.blob)...) for x in state.parameters]
end
return AdagradSolverState(param_states, param_history)
end

function shutdown(state::SolverState{AdagradSolverState})
map(x -> map(destroy, x), state.internal.param_history)
end

function update(solver::Solver{Adagrad}, net::Net, state::SolverState{AdagradSolverState})
for i = 1:length(state.internal.param_states)
layer_state = state.internal.param_states[i]
history = state.internal.param_history[i]
for j = 1:length(layer_state.parameters)
hist_blob = history[j]
gradient = layer_state.parameters[j].gradient
data_type = eltype(hist_blob)

update_parameters!(net, solver.method, solver.params[:gamma], solver.params[:epsilon],
layer_state.parameters[j].blob, hist_blob, gradient, data_type)
end
end
end

function update_parameters!(net::Net{CPUBackend}, method::Adagrad, gamma, epsilon,
param_blob, hist_blob, gradient, data_type)

g2 = gradient.data .^ 2;

# hist_blob += 1* g2 (update with vt-1)
BLAS.axpy!(length(hist_blob), convert(data_type, 1), pointer(g2), 1, pointer(hist_blob.data), 1)
adj_learning_rate = gamma / (epsilon + sqrt(sum(hist_blob.data)))

# param_blob += -adj_learning_rate * gradient
BLAS.axpy!(length(hist_blob), convert(data_type, -adj_learning_rate), gradient.data, 1, param_blob.data, 1)
end
2 changes: 1 addition & 1 deletion test/layers/hdf5-data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ end
function test_hdf5_data_layer_shuffle(backend::Backend, batch_size, n, T)
# do not run (non-async) HDF5 data layer shuffling on windows, because it is implemented
# with memmap, which is not working properly on Windows.
@windows? nothing : test_hdf5_data_layer_shuffle(backend, batch_size, false, n, T)
@static is_windows() ? nothing : test_hdf5_data_layer_shuffle(backend, batch_size, false, n, T)

test_hdf5_data_layer_shuffle(backend, batch_size, true, n, T)
end
Expand Down
2 changes: 1 addition & 1 deletion test/layers/random-mask.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ function test_random_mask_layer(backend, T, eps)
diff_blobs = Blob[make_blob(backend, inputs[i]) for i = 1:n_inputs]

println(" > Setup")
layer = RandomMaskLayer(bottoms=[symbol("inputs-$i") for i = 1:n_inputs], ratio=ratio)
layer = RandomMaskLayer(bottoms=[Symbol("inputs-$i") for i = 1:n_inputs], ratio=ratio)
state = setup(backend, layer, input_blobs, diff_blobs)

println(" > Forward")
Expand Down
57 changes: 57 additions & 0 deletions test/solvers/test-adadelta-solver.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
function test_sgd_solver(backend)
println("-- Testing simple SGD solver call")
registry_reset(backend)
srand(12345678)
############################################################
# Prepare Random Data
############################################################
N = 5 # works with arbitrary minibatch size as long as
# N == batch_size in MemoryDataLayer so it cycles through
# and gets the same data during forward()
M = 10
P = 4

X = rand(M, N)
W = rand(M, P)
B = rand(P, 1)

Y = (W'*X .+ B)
Y = Y + 0.01*randn(size(Y))

############################################################
# Define network
############################################################
data_layer = MemoryDataLayer(batch_size=N, data=Array[X,Y])

w1 = InnerProductLayer(neuron=Neurons.Sigmoid(), name="ip1",output_dim=20, tops=[:a], bottoms=[:data])
w2 = InnerProductLayer(neuron=Neurons.Identity(), name="ip2",output_dim=4, tops=[:b], bottoms=[:a])
loss_layer = SquareLossLayer(name="loss", bottoms=[:b, :label] )


net = Net("TEST", backend, [w1, w2, loss_layer, data_layer])

# Make a Solver with max iterations 2
method = Adadelta()
params = make_solver_parameters(method,
max_iter=2,
beta1=0.9,
beta2=0.999,
epsilon=1e-8,
load_from="")
solver = Solver(method, params)
solve(solver, net)
# TODO check gradient updates
# TODO check snapshot loading
# TODO check statistic saving
# TODO check other lr policies
# TODO check other mom policies
destroy(net)
end

if test_cpu
test_sgd_solver(backend_cpu)
end

if test_gpu
test_sgd_solver(backend_gpu)
end
56 changes: 56 additions & 0 deletions test/solvers/test-adagrad-solver.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
function test_sgd_solver(backend)
println("-- Testing simple SGD solver call")
registry_reset(backend)
srand(12345678)
############################################################
# Prepare Random Data
############################################################
N = 5 # works with arbitrary minibatch size as long as
# N == batch_size in MemoryDataLayer so it cycles through
# and gets the same data during forward()
M = 10
P = 4

X = rand(M, N)
W = rand(M, P)
B = rand(P, 1)

Y = (W'*X .+ B)
Y = Y + 0.01*randn(size(Y))

############################################################
# Define network
############################################################
data_layer = MemoryDataLayer(batch_size=N, data=Array[X,Y])

w1 = InnerProductLayer(neuron=Neurons.Sigmoid(), name="ip1",output_dim=20, tops=[:a], bottoms=[:data])
w2 = InnerProductLayer(neuron=Neurons.Identity(), name="ip2",output_dim=4, tops=[:b], bottoms=[:a])
loss_layer = SquareLossLayer(name="loss", bottoms=[:b, :label] )


net = Net("TEST", backend, [w1, w2, loss_layer, data_layer])

# Make a Solver with max iterations 2
method = Adagrad()
params = make_solver_parameters(method,
max_iter=2,
gamma=1.0,
eps=1e-6,
load_from="")
solver = Solver(method, params)
solve(solver, net)
# TODO check gradient updates
# TODO check snapshot loading
# TODO check statistic saving
# TODO check other lr policies
# TODO check other mom policies
destroy(net)
end

if test_cpu
test_sgd_solver(backend_cpu)
end

if test_gpu
test_sgd_solver(backend_gpu)
end

0 comments on commit 9a39c87

Please sign in to comment.