-
Notifications
You must be signed in to change notification settings - Fork 254
/
mnist_dropout_fc.jl
106 lines (90 loc) · 4.71 KB
/
mnist_dropout_fc.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#ENV["MOCHA_USE_NATIVE_EXT"] = "true"
#ENV["OMP_NUM_THREADS"] = 1
#blas_set_num_threads(1)
ENV["MOCHA_USE_CUDA"] = "true"
using Mocha
############################################################
# This is an example script for training a fully connected
# network with dropout on mnist.
#
# The network size is 784-1200-1200-10 with ReLU units
# in the hidden layers and a softmax output layer.
# The parameters for training the network were chosen
# to reproduce the results from the original dropout paper:
# http://arxiv.org/abs/1207.0580
# and the corresponding newer JMLR paper:
# http://jmlr.org/papers/volume15/srivastava14a/srivastava14a.pdf
#
# Our parameters slightly differ. This is mainly due to the
# fact that in the original dropout paper the weights are scaled
# by 0.5 after training whereas we scale them by 2 during training.
#
# The settings in this script should currently produce a model that
# gets 94 errors (or 99.06 % accuracy) on the test set
# if you run it for the whole 2000 epochs (=600*2000 steps).
# This is slightly better than the results of the JMLR paper.
# This difference is likely due to slight differences in the
# learning parameters. Also note that our hyperparameters
# are not chosen using a validation set, as one would do
# for a paper. If your hardware and cuda versions differ
# from the setup we used for intial testing your results might
# also slightly vary due to floating point inaccuracies.
############################################################
# fix the random seed to make results reproducable
srand(12345678)
data_layer = HDF5DataLayer(name="train-data", source="data/train.txt", batch_size=100)
# each fully connected layer uses a ReLU activation and a constraint on the L2 norm of the weights
fc1_layer = InnerProductLayer(name="fc1", output_dim=1200, neuron=Neurons.ReLU(),
weight_init = GaussianInitializer(std=0.01),
#weight_cons = L2Cons(4.5),
bottoms=[:data], tops=[:fc1])
fc2_layer = InnerProductLayer(name="fc2", output_dim=1200, neuron=Neurons.ReLU(),
weight_init = GaussianInitializer(std=0.01),
weight_cons = L2Cons(4.5),
bottoms=[:fc1], tops=[:fc2])
fc3_layer = InnerProductLayer(name="out", output_dim=10, bottoms=[:fc2],
weight_init = ConstantInitializer(0),
weight_cons = L2Cons(4.5),
tops=[:out])
loss_layer = SoftmaxLossLayer(name="loss", bottoms=[:out,:label])
# setup dropout for the different layers
# we use 20% dropout on the inputs and 50% dropout in the hidden layers
# as these values were previously found to be good defaults
drop_input = DropoutLayer(name="drop_in", bottoms=[:data], ratio=0.2)
drop_fc1 = DropoutLayer(name="drop_fc1", bottoms=[:fc1], ratio=0.5)
drop_fc2 = DropoutLayer(name="drop_fc2", bottoms=[:fc2], ratio=0.5)
backend = GPUBackend()
init(backend)
common_layers = [fc1_layer, fc2_layer, fc3_layer]
drop_layers = [drop_input, drop_fc1, drop_fc2]
# put training net together, note that the correct ordering will automatically be established by the constructor
net = Net("MNIST-train", backend, [data_layer, common_layers..., drop_layers..., loss_layer])
base_dir = "snapshots_dropout_fc"
# we let the learning rate decrease by 0.998 in each epoch (=600 batches of size 100)
# and let the momentum increase linearly from 0.5 to 0.9 over 500 epochs
# which is equivalent to an increase step of 0.0008
# training is done for 2000 epochs
params = SolverParameters(max_iter=600*2000, regu_coef=0.0,
mom_policy=MomPolicy.Linear(0.5, 0.0008, 600, 0.9),
lr_policy=LRPolicy.Step(0.1, 0.998, 600),
load_from=base_dir)
solver = SGD(params)
setup_coffee_lounge(solver, save_into="$base_dir/statistics.jld", every_n_iter=5000)
# report training progress every 100 iterations
add_coffee_break(solver, TrainingSummary(), every_n_iter=100)
# save snapshots every 5000 iterations
add_coffee_break(solver, Snapshot(base_dir), every_n_iter=5000)
# show performance on test data every 600 iterations (one epoch)
data_layer_test = HDF5DataLayer(name="test-data", source="data/test.txt", batch_size=100)
acc_layer = AccuracyLayer(name="test-accuracy", bottoms=[:out, :label], report_error=true)
test_net = Net("MNIST-test", backend, [data_layer_test, common_layers..., acc_layer])
add_coffee_break(solver, ValidationPerformance(test_net), every_n_iter=600)
solve(solver, net)
#Profile.init(int(1e8), 0.001)
#@profile solve(solver, net)
#open("profile.txt", "w") do out
# Profile.print(out)
#end
destroy(net)
destroy(test_net)
shutdown(backend)