-
Notifications
You must be signed in to change notification settings - Fork 254
/
async-hdf5-data.jl
174 lines (140 loc) · 4.97 KB
/
async-hdf5-data.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
using HDF5
@defstruct AsyncHDF5DataLayer Layer (
name :: String = "hdf5-data",
(source :: String = "", source != ""),
(batch_size :: Int = 0, batch_size > 0),
(chunk_size :: Int = 2^20, chunk_size > 0),
(tops :: Vector{Symbol} = Symbol[:data,:label], length(tops) > 0),
shuffle :: Bool = false,
transformers :: Vector = [],
)
@characterize_layer(AsyncHDF5DataLayer,
is_source => true
)
type AsyncHDF5DataLayerState <: LayerState
layer :: AsyncHDF5DataLayer
blobs :: Vector{Blob}
epoch :: Int
trans :: Vector{Vector{DataTransformerState}}
sources :: Vector{String}
io_task :: Task
stop_task :: Bool
AsyncHDF5DataLayerState(backend::Backend, layer::AsyncHDF5DataLayer) = begin
state = new(layer)
sources = open(layer.source, "r") do s
map(strip, filter(l -> !isspace(l), readlines(s)))
end
@assert(length(sources) > 0)
state.sources = sources
state.epoch = 0
# empty array, will be constructed in setup
state.blobs = Array(Blob, length(layer.tops))
state.trans = Array(Vector{DataTransformerState}, length(layer.tops))
return state
end
end
function setup(backend::Backend, layer::AsyncHDF5DataLayer, inputs::Vector{Blob}, diffs::Vector{Blob})
@assert length(inputs) == 0
state = AsyncHDF5DataLayerState(backend, layer)
h5_file = h5open(state.sources[1], "r")
dsets = [h5_file[string(x)] for x in layer.tops]
# setup blob shapes and data transformers
transformers = convert(Vector{(Symbol, DataTransformerType)}, state.layer.transformers)
for i = 1:length(state.blobs)
dims = size(dsets[i])
dims = tuple(dims[1:end-1]..., state.layer.batch_size)
dset = dsets[i]
state.blobs[i] = make_blob(backend, eltype(dset), dims)
state.trans[i] = [setup(backend, convert(DataTransformerType, t), state.blobs[i])
for (k,t) in filter(kt -> kt[1] == state.layer.tops[i], transformers)]
end
state.stop_task = false
close(h5_file)
function io_task_impl()
# data blocks to produce
data_blocks = Array[Array(eltype(x), size(x)) for x in state.blobs]
n_done = 0
while true
if layer.shuffle
shuffle_src = randperm(length(state.sources))
else
shuffle_src = collect(1:length(state.sources))
end
for i_file = 1:length(shuffle_src)
h5_file = h5open(state.sources[shuffle_src[i_file]], "r")
dsets = [h5_file[string(x)] for x in layer.tops]
n_total = size(dsets[1])[end]
chunk_idx = 1:layer.chunk_size:n_total
if layer.shuffle
shuffle_chunk = randperm(length(chunk_idx))
else
shuffle_chunk = collect(1:length(chunk_idx))
end
# load data in chunks
for i_chunk = 1:length(chunk_idx)
idx_start = chunk_idx[shuffle_chunk[i_chunk]]
idx_end = min(idx_start+layer.chunk_size-1, n_total)
data_chunks = map(1:length(dsets)) do i_dset
idx = map(x -> 1:x, size(state.blobs[i_dset])[1:end-1])
dsets[i_dset][idx..., idx_start:idx_end]
end
n_chunk = idx_end - idx_start+1
if layer.shuffle
# shuffle within chunk
shuffle_idx = randperm(n_chunk)
end
curr_idx = 1
while curr_idx <= n_chunk
if n_done == layer.batch_size
produce(data_blocks)
if state.stop_task
@info("AsyncHDF5DataLayer: IO Task reaching the end...")
close(h5_file)
produce(nothing)
return
end
n_done = 0
end
n_todo = layer.batch_size - n_done
n_take = min(n_todo, n_chunk-curr_idx+1)
for i_dset = 1:length(dsets)
idx = map(x -> 1:x, size(state.blobs[i_dset])[1:end-1])
idx_take = curr_idx:curr_idx+n_take-1
if layer.shuffle
idx_take = shuffle_idx[idx_take]
end
data_blocks[i_dset][idx...,n_done+1:n_done+n_take] = data_chunks[i_dset][idx..., idx_take]
end
curr_idx += n_take
n_done += n_take
end
end
end
# update epoch
state.epoch += 1
end
end
# start the IO task
state.io_task = Task(io_task_impl)
return state
end
function shutdown(backend::Backend, state::AsyncHDF5DataLayerState)
@info("AsyncHDF5DataLayer: Stopping IO task...")
state.stop_task = true
while consume(state.io_task) != nothing
# ignore everything
end
map(destroy, state.blobs)
map(ts -> map(t -> shutdown(backend, t), ts), state.trans)
end
function forward(backend::Backend, state::AsyncHDF5DataLayerState, inputs::Vector{Blob})
data_blocks = consume(state.io_task)
for i = 1:length(state.blobs)
copy!(state.blobs[i], data_blocks[i])
for j = 1:length(state.trans[i])
forward(backend, state.trans[i][j], state.blobs[i])
end
end
end
function backward(backend::Backend, state::AsyncHDF5DataLayerState, inputs::Vector{Blob}, diffs::Vector{Blob})
end