This repository has been archived by the owner on Feb 12, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 193
/
qrnn.py
192 lines (152 loc) · 8.85 KB
/
qrnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import torch
from torch import nn
from torch.autograd import Variable
if __name__ == '__main__':
from forget_mult import ForgetMult
else:
from .forget_mult import ForgetMult
class QRNNLayer(nn.Module):
r"""Applies a single layer Quasi-Recurrent Neural Network (QRNN) to an input sequence.
Args:
input_size: The number of expected features in the input x.
hidden_size: The number of features in the hidden state h. If not specified, the input size is used.
save_prev_x: Whether to store previous inputs for use in future convolutional windows (i.e. for a continuing sequence such as in language modeling). If true, you must call reset to remove cached previous values of x. Default: False.
window: Defines the size of the convolutional window (how many previous tokens to look when computing the QRNN values). Supports 1 and 2. Default: 1.
zoneout: Whether to apply zoneout (i.e. failing to update elements in the hidden state) to the hidden state updates. Default: 0.
output_gate: If True, performs QRNN-fo (applying an output gate to the output). If False, performs QRNN-f. Default: True.
use_cuda: If True, uses fast custom CUDA kernel. If False, uses naive for loop. Default: True.
Inputs: X, hidden
- X (seq_len, batch, input_size): tensor containing the features of the input sequence.
- hidden (batch, hidden_size): tensor containing the initial hidden state for the QRNN.
Outputs: output, h_n
- output (seq_len, batch, hidden_size): tensor containing the output of the QRNN for each timestep.
- h_n (batch, hidden_size): tensor containing the hidden state for t=seq_len
"""
def __init__(self, input_size, hidden_size=None, save_prev_x=False, zoneout=0, window=1, output_gate=True, use_cuda=True):
super(QRNNLayer, self).__init__()
assert window in [1, 2], "This QRNN implementation currently only handles convolutional window of size 1 or size 2"
self.window = window
self.input_size = input_size
self.hidden_size = hidden_size if hidden_size else input_size
self.zoneout = zoneout
self.save_prev_x = save_prev_x
self.prevX = None
self.output_gate = output_gate
self.use_cuda = use_cuda
# One large matmul with concat is faster than N small matmuls and no concat
self.linear = nn.Linear(self.window * self.input_size, 3 * self.hidden_size if self.output_gate else 2 * self.hidden_size)
def reset(self):
# If you are saving the previous value of x, you should call this when starting with a new state
self.prevX = None
def forward(self, X, hidden=None):
seq_len, batch_size, _ = X.size()
source = None
if self.window == 1:
source = X
elif self.window == 2:
# Construct the x_{t-1} tensor with optional x_{-1}, otherwise a zeroed out value for x_{-1}
Xm1 = [self.prevX if self.prevX is not None else X[:1, :, :] * 0, X[:-1, :, :]]
Xm1 = torch.cat(Xm1, 0)
# Convert two (seq_len, batch_size, hidden) tensors to (seq_len, batch_size, 2 * hidden)
source = torch.cat([X, Xm1], 2)
# Matrix multiplication for the three outputs: Z, F, O
Y = self.linear(source)
# Convert the tensor back to (batch, seq_len, len([Z, F, O]) * hidden_size)
if self.output_gate:
Y = Y.view(seq_len, batch_size, 3 * self.hidden_size)
Z, F, O = Y.chunk(3, dim=2)
else:
Y = Y.view(seq_len, batch_size, 2 * self.hidden_size)
Z, F = Y.chunk(2, dim=2)
###
Z = torch.nn.functional.tanh(Z)
F = torch.nn.functional.sigmoid(F)
# If zoneout is specified, we perform dropout on the forget gates in F
# If an element of F is zero, that means the corresponding neuron keeps the old value
if self.zoneout:
if self.training:
mask = Variable(F.data.new(*F.size()).bernoulli_(1 - self.zoneout), requires_grad=False)
F = F * mask
else:
F *= 1 - self.zoneout
# Ensure the memory is laid out as expected for the CUDA kernel
# This is a null op if the tensor is already contiguous
Z = Z.contiguous()
F = F.contiguous()
# The O gate doesn't need to be contiguous as it isn't used in the CUDA kernel
# Forget Mult
# For testing QRNN without ForgetMult CUDA kernel, C = Z * F may be useful
C = ForgetMult()(F, Z, hidden, use_cuda=self.use_cuda)
# Apply (potentially optional) output gate
if self.output_gate:
H = torch.nn.functional.sigmoid(O) * C
else:
H = C
# In an optimal world we may want to backprop to x_{t-1} but ...
if self.window > 1 and self.save_prev_x:
self.prevX = Variable(X[-1:, :, :].data, requires_grad=False)
return H, C[-1:, :, :]
class QRNN(torch.nn.Module):
r"""Applies a multiple layer Quasi-Recurrent Neural Network (QRNN) to an input sequence.
Args:
input_size: The number of expected features in the input x.
hidden_size: The number of features in the hidden state h. If not specified, the input size is used.
num_layers: The number of QRNN layers to produce.
layers: List of preconstructed QRNN layers to use for the QRNN module (optional).
save_prev_x: Whether to store previous inputs for use in future convolutional windows (i.e. for a continuing sequence such as in language modeling). If true, you must call reset to remove cached previous values of x. Default: False.
window: Defines the size of the convolutional window (how many previous tokens to look when computing the QRNN values). Supports 1 and 2. Default: 1.
zoneout: Whether to apply zoneout (i.e. failing to update elements in the hidden state) to the hidden state updates. Default: 0.
output_gate: If True, performs QRNN-fo (applying an output gate to the output). If False, performs QRNN-f. Default: True.
use_cuda: If True, uses fast custom CUDA kernel. If False, uses naive for loop. Default: True.
Inputs: X, hidden
- X (seq_len, batch, input_size): tensor containing the features of the input sequence.
- hidden (layers, batch, hidden_size): tensor containing the initial hidden state for the QRNN.
Outputs: output, h_n
- output (seq_len, batch, hidden_size): tensor containing the output of the QRNN for each timestep.
- h_n (layers, batch, hidden_size): tensor containing the hidden state for t=seq_len
"""
def __init__(self, input_size, hidden_size,
num_layers=1, bias=True, batch_first=False,
dropout=0, bidirectional=False, layers=None, **kwargs):
assert bidirectional == False, 'Bidirectional QRNN is not yet supported'
assert batch_first == False, 'Batch first mode is not yet supported'
assert bias == True, 'Removing underlying bias is not yet supported'
super(QRNN, self).__init__()
self.layers = torch.nn.ModuleList(layers if layers else [QRNNLayer(input_size, hidden_size, **kwargs) for _ in range(num_layers)])
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = len(layers) if layers else num_layers
self.bias = bias
self.batch_first = batch_first
self.dropout = dropout
self.bidirectional = bidirectional
def reset(self):
r'''If your convolutional window is greater than 1, you must reset at the beginning of each new sequence'''
[layer.reset() for layer in self.layers]
def forward(self, input, hidden=None):
next_hidden = []
for i, layer in enumerate(self.layers):
input, hn = layer(input, None if hidden is None else hidden[i])
next_hidden.append(hn)
if self.dropout != 0 and i < len(self.layers) - 1:
input = torch.nn.functional.dropout(input, p=self.dropout, training=self.training, inplace=False)
next_hidden = torch.cat(next_hidden, 0).view(self.num_layers, *next_hidden[0].size()[-2:])
return input, next_hidden
if __name__ == '__main__':
seq_len, batch_size, hidden_size = 2, 2, 16
seq_len, batch_size, hidden_size = 35, 8, 32
size = (seq_len, batch_size, hidden_size)
X = Variable(torch.rand(size), requires_grad=True).cuda()
print(X.size())
qrnn = QRNNLayer(hidden_size, hidden_size)
qrnn.cuda()
Y, _ = qrnn(X)
qrnn.use_cuda = False
Z, _ = qrnn(X)
diff = (Y - Z).sum().data[0]
print('Total difference between QRNN(use_cuda=True) and QRNN(use_cuda=False) results:', diff)
assert diff < 1e-5, 'CUDA and non-CUDA QRNN layers return different results'
from torch.autograd import gradcheck
inputs = [X,]
test = gradcheck(QRNNLayer(hidden_size, hidden_size).cuda(), inputs)
print(test)