## Recurrent neural network for word-level language modelling

Replicating the results from:

Zaremba, Wojciech, Ilya Sutskever, and Oriol Vinyals. "Recurrent neural network regularization." arXiv preprint arXiv:1409.2329 (2014).

on the Penn Tree Bank dataset (downloaded from http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz)

using theano and lasagne

In [1]:
import numpy as np
import collections
import theano
import theano.tensor as T
from theano.ifelse import ifelse
import lasagne
from lasagne import nonlinearities
from lasagne.utils import unroll_scan

from lasagne.layers.input import InputLayer

from lasagne.layers.recurrent import Gate

# This is an LSTM layer that inherits from the lasagne implementation
# it allows to access the cell state
from recurrent_extend import LSTMLayerWithState

import sys

Using gpu device 0: GeForce GT 640 (CNMeM is disabled, CuDNN 4007)


In [2]:
np.random.seed(11071988)

In [3]:
datapath = './data/'
model_size = 'small' # small, medium or large
learning_rate = 1
optim_type = 'sgd' # sgd or adadelta
b_init = 'constant_f0' # constant, constant_f0 or uniform

print('word_lstm_ptb: model size %s, learning rate %.3f, optimization %s, b %s' %(model_size,learning_rate,optim_type,b_init))

word_lstm_ptb: model size small, learning rate 1.000, optimization sgd, b constant_f0


### Data

In [4]:
with open(datapath+'/ptb.train.txt', 'r') as f:
    tokens = f.read().replace('\n','<eos>').split(' ')[1:]
with open(datapath+'/ptb.valid.txt', 'r') as f:
    tokens_valid = f.read().replace('\n','<eos>').split(' ')[1:]
with open(datapath+'/ptb.test.txt', 'r') as f:
    tokens_test = f.read().replace('\n','<eos>').split(' ')[1:]    

def build_vocabulary(tokens):
    counter = collections.Counter(tokens)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    return word_to_id

vocab = build_vocabulary(tokens)
data = [vocab[word] for word in tokens if word in vocab]
data_valid = [vocab[word] for word in tokens_valid if word in vocab]
data_test = [vocab[word] for word in tokens_test if word in vocab]
sortindex = np.argsort(list(vocab.values()))
words = np.array(list(vocab.keys()))[sortindex]
vocab_size = len(list(vocab.keys()))

In [5]:
def data_producer(data, batch_size, n_batch=None):
    datanp = np.array(data).astype('int32')
    batch_len = int(np.floor(datanp.shape[0]/batch_size))
    datanp = np.reshape(datanp[0 : batch_size * batch_len],
                        [batch_size, batch_len])
    epoch_size = (batch_len - 1) // seq_length
    pointers = np.arange(0,batch_len-seq_length,seq_length)
    if n_batch is not None:
        pointers = pointers[:n_batch]
    for (ii, pointer) in enumerate(pointers):
        x = datanp[:,pointer:pointer+seq_length]
        y = datanp[:,pointer+1:pointer+seq_length+1]
        yield x,y,(ii/len(pointers))

In [6]:
# len(data)

### Hyperparameters

In [7]:
batch_size = 20
n_layer = 2
grad_clip = 5
# grad_clip = grad_clip*batch_size

if model_size=='large':
    # Large
    embed_size = 1500
    seq_length = 35
    init_range = .04
    dropout_rate = .65
    num_epoch = 55
    max_epoch = 14
    decay = 1.15
elif model_size=='medium':
    # Medium
    embed_size = 650
    seq_length = 35
    init_range = .05
    dropout_rate = .5
    num_epoch = 39
    max_epoch = 6
    decay = 1.25
elif model_size=='small':
    # Small
    embed_size = 200
    seq_length = 20
    init_range = .1
    dropout_rate = 0.
    num_epoch = 13
    max_epoch = 4
    decay = 2
elif model_size=='test':
    embed_size = 30
    seq_length = 35
    init_range = .1
    dropout_rate = 0.
    num_epoch = 1
    batch_size = 2

In [8]:
for (test_x,test_y,p) in data_producer(data_test, batch_size, 3):
    continue

### Parameters

In [9]:
def init_fcn(val_shape, init_range, init_range_upper=None):
    if init_range_upper is None:
        val = np.random.uniform(-init_range,init_range,
                                val_shape).astype(theano.config.floatX)
    else:
        val = np.random.uniform(init_range,init_range_upper,
                                val_shape).astype(theano.config.floatX)
    return val

def zero_fcn(val_shape):
    val = np.zeros(val_shape).astype(theano.config.floatX)
    return val

def ones_fcn(val_shape):
    val = np.ones(val_shape).astype(theano.config.floatX)
    return val

In [10]:
# word embedding
embed = theano.shared(name='embedding variable', value=init_fcn((vocab_size,embed_size),init_range))
# cell and hidden state initial values
c_init = []; h_init = []
for iLayer in range(n_layer):
    c_init.append(theano.shared(zero_fcn((batch_size,embed_size))))
    h_init.append(theano.shared(zero_fcn((batch_size,embed_size))))
# parameters for ingate and outgate:
def io_gate_parameters():
    g = Gate(W_in=init_fcn((embed_size,embed_size),init_range),
             W_hid=init_fcn((embed_size,embed_size),init_range),
             W_cell=None, b=zero_fcn((embed_size)))        
    return g
# parameters for forgetgate:
if b_init=='constant_f0':
    def f_gate_parameters():
        g = Gate(W_in=init_fcn((embed_size,embed_size),init_range),
                 W_hid=init_fcn((embed_size,embed_size),init_range),
                 W_cell=None, b=zero_fcn((embed_size)))         
        return g
else:
    def f_gate_parameters():
        g = Gate(W_in=init_fcn((embed_size,embed_size),init_range),
                 W_hid=init_fcn((embed_size,embed_size),init_range),
                 W_cell=None, b=ones_fcn((embed_size)))         
        return g    
# parameters for cell:
def cell_parameters():
    c = Gate(W_in=init_fcn((embed_size,embed_size),init_range),
             W_hid=init_fcn((embed_size,embed_size),init_range),
             W_cell=None, b=zero_fcn((embed_size)),
             nonlinearity=nonlinearities.tanh)
    return c
gates = []
for iLayer in range(n_layer):
    gates.append([])
    gates[iLayer].append(io_gate_parameters())
    gates[iLayer].append(cell_parameters())
    gates[iLayer].append(f_gate_parameters())
    gates[iLayer].append(io_gate_parameters())
# output embeddings
W_hy = theano.shared(init_fcn((embed_size,vocab_size),init_range),name='output w')
b_y = theano.shared(zero_fcn(vocab_size),name='output b')

if b_init=='uniform':
    for iLayer in range(n_layer):
        for iGate in range(4):
            gates[iLayer][iGate].b = init_fcn((embed_size),init_range)
    b_y.set_value(init_fcn(vocab_size,init_range))

### Input and embeddings

In [11]:
input_var = T.imatrix('input variable').astype('int32')
drate = theano.shared(dropout_rate).astype(T.config.floatX)
input_embed = embed[input_var].reshape((input_var.shape[0], seq_length, embed_size))
l_in = InputLayer(shape=(batch_size, seq_length, embed_size),
                                 input_var=input_embed, name='input layer')
l_in_drop = lasagne.layers.dropout(l_in, p=drate, name='input dropout layer')

In [12]:
# f_input_embed = theano.function([input_var], input_embed, allow_input_downcast=True)
# x_eval = f_input_embed(test_x)
# len(data_test)

### LSTM layers

In [13]:
l_lstm = []; l_lstm_drop = [];
l_c_init = []; l_h_init = []
l_lstm_in = [l_in_drop]

for iLayer in range(n_layer):
    l_c_init.append(InputLayer(shape=(batch_size, embed_size), input_var=c_init[iLayer]))
    l_h_init.append(InputLayer(shape=(batch_size, embed_size), input_var=h_init[iLayer])) 
    l_lstm.append(LSTMLayerWithState(l_lstm_in[-1], embed_size, ingate = gates[iLayer][0], 
                                     cell = gates[iLayer][1], forgetgate = gates[iLayer][2],
                                     outgate = gates[iLayer][3], peepholes=False,
                                     unroll_scan=True, name='lstm%d layer' %(iLayer+1),
                                     nonlinearity=nonlinearities.tanh,
                                     cell_init=l_c_init[-1], hid_init=l_h_init[-1]))
    l_lstm_drop.append(lasagne.layers.dropout(l_lstm[-1], p=drate, 
                                              name='lstm%d dropout layer'  %(iLayer+1)))       
    if iLayer<n_layer-1:
        l_lstm_in.append(l_lstm_drop[-1])
lstm_out = lasagne.layers.get_output(l_lstm_drop[-1])
# lstm_out_det = lasagne.layers.get_output(l_lstm[-1],deterministic=True)

In [14]:
p_lstm = lasagne.layers.get_all_params(l_lstm[-1])
# p_lstm

### Get LSTM cell and hidden states

In [15]:
cellsc = dict()
hid_out = []; cell_out = []

for iLayer in range(n_layer):
    lstm_ins = [lasagne.layers.get_output(l_lstm_in[iLayer]),
                lasagne.layers.get_output(l_h_init[iLayer]), 
                lasagne.layers.get_output(l_c_init[iLayer])]
    hid_out.append(l_lstm[iLayer].get_output_for(lstm_ins,cellsc))
    cell_out.append(cellsc[l_lstm[iLayer]])

### Softmax layer

In [16]:
lstm_out_resh = T.reshape(lstm_out, (lstm_out.shape[0] * lstm_out.shape[1], -1))
y_logit = T.dot(lstm_out_resh,W_hy) + b_y
y_hat = T.nnet.softmax(y_logit)

### Cost

In [17]:
target_var = T.imatrix('target variable').astype('int32')
target_var_f = target_var.flatten(ndim=1)
seq_cost = -T.log(y_hat)[T.arange(y_hat.shape[0]),target_var_f]
resh_cost = T.reshape(seq_cost, (lstm_out.shape[0], lstm_out.shape[1]))
cost = T.sum(T.mean(resh_cost,axis=1))

In [18]:
# f_seq_cost = theano.function([input_var,target_var,drate],cost)

In [19]:
# test_xx = np.repeat([test_x[0]],batch_size,0)
# test_yy = np.repeat([test_y[0]],batch_size,0)
# np.exp(f_seq_cost(test_xx,test_yy,0.)/batch_size)

### Gradients

In [20]:
p_all = [embed]+p_lstm+[W_hy,b_y]
grads_all = T.grad(cost,p_all)

norm_grads_all = 0
for grads in grads_all:
    norm_grads_all += T.sum(grads ** 2)
norm_grads_all = T.sqrt(norm_grads_all)

In [21]:
# p_all

### Optimization

Apply the gradient clipping

In [22]:
gshared = [theano.shared(p.get_value() * 0.) for p in p_all]
shrink_factor = ifelse(T.gt(norm_grads_all,grad_clip),grad_clip/norm_grads_all,1.)
gup = [(gs,g*shrink_factor) for gs,g in zip(gshared,grads_all)] # gradient clipping

Carry over hidden and cell states for the next batch:

In [23]:
gup = theano.compat.OrderedDict(gup)
for iLayer in range(n_layer):
    gup[c_init[iLayer]] = cell_out[iLayer][-1]
    gup[h_init[iLayer]] = hid_out[iLayer].dimshuffle(1, 0, 2)[-1]

In [24]:
f_grad_shared = theano.function([input_var, target_var],cost,updates=gup, allow_input_downcast=True)

Update the parameters with specified learning rate

In [25]:
lrate = T.iscalar().astype(T.config.floatX)
pup = [(p,p - lrate * g) for p,g in zip(p_all,gshared)]
f_update = theano.function([lrate],[],updates=pup)

Alternative optimization procedure: adadelta

In [26]:
if optim_type=='adadelta':
    scaled_grads = lasagne.updates.total_norm_constraint(grads_all, grad_clip)
    adaup = lasagne.updates.adadelta(scaled_grads, p_all, learning_rate, rho=.9)
    adaup = theano.compat.OrderedDict(adaup)
    for iLayer in range(n_layer):
        adaup[c_init[iLayer]] = cell_out[iLayer][-1]
        adaup[h_init[iLayer]] = hid_out[iLayer].dimshuffle(1, 0, 2)[-1]
    f_ada_update = theano.function([input_var, target_var],cost,updates=adaup)

### Additional functions

Function to reset the hidden and cell states to zero

In [27]:
rup = theano.compat.OrderedDict()
for iLayer in range(n_layer):
    rup[c_init[iLayer]] = theano.shared(zero_fcn((batch_size,embed_size)))
    rup[h_init[iLayer]] = theano.shared(zero_fcn((batch_size,embed_size)))
f_reset_state = theano.function([],[],updates=rup)

In [28]:
# f_grad_norm = theano.function([input_var, target_var], norm_grads_all)
f_cost = theano.function([input_var, target_var, drate], cost, updates=gup)

### Train

In [29]:
n_look = 200

import time
loss_val = 0; i_batch = 0; i_epoch = 0
a = time.time()
gnorm = []

for i_epoch in range(num_epoch):

    for (x,y,p) in data_producer(data,batch_size):

        if optim_type=='sgd':
            loss_val += f_grad_shared(x,y)/batch_size
            f_update(learning_rate)
        else:
            loss_val += f_ada_update(x,y)
        i_batch += 1

        if ((i_batch%n_look)==0):
            print('epoch %d, %.2f percent: training perplexity %.3f' 
                  %(i_epoch,(p*100),np.exp(loss_val/i_batch)))
            print('time: '+str(time.time()-a))
            a = time.time()
            
    f_reset_state()
    valid_cost = 0; i_batch = 0
    for (x,y,p) in data_producer(data_valid,batch_size):
        valid_cost += f_cost(x,y,0.)/batch_size # dropout-rate: 0.
        i_batch += 1
    tp = np.exp(valid_cost/i_batch)
    print('')
    print('======================================')
    print('valid perplexity: %.3f' %(tp))
    print('======================================')
    f_reset_state()
    i_batch = 0; loss_val = 0
    
#     if is_sgd:
    if i_epoch > max_epoch:
        learning_rate /= decay # adjust learning rate        

epoch 0, 8.57 percent: training perplexity 930.969
time: 34.69767618179321
epoch 0, 17.18 percent: training perplexity 683.096
time: 34.643112897872925
epoch 0, 25.79 percent: training perplexity 555.802
time: 34.69820022583008
epoch 0, 34.40 percent: training perplexity 477.477
time: 34.667715549468994
epoch 0, 43.00 percent: training perplexity 424.574
time: 34.658427000045776
epoch 0, 51.61 percent: training perplexity 387.441
time: 34.71850514411926
epoch 0, 60.22 percent: training perplexity 354.260
time: 34.639967918395996
epoch 0, 68.83 percent: training perplexity 330.121
time: 34.670252323150635
epoch 0, 77.44 percent: training perplexity 312.332
time: 34.914453744888306
epoch 0, 86.05 percent: training perplexity 294.283
time: 34.917240858078
epoch 0, 94.66 percent: training perplexity 277.532
time: 34.907180309295654

valid perplexity: 179.057
epoch 1, 8.57 percent: training perplexity 152.177
time: 88.91546082496643
epoch 1, 17.18 percent: training perplexity 158.776
time: 

KeyboardInterrupt: 

### Test

In [31]:
f_reset_state()
test_cost = 0; i_batch = 0
for (x,y,p) in data_producer(data_test,batch_size):
    test_cost += f_cost(x,y,0.)/batch_size
    i_batch += 1
tp = np.exp(test_cost/i_batch)
print('')
print('======================================')
print('test perplexity: %.3f' %(tp))
print('======================================')


test perplexity: 118.384
