In [19]:
#import necessary dependencies
import math
import os
import time
import numpy as np
import mxnet as mx
from mxnet import gluon, autograd
from mxnet.gluon import nn, rnn
import mxnet.ndarray as F
import logging

In [3]:
#load cpu context, if using cpu mx.gpu(0)
context= mx.gpu(0)

In [4]:
# loading https://s3.amazonaws.com/text-datasets/nietzsche.txt nietzsche- You can load anyother text you want (https://cs.stanford.edu/people/karpathy/char-rnn/)
with open("../data/nlp/nietzsche.txt", errors='ignore') as f:
    text = f.read()
print(len(text))

600901


In [5]:
#total of characters in dataset
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 86


In [6]:
#zeros for padding
chars.insert(0, "\0")

In [7]:
''.join(chars[1:-6])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz'

In [8]:
#maps character to unique index e.g. {a:1,b:2....}
char_indices = dict((c, i) for i, c in enumerate(chars))
#maps indices to character (1:a,2:b ....)
indices_char = dict((i, c) for i, c in enumerate(chars))

In [9]:
#mapping the dataset into index
idx = [char_indices[c] for c in text]

In [10]:
print(len(idx))

600901


In [11]:
#testing the mapping
''.join(indices_char[i] for i in idx[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

## Our unrolled RNN

In this model we map 3 inputs to one output. Later we will design rnn with n inputs to n inputs (sequence to sequence)

In [12]:
#input for neural network( our basic rnn has 3 inputs, n samples)
cs=3
c1_dat = [idx[i] for i in range(0, len(idx)-1-cs, cs)]
c2_dat = [idx[i+1] for i in range(0, len(idx)-1-cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx)-1-cs, cs)]
#the output of rnn network (single vector)
c4_dat = [idx[i+3] for i in range(0, len(idx)-1-cs, cs)]

In [13]:
#stacking the inputs to form (3 input features )
x1 = np.stack(c1_dat[:-2])
x2 = np.stack(c2_dat[:-2])
x3 = np.stack(c3_dat[:-2])

In [14]:
# the output (1 X N data points)
y = np.stack(c4_dat[:-2])

In [15]:
col_concat = np.array([x1,x2,x3])
t_col_concat = col_concat.T
print(t_col_concat.shape)


(200297, 3)


In [16]:
# our sample inputs for the model
x1_nd = mx.nd.array(x1)
x2_nd = mx.nd.array(x2)
x3_nd = mx.nd.array(x3)
sample_input = mx.nd.array([ [x1[0],x2[0],x3[0]] ,[x1[1],x2[1],x3[1] ] ])

simple_train_data = mx.nd.array(t_col_concat)
simple_label_data = mx.nd.array(y)

In [17]:
#Set the batchsize as 32, so input is of form 32 X 3
#output is 32 X 1
batch_size = 32
def get_batch(source,label_data, i,batch_size=32):
    bb_size = min(batch_size, source.shape[0] - 1 - i)
    data = source[i : i + bb_size]
    target = label_data[i: i + bb_size]
    #print(target.shape)
    return data, target.reshape((-1,))

In [18]:
test_bat,test_target = get_batch(simple_train_data,simple_label_data,5,batch_size)
print(test_bat.shape)
print(test_target.shape)

(32, 3)
(32,)


<img src="images/unRolled_rnn.png">

In [20]:
#simple UnRollredRNN_Model
from mxnet.gluon import Block, nn
from mxnet import ndarray as F

class UnRolledRNN_Model(Block):
    def __init__(self,vocab_size, num_embed, num_hidden,**kwargs):
        super(UnRolledRNN_Model, self).__init__(**kwargs)
        self.num_embed = num_embed
        self.vocab_size = vocab_size
        
        # use name_scope to give child Blocks appropriate names.
        # It also allows sharing Parameters between Blocks recursively.
        with self.name_scope():
            self.encoder = nn.Embedding(self.vocab_size, self.num_embed)
            self.dense1 = nn.Dense(num_hidden,activation='relu',flatten=True)
            self.dense2 = nn.Dense(num_hidden,activation='relu',flatten=True)
            self.dense3 = nn.Dense(vocab_size,flatten=True)

    def forward(self, inputs):
        emd = self.encoder(inputs)
        #print(emd.shape)
        #since the input is shape(batch_size,input(3 characters))
        # we need to extract 0th,1st,2nd character from each batch
        chararcter1 = emd[:,0,:]
        chararcter2 = emd[:,1,:]
        chararcter3 = emd[:,2,:]
        c1_hidden = self.dense1(chararcter1) # green arrow in diagram for character 1
        c2_hidden = self.dense1(chararcter2) # green arrow in diagram for character 2
        c3_hidden = self.dense1(chararcter3) # green arrow in diagram for character 3
        c1_hidden_2 = self.dense2(c1_hidden)  # yellow arrow in diagram
        addition_result = F.add(c2_hidden,c1_hidden_2) # Total c1 + c2
        addition_hidden = self.dense2(addition_result) # the yellow arrow
        addition_result_2 = F.add(addition_hidden,c3_hidden) # Total c1 + c2
        final_output = self.dense3(addition_result_2)      
        return final_output
    
vocab_size = len(chars)+1 # the vocabsize
num_embed = 30
num_hidden = 256
#model creatings
simple_model = UnRolledRNN_Model(vocab_size, num_embed, num_hidden)
#model initilisation
simple_model.collect_params().initialize(mx.init.Xavier(), ctx=context)
trainer = gluon.Trainer(simple_model.collect_params(), 'adam')
loss = gluon.loss.SoftmaxCrossEntropyLoss()
#sample input shape is of size (2x3)
#output = simple_model(sample_input)
#sample out shape should be(3*87). 87 is our vocab size
#print('the output shape',output.shape)

In [20]:
#check point file
os.makedirs('checkpoints', exist_ok=True)
filename_unrolled_rnn = "checkpoints/rnn_gluon_abc.params" 

In [21]:
#the actual training 
def UnRolledRNNtrain(train_data,label_data,batch_size=32,epochs=10):
    epochs = epochs
    smoothing_constant = .01
    for e in range(epochs):
        for ibatch, i in enumerate(range(0, train_data.shape[0] - 1, batch_size)):
            data, target = get_batch(train_data,label_data, i,batch_size)
            data = data.as_in_context(context)
            target = target.as_in_context(context)
            with autograd.record():
                output = simple_model(data)
                L = loss(output, target)
            L.backward()
            trainer.step(data.shape[0])

            ##########################
            #  Keep a moving average of the losses
            ##########################
            if ibatch == 128:
                curr_loss = mx.nd.mean(L).asscalar()
                moving_loss = 0
                moving_loss = (curr_loss if ((i == 0) and (e == 0)) 
                           else (1 - smoothing_constant) * moving_loss + (smoothing_constant) * curr_loss)
                print("Epoch %s. Loss: %s, moving_loss %s" % (e,curr_loss,moving_loss))   
    simple_model.save_params(filename_unrolled_rnn)          

In [22]:
epochs = 10
UnRolledRNNtrain(simple_train_data,simple_label_data,batch_size,epochs)

Epoch 0. Loss: 2.57893, moving_loss 0.0257892537117
Epoch 1. Loss: 2.11341, moving_loss 0.0211340808868
Epoch 2. Loss: 2.16297, moving_loss 0.0216296601295
Epoch 3. Loss: 2.11876, moving_loss 0.0211876249313
Epoch 4. Loss: 2.05449, moving_loss 0.0205448579788
Epoch 5. Loss: 2.05682, moving_loss 0.0205682468414
Epoch 6. Loss: 2.06466, moving_loss 0.020646572113
Epoch 7. Loss: 2.06384, moving_loss 0.02063839674
Epoch 8. Loss: 2.04167, moving_loss 0.0204166507721
Epoch 9. Loss: 2.03729, moving_loss 0.0203729319572


In [23]:
#loading the model back
simple_model.load_params(filename_unrolled_rnn, ctx=context)

In [24]:
#evaluating the model
def evaluate(input_string):
    idx = [char_indices[c] for c in input_string]
    sample_input = mx.nd.array([[ idx[0],idx[1],idx[2] ]],ctx=context)
    output = simple_model(sample_input)
    index = mx.nd.argmax(output, axis=1)
    return index.asnumpy()[0]

In [25]:
#predictions
begin_char = 'lov'
answer = evaluate(begin_char)
print('the predicted answer is ',indices_char[answer])


the predicted answer is  e


## Character RNN using gluon/lstm api

Training sequence 2 sequence models using Gluon API

In [17]:
# Class to create model objects.
class GluonRNNModel(gluon.Block):
    """A model with an encoder, recurrent layer, and a decoder."""

    def __init__(self, mode, vocab_size, num_embed, num_hidden,
                 num_layers, dropout=0.5, **kwargs):
        super(GluonRNNModel, self).__init__(**kwargs)
        with self.name_scope():
            self.drop = nn.Dropout(dropout)
            self.encoder = nn.Embedding(vocab_size, num_embed,
                                        weight_initializer = mx.init.Uniform(0.1))
               
            if mode == 'lstm':
                self.rnn = rnn.LSTM(num_hidden, num_layers, dropout=dropout,
                                    input_size=num_embed)
            elif mode == 'gru':
                self.rnn = rnn.GRU(num_hidden, num_layers, dropout=dropout,
                                   input_size=num_embed)
            else:
                self.rnn = rnn.RNN(num_hidden, num_layers, activation='relu', dropout=dropout,
                                   input_size=num_embed)
            self.decoder = nn.Dense(vocab_size, in_units = num_hidden)
            self.num_hidden = num_hidden
    #define the forward pass of the neural network
    def forward(self, inputs, hidden):
        emb = self.drop(self.encoder(inputs))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        #print('output forward',output.shape)
        decoded = self.decoder(output.reshape((-1, self.num_hidden)))
        return decoded, hidden
    #Initial state of netork
    def begin_state(self, *args, **kwargs):
        return self.rnn.begin_state(*args, **kwargs)

In [18]:
# define the lstm
mode = 'lstm'
vocab_size = len(chars)+1 # number of characters in vocab_size
embedsize = 500
hididen_units = 1000
number_layers = 2
clip = 0.2
epochs = 2 # use 200 epochs for good result
batch_size = 32
seq_length = 100 # sequence length
dropout = 0.4
log_interval = 64
rnn_save = 'checkpoints/gluonlstm_abc' #checkpoints/gluonlstm_2 (prepared for seq_lenght 100, 200 epochs)


In [19]:
# GluonRNNModel 
model = GluonRNNModel(mode, vocab_size, embedsize, hididen_units,
                       number_layers, dropout)
# initalise the weights of models to random weights
model.collect_params().initialize(mx.init.Xavier(), ctx=context)
# Adam trainer
trainer = gluon.Trainer(model.collect_params(), 'adam')
#softmax cros entropy loss
loss = gluon.loss.SoftmaxCrossEntropyLoss()

In [14]:


# prepares rnn batches
# The batch will be of shape is (num_example * batch_size) because of RNN uses sequences of input     x
# for example if we use (a1,a2,a3) as one input sequence , (b1,b2,b3) as another input sequence and (c1,c2,c3)
# if we have batch of 3, then at timestep '1'  we only have (a1,b1.c1) as input, at timestep '2' we have (a2,b2,c2) as input...
# hence the batchsize is of order 
# In feedforward we use (batch_size, num_example)
def rnn_batch(data, batch_size):
    """Reshape data into (num_example, batch_size)"""
    nbatch = data.shape[0] // batch_size
    data = data[:nbatch * batch_size]
    data = data.reshape((batch_size, nbatch)).T
    return data

idx_nd = mx.nd.array(idx)
# convert the idex of characters
train_data_rnn_gluon = rnn_batch(idx_nd, batch_size).as_in_context(context)


In [15]:
#get the batch
def get_batch(source, i,seq):
    seq_len = min(seq, source.shape[0] - 1 - i)
    data = source[i : i + seq_len]
    target = source[i + 1 : i + 1 + seq_len]
    return data, target.reshape((-1,))

# detach the hidden state, so we dont accidentally compute gradients
def detach(hidden):
    if isinstance(hidden, (tuple, list)):
        hidden = [i.detach() for i in hidden]
    else:
        hidden = hidden.detach()
    return hidden

In [73]:
def trainGluonRNN(epochs,train_data,seq=seq_length):
    for epoch in range(epochs):
        total_L = 0.0
        hidden = model.begin_state(func = mx.nd.zeros, batch_size = batch_size, ctx = context)
        for ibatch, i in enumerate(range(0, train_data.shape[0] - 1, seq_length)):
            data, target = get_batch(train_data, i,seq)
            hidden = detach(hidden)
            with autograd.record():
                output, hidden = model(data, hidden)
                L = loss(output, target) # this is total loss associated with seq_length
                L.backward()

            grads = [i.grad(context) for i in model.collect_params().values()]
            # Here gradient is for the whole batch.
            # So we multiply max_norm by batch_size and seq_length to balance it.
            gluon.utils.clip_global_norm(grads, clip * seq_length * batch_size)

            trainer.step(batch_size)
            total_L += mx.nd.sum(L).asscalar()

            if ibatch % log_interval == 0 and ibatch > 0:
                cur_L = total_L /  seq_length / batch_size / log_interval
                print('[Epoch %d Batch %d] loss %.2f',epoch + 1, ibatch, cur_L)
                total_L = 0.0
        model.save_params(rnn_save)

In [74]:
print('the train data shape is',train_data_rnn_gluon.shape)

the train data shape is (18778, 32)


In [75]:
#The train data shape
trainGluonRNN(epochs,train_data_rnn_gluon,seq=seq_length)


In [20]:
model.load_params(rnn_save, context)

In [77]:
#evaluates a seqtoseq model over input string
def evaluate_seq2seq(model,input_string,seq_length,batch_size):
    idx = [char_indices[c] for c in input_string]
    if(len(input_string) != seq_length):
        raise ValueError("input string should be equal to sequence length")
    hidden = model.begin_state(func = mx.nd.zeros, batch_size = batch_size, ctx=context)
    sample_input = mx.nd.array(np.array([idx[0:seq_length]]).T
                                ,ctx=context)
    output,hidden = model(sample_input,hidden)
    index = mx.nd.argmax(output, axis=1)
    index = index.asnumpy()
    return [indices_char[char] for char in index]

In [78]:
#maps the input sequence to output sequence
def mapInput(input_str,output_str):
    for i,_ in enumerate(input_str):
        partial_input = input_str[:i+1]
        partial_output = output_str[i:i+1]
        print(partial_input + "->" + partial_output[0])

In [79]:
test_input = 'probably the time is at hand when it will be once and again understood WHAT has actually sufficed an'
print(len(test_input))
result= evaluate_seq2seq(model,test_input,seq_length,1)
mapInput(test_input,result)

100
p->h
pr->o
pro->v
prob->a
proba->b
probab->l
probabl->y
probably-> 
probably ->d
probably t->h
probably th->e
probably the-> 
probably the ->b
probably the t->i
probably the ti->m
probably the tim->e
probably the time-> 
probably the time ->i
probably the time i->n
probably the time is-> 
probably the time is ->a
probably the time is a->t
probably the time is at-> 
probably the time is at ->a
probably the time is at h->a
probably the time is at ha->n
probably the time is at han->d
probably the time is at hand-> 
probably the time is at hand ->w
probably the time is at hand w->h
probably the time is at hand wh->e
probably the time is at hand whe->n
probably the time is at hand when-> 
probably the time is at hand when ->i
probably the time is at hand when i->t
probably the time is at hand when it-> 
probably the time is at hand when it ->i
probably the time is at hand when it w->a
probably the time is at hand when it wi->l
probably the time is at hand when it wil->l
probably the tim

In [80]:
# a nietzsche like text generator
import sys
def generate_random_text(model,input_string,seq_length,batch_size,sentence_length):
    count = 0
    new_string = ''
    cp_input_string = input_string
    hidden = model.begin_state(func = mx.nd.zeros, batch_size = batch_size, ctx=context)
    while count < sentence_length:
        idx = [char_indices[c] for c in input_string]
        if(len(input_string) != seq_length):
            print(len(input_string))
            raise ValueError('there was a error in the input ')
        sample_input = mx.nd.array(np.array([idx[0:seq_length]]).T
                                ,ctx=context)
        output,hidden = model(sample_input,hidden)
        index = mx.nd.argmax(output, axis=1)
        index = index.asnumpy()
        count = count + 1
        new_string = new_string + indices_char[index[-1]]
        input_string = input_string[1:] + indices_char[index[-1]]
    print(cp_input_string + new_string)

In [81]:
generate_random_text(model,"probably the time is at hand when it will be once and again understood WHAT has actually sufficed an",seq_length,1,200)

probably the time is at hand when it will be once and again understood WHAT has actually sufficed and
missed his eyes are open to things _near_. He is not punished by
the mind will the belief in them that there is a god who commands us
to be regarded as something "given." How far from the beginning,
