In [1]:
import mxnet as mx
print(mx.__version__)

1.0.0


  import OpenSSL.SSL


In [2]:
from __future__ import print_function
import mxnet as mx
from mxnet import nd, autograd
import numpy as np
mx.random.seed(1)
#ctx = mx.gpu(0)
ctx = mx.cpu(0)

## 1. DATASET

#### Dataset: “The Time Machine”
Now mess with some data. I grabbed a copy of the Time Machine, mostly because it’s available freely thanks to the good people at Project Gutenberg and a lot of people are tired of seeing RNNs generate Shakespeare. In case you prefer torturing Shakespeare to torturing H.G. Wells, I’ve also included Andrej Karpathy’s tinyshakespeare.txt in the data folder. Let’s get started by reading in the data.

In [3]:
with open("./data/timemachine.txt") as f:
    time_machine = f.read()

#### And you’ll probably want to get a taste for what the text looks like.

In [4]:
print(time_machine[0:1000])

﻿The Project Gutenberg EBook of Making the Nine, by Albertus T. Dudley

This eBook is for the use of anyone anywhere in the United States and most
other parts of the world at no cost and with almost no restrictions
whatsoever.  You may copy it, give it away or re-use it under the terms of
the Project Gutenberg License included with this eBook or online at
www.gutenberg.org.  If you are not located in the United States, you'll have
to check the laws of the country where you are located before using this ebook.

Title: Making the Nine

Author: Albertus T. Dudley

Illustrator: Charles Copeland

Release Date: January 22, 2018 [EBook #56415]

Language: English

Character set encoding: UTF-8

*** START OF THIS PROJECT GUTENBERG EBOOK MAKING THE NINE ***




Produced by Barry Abrahamsen and the Online Distributed
Proofreading Team at http://www.pgdp.net (This file was
produced from images generously made available by The
Internet Archive)









                            MAKING THE NINE



In [5]:
#time_machine = [x.strip() for x in time_machine.split('\n')]
time_machine = [x.strip() for x in time_machine.split('\n') if list(set(x.strip())) not in [[], ['-']]]
time_machine = '\n'.join(time_machine[time_machine.index('CHAPTER I'):])
print(time_machine[0:1000])

CHAPTER I
AN UNWELCOME PROPOSITION
“HOW they do yell! Where’s your patriotism, Phil, to be hanging round in
this gloomy crowd when all your friends are howling their heads off
outside? Don’t you know Yale won the game? Why aren’t you out there with
the rest?”
Philip Poole looked up with a smile, but did not reply.
“He’s comforting the afflicted,” said Dick Melvin, who shared with Poole
the ownership of the room. “You don’t want to gloat over us poor
Harvardites, do you, Phil? Thank you much for your sympathy.”
“That isn’t the reason,” said the lad, after a pause, with the sober
look in his big, wide-open eyes that made him seem serious even when his
feelings inclined in the opposite direction. “I just don’t see any cause
for such a racket. A Yale football victory over Harvard is too ordinary
an occurrence to get wild over.”
The chorus of hoots and groans that greeted this explanation brought a
smile of satisfaction to the boy’s face. He was the youngest of the
company, only in his seco

#### Numerical representations of characters
When we create numerical representations of characters, we’ll use one-hot representations. A one-hot is a vector that takes value 1 in the index corresponding to a character, and 0 elsewhere. Because this vector is as long as the vocab, let’s get a definitive list of characters in this dataset so that our representation is not longer than necessary.

In [6]:
character_list = list(set(time_machine))
vocab_size = len(character_list)
print(character_list)
print("Length of vocab: %s" % vocab_size)

['/', 'X', ']', ',', 'm', '╤', '\n', 'N', "'", '○', 'n', 'K', 'x', 'R', 'r', '[', '┼', '—', '%', 'L', 'A', 'q', '9', '┴', '1', 'F', '”', ';', 'S', 'u', 'É', '3', '’', '│', '"', ':', 'C', 'Z', 'h', 'V', 'k', '@', '(', 'f', '_', 's', 'ö', 'e', '.', 'D', 'w', '─', '!', 'H', 'g', 't', '4', 'l', '═', 'Q', 'i', 'b', 'é', '0', 'æ', 'M', '$', 'Æ', '-', 'I', 'T', 'P', 'Y', '‘', 'ê', '*', 'c', '7', '6', '“', ')', 'z', 'O', 'U', '–', 'G', '5', 'E', '●', ' ', 'J', '?', 'W', 'y', 'p', 'd', '8', '2', 'v', 'o', 'a', 'j', 'B']
Length of vocab: 103


#### We’ll often want to access the index corresponding to each character quickly so let’s store this as a dictionary.

In [7]:
character_dict = {}
for e, char in enumerate(character_list):
    character_dict[char] = e
print(character_dict)

{'/': 0, 'X': 1, ']': 2, ',': 3, 'm': 4, '╤': 5, '\n': 6, 'N': 7, "'": 8, '○': 9, 'n': 10, 'K': 11, 'x': 12, 'R': 13, 'r': 14, '[': 15, '┼': 16, '—': 17, '%': 18, 'L': 19, 'A': 20, 'q': 21, '9': 22, '┴': 23, '1': 24, 'F': 25, '”': 26, ';': 27, 'S': 28, 'u': 29, 'É': 30, '3': 31, '’': 32, '│': 33, '"': 34, ':': 35, 'C': 36, 'Z': 37, 'h': 38, 'V': 39, 'k': 40, '@': 41, '(': 42, 'f': 43, '_': 44, 's': 45, 'ö': 46, 'e': 47, '.': 48, 'D': 49, 'w': 50, '─': 51, '!': 52, 'H': 53, 'g': 54, 't': 55, '4': 56, 'l': 57, '═': 58, 'Q': 59, 'i': 60, 'b': 61, 'é': 62, '0': 63, 'æ': 64, 'M': 65, '$': 66, 'Æ': 67, '-': 68, 'I': 69, 'T': 70, 'P': 71, 'Y': 72, '‘': 73, 'ê': 74, '*': 75, 'c': 76, '7': 77, '6': 78, '“': 79, ')': 80, 'z': 81, 'O': 82, 'U': 83, '–': 84, 'G': 85, '5': 86, 'E': 87, '●': 88, ' ': 89, 'J': 90, '?': 91, 'W': 92, 'y': 93, 'p': 94, 'd': 95, '8': 96, '2': 97, 'v': 98, 'o': 99, 'a': 100, 'j': 101, 'B': 102}


In [8]:
time_numerical = [character_dict[char] for char in time_machine]
print(time_numerical[0:1000])

[36, 53, 20, 71, 70, 87, 13, 89, 69, 6, 20, 7, 89, 83, 7, 92, 87, 19, 36, 82, 65, 87, 89, 71, 13, 82, 71, 82, 28, 69, 70, 69, 82, 7, 6, 79, 53, 82, 92, 89, 55, 38, 47, 93, 89, 95, 99, 89, 93, 47, 57, 57, 52, 89, 92, 38, 47, 14, 47, 32, 45, 89, 93, 99, 29, 14, 89, 94, 100, 55, 14, 60, 99, 55, 60, 45, 4, 3, 89, 71, 38, 60, 57, 3, 89, 55, 99, 89, 61, 47, 89, 38, 100, 10, 54, 60, 10, 54, 89, 14, 99, 29, 10, 95, 89, 60, 10, 6, 55, 38, 60, 45, 89, 54, 57, 99, 99, 4, 93, 89, 76, 14, 99, 50, 95, 89, 50, 38, 47, 10, 89, 100, 57, 57, 89, 93, 99, 29, 14, 89, 43, 14, 60, 47, 10, 95, 45, 89, 100, 14, 47, 89, 38, 99, 50, 57, 60, 10, 54, 89, 55, 38, 47, 60, 14, 89, 38, 47, 100, 95, 45, 89, 99, 43, 43, 6, 99, 29, 55, 45, 60, 95, 47, 91, 89, 49, 99, 10, 32, 55, 89, 93, 99, 29, 89, 40, 10, 99, 50, 89, 72, 100, 57, 47, 89, 50, 99, 10, 89, 55, 38, 47, 89, 54, 100, 4, 47, 91, 89, 92, 38, 93, 89, 100, 14, 47, 10, 32, 55, 89, 93, 99, 29, 89, 99, 29, 55, 89, 55, 38, 47, 14, 47, 89, 50, 60, 55, 38, 6, 55, 38, 

In [9]:
#########################
#  Check that the length is right
#########################
print(len(time_machine))
print(len(time_numerical))

#########################
#  Check that the format looks right
#########################
print(time_numerical[:25])

#########################
#  Convert back to text
#########################
print("".join([character_list[idx] for idx in time_numerical[:25]]))

336118
336118
[36, 53, 20, 71, 70, 87, 13, 89, 69, 6, 20, 7, 89, 83, 7, 92, 87, 19, 36, 82, 65, 87, 89, 71, 13]
CHAPTER I
AN UNWELCOME PR


#### One-hot representations
We can use NDArray’s one_hot() operation to render a one-hot representation of each character. But frack it, since this is the from scratch tutorial, let’s write this ourselves.

In [10]:
def one_hots(numerical_list, vocab_size=vocab_size):
    result = nd.zeros((len(numerical_list), vocab_size), ctx=ctx)
    #for i, idx in enumerate(numerical_list):
    #    result[i, idx] = 1.0
    #Tutorial의 For문 비효율적 --> nd.arange로 수정 (17초 -> 0.2초)
    result[nd.arange(len(numerical_list)), numerical_list] = 1.0
    return result

In [11]:
print(one_hots(time_numerical))


[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
<NDArray 336118x103 @cpu(0)>


#### That looks about right. Now let’s write a function to convert our one-hots back to readable text.

In [12]:
def textify(embedding):
    result = ""
    indices = nd.argmax(embedding, axis=1).asnumpy()
    for idx in indices:
        result += character_list[int(idx)]
    return result

In [13]:
print(textify(one_hots(time_numerical[0:100])))

CHAPTER I
AN UNWELCOME PROPOSITION
“HOW they do yell! Where’s your patriotism, Phil, to be hanging r


## 2. PreProcessing

#### Preparing the data for training
Great, it’s not the most efficient implementation, but we know how it works. So we’re already doing better than the majority of people with job titles in machine learning. Now, let’s chop up our dataset into sequences that we could feed into our model.

You might think we could just feed in the entire dataset as one gigantic input and backpropagate across the entire sequence. When you try to backpropagate across thousands of steps a few things go wrong: (1) The time it takes to compute a single gradient update will be unreasonably long (2) The gradient across thousands of recurrent steps has a tendency to either blow up, causing NaN errors due to losing precision, or to vanish.

Thus we’re going to look at feeding in our data in reasonably short sequences. Note that this home-brew version is pretty slow; if you’re still running on a CPU, this is the right time to make dinner.

In [14]:
seq_length = 64
# -1 here so we have enough characters for labels later
num_samples = (len(time_numerical) - 1) // seq_length
dataset = one_hots(time_numerical[:seq_length*num_samples]).reshape((num_samples, seq_length, vocab_size))
print('Shape of dataset: ', dataset.shape) #(num_samples, seq_length, vocab_size)
print(textify(dataset[0]))

Shape of dataset:  (5251, 64, 103)
CHAPTER I
AN UNWELCOME PROPOSITION
“HOW they do yell! Where’s yo


#### Now that we’ve chopped our dataset into sequences of length seq_length, at every time step, our input is a single one-hot vector. This means that our computation of the hidden layer would consist of matrix-vector multiplications, which are not especially efficient on GPU. To take advantage of the available computing resources, we’ll want to feed through a batch of sequences at the same time. The following code may look tricky but it’s just some plumbing to make the data look like this.

In [15]:
batch_size = 32

In [16]:
print('Shape of dataset: ', dataset.shape) #(num_samples, seq_length, vocab_size)
print('# of sequences in dataset: ', len(dataset))

num_batches = len(dataset) // batch_size
print('Size of batch: ', batch_size)
print('# of batches: ', num_batches)

train_data = dataset[:num_batches*batch_size].reshape((batch_size, num_batches, seq_length, vocab_size))

# swap batch_size and seq_length axis to make later access easier
print('Shape of train_data: ', train_data.shape) #(batch_size, num_batches, seq_length, vocab_size)
train_data = nd.swapaxes(train_data, 0, 1)
print('Shape of train_data: ', train_data.shape) #(num_batches, batch_size, seq_length, vocab_size)
train_data = nd.swapaxes(train_data, 1, 2)
print('Shape of train_data: ', train_data.shape) #(num_batches, seq_length, batch_size, vocab_size)

Shape of dataset:  (5251, 64, 103)
# of sequences in dataset:  5251
Size of batch:  32
# of batches:  164
Shape of train_data:  (32, 164, 64, 103)
Shape of train_data:  (164, 32, 64, 103)
Shape of train_data:  (164, 64, 32, 103)


#### Let’s sanity check that everything went the way we hope. For each data_row, the second sequence should follow the first:

In [17]:
for i in range(3):
    print("***Batch %s:***\n\n%s\n\n%s\n\n" % (i, textify(train_data[i, :, 0]), textify(train_data[i, :, 1])))

***Batch 0:***

CHAPTER I
AN UNWELCOME PROPOSITION
“HOW they do yell! Where’s yo

 “He’s scared as death of the mile run. I guess I’ll land
him.”



***Batch 1:***

ur patriotism, Phil, to be hanging round in
this gloomy crowd wh

CHAPTER II
ON THE ICE
AS Dickinson foresaw, Melvin yielded to th


***Batch 2:***

en all your friends are howling their heads off
outside? Don’t y

e pressure brought to bear
upon him, and resigned himself to the




#### Preparing our labels
Now let’s repurpose the same batching code to create our label batches

In [18]:
# Preparing the data for training 과 동일한 방법
# (num_samples, seq_length, vocab_size)
# --> num_samples를 num_batches*batch_size 길이로 고정
# --> (batch_size, num_batches, seq_length, vocab_size)
# --> (num_batches, seq_length, batch_size, vocab_size)

labels = one_hots(time_numerical[1:seq_length*num_samples+1])
print('Shape of labels: ', labels.shape) #(num_samples*seq_length, vocab_size)

train_label_1 = labels.reshape((num_samples, seq_length, vocab_size))
print('Shape of train_label_1: ', train_label_1.shape) #(num_samples, seq_length, vocab_size)
train_label_1 = train_label_1[:num_batches*batch_size]
print('Shape of train_label_1: ', train_label_1.shape) #(num_batches*batch_size, seq_length, vocab_size)

train_label_1 = train_label_1.reshape((batch_size, num_batches, seq_length, vocab_size))
print('Shape of train_label_1: ', train_label_1.shape) #(batch_size, num_batches, seq_length, vocab_size)
train_label_1 = nd.swapaxes(train_label_1, 0, 1)
print('Shape of train_label_1: ', train_label_1.shape) #(num_batches, batch_size, seq_length, vocab_size)
train_label_1 = nd.swapaxes(train_label_1, 1, 2)
print('Shape of train_label_1: ', train_label_1.shape) #(num_batches, seq_length, batch_size, vocab_size)



Shape of labels:  (336064, 103)
Shape of train_label_1:  (5251, 64, 103)
Shape of train_label_1:  (5248, 64, 103)
Shape of train_label_1:  (32, 164, 64, 103)
Shape of train_label_1:  (164, 32, 64, 103)
Shape of train_label_1:  (164, 64, 32, 103)


In [19]:
labels = one_hots(time_numerical[1:seq_length*num_samples+1])
print('Shape of labels: ', labels.shape) #(num_samples*seq_length, vocab_size)

train_label_2 = labels.reshape((batch_size, num_batches, seq_length, vocab_size))
print('Shape of train_label_2: ', train_label_2.shape) #(batch_size, num_batches, seq_length, vocab_size)
train_label_2 = nd.swapaxes(train_label_2, 0, 1)
print('Shape of train_label_2: ', train_label_2.shape) #(num_batches, batch_size, seq_length, vocab_size)
train_label_2 = nd.swapaxes(train_label_2, 1, 2)
print('Shape of train_label_2: ', train_label_2.shape) #(num_batches, seq_length, batch_size, vocab_size)


Shape of labels:  (336064, 103)
Shape of train_label_2:  (32, 164, 64, 103)
Shape of train_label_2:  (164, 32, 64, 103)
Shape of train_label_2:  (164, 64, 32, 103)


In [20]:
np.array_equal(train_label_1.asnumpy(), train_label_2.asnumpy())

True

In [21]:
# (참고) mx.nd.array는 전체 길이가 다른 Shape으로 변경 가능

print('\nnp.array: Size 150 into (2,3,4,5)\n')
try:
    print(np.array(range(150)).reshape((2,3,4,5)))
except ValueError:
    print('np.array: Cannot reshape array into different shape')

print('\nmx.nd.array: Size 150 into (2,3,4,5)\n')
try:
    print(print(mx.nd.array(range(150)).reshape((2,3,4,5))))
except ValueError:
    print('mx.nd.array: Cannot reshape array into different shape')
    




np.array: Size 150 into (2,3,4,5)

np.array: Cannot reshape array into different shape

mx.nd.array: Size 150 into (2,3,4,5)


[[[[   0.    1.    2.    3.    4.]
   [   5.    6.    7.    8.    9.]
   [  10.   11.   12.   13.   14.]
   [  15.   16.   17.   18.   19.]]

  [[  20.   21.   22.   23.   24.]
   [  25.   26.   27.   28.   29.]
   [  30.   31.   32.   33.   34.]
   [  35.   36.   37.   38.   39.]]

  [[  40.   41.   42.   43.   44.]
   [  45.   46.   47.   48.   49.]
   [  50.   51.   52.   53.   54.]
   [  55.   56.   57.   58.   59.]]]


 [[[  60.   61.   62.   63.   64.]
   [  65.   66.   67.   68.   69.]
   [  70.   71.   72.   73.   74.]
   [  75.   76.   77.   78.   79.]]

  [[  80.   81.   82.   83.   84.]
   [  85.   86.   87.   88.   89.]
   [  90.   91.   92.   93.   94.]
   [  95.   96.   97.   98.   99.]]

  [[ 100.  101.  102.  103.  104.]
   [ 105.  106.  107.  108.  109.]
   [ 110.  111.  112.  113.  114.]
   [ 115.  116.  117.  118.  119.]]]]
<NDArray 2x3x4x5 

#### A final sanity check
Remember that our target at every time step is to predict the next character in the sequence. So our labels should look just like our inputs but offset by one character. Let’s look at corresponding inputs and outputs to make sure everything lined up as expected.

In [22]:
train_label = train_label_1

In [23]:
print(textify(train_data[10, :, 3]))
print(textify(train_label[10, :, 3]))

e no more goals.
Again Varrell took the puck, and with his famil
 no more goals.
Again Varrell took the puck, and with his famili


In [24]:
def generate_batch_data(data, seq_length, dims, batch_size, normalize=None, **kwargs):
    
    data = mx.nd.array(data)
    
    if normalize!=None:
        data = normalize(data, **kwargs)
        
    data = data.reshape((-1, seq_length, dims))
    
    num_batches = len(data) // batch_size    
    data_batch = data.reshape((batch_size, num_batches, seq_length, dims))
    data_batch = nd.swapaxes(data_batch, 0, 1)
    data_batch = nd.swapaxes(data_batch, 1, 2)
    return data_batch




In [25]:
seq_length = 64
num_samples = (len(time_numerical) - 1) // seq_length
batch_size = 32
x = generate_batch_data(data = time_numerical[0:seq_length*num_samples],
                        seq_length = seq_length,
                        dims = vocab_size,
                        batch_size = batch_size,
                        normalize = one_hots,
                        vocab_size = vocab_size
                       )

y = generate_batch_data(data = time_numerical[1:seq_length*num_samples+1],
                        seq_length = seq_length,
                        dims = vocab_size,
                        batch_size = batch_size,
                        normalize = one_hots,
                        vocab_size = vocab_size
                       )


In [26]:
import random
random.seed(1)
train_idx = random.sample(range(len(x)), round(len(x) * 0.7))
valid_idx = [x for x in range(len(x)) if x not in train_idx]

In [27]:
train_x = x[train_idx]
train_y = y[train_idx]
valid_x = x[valid_idx]
valid_y = y[valid_idx]

## 3. Define Activation, Loss, Optimizer

#### Softmax Activation

Softmax Function 형태: exp(f) / ∑exp(f)

하지만 softmax function 을 코딩할 경우,

큰 숫자들을 나누는 것은 numerically unstable 하기 때문에, 노말리제이션 트릭을 사용한다.

(the intermediate terms exp(f) and ∑exp(f) may be very large due to the exponentials.)

In [28]:
# example with 3 classes and each having large scores
f = np.array([123, 456, 789])
# Bad: Numeric problem, potential blowup
# instead: first shift the values of f so that the highest number is 0:
p = np.exp(f) / np.sum(np.exp(f))

# f becomes [-666, -333, 0]
norm_f = f - np.max(f)
# safe to do, gives the correct answer
norm_p = np.exp(norm_f) / np.sum(np.exp(norm_f))

print(f)
print(p)
print(norm_f)
print(norm_p)

[123 456 789]
[  0.   0.  nan]
[-666 -333    0]
[  5.75274406e-290   2.39848787e-145   1.00000000e+000]


  """
  """


In [29]:
def sigmoid(x):
    return 1. / (1. + nd.exp(-x))
# Same as nd.Activation(x, act_type='sigmoid')

def tanh(x):
    return (nd.exp(x) - nd.exp(-x)) / (nd.exp(x) + nd.exp(-x))
# Same as nd.Activation(x, act_type='tanh')

def softmax(y_linear, temperature=1.0):
    lin = (y_linear-nd.max(y_linear, axis=1).reshape((-1,1))) / temperature # shift each row of y_linear by its max
    exp = nd.exp(lin)
    partition =nd.sum(exp, axis=1).reshape((-1,1))
    return exp / partition

In [30]:
####################
# With a temperature of 1 (always 1 during training), we get back some set of probabilities
####################
softmax(nd.array([[1, -1], [-1, 1]]), temperature=1.0)


[[ 0.88079703  0.11920292]
 [ 0.11920292  0.88079703]]
<NDArray 2x2 @cpu(0)>

In [31]:
####################
# If we set a high temperature, we can get more entropic (*noisier*) probabilities
####################
softmax(nd.array([[1,-1],[-1,1]]), temperature=1000.0)


[[ 0.50049996  0.49949998]
 [ 0.49949998  0.50049996]]
<NDArray 2x2 @cpu(0)>

In [32]:
####################
# Often we want to sample with low temperatures to produce sharp probabilities
####################
softmax(nd.array([[10,-10],[-10,10]]), temperature=.1)


[[ 1.  0.]
 [ 0.  1.]]
<NDArray 2x2 @cpu(0)>

#### Cross-entropy loss function
At every time step our task is to predict the next character, given the string up to that point. This is the familiar multi-task classification that we introduced for handwritten digit classification. Accordingly, we’ll rely on the same loss function, cross-entropy.

Cross-Entropy 형태: H(p,q) = -∑plog(q)

In [33]:
# def cross_entropy(yhat, y):
#     return - nd.sum(y * nd.log(yhat))

def cross_entropy(yhat, y):
    return - nd.mean(nd.sum(y * nd.log(yhat), axis=0, exclude=True))

# Define accuracy metric
def accuracy(yhat, y):
    pred = nd.argmax(yhat, axis = 1)
    real = nd.argmax(y, axis = 1)
    correct = nd.sum(pred == real)
    total = yhat.shape[0]
    return correct / total

In [34]:
cross_entropy(nd.array([[.2,.5,.3], [.2,.5,.3]]), nd.array([[1.,0,0], [0, 1.,0]]))


[ 1.15129256]
<NDArray 1 @cpu(0)>

#### Averaging the loss over the sequence
Because the unfolded RNN has multiple outputs (one at every time step) we can calculate a loss at every time step. The weights corresponding to the net at time step t influence both the loss at time step t and the loss at time step t+1. To combine our losses into a single global loss, we’ll take the average of the losses at each time step.

In [35]:
def average_ce_loss(outputs, labels):
    assert(len(outputs) == len(labels))
    total_loss = 0.
    for (output, label) in zip(outputs,labels):
        total_loss = total_loss + cross_entropy(output, label)
    return total_loss / len(outputs)

def average_acc(outputs, labels):
    assert(len(outputs) == len(labels))
    total_acc = 0.
    for (output, label) in zip(outputs,labels):
        total_acc = total_acc + accuracy(output, label)
    return total_acc / len(outputs)

#### Optimizer

In [36]:
def SGD(params, lr):
    for param in params:
        param[:] = param - lr * param.grad

## 4. Model

#### Recurrent neural networks

Recall that the update for an ordinary hidden layer in a neural network with activation function ϕ is given by

h=ϕ(xW + b)

To make this a recurrent neural network, we’re simply going to add a weight sum of the previous hidden state ht−1ht−1:

ht=ϕ(xtWxh + ht−1Whh+bh)

Then at every time set t, we’ll calculate the output as:

ŷt=softmax(htWhy + by)


In [37]:
class RNNModel_layer1():

    def __init__(self, model, train, valid, hidden_dims, ctx):
        assert(train[0].shape[1:] == valid[0].shape[1:])
        assert(train[1].shape[1:] == valid[1].shape[1:])
        
        self.model = model
        self.train_x = train[0]
        self.train_y = train[1]
        self.valid_x = valid[0]
        self.valid_y = valid[1]
        self.num_batches = self.train_x.shape[0]
        self.input_dims = self.train_x.shape[3]
        self.output_dims = self.train_y.shape[3]
        self.hidden_dims = hidden_dims
        self.seq_outputs = self.train_y.shape[1]
        self.ctx = ctx
                
        
    def allocate_params(self):        
        if self.model == 'simple_rnn':
            ########################
            #  Weights connecting the inputs to the hidden layer
            ########################
            self.Wxh = nd.random_normal(shape=(self.input_dims, self.hidden_dims), ctx=self.ctx) * .01

            ########################
            #  Recurrent weights connecting the hidden layer across time steps
            ########################
            self.Whh = nd.random_normal(shape=(self.hidden_dims, self.hidden_dims), ctx=self.ctx) * .01

            ########################
            #  Bias vector for hidden layer
            ########################
            self.bh = nd.random_normal(shape=self.hidden_dims, ctx=self.ctx) * .01

            ########################
            # Weights to the output nodes
            ########################
            self.Why = nd.random_normal(shape=(self.hidden_dims, self.output_dims), ctx=self.ctx) * .01
            self.by = nd.random_normal(shape=self.output_dims, ctx=self.ctx) * .01

            # NOTE: to keep notation consistent,
            # we should really use capital letters
            # for hidden layers and outputs,
            # since we are doing batchwise computations]

            ########################
            # Attach the gradients
            ########################
            self.params = [self.Wxh, self.Whh, self.bh, self.Why, self.by]
            for self.param in self.params:
                self.param.attach_grad()
        
        
        elif self.model == 'lstm':
            ########################
            #  Weights connecting the inputs to the hidden layer
            ########################
            self.Wxg = nd.random_normal(shape=(self.input_dims, self.hidden_dims), ctx=self.ctx) * .01
            self.Wxi = nd.random_normal(shape=(self.input_dims, self.hidden_dims), ctx=self.ctx) * .01
            self.Wxf = nd.random_normal(shape=(self.input_dims, self.hidden_dims), ctx=self.ctx) * .01
            self.Wxo = nd.random_normal(shape=(self.input_dims, self.hidden_dims), ctx=self.ctx) * .01

            ########################
            #  Recurrent weights connecting the hidden layer across time steps
            ########################
            self.Whg = nd.random_normal(shape=(self.hidden_dims, self.hidden_dims), ctx=self.ctx)* .01
            self.Whi = nd.random_normal(shape=(self.hidden_dims, self.hidden_dims), ctx=self.ctx)* .01
            self.Whf = nd.random_normal(shape=(self.hidden_dims, self.hidden_dims), ctx=self.ctx)* .01
            self.Who = nd.random_normal(shape=(self.hidden_dims, self.hidden_dims), ctx=self.ctx)* .01

            ########################
            #  Bias vector for hidden layer
            ########################
            self.bg = nd.random_normal(shape=self.hidden_dims, ctx=self.ctx) * .01
            self.bi = nd.random_normal(shape=self.hidden_dims, ctx=self.ctx) * .01
            self.bf = nd.random_normal(shape=self.hidden_dims, ctx=self.ctx) * .01
            self.bo = nd.random_normal(shape=self.hidden_dims, ctx=self.ctx) * .01

            ########################
            # Weights to the output nodes
            ########################
            self.Why = nd.random_normal(shape=(self.hidden_dims, self.output_dims), ctx=self.ctx) * .01
            self.by = nd.random_normal(shape=self.output_dims, ctx=self.ctx) * .01

            ########################
            # Attach the gradients
            ########################
            self.params = [self.Wxg, self.Wxi, self.Wxf, self.Wxo]
            self.params = self.params + [self.Whg, self.Whi, self.Whf, self.Who]
            self.params = self.params + [self.bg, self.bi, self.bf, self.bo]
            self.params = self.params + [self.Why, self.by]
            for self.param in self.params:
                self.param.attach_grad()

        elif self.model == 'gru':
            ########################
            #  Weights connecting the inputs to the hidden layer
            ########################
            self.Wxz = nd.random_normal(shape=(self.input_dims, self.hidden_dims), ctx=self.ctx) * .01
            self.Wxr = nd.random_normal(shape=(self.input_dims, self.hidden_dims), ctx=self.ctx) * .01
            self.Wxh = nd.random_normal(shape=(self.input_dims, self.hidden_dims), ctx=self.ctx) * .01

            ########################
            #  Recurrent weights connecting the hidden layer across time steps
            ########################
            self.Whz = nd.random_normal(shape=(self.hidden_dims, self.hidden_dims), ctx=self.ctx)* .01
            self.Whr = nd.random_normal(shape=(self.hidden_dims, self.hidden_dims), ctx=self.ctx)* .01
            self.Whh = nd.random_normal(shape=(self.hidden_dims, self.hidden_dims), ctx=self.ctx)* .01

            ########################
            #  Bias vector for hidden layer
            ########################
            self.bz = nd.random_normal(shape=self.hidden_dims, ctx=self.ctx) * .01
            self.br = nd.random_normal(shape=self.hidden_dims, ctx=self.ctx) * .01
            self.bh = nd.random_normal(shape=self.hidden_dims, ctx=self.ctx) * .01

            ########################
            # Weights to the output nodes
            ########################
            self.Why = nd.random_normal(shape=(self.hidden_dims, self.output_dims), ctx=self.ctx) * .01
            self.by = nd.random_normal(shape=self.output_dims, ctx=self.ctx) * .01
            
            ########################
            # Attach the gradients
            ########################
            self.params = [self.Wxz, self.Wxr, self.Wxh]
            self.params = self.params + [self.Whz, self.Whr, self.Whh]
            self.params = self.params + [self.bz, self.br, self.bh]
            self.params = self.params + [self.Why, self.by]
            for self.param in self.params:
                self.param.attach_grad()
        
        else:
            raise ValueError("Invalid mode %s. Options are simple_rnn, lstm, and gru" % self.mode)
    
    def SGD(self, lr):
        for self.param in self.params:
            self.param[:] = self.param - lr * self.param.grad
    
    def rnn_model(self, inputs, h, c=None, mode='train', **kwargs):
        outputs = []
        for X in inputs:
            if self.model == 'simple_rnn':
                h_linear = nd.dot(X, self.Wxh) + nd.dot(h, self.Whh) + self.bh
                h = nd.tanh(h_linear)
                
            elif self.model == 'lstm':
                g = nd.tanh(nd.dot(X, self.Wxg) + nd.dot(h, self.Whg) + self.bg)
                i = nd.sigmoid(nd.dot(X, self.Wxi) + nd.dot(h, self.Whi) + self.bi)
                f = nd.sigmoid(nd.dot(X, self.Wxf) + nd.dot(h, self.Whf) + self.bf)
                o = nd.sigmoid(nd.dot(X, self.Wxo) + nd.dot(h, self.Who) + self.bo)

                c = f * c + i * g
                h = o * nd.tanh(c)
            elif self.model == 'gru':
                z = nd.sigmoid(nd.dot(X, self.Wxz) + nd.dot(h, self.Whz) + self.bz)
                r = nd.sigmoid(nd.dot(X, self.Wxr) + nd.dot(h, self.Whr) + self.br)
                g = nd.tanh(nd.dot(X, Wxh) + nd.dot(r * h, self.Whh) + self.bh)
                
                h = z * h + (1 - z) * g
                
            yhat_linear = nd.dot(h, self.Why) + self.by
                            
            if(self.output_dims == 2):
                yhat = sigmoid(yhat_linear)
            elif(self.output_dims > 2):
                yhat = softmax(yhat_linear, **kwargs)                
            outputs.append(yhat)
        
        if self.seq_outputs == 1:
            outputs = outputs[len(outputs)-1]
            
        if self.model in ['simple_rnn', 'gru']:
            return (outputs, h)
        elif self.model == 'lstm':
            return (outputs, h, c)

        
    def set_sample_generation(self, prefix, num_chars, temperature=1.0):
        self.prefix = prefix
        self.num_chars = num_chars
        self.temperature = temperature
        
    def run_sample_generation(self):
        #####################################
        # Initialize the string that we'll return to the supplied prefix
        #####################################
        string = self.prefix

        #####################################
        # Prepare the prefix as a sequence of one-hots for ingestion by RNN
        #####################################
        prefix_numerical = [character_dict[char] for char in self.prefix]
        input = one_hots(prefix_numerical)

        #####################################
        # Set the initial state of the hidden representation ($h_0$) to the zero vector
        #####################################
        h = nd.zeros(shape=(1, self.hidden_dims), ctx=self.ctx)
        if self.model == 'lstm':
            c = nd.zeros(shape=(1, self.hidden_dims), ctx=self.ctx)



        #####################################
        # For num_chars iterations,
        #     1) feed in the current input
        #     2) sample next character from from output distribution
        #     3) add sampled character to the decoded string
        #     4) prepare the sampled character as a one_hot (to be the next input)
        #####################################
        for i in range(self.num_chars):
            if self.model in ['simple_rnn', 'gru']:
                outputs, h = self.rnn_model(input, h, temperature=.1)
            elif self.model == 'lstm':
                outputs, h, c = self.rnn_model(input, h, c, temperature=.1)

            #outputs, h = self.simple_rnn(input, h, temperature=.1)
            
            choice = np.random.choice(vocab_size, p=outputs[-1][0].asnumpy())
            string += character_list[choice]
            input = one_hots([choice])
        return string

    def run(self, epochs, learning_rate):
        for e in range(epochs):
            ############################
            # Attenuate the learning rate by a factor of 2 every 100 epochs.
            ############################
            if ((e+1) % 10 == 0):
                learning_rate = learning_rate / 2.0
            
            h = nd.zeros(shape=(batch_size, self.hidden_dims), ctx=ctx)
            if self.model == 'lstm':
                c = nd.zeros(shape=(batch_size, self.hidden_dims), ctx=ctx)
            
            if self.seq_outputs == 1:
                loss_func = cross_entropy
                acc_func = accuracy
                self.train_y = self.train_y[:,0]
                self.valid_y = self.valid_y[:,0]
                
            elif self.seq_outputs > 1:
                loss_func = average_ce_loss
                acc_func = average_acc

            for i in range(self.num_batches):
                with autograd.record():
                    if self.model in ['simple_rnn', 'gru']:
                        outputs, h = self.rnn_model(self.train_x[i], h, mode='train')
                    elif self.model == 'lstm':
                        outputs, h, c = self.rnn_model(self.train_x[i], h, c, mode='train')

                    loss = loss_func(outputs, self.train_y[i])
                    loss.backward()
                self.SGD(learning_rate)
                
                k = random.sample(range(len(valid_x)), 1)[0]
                if self.model in ['simple_rnn', 'gru']:
                    val_outputs, _ = self.rnn_model(self.valid_x[k], h, mode='valid')
                elif self.model == 'lstm':
                    val_outputs, _, _ = self.rnn_model(self.valid_x[k], h, c, mode='valid')
                
                val_loss = loss_func(val_outputs, self.valid_y[k])
                val_acc = acc_func(val_outputs, self.valid_y[k])
                
                ##########################
                #  Keep a moving average of the losses
                ##########################
                tr_loss = round(loss.asnumpy()[0], 3)
                val_loss = round(val_loss.asnumpy()[0], 3)
                val_acc = round(val_acc.asnumpy()[0], 3)

                if (i == 0) and (e == 0):
                    tr_moving_loss = round(np.mean(tr_loss) ,3)
                    val_moving_loss = round(np.mean(val_loss), 3)
                    val_moving_acc = round(np.mean(val_acc) ,3)
                else:
                    tr_moving_loss = round(.9 * tr_moving_loss + .1 * np.mean(tr_loss) ,3)
                    val_moving_loss = round(.9 * val_moving_loss + .1 * np.mean(tr_loss) ,3)
                    val_moving_acc = round(.9 * val_moving_acc + .1 * np.mean(val_acc) ,3)
                
#                 if i % 10 == 0:
#                     print("Epoch %s. Batch %s. Loss: %s. Moving Loss: %s. Val Loss: %s. Val Moving Loss: %s. Val Acc: %s. Val Moving Acc: %s." %
#                           (e, i, tr_loss, tr_moving_loss, val_loss, val_moving_loss, val_acc, val_moving_acc))
            
            print("Epoch %s. Loss: %s. Val Loss: %s. Val ACC: %s" % (e, tr_moving_loss, val_moving_loss, val_moving_acc))
            if 'prefix' in dir(self):
                print(self.run_sample_generation())
                
#             print('**************** End of Epoch %s ****************' % (e))




In [38]:
trainset = [train_x, train_y]
validset = [valid_x, valid_y]

In [39]:

RNN = RNNModel_layer1(model = 'simple_rnn', train = trainset, valid = validset, hidden_dims = 256, ctx=ctx)
RNN.allocate_params()
RNN.set_sample_generation(prefix = 'Do you know', num_chars = 50)
RNN.run(epochs = 50, learning_rate = .5)



Epoch 0. Loss: 3.166. Val Loss: 3.166. Val ACC: 0.148
Do you know                                                  
Epoch 1. Loss: 2.948. Val Loss: 2.948. Val ACC: 0.227
Do you know the the the the the the the the the the the the t
Epoch 2. Loss: 2.755. Val Loss: 2.755. Val ACC: 0.256
Do you know an the the the the the the the the the the the th
Epoch 3. Loss: 2.654. Val Loss: 2.654. Val ACC: 0.276
Do you know an the the the the the the the the the the the th
Epoch 4. Loss: 2.578. Val Loss: 2.578. Val ACC: 0.279
Do you know an the the the the the the the the the the the th
Epoch 5. Loss: 2.519. Val Loss: 2.519. Val ACC: 0.295
Do you know and and and and and and and and and and an the th
Epoch 6. Loss: 2.481. Val Loss: 2.481. Val ACC: 0.305
Do you know an the the the the the the the the the the the th
Epoch 7. Loss: 2.449. Val Loss: 2.449. Val ACC: 0.308
Do you know an the the the the the the the the the the the th
Epoch 8. Loss: 2.411. Val Loss: 2.411. Val ACC: 0.317
Do you know the th

In [None]:

RNN = RNNModel_layer1(model = 'lstm', train = trainset, valid = validset, hidden_dims = 256, ctx=ctx)
RNN.allocate_params()
RNN.set_sample_generation(prefix = 'Do you know', num_chars = 50)
RNN.run(epochs = 50, learning_rate = .5)



In [None]:
RNN = RNNModel_layer1(model = 'gru', train = trainset, valid = validset, hidden_dims = 256, ctx=ctx)
RNN.allocate_params()
RNN.set_sample_generation(prefix = 'Do you know', num_chars = 50)
RNN.run(epochs = 50, learning_rate = .5)

