In [0]:
!pip install mxnet==1.6.0b20200101
!pip install d2lzh==0.8.10

from google.colab import drive
drive.mount('/content/drive')

In [0]:
from mxnet import nd
import random
import zipfile

with open('/content/drive/My Drive/Data/Andy_Lau.txt') as f:
  for line in f:
    corpus_chars = f.read()

corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')

# We map each character to a continuous integer starting from 0, also known as the index, to facilitate subsequent data processing. 
# In order to get the index, we take all the different characters in the data set, and then map them one by one to the index to construct the dictionary. 
# Next, print vocab_size, which is the number of different characters in the dictionary, also known as the vocabulary size.
idx_to_char = list(set(corpus_chars))
char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
vocab_size = len(char_to_idx)
vocab_size


2783

In [0]:
# Each character in the training data set is converted into an index, and the first 20 characters and their corresponding indexes are printed.
corpus_indices = [char_to_idx[char] for char in corpus_chars]
sample = corpus_indices[:20]
print('chars:', ''.join([idx_to_char[idx] for idx in sample]))
print('indices:', sample)

chars: 但是不敌愁眉难服气为你不肯心死 如何玩味
indices: [1665, 455, 2030, 224, 1276, 735, 53, 1103, 2241, 277, 819, 2030, 1035, 1852, 1940, 584, 1178, 524, 1509, 258]


In [0]:
import math
from mxnet import autograd, nd
from mxnet.gluon import loss as gloss
import time

# One-hot encoding
# To represent words as vectors into a neural network, a simple way is to use one-hot vectors. Suppose the number of different characters in the dictionary is N (i.e. vocab_size), 
# each character has a one-to-one correspondence with a continuous integer value index from 0 to N−1.
# If the index of a character is an integer i, then we create a vector of length N with all 0s, and set the element whose position is i to 1.
# This vector is a one-hot vector to the original character. The following shows one-hot vectors with indices 0 and 2, respectively. The vector length is equal to the dictionary size.

nd.one_hot(nd.array([0, 2]), vocab_size)

# The shape of our mini-batch for each sample is (batch size, number of time steps). 
# The following function transforms such small batch into several matrices that can be input into the network. Their shape is (batch size, dictionary size). The number of matrices is equal to the number of time steps.
# That is, the input of time step t is Xt ∈ Rn × d, where n is the batch size and d is the number of inputs, i.e. the length of the one-hot vector (dictionary size).

def to_onehot(X, size):
    return [nd.one_hot(x, size) for x in X.T]

X = nd.arange(10).reshape((2, 5))
inputs = to_onehot(X, vocab_size)
len(inputs), inputs[0].shape


(5, (2, 2783))

In [0]:
# Next, we initialize the model parameters. The number of hidden units num_hiddens is a hyperparameter.

import d2lzh as d2l

num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size
ctx = d2l.try_gpu()
print('will use', ctx)

def get_params():
    def _one(shape):
        return nd.random.normal(scale=0.01, shape=shape, ctx=ctx)

    # Hidden layer parameters
    W_xh = _one((num_inputs, num_hiddens))
    W_hh = _one((num_hiddens, num_hiddens))
    b_h = nd.zeros(num_hiddens, ctx=ctx)
    # Output layer parameters
    W_hq = _one((num_hiddens, num_outputs))
    b_q = nd.zeros(num_outputs, ctx=ctx)
    # Attach gradient
    params = [W_xh, W_hh, b_h, W_hq, b_q]
    for param in params:
        param.attach_grad()
    return params

will use cpu(0)


In [0]:
# We implement this model based on the computational expression of a recurrent neural network. First define the init_rnn_state function to return the initialized hidden state. 
# It returns a tuple consisting of an NDArray whose shape is (batch size, number of hidden units). Tuples are used to make it easier to handle cases where the hidden state contains multiple NDArrays.

def init_rnn_state(batch_size, num_hiddens, ctx):
    return (nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx), )

# The following rnn function defines how to calculate the hidden state and output in a time step. The activation function here uses the tanh function. 
# When the elements are evenly distributed in the number domain, the mean value of the tanh function is 0.

def rnn(inputs, state, params):
    # inputs and outputs are both matrices of shape (batch_size, vocab_size). There are num_steps matrices.
    W_xh, W_hh, b_h, W_hq, b_q = params
    H, = state
    outputs = []
    for X in inputs:
        H = nd.tanh(nd.dot(X, W_xh) + nd.dot(H, W_hh) + b_h)
        Y = nd.dot(H, W_hq) + b_q
        outputs.append(Y)
    return outputs, (H,)

In [0]:
# A simple test to observe the number of output results (the number of time steps), the shape of the output layer output at the first time step, and the shape of the hidden state.

state = init_rnn_state(X.shape[0], num_hiddens, ctx)
inputs = to_onehot(X.as_in_context(ctx), vocab_size)
params = get_params()
outputs, state_new = rnn(inputs, state, params)
len(outputs), outputs[0].shape, state_new[0].shape

(5, (2, 2783), (2, 256))

In [0]:
# Defining prediction functions
# The following function predicts the next num_chars characters based on the prefix (a string containing several characters). 
# It sets the recurrent neural unit rnn as a function parameter.

def predict_rnn(prefix, num_chars, rnn, params, init_rnn_state,
                num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx):
    state = init_rnn_state(1, num_hiddens, ctx)
    output = [char_to_idx[prefix[0]]]
    for t in range(num_chars + len(prefix) - 1):
        # Use the output of the previous step as the input of the current step
        X = to_onehot(nd.array([output[-1]], ctx=ctx), vocab_size)
        # Calculate output and update hidden status
        (Y, state) = rnn(X, state, params)
        # Input for the next time step is the character in prefix or the current best predicted character
        if t < len(prefix) - 1:
            output.append(char_to_idx[prefix[t + 1]])
        else:
            output.append(int(Y[0].argmax(axis=1).asscalar()))
    return ''.join([idx_to_char[i] for i in output])

In [0]:
# To test the predict_rnn function. We will create a 10-character lyrics (regardless of the prefix length) based on the prefix "男人". Because the model parameters are random values, the prediction results are also random.

predict_rnn('男人', 10, rnn, params, init_rnn_state, num_hiddens, vocab_size,
            ctx, idx_to_char, char_to_idx)

'男人剧性漓然畅巅捕祇牺延'

In [0]:
# Clipping Gradients
# Gradient decay or gradient explosion is more likely to occur in recurrent neural networks. To deal with gradient explosions, we can clip gradients. Suppose we stitch all the elements of the model parameter gradients into a vector g and set the threshold of the cropping as θ.

def grad_clipping(params, theta, ctx):
    norm = nd.array([0], ctx)
    for param in params:
        norm += (param.grad ** 2).sum()
    norm = norm.sqrt().asscalar()
    if norm > theta:
        for param in params:
            param.grad[:] *= theta / norm

In [0]:
# We use perplexity to evaluate the quality of a language model. Perplexity is the value obtained from exponential operation from the cross-entropy loss function.

# In the best case, the model always predicts the probability of the label category as 1, and the perplexity degree is 1;
# In the worst case, the model always predicts the probability of the label category as 0, and the perplexity is positive infinity;
# In the baseline case, the model always predicts that the probability is the same for all categories. The degree of perplexity is the number of categories.
# Obviously, the perplexity of any valid model must be less than the number of categories. In this example, the perplexity must be less than vocab_size.

In [0]:
# Defining model training functions

# Evaluate the model using perplexity.
# Clip gradient before iterating model parameters.
# Different sampling methods for time series data will lead to different initialization of hidden state.


def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, ctx, corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):
    if is_random_iter:
        data_iter_fn = d2l.data_iter_random
    else:
        data_iter_fn = d2l.data_iter_consecutive
    params = get_params()
    loss = gloss.SoftmaxCrossEntropyLoss()

    for epoch in range(num_epochs):
        if not is_random_iter:
            # If using adjacent sampling, initialize the hidden state at the beginning of the epoch
            state = init_rnn_state(batch_size, num_hiddens, ctx)
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx)
        for X, Y in data_iter:
            if is_random_iter:
            # If using random sampling, initialize the hidden state before each mini-batch update
                state = init_rnn_state(batch_size, num_hiddens, ctx)
            else:
            # Otherwise need to use the detach function to detach the hidden state from the calculation graph 
                for s in state:
                    s.detach()
            with autograd.record():
                inputs = to_onehot(X, vocab_size)
                # outputs num_steps matrices which shape of (batch_size, vocab_size)
                (outputs, state) = rnn(inputs, state, params)
                # The shape after stitching is (num_steps * batch_size, vocab_size)
                outputs = nd.concat(*outputs, dim=0)
                # The shape of Y is (batch_size, num_steps). After transposing, it becomes a length of 
                # batch * num_steps vector, which corresponds to the output rows one by one
                y = Y.T.reshape((-1,))
                # Calculate average classification error using cross entropy loss
                l = loss(outputs, y).mean()
            l.backward()
            grad_clipping(params, clipping_theta, ctx)
            # Clip gradient
            d2l.sgd(params, lr, 1)
            # Because the error has been averaged, the gradient need not be averaged

            l_sum += l.asscalar() * y.size
            n += y.size

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn(
                    prefix, pred_len, rnn, params, init_rnn_state,
                    num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx))

In [0]:
# Train models and write lyrics
# First, set the model hyperparameters. We will create a piece of lyrics with a length of 50 characters (regardless of the prefix length) based on the prefixes "男人" and "女人“". 
## Every 50 iterations we write a lyrics based on the currently trained model.

num_epochs, num_steps, batch_size, lr, clipping_theta = 250, 35, 32, 1e2, 1e-2
pred_period, pred_len, prefixes = 50, 50, ['男人', '女人']

In [0]:
train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                      vocab_size, ctx, corpus_indices, idx_to_char,
                      char_to_idx, True, num_epochs, num_steps, lr,
                      clipping_theta, batch_size, pred_period, pred_len,
                      prefixes)

epoch 50, perplexity 53.750615, time 101.17 sec
 - 男人的心里 我的爱你我的地 我的天不开你的柔情 我的心情感动 我的泪却是我的家 你的眼神我的心 我的心不
 - 女人 我不是你的心 我的心不是你的眼光 我的心不可以 只管你一天 我的心忘了我的心 我的爱你我的一切 我
epoch 100, perplexity 25.935840, time 97.77 sec
 - 男人究竟不是我的心 你的爱该那么的不会 我踏上了爱你的心中 曾经错失意中 你曾经以不对你 我是我的心也有
 - 女人 从未最相信相见 但愿现在未来 什么时候才明白生命里面 我为何你的身影 你去爱我的心曾有爱你 我是真
epoch 150, perplexity 16.976173, time 99.71 sec
 - 男人究竟为了什么 因为我有感到也不要多么好 天生命中不能不停 我的心情已不够 我这样对我的爱你 我已经不
 - 女人 要不敢爱惜你是我痴心的爱情 我已经不可不可以 只想你的心 无法的风我们都是你 我不能忘记你我的眼中
epoch 200, perplexity 12.711507, time 98.90 sec
 - 男人究竟犯了心 你在我的爱你最后一生不讲千百句 言说谎言却不能隐藏 我的胸前安歇的一切最美一个男孩手走在
 - 女人 既然大家没有错 只有你何必说话 就说一生都可以 平静的发现我的一切变得一个梦 经过多少的心中有你的
