# <div align="center"> 通过代码实现加深对RNN的理解 </div>

In [1]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
import os
import time
from datetime import datetime

In [2]:
class data_preprocess(object):
    def __init__(self):
        self.vocabulary_size = 8000
        self.unknown_token = 'UNKNOWN_TOKEN'
        self.start_token = 'SENTENCE_START'
        self.end_token = "SENTENCE_END"
        
    def process(self, path):
        if os.path.exists('/tmp/Datasets/X_train.npy'):
            return
        else:
            os.mkdir('/tmp/Datasets')
        
        with open(path, 'rt', encoding="utf-8") as f:
            reader = csv.reader(f, skipinitialspace=True)
            reader.__next__() # next(reader)
            # 每行存在多个句子
            sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
            # 每个句子前后加token
            sentences = ["%s %s %s" % (self.start_token, x, self.end_token) for x in sentences]
            print("sentences length: %d" % len(sentences))
            
            # 把句子细化为单词
            tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]            
            # 计算每个单词的频率
            word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
            
            # 选出前vocabulary_size个单词
            vocab = word_freq.most_common(self.vocabulary_size-1)
            index_to_word = [x[0] for x in vocab]
            index_to_word.append(self.unknown_token)
            word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])
            print("Using vocabulary size %d." % len(vocab))
            print("The least frequent word is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))
            
            # 将原始句子中没有出现在vocab里的单词标注为unkown_token
            for i, sent in enumerate(tokenized_sentences):
                tokenized_sentences[i] = [w if w in word_to_index else self.unknown_token for w in sent]
                
            for sent in tokenized_sentences:
                print(sent[:-1])
                print(sent[1:])
                break
                
            X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
            y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])
            
            # Save to Dist 
            np.save('/tmp/Datasets/index_to_word.npy', index_to_word)
            np.save('/tmp/Datasets/word_to_index.npy', word_to_index)
            np.save('/tmp/Datasets/X_train.npy', X_train)
            np.save('/tmp/Datasets/y_train.npy', y_train)

数据集下载: [百度云盘Datasets](https://pan.baidu.com/s/1gAFZ9gSf4pHJBt5W6_PgPQ "提取码: gxk4")

In [3]:
g = data_preprocess()
g.process('/home/lidong/Datasets/ML/reddit-comments-2015-08.csv')

In [4]:
np.random.seed(10)

In [5]:
X_train = np.load('/tmp/Datasets/X_train.npy', allow_pickle=True)
y_train = np.load('/tmp/Datasets/y_train.npy', allow_pickle=True)
index_to_word = np.load('/tmp/Datasets/index_to_word.npy', allow_pickle=True)

print(X_train.shape, y_train.shape, index_to_word.shape)
print(X_train[0], '\n', y_train[0])
print(index_to_word[X_train[0]])

(79170,) (79170,) (8000,)
[0, 6, 3495, 7, 155, 796, 25, 222, 8, 32, 20, 202, 4955, 350, 91, 6, 66, 207, 5, 2] 
 [6, 3495, 7, 155, 796, 25, 222, 8, 32, 20, 202, 4955, 350, 91, 6, 66, 207, 5, 2, 1]
['SENTENCE_START' 'i' 'joined' 'a' 'new' 'league' 'this' 'year' 'and'
 'they' 'have' 'different' 'scoring' 'rules' 'than' 'i' "'m" 'used' 'to'
 '.']


In [6]:
VOCABULARY_SIZE = 8000

# RNN
class RNNNumpy:
    def __init__(self, word_dim, hidden_dim = 100, bptt_truncate = 4):
        self.K = word_dim
        self.H = hidden_dim
        self.bptt_truncate = bptt_truncate
        # 使用均匀分布初始化参数
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
        
test_rnn = RNNNumpy(VOCABULARY_SIZE) 

In [7]:
def softmax(x):
    xt = np.exp(x - np.max(x))
    return xt / np.sum(xt)

$s_t = tanh(Ux_t + W s_{t-1})$

$o_t = softmax(Vs_t)$

In [8]:
def forward_propagation(self, x):
    # x 是一个句子
    T = len(x)
    # T+1:多一个ｓ_{-1}的初始状态
    s = np.zeros((T+1, self.H))
    o = np.zeros((T, self.K))
    for t in np.arange(T):
        s[t] = np.tanh(self.U[:, x[t]] + self.W.dot(s[t-1]))
        o[t] = softmax(self.V.dot(s[t]))
    return [o, s]

RNNNumpy.forward_propagation = forward_propagation

# 输入一个句子测试
print(X_train[10])
print(index_to_word[X_train[10]])
o, s = test_rnn.forward_propagation(X_train[10])
print(o.shape, s.shape)

[0, 72, 63, 13, 124, 5, 26, 1128, 208, 5, 324, 3, 329, 4, 112, 32, 75, 7, 4746, 4, 8, 84, 52, 9, 7, 3155, 1021, 492, 7534, 8, 133, 48, 3096, 4, 10, 95, 51, 4, 128, 17, 37, 314, 577, 2, 40]
['SENTENCE_START' 'no' 'one' 'is' 'going' 'to' 'be' 'honest' 'enough' 'to'
 'run' 'the' 'check' ',' 'see' 'they' "'re" 'a' 'felon' ',' 'and' 'then'
 'all' 'of' 'a' 'sudden' 'immediately' 'turn' 'dishonest' 'and' 'say' '``'
 'nah' ',' 'you' 'know' 'what' ',' 'here' "'s" 'your' 'gun' 'anyway' '.'
 "''"]
(45, 8000) (46, 100)


In [9]:
def predict(self, x):
    o, s = self.forward_propagation(x)
    return np.argmax(o, axis = 1)

RNNNumpy.predict = predict

predictions = test_rnn.predict(X_train[10])
print(predictions.shape)
print(predictions)

(45,)
[1284 5221 7653 7430 1013 3562 7366 1874  224 6601 7299 6722 6892 3198
 4480 5853 2926  261 4073 2371 6299 5376 4146 3761 7051 5981 1549 3765
 4958 1835 6166 5192 2579 5879 4864 5132 6569 2800 2752 6821 4437 7021
 3943 6912 3922]


交叉熵损失函数:
    
$$
L(y, o) = - \frac{1}{N}\sum_{n \in N}y_n \log o_n
$$

训练之前$o_i$出现的任意值的概率为$\dfrac {1}{K}$, 即$K$个词概率均等, $y_n=1$则:
$$
\begin{aligned}
L(y, o) &= - \frac{1}{N}\sum_{n \in N}y_n \log \frac{1}{K} \\
    &= \frac{1}{N} N \log K \\
    &= logK
\end{aligned}
$$

In [10]:
# slice 插曲:

l1 = np.array([[1,2,3], [4,5,6], [7,8,9]])
# 选取所有行的第1,2列
print(l1[:, (1,2)])
# 选取第1行第1列, 选取第2行第2列
print(l1[(1,2), (1,2)])

[[2 3]
 [5 6]
 [8 9]]
[5 9]


In [11]:
def calculate_loss(self, xs, ys):
    # xs: 一批训练数据
    N = sum((len(y_i) for y_i in ys))
    L = 0
    # 遍历句子
    for i in np.arange(len(ys)):
        o, s = self.forward_propagation(xs[i])
        # 选出标签y(正确下标)对应的概率
        correct_word_predictions = o[np.arange(len(ys[i])), ys[i]]
        L += -1 * sum(np.log(correct_word_predictions))
    return L/N

RNNNumpy.calculate_loss = calculate_loss

print("Expected Loss for random prediction: %f" % np.log(8000))
print("Actual loss: %f" % test_rnn.calculate_loss(X_train[:1000], y_train[:1000]))

Expected Loss for random prediction: 8.987197
Actual loss: 8.987393


In [12]:
# V可以做空间映射将100(H) 映射到 8000(K)空间中

def bptt(self, x, y):
    # x句子词向量
    T = len(y)
    o, s = self.forward_propagation(x)
    dLdU = np.zeros(self.U.shape)
    dLdV = np.zeros(self.V.shape)
    dLdW = np.zeros(self.W.shape)
    delta_o = o
    delta_o[np.arange(len(y)), y] -= 1  # it is y_hat - y
    for t in np.arange(T):
        dLdV += np.outer(delta_o[t], s[t].T)    # at time step t, shape is word_dim * hidden_dim
        delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
        # backpropagation through time (for at most self.bptt_truncate steps)
        # given time step t, go back from time step t, to t-1, t-2, ...
        for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
            # print("Backprogation step t=%d bptt step=%d" %(t, bptt_step))
            dLdW += np.outer(delta_t, s[bptt_step - 1])
            dLdU[:, x[bptt_step]] += delta_t
            # update delta for next step
            dleta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
    return [dLdU, dLdV, dLdW]

RNNNumpy.bptt = bptt

In [13]:
def numpy_sgd_step(self, x, y, learning_rate):
    dLdU, dLdV, dLdW = self.bptt(x, y)
    self.U -= learning_rate * dLdU
    self.V -= learning_rate * dLdV
    self.W -= learning_rate * dLdW
    
RNNNumpy.sgd_step = numpy_sgd_step

def train_with_sgd(model, X_train, y_train, learning_rate = 0.005, nepoch = 100, evaluate_loss_after = 5):
    # keep track of the losses so that we can plot them later
    losses = []
    num_examples_seen = 0
    for epoch in range(nepoch):
        # optionally evaluate the loss
        if (epoch % evaluate_loss_after == 0):
            loss = model.calculate_loss(X_train, y_train)
            losses.append((num_examples_seen, loss))
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print("%s: loss after num_examples_seen=%d epoch=%d: %f" %(time, num_examples_seen, epoch, loss))
            # adjust the learning rate if loss increases
            if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                learning_rate = learning_rate * 0.5
                print("setting learning rate to %f" %(learning_rate))
            sys.stdout.flush()
        # for each training example...
        for i in range(len(y_train)):
            # one sgd step
            model.sgd_step(X_train[i], y_train[i], learning_rate)
            num_examples_seen += 1

In [14]:
train_with_sgd(test_rnn, X_train[:100], y_train[:100], nepoch = 10, evaluate_loss_after = 2)

2019-09-19 16:24:19: loss after num_examples_seen=0 epoch=0: 8.987280
2019-09-19 16:24:35: loss after num_examples_seen=200 epoch=2: 8.942253
2019-09-19 16:24:50: loss after num_examples_seen=400 epoch=4: 6.361804
2019-09-19 16:25:05: loss after num_examples_seen=600 epoch=6: 5.978650
2019-09-19 16:25:20: loss after num_examples_seen=800 epoch=8: 5.799763
