### 1. 原始语料

In [1]:
s1 = """
11月1日出版的《求是》杂志发表习近平总书记重要文章《促进高质量充分就业》。
文章中，总书记深刻总结新时代就业工作成就和经验，
分析当前就业形势和面临的突出问题，就促进高质量充分就业提出要求。
今年10月，总书记在安徽考察时再次强调：“解决好重点人群就业，
完善农村低收入人口常态化帮扶政策，确保不发生规模性返贫致贫。”
"""

In [2]:
s2 =  """
“就业是家事，更是国事。”情牵百姓、心系民生，习近平总书记高度重视就业工作。如何依靠发展促进就业？
如何抓好重点群体就业？如何培养更多专业技术人才？如何构建和谐劳动关系？这些问题，总书记念兹在兹。
"""

In [3]:
s3 = """
2019年2月1日，习近平总书记在看望北京市前门石头胡同服务点的“快递小哥”时指出：
“要坚持就业优先战略，把解决人民群众就业问题放在更加突出的位置，努力创造更多就业岗位。”
学院的电子屏幕上，展示了该校毕业生去向落实率的柱状图，总书记仔细察看，反复询问具体数据。
"""

### 2. 构建字典

In [4]:
import jieba

In [5]:
words = {"<PAD>", "<UNK>"}
for s in [s1, s2, s3]:
    words = words.union(set(jieba.lcut(s)))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\63447\AppData\Local\Temp\jieba.cache
Loading model cost 0.359 seconds.
Prefix dict has been built successfully.


In [6]:
"""
    构建字典
"""
word2idx = {word: idx for idx, word in enumerate(words)}
idx2word = {idx: word for word, idx in word2idx.items()}

In [7]:
words1 = jieba.lcut(s1)
words2 = jieba.lcut(s2)
words3 = jieba.lcut(s3)

In [8]:
len(words1), len(words2), len(words3)

(94, 56, 79)

In [9]:
lens = [len(temp) for temp in [words1, words2, words3]]

In [10]:
lens

[94, 56, 79]

In [11]:
import numpy as np

In [12]:
lens = np.array(lens)

In [13]:
SEQ_LEN = int(lens.mean())

In [14]:
def pad_sentence(sentence, seq_len=SEQ_LEN):
    """
        处理句子：
            - 按照 seq_len的长度来统一句子的长度
    """
    if len(sentence) >= seq_len:
        sentence = sentence[:seq_len]
    else:
        sentence += ["<PAD>"] * (seq_len - len(sentence))
    return sentence

In [15]:
words1 = pad_sentence(sentence=words1)
words2 = pad_sentence(sentence=words2)
words3 = pad_sentence(sentence=words3)

In [16]:
ids1 = [word2idx.get(word, word2idx.get("<UNK>")) for word in words1]
ids2 = [word2idx.get(word, word2idx.get("<UNK>")) for word in words2]
ids3 = [word2idx.get(word, word2idx.get("<UNK>")) for word in words3]

In [17]:
X = [ids1, ids2, ids3]

In [18]:
import torch

In [19]:
X = torch.tensor(data=X, dtype=torch.long)

In [20]:
# [batch_size, seq_len]
X.shape

torch.Size([3, 76])

In [21]:
# 数据的转置 [seq_len, batch_size]
torch.permute(input=X, dims=(1, 0)).shape

torch.Size([76, 3])

In [22]:
from torch import nn

In [23]:
embed = nn.Embedding(num_embeddings=len(word2idx),
                    embedding_dim=256,
                    padding_idx=idx2word.get("<PAD>"))

In [24]:
X1 = embed(X)

In [25]:
X1.shape

torch.Size([3, 76, 256])

In [26]:
# 自动挡 自动循环了seq_len
rnn = nn.RNN(input_size=256, hidden_size=512, batch_first=True)

In [27]:
h0 = torch.zeros(1, 3, 512, dtype=torch.float32)

In [28]:
h0

tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]])

In [29]:
# 通过RNN的处理
# 输入：input, h_0
# 输出：output, h_n
out, hn = rnn(X1, h0)

In [30]:
# [3, 76, 512]
out.shape

torch.Size([3, 76, 512])

In [31]:
# [1, 3, 512]
hn.shape

torch.Size([1, 3, 512])

In [32]:
torch.allclose(out[:, -1, :], hn[0, :, :])

True

In [33]:
X.shape

torch.Size([3, 76])

In [34]:
X1 = embed(X)

In [35]:
X1.shape

torch.Size([3, 76, 256])

In [36]:
X2 = torch.permute(input=X1, dims=(1, 0, 2))

In [37]:
# [seq_len, batch_size, embedding_dim]
X2.shape

torch.Size([76, 3, 256])

$ h_t = \tanh(x_t W_{ih}^T + b_{ih} + h_{t-1}W_{hh}^T + b_{hh}) $

In [38]:
"""
    1，参数
    2，逻辑
"""

'\n    1，参数\n    2，逻辑\n'

In [39]:
rnn = nn.RNN(input_size=256, hidden_size=512, batch_first=False)

In [40]:
h0 = torch.zeros(1, 3, 512, dtype=torch.float32)

In [41]:
out, hn = rnn(X2, h0)

In [42]:
# 所有步的特征
out.shape

torch.Size([76, 3, 512])

In [43]:
# 最后一步
hn.shape

torch.Size([1, 3, 512])

In [44]:
"""
    手动挡
        - 自己控制循环
        - 系统每次只处理一步
"""

'\n    手动挡\n        - 自己控制循环\n        - 系统每次只处理一步\n'

In [45]:
rnn_cell = nn.RNNCell(input_size=256, hidden_size=512)

In [46]:
X2.shape

torch.Size([76, 3, 256])

In [47]:
hn = torch.zeros(3, 512, dtype=torch.float32)
out = []
for x in X2:
    hn = rnn_cell(x, hn)
    out.append(hn)

In [48]:
torch.stack(tensors=out, dim=0).shape

torch.Size([76, 3, 512])