In [1]:
import torchtext
from torchtext.vocab import Vectors
import torch
import torch.nn as nn
import numpy as np
import random
from time import time
USE_CUDA = torch.cuda.is_available()

# 为了保证实验结果可以复现，我们经常会把各种random seed固定在某一个值
random.seed(1)
np.random.seed(1)
torch.manual_seed(1)
if USE_CUDA:
    torch.cuda.manual_seed(1)

MAX_VOCAB_SIZE = 100000
EMBEDDING_SIZE = 80
HIDDEN_SIZE = 80
BATCH_SIZE = 35

### 1.torchtext for preprocessing

获取Field 以及训练LM的数据集格式

In [2]:
text = torchtext.data.Field(lower=True) #变小写
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
                   path="./dataset", text_field=text,
                   train="new_trainset_havestp_C.txt", 
                   validation="new_testset_havestp_C.txt", 
                   test="new_testset_havestp_C.txt" )

text.build_vocab(train, max_size=MAX_VOCAB_SIZE)

In [3]:
print('词数：',len(text.vocab))
print(text.vocab.itos[:10] )#跟C++ 11用法一样

# 一个编程的好习惯，及时test
assert text.vocab.itos[:10] == ['<unk>', '<pad>', '的', '了', '在', '是', '和', '有', '也', '不']
text.vocab.stoi['<unk>']

词数： 100002
['<unk>', '<pad>', '的', '了', '在', '是', '和', '有', '也', '不']


0

获取迭代器

In [4]:
train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=BATCH_SIZE, device=-1, 
    bptt_len=20, repeat=False, shuffle=True)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [5]:
print(train_iter)
print(iter(train_iter)) #迭代器，可以理解为一种指针？
it = iter(train_iter)
print(next(it)) #返回iter的下一个元素

<torchtext.data.iterator.BPTTIterator object at 0x0000023BC473CBA8>
<generator object BPTTIterator.__iter__ at 0x0000023BCD879F10>

[torchtext.data.batch.Batch of size 35]
	[.text]:[torch.LongTensor of size 20x35]
	[.target]:[torch.LongTensor of size 20x35]


In [6]:
batch = next(it)
print(batch.text.shape, batch.target.shape) # bptt_len * batch_size，我们更习惯反过来
batch.text[:,0] # 因为第一个维度是“句”长，so一列是 一句话

torch.Size([20, 35]) torch.Size([20, 35])


tensor([ 1645,  2408,   363,    78,   698,   917,  5940,   794,  1705,  5940,
          794,  3537,  1411, 16840,  2478,    68,    18,   163,  5106,   917])

In [7]:
for i in range(2):
    batch = next(it)
    print(''.join([text.vocab.itos[j] for j in batch.text[:,0]]))
    print(''.join([text.vocab.itos[j] for j in batch.target[:,0]]),'\n')
    # 所以虽是不同的batch，LSTM的 hidden可以一直传下去

签到卡张<eos>哪里回收翡翠啊有一个翡翠手镯卖那里<unk>的啊<eos>实用高层建筑火灾
卡张<eos>哪里回收翡翠啊有一个翡翠手镯卖那里<unk>的啊<eos>实用高层建筑火灾应该 

应该如何逃生年月日举世闻名的美国纽约世贸中心能容纳五万<unk>的姐妹楼在遭受恐怖分子袭击后
如何逃生年月日举世闻名的美国纽约世贸中心能容纳五万<unk>的姐妹楼在遭受恐怖分子袭击后相继 



### 定义模型

In [8]:
class RNN_LM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(RNN_LM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        
    def forward(self, text, hidden):
        # text: seq_len * batch_size
        emb = self.embed(text) # seq_len * embed_size * batch_size
        lstm_out, hidden = self.lstm(emb, hidden) # y, 末尾hidden
        # lstm_out: seq_len * batch_size * hidden_size
        # hidden: 1 * batch_size * hidden_size, 1 * batch_size * hidden_size
        
        vocab_out = self.linear(lstm_out.view(-1, lstm_out.shape[2])) 
        #(seq_len * batch_size) * hidden_size 变成2维
        vocab_out = vocab_out.view(lstm_out.size(0),lstm_out.size(1), self.vocab_size) 
        #恢复seq_len * batch_size * vocab_size
        return vocab_out, hidden #hidden也包含了信息
    
    def init_hidden(self, batch_size, requires_grad=True):
        weight = next(self.parameters())
        #返回：两个和weight一样的全0矩阵，cell 和 hidden
        initrange = 1/(self.embed_size+self.hidden_size)
        cell_w = weight.new_zeros(1,batch_size, self.hidden_size, requires_grad=requires_grad)
        hidden_w = weight.new_zeros(1,batch_size, self.hidden_size, requires_grad=requires_grad)
        with torch.no_grad():
            return (cell_w.uniform_(-initrange, initrange),hidden_w.uniform_(-initrange, initrange))
                

#### 初始化模型

In [9]:
model = RNN_LM(vocab_size=len(text.vocab),
               embed_size=EMBEDDING_SIZE,
               hidden_size=HIDDEN_SIZE)
USE_CUDA = 1 #52 vs 232 per 100iters
device = torch.device('cuda' if USE_CUDA else 'cpu')

if USE_CUDA:
    model = model.to(device)
model

RNN_LM(
  (embed): Embedding(100002, 80)
  (lstm): LSTM(80, 80)
  (linear): Linear(in_features=80, out_features=100002, bias=True)
)

In [10]:
next(model.parameters()) #都在cuda上了

Parameter containing:
tensor([[-1.5256, -0.7502, -0.6540,  ..., -0.5601,  0.3956, -0.9823],
        [-0.5065,  0.0998, -0.6540,  ...,  1.8550, -0.7064,  2.5571],
        [ 0.4175, -0.2127, -0.8400,  ..., -2.3648, -0.9295,  0.2936],
        ...,
        [-1.7138,  0.2265,  1.7561,  ...,  1.0764, -0.2923, -0.4635],
        [ 0.9190,  1.1605, -0.3112,  ...,  0.7747,  0.4566,  0.0675],
        [-0.9111, -0.2605, -0.5237,  ...,  0.4181, -0.0922, -0.3103]],
       device='cuda:0', requires_grad=True)

#### 训练模型

In [11]:
def repackage_hidden(h):
    if isinstance(h, torch.Tensor):
        return h.detach() #计算图在此截断，否则相当于非常长的一样计算图
        # 其实也相当于copy啊？
    else:
        return tuple(repackage_hidden(i) for i in h)

loss_fn = nn.CrossEntropyLoss()
learning_rate = 5e-4
optim = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optim, 0.5)

#### 先定义评估函数

In [12]:
def evaluate(model, data):
    model.eval()
    total_loss = 0.
    it = iter(data)
    total_count = 0.
    with torch.no_grad():
        hidden = model.init_hidden(BATCH_SIZE, requires_grad=False)
        for i, batch in enumerate(it):
            data, target = batch.text, batch.target
            if USE_CUDA:
                data, target = data.cuda(), target.cuda()
            hidden = repackage_hidden(hidden)
            output, hidden = model(data, hidden)
            loss = loss_fn(output.view(-1, len(text.vocab)), target.view(-1))
            total_count += np.multiply(*data.size())
            total_loss += loss.item()*np.multiply(*data.size())
            
    loss = total_loss / total_count
    model.train()
    return loss

In [13]:
EPOCHS = 1
GRAD_CLIP = 5.0
min_loss = float("inf")
a = time()

for epoch in range(EPOCHS):
    model.train()#训练和测试不一样的，但是为啥之前都没写？
    it = iter(train_iter)
    hidden = model.init_hidden(BATCH_SIZE) #两个
#     print(hidden[0].shape)
    for i, batch in enumerate(it):
        
            
        data, target = batch.text, batch.target
        if USE_CUDA:
            data, target = data.cuda(), target.cuda()
#         print(data.shape, target.shape)
        hidden = repackage_hidden(hidden) #构建新的hidden起点
        output, hidden = model(data, hidden)
        
        loss = loss_fn(output.view(-1,len(text.vocab)), target.view(-1)) 
        #output: batch_size * target_dim, 
        #target: batch_size *1 即可
        optim.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optim.step()     
        
        if i%10 ==0:
            if loss.item()<min_loss:
                print("iter:", i, ' loss:%.4f'%loss.item(), 'perplexity:%.2f'%np.exp(loss.item()), ' 耗时:%.2fs'%(time()-a))
                min_loss = loss.item()
            a = time()
            
#         if i>=4000:
#             break
#         if i%2 == 0:
#             val_loss = evaluate(model, val_iter)
#             if len(val_losses) == 0 or val_loss < min(val_losses):
#                 print("better model, val loss: ", val_loss)
#                 torch.save(model.state_dict(), "LSTM%s.pth"%i)
#             else:
#                 scheduler.step()
#                 optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
#             val_losses.append(val_loss)

iter: 0  loss:11.5202 perplexity:100726.63  耗时:1.91s
iter: 10  loss:11.5042 perplexity:99127.37  耗时:6.12s
iter: 20  loss:11.4953 perplexity:98256.30  耗时:6.10s
iter: 30  loss:11.4642 perplexity:95242.44  耗时:6.08s
iter: 40  loss:11.4098 perplexity:90201.94  耗时:6.07s
iter: 50  loss:11.2763 perplexity:78925.41  耗时:6.10s
iter: 60  loss:10.6479 perplexity:42104.59  耗时:6.11s
iter: 70  loss:9.9231 perplexity:20395.86  耗时:6.11s
iter: 80  loss:9.5310 perplexity:13780.88  耗时:6.09s
iter: 90  loss:9.0174 perplexity:8245.60  耗时:6.10s
iter: 130  loss:8.9963 perplexity:8073.13  耗时:6.08s
iter: 140  loss:8.7881 perplexity:6555.48  耗时:6.10s
iter: 210  loss:8.7059 perplexity:6038.43  耗时:6.10s
iter: 320  loss:8.6995 perplexity:5999.96  耗时:6.11s
iter: 360  loss:8.5742 perplexity:5293.56  耗时:6.12s
iter: 520  loss:8.5093 perplexity:4960.92  耗时:6.10s
iter: 670  loss:8.4285 perplexity:4575.71  耗时:6.08s
iter: 1060  loss:8.4278 perplexity:4572.45  耗时:6.08s
iter: 1770  loss:8.3761 perplexity:4342.21  耗时:6.11s
iter

In [17]:
torch.save(model.state_dict(), './models/LSTM3.pth')

In [18]:
best_model = model

In [19]:
hidden = best_model.init_hidden(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input = torch.randint(len(text.vocab), (1, 1), dtype=torch.long).to(device)
words = []
for i in range(100):
    output, hidden = best_model(input, hidden)
    word_weights = output.squeeze().exp().cpu()
    word_idx = torch.multinomial(word_weights, 1)[0]
    input.fill_(word_idx)
    word = text.vocab.itos[word_idx]
    words.append(word)
print(' '.join(words))

循着 儿科医院 板凳 足总杯 加装 妻女 名 动植物 间接 他人 也 无 第三节 庆典 刘 遍 新型 彻底改变 的 公路 且 规划 安全 不 在 年间 或 发明 关注 到 很难 的 方法 运动 存在 举证 的 骗局 制定 同时 多 <unk> 将 重要 引导 相同 他们 以 增多 下 对 电网 的 达到 检查 其中 及 自身 都 有 激动 不远 的 位置 放在心上 各色 纪律 大家 事发 而 实名 拓宽 从 秒 就是 回国 暂停 手机 于 先进分子 的 蛇口 这 仇恨 登上 的 过程 建设 抗日 需要 郭氏 钱 符合要求 均 贵州 的 富豪 一站式 和 存款
