In [1]:
import torchtext
import torch
import torch.nn as nn
import numpy as np
import random
from time import time
USE_CUDA = torch.cuda.is_available()

# freeze random seed
random.seed(1)
np.random.seed(1)
torch.manual_seed(1)
if USE_CUDA:
    torch.cuda.manual_seed(1)
    torch.cuda.set_device(2)

MAX_VOCAB_SIZE = 100000
EMBEDDING_SIZE = 500
HIDDEN_SIZE = 1000
BATCH_SIZE = 100
BPTT_LEN = 30 #seq_len

RuntimeError: cuda runtime error (10) : invalid device ordinal at ..\torch\csrc\cuda\Module.cpp:33

### 1.torchtext for Preprocessing

获取Field 以及训练LM的数据集格式

In [2]:
text = torchtext.data.Field(lower=True) #变小写
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
                   path="./dataset", text_field=text,
                   train="corpus_havestp.txt", 
                   validation="corpus_havestp.txt", 
                   test="corpus_havestp.txt" )

text.build_vocab(train, max_size=MAX_VOCAB_SIZE)

In [3]:
print('词数：',len(text.vocab))
print(text.vocab.itos[:10] )#同C++ 11

# 好习惯，及时assert
# assert text.vocab.itos[:1] == ['<unk>', '<pad>']
text.vocab.stoi['<unk>']

词数： 100002
['<unk>', '<pad>', '，', '的', '。', '、', '了', '在', '“', '”']


0

获取迭代器

In [4]:
train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=BATCH_SIZE, device=-1, 
    bptt_len=BPTT_LEN, repeat=False, shuffle=True)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [5]:
print(train_iter)
print(iter(train_iter)) #迭代器，泛型指针
it = iter(train_iter)
print(next(it)) #返回iter的下一个元素

<torchtext.data.iterator.BPTTIterator object at 0x7f3da5683898>
<generator object BPTTIterator.__iter__ at 0x7f3da568be60>

[torchtext.data.batch.Batch of size 100]
	[.text]:[torch.LongTensor of size 30x100]
	[.target]:[torch.LongTensor of size 30x100]


In [6]:
batch = next(it)
print(batch.text.shape, batch.target.shape) # seq_len * batch_size，我们更习惯反过来
batch.text[:,0] # 因为第一个维度是“句”长，so一列是 一句话

torch.Size([30, 100]) torch.Size([30, 100])


tensor([ 1637,     2,    10,     0,   863,  2204,   236,  2867,    12, 11144,
         2496,    38,     2,   881,   213,  1465,     3, 16749,    45,  1025,
         8172,  1929,   204, 54621,     3,   813,     2,  1037, 28421,    62])

In [7]:
for i in range(2):
    batch = next(it)
    print(''.join([text.vocab.itos[j] for j in batch.text[:,0]]))
    print(''.join([text.vocab.itos[j] for j in batch.target[:,0]]),'\n')
    # 所以虽是不同的batch，LSTM的 hidden可以一直传下去。每次截断计算图 只保留value 

有说法的！今天一<unk>全军2017年1月2日实名举报上黄镇宣国才的贴子（仍被锁定禁止评论）已经
说法的！今天一<unk>全军2017年1月2日实名举报上黄镇宣国才的贴子（仍被锁定禁止评论）已经正好 

正好一整年了=750)window.open('http://img.jsly001.com/attachment/mon_1801/4_
一整年了=750)window.open('http://img.jsly001.com/attachment/mon_1801/4_291085 



### 2.定义模型
|process|shape|备注|
|:----|:----|:----|
|TEXT|s\*b|其实有个word_idx在里面
|embedding|v->e|
|after embed|s\*e\*b|
|lstm|
|after lstm |  |
<img src='./dataset/LSTM_structure_analyse.png' align='center' width='400'>


In [4]:
class RNN_LM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(RNN_LM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        
    def forward(self, text, hidden):
        # text: seq_len * batch_size
        emb = self.embed(text) # seq_len * embed_size * batch_size
        lstm_out, hidden = self.lstm(emb, hidden) # y, 末尾hidden
        # lstm_out: seq_len * batch_size * hidden_size
        # hidden: 1 * batch_size * hidden_size, 1 * batch_size * hidden_size
        
        vocab_out = self.linear(lstm_out.view(-1, lstm_out.shape[2])) 
        #(seq_len * batch_size) * hidden_size 把前2维拼成1维，因为Linear只接受2维input
        vocab_out = vocab_out.view(lstm_out.size(0),lstm_out.size(1), self.vocab_size) 
        #恢复shape:seq_len * batch_size * vocab_size
        return vocab_out, hidden
    
    def init_hidden(self, batch_size, requires_grad=True):
        weight = next(self.parameters())
        initrange = 2.0/(self.embed_size+self.hidden_size)
        cell_w = weight.new_zeros(1, batch_size, self.hidden_size, requires_grad=requires_grad)
        hidden_w = weight.new_zeros(1, batch_size, self.hidden_size, requires_grad=requires_grad)
        with torch.no_grad():#否则无法 _operate
            return (cell_w.uniform_(-initrange, initrange), hidden_w.uniform_(-initrange, initrange))
        #返回：两个和weight一样的rand tensor----cell & hidden
                

#### 初始化模型

In [9]:
model = RNN_LM(vocab_size=len(text.vocab),
               embed_size=EMBEDDING_SIZE,
               hidden_size=HIDDEN_SIZE)
USE_CUDA = 1 #52 vs 232 per 100iters
device = torch.device('cuda' if USE_CUDA else 'cpu')

if USE_CUDA:
    model = model.to(device)
model

RNN_LM(
  (embed): Embedding(100002, 500)
  (lstm): LSTM(500, 1000)
  (linear): Linear(in_features=1000, out_features=100002, bias=True)
)

In [10]:
next(model.parameters()) #都在cuda上了

Parameter containing:
tensor([[-1.5256, -0.7502, -0.6540,  ...,  0.2168, -0.1428,  1.4274],
        [ 0.1643, -0.3161,  0.1285,  ..., -0.1286, -0.0571, -0.0711],
        [ 1.1658,  0.1701,  0.5288,  ...,  0.4481,  0.1001,  1.5422],
        ...,
        [ 0.6190,  0.5853,  1.6793,  ..., -0.2505,  0.7391,  1.3023],
        [-2.7605, -0.2259, -0.7199,  ...,  0.3644,  0.2963,  0.1062],
        [ 0.4066,  0.4807,  0.7690,  ..., -0.4090,  1.0787, -0.7293]],
       device='cuda:2', requires_grad=True)

#### 训练模型

In [11]:
def repackage_hidden(h):
    if isinstance(h, torch.Tensor):
        return h.detach() #计算图在此截断，否则相当于非常长的一样计算图
    else:
        return tuple(repackage_hidden(x) for x in h)

loss_fn = nn.CrossEntropyLoss()
learning_rate = 9e-4
optim = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optim, 0.5)

#### 先定义评估函数

In [12]:
def evaluate(model, data):
    model.eval()
    total_loss = 0.
    it = iter(data)
    total_count = 0.
    with torch.no_grad():
        hidden = model.init_hidden(BATCH_SIZE, requires_grad=False)
        for i, batch in enumerate(it):
            data, target = batch.text, batch.target
            if USE_CUDA:
                data, target = data.cuda(), target.cuda()
            hidden = repackage_hidden(hidden)
            output, hidden = model(data, hidden)
            loss = loss_fn(output.view(-1, len(text.vocab)), target.view(-1))
            total_count += np.multiply(*data.size())
            total_loss += loss.item()*np.multiply(*data.size())
            
    loss = total_loss / total_count
    model.train()
    return loss

In [7]:
import torch as t
a = t.ones(5,6)
a.view(-1).shape

torch.Size([30])

In [13]:
EPOCHS = 100
GRAD_CLIP = 5.0
min_loss = float("inf")
a = time()

for epoch in range(EPOCHS):
    model.train()#训练和测试不一样的，但是为啥之前都没写？
    it = iter(train_iter)
    hidden = model.init_hidden(BATCH_SIZE) #两个
#     print(hidden[0].shape)
    for i, batch in enumerate(it):
        
            
        data, target = batch.text, batch.target # 都是[s,b]
        if USE_CUDA:
            data, target = data.cuda(), target.cuda()
        hidden = repackage_hidden(hidden) #构建新的hidden起点
        output, hidden = model(data, hidden) #[s,b,v], [1,b,h]
        
        loss = loss_fn(output.view(-1,len(text.vocab)), target.view(-1)) 
        #output: [s*b,v] 
        #target: [s*b] 即可，值就是v的index
        optim.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP) #clip grad
        optim.step()     
        
        if i%100 ==0:
            if loss.item()<min_loss:
                print("iter:", i, ' loss:%.4f'%loss.item(), 'perplexity:%.2f'%np.exp(loss.item()))
                torch.save(model.state_dict(), './models/LSTM_s%d_e%d_h%d_b%d_p.pth'%(BPTT_LEN,EMBEDDING_SIZE,HIDDEN_SIZE,BATCH_SIZE))
                min_loss = loss.item()
        if i%1000 == 0:
            print('%siter 耗时:%.2fs'%(i, time()-a))
            a = time()
            
    print("epoch:", epoch, ' loss:%.4f'%loss.item(), 'perplexity:%.2f'%np.exp(loss.item()))
    
#         if i>=4000:
#             break
#         if i%2 == 0:
#             val_loss = evaluate(model, val_iter)
#             if len(val_losses) == 0 or val_loss < min(val_losses):
#                 print("better model, val loss: ", val_loss)
#                 torch.save(model.state_dict(), "LSTM%s.pth"%i)
#             else:
#                 scheduler.step()
#                 optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
#             val_losses.append(val_loss)

iter: 0  loss:11.5128 perplexity:99985.28
0iter 耗时:4.41s
iter: 100  loss:7.6488 perplexity:2098.15
iter: 200  loss:7.2306 perplexity:1381.08
iter: 300  loss:6.8902 perplexity:982.64
iter: 500  loss:6.6771 perplexity:794.01
iter: 600  loss:6.5617 perplexity:707.49
iter: 800  loss:6.5260 perplexity:682.67
iter: 1000  loss:6.2520 perplexity:519.07
1000iter 耗时:1246.38s
iter: 1200  loss:6.1209 perplexity:455.29
iter: 1300  loss:6.1075 perplexity:449.23
iter: 1400  loss:6.0799 perplexity:436.97
iter: 1500  loss:5.8064 perplexity:332.42
iter: 1600  loss:5.6130 perplexity:273.98
2000iter 耗时:1260.63s
iter: 2200  loss:5.5258 perplexity:251.09
iter: 3000  loss:5.5202 perplexity:249.67
3000iter 耗时:1254.76s
iter: 3200  loss:5.4930 perplexity:242.99
epoch: 0  loss:5.4485 perplexity:232.40
0iter 耗时:527.14s
iter: 100  loss:5.4673 perplexity:236.81
iter: 200  loss:5.1618 perplexity:174.47
iter: 500  loss:5.0713 perplexity:159.38
iter: 600  loss:4.9923 perplexity:147.27
iter: 1000  loss:4.9052 perplexit

KeyboardInterrupt: 

In [16]:
best_model = RNN_LM(len(text.vocab), EMBEDDING_SIZE, HIDDEN_SIZE)
if USE_CUDA:
    best_model = best_model.cuda()
best_model.load_state_dict(torch.load('./models/LSTM_s30_e500_h1000_b100_p12e5.pth'))

In [116]:
hidden = best_model.init_hidden(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# input = torch.randint(len(text.vocab), (1, 1), dtype=torch.long).to(device)
first_word = '百度'
input = torch.LongTensor([[text.vocab.stoi[first_word]]]).to(device)
num_sente = 5
eos = {'?':1,'？':1,'.':1,'。':1,'!':1,'！':1,'<eos>':1}

for i in range(1000000):
    word = text.vocab.itos[input.item()]
    print(word, end='')
    
    output, hidden = best_model(input, hidden)
    word_prob = output.squeeze().exp().cpu() #shape:[vocab_size]
    word_idx = torch.multinomial(word_prob, 1).item() #多项式分布采样得到一个
#     word_idx = torch.max(word_prob, 0)[1] #max(a,0)会返回(value,index)
    input.fill_(word_idx)

    if word in eos:
        num_sente -= 1
        if num_sente <= 0:
            break

百度次要他称业务范围涵盖稳小吃中，有人在使用这一五大影响之前，已经提前30天了，在动车低1分下，预计3年内已经出现2700天才能顶店面，当块碰到了新房1就重新退房过，这还要4个月的回归到了想必大家的认可！1、重庆站：为确保安全第一款找非现行《2015年国家外国人城镇国际社会经济体系发展情况。对此，建瓯市公安局须主张通报，对近期公布的各类污染源排摸等情况进行分析，通过相关的分析研究制定具体工作方案，让有关办案人员一起交流学习的动态，优先回答，仍需取得“快乐成绩”。截至10月底，他有3个开发区\n相关内容有新华社；6）深入挖掘的核心价值。\n目前，26<unk>火箭创石墨垃圾，山海关德国<unk>近<unk>万；将客户利用产品等，以要准入...一条条给予最正的信任——企业向你的消费者表示自己对，“原来的专家--山东网”首页还要<unk>站在手里么？