### 数据预览和预处理

In [1]:
f=open("./input/poetryFromTang.txt","r",encoding='utf-8')
poetry_corpus = f.read()

In [2]:
poetry_corpus[:50]

'\n巴山上峡重复重，阳台碧峭十二峰。荆王猎时逢暮雨，\n夜卧高丘梦神女。轻红流烟湿艳姿，行云飞去明星稀。'

In [3]:
len(poetry_corpus)

16647

文本预处理

In [4]:
poetry_corpus = poetry_corpus.replace('\n', ' ').replace('\r', ' ').replace('，', ' ').replace('。', ' ')
poetry_corpus[:100]

' 巴山上峡重复重 阳台碧峭十二峰 荆王猎时逢暮雨  夜卧高丘梦神女 轻红流烟湿艳姿 行云飞去明星稀  目极魂断望不见 猿啼三声泪沾衣   见尽数万里 不闻三声猿 但飞萧萧雨 中有亭亭魂  千载楚襄恨 '

数据生成

In [5]:
import numpy as np
max_features = 10000
vocab = set(poetry_corpus)
vocab_dict={}
for word in poetry_corpus:
    if word in vocab_dict:
        vocab_dict[word] +=1
    else :
        vocab_dict[word] = 1
vocab_list = []
for word in vocab_dict:
    vocab_list.append((word, vocab_dict[word]))
vocab_list.sort(key=lambda x: x[1], reverse=True)
if len(vocab_list) > max_features:
    vocab_list = vocab_list[:max_features]
vocab = [x[0] for x in vocab_list]
word_to_idx = {c: i for i, c in enumerate(vocab)}
idx_to_word = dict(enumerate(vocab))

In [6]:
def text_to_arr(text):
    
    arr = []
    for word in text:
        if word in word_to_idx:
            arr.append(word_to_idx[word])
        else: arr.append(len(vocab))
    return np.array(arr)
def arr_to_text(arr):
    words = []
    for index in arr:
        if index == len(vocab):
            words.append('<unk>')
        elif index < len(vocab):
            words.append(idx_to_word[index])
        else:
            raise Exception('Unknown index!')
    return "".join(words)

In [7]:
text_arr = poetry_corpus[:11]
text_to_arr(text_arr)

array([   0,  921,   10,   28, 1201,  275,   56,  275,    0,  110,  128])

### 构造文本时序数据

In [8]:
#序列长度
sequence = 5
# 总的序列个数
num_seq = int(len(poetry_corpus) / sequence)
text = poetry_corpus[:num_seq*sequence]
num_seq

3329

调整为Torch Tensor

In [9]:
import torch
from torch.utils.data import Dataset,DataLoader,TensorDataset
arr = text_to_arr(text)
arr = arr.reshape((num_seq, -1))
arr = torch.from_numpy(arr)

In [10]:
class TextDataset(object):
    def __init__(self, arr):
        self.arr = arr
        
    def __getitem__(self, item):
        x = self.arr[item, :]
        
        # 构造 label
        y = torch.zeros(x.shape)
        # 将输入的第一个字符作为最后一个输入的 label
        y[:-1], y[-1] = x[1:], x[0]
        return x, y
    
    def __len__(self):
        return self.arr.shape[0]

In [11]:
train_set = TextDataset(arr)

In [12]:
train_set

<__main__.TextDataset at 0x174c7526d68>

In [13]:
x, y = train_set[0]
print(arr_to_text(x.numpy()))
print(arr_to_text(y.numpy()))

 巴山上峡
巴山上峡 


### GRU

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class char_GRU(nn.Module):
    def __init__(self, vocab_size,embedding_dim, hidden_state, embedding_matrix=None):
        super(char_GRU, self).__init__()
        #嵌入层
        self.num_layers = vocab_size
        self.hidden_size = hidden_state

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=1)
        if embedding_matrix is not None:
            #加载词向量
            self.embedding.weight.data.copy_(embedding_matrix)
            #冻结参数
            #self.embedding.weight.data.requires_grad = False
        self.rnn = nn.GRU(embedding_dim,hidden_state,2,batch_first = True)
        self.fc = nn.Linear(hidden_state, vocab_size)

    def forward(self, x,hs=None):
        batch=x.shape[0]
        embed = self.embedding(x) 
        if hs is None:
            hs = Variable(
                torch.zeros(self.num_layers, batch, self.hidden_size))
        out,hs= self.rnn(embed)
        out = self.fc(out)
        return out.view(-1,vocab_size),hs

In [15]:
from torch.utils.data import DataLoader

batch_size = 128
vocab_size = len(vocab)+1
train_data = DataLoader(train_set, batch_size, True)
model = char_GRU(vocab_size, 128, 128)

In [16]:
from torch.autograd import Variable
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()
epochs = 25
for e in range(epochs):
    train_loss = 0
    for data in train_data:
        x, y = data
        x = x.long()
        y = y.long()
        x, y = Variable(x), Variable(y)

        # Forward.
        score,_ = model(x)
        loss= criterion(score, y.view(-1))

        # Backward.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
    print('epoch: {}, perplexity is: {:.3f}'.format(e+1, np.exp(train_loss / len(train_data))))

epoch: 1, perplexity is: 945.885
epoch: 2, perplexity is: 572.340
epoch: 3, perplexity is: 521.260
epoch: 4, perplexity is: 401.698
epoch: 5, perplexity is: 275.885
epoch: 6, perplexity is: 177.950
epoch: 7, perplexity is: 107.476
epoch: 8, perplexity is: 79.147
epoch: 9, perplexity is: 55.702
epoch: 10, perplexity is: 41.112
epoch: 11, perplexity is: 29.948
epoch: 12, perplexity is: 25.140
epoch: 13, perplexity is: 21.323
epoch: 14, perplexity is: 18.985
epoch: 15, perplexity is: 16.399
epoch: 16, perplexity is: 14.243
epoch: 17, perplexity is: 12.725
epoch: 18, perplexity is: 12.243
epoch: 19, perplexity is: 9.589
epoch: 20, perplexity is: 8.733
epoch: 21, perplexity is: 8.724
epoch: 22, perplexity is: 7.560
epoch: 23, perplexity is: 6.959
epoch: 24, perplexity is: 6.771
epoch: 25, perplexity is: 5.751


In [17]:
#在概率最高的几个字符，随机选取一个作为输出
def pick_top_n(preds, top_n=5):
    top_pred_prob, top_pred_label = torch.topk(preds, top_n, 1)
    top_pred_prob /= torch.sum(top_pred_prob)
    top_pred_prob = top_pred_prob.squeeze(0).cpu().numpy()
    top_pred_label = top_pred_label.squeeze(0).cpu().numpy()
    c = np.random.choice(top_pred_label, size=1, p=top_pred_prob)
    return c

In [18]:
begin = '夜卧高丘梦神'
text_len = 30

model = model.eval()
samples = [word_to_idx[c] for c in begin]
input_txt = torch.LongTensor(samples)[None]
input_txt = Variable(input_txt)
_, init_state = model(input_txt)
result = samples
model_input = input_txt[:, -1][:, None]
for i in range(text_len):
    out, init_state = model(model_input, init_state)
    pred = pick_top_n(out.data)
    model_input = Variable(torch.LongTensor(pred))[None]
    result.append(pred[0])
text = arr_to_text(result)
print(text)

夜卧高丘梦神神知妾天漏下下南 隅寒蒲城阙百衔树临临临流萦玉薄过楼楼珠争玉


### LSTM

In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class char_LSTM(nn.Module):
    def __init__(self, vocab_size,embedding_dim, hidden_state, embedding_matrix=None):
        super(char_LSTM, self).__init__()
        #嵌入层
        self.num_layers = vocab_size
        self.hidden_size = hidden_state

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=1)
        if embedding_matrix is not None:
            #加载词向量
            self.embedding.weight.data.copy_(embedding_matrix)
            #冻结参数
            #self.embedding.weight.data.requires_grad = False
        self.rnn = nn.LSTM(embedding_dim,hidden_state,2,batch_first = True)
        self.fc = nn.Linear(hidden_state, vocab_size)

    def forward(self, x,hn=None):
        batch=x.shape[0]
        embed = self.embedding(x) 
        out,hn= self.rnn(embed,hn)
        out = self.fc(out)
        return out.view(-1,vocab_size),hn

In [20]:
from torch.utils.data import DataLoader

batch_size = 128
vocab_size = len(vocab)+1
train_data = DataLoader(train_set, batch_size, True)
model = char_LSTM(vocab_size, 128, 128)

In [21]:
from torch.autograd import Variable
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()
epochs = 25
for e in range(epochs):
    train_loss = 0
    for data in train_data:
        x, y = data
        x = x.long()
        y = y.long()
        x, y = Variable(x), Variable(y)

        # Forward.
        score,_ = model(x)
        loss= criterion(score, y.view(-1))

        # Backward.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
    print('epoch: {}, perplexity is: {:.3f}'.format(e+1, np.exp(train_loss / len(train_data))))

epoch: 1, perplexity is: 979.842
epoch: 2, perplexity is: 544.762
epoch: 3, perplexity is: 463.647
epoch: 4, perplexity is: 447.757
epoch: 5, perplexity is: 388.332
epoch: 6, perplexity is: 330.095
epoch: 7, perplexity is: 257.956
epoch: 8, perplexity is: 208.875
epoch: 9, perplexity is: 177.638
epoch: 10, perplexity is: 141.000
epoch: 11, perplexity is: 109.061
epoch: 12, perplexity is: 85.458
epoch: 13, perplexity is: 64.365
epoch: 14, perplexity is: 46.743
epoch: 15, perplexity is: 32.446
epoch: 16, perplexity is: 26.736
epoch: 17, perplexity is: 19.438
epoch: 18, perplexity is: 15.259
epoch: 19, perplexity is: 12.001
epoch: 20, perplexity is: 9.881
epoch: 21, perplexity is: 8.350
epoch: 22, perplexity is: 6.784
epoch: 23, perplexity is: 5.939
epoch: 24, perplexity is: 4.902
epoch: 25, perplexity is: 4.203


In [22]:
begin = '荆王猎时逢暮'
text_len = 30

model = model.eval()
samples = [word_to_idx[c] for c in begin]
input_txt = torch.LongTensor(samples)[None]
input_txt = Variable(input_txt)
_, init_state = model(input_txt)
result = samples
model_input = input_txt[:, -1][:, None]
for i in range(text_len):
    out, init_state = model(model_input, init_state)
    pred = pick_top_n(out.data)
    model_input = Variable(torch.LongTensor(pred))[None]
    result.append(pred[0])
text = arr_to_text(result)
print(text)

荆王猎时逢暮何古人世问平白草平知 日世事平朝 时东开 平朝云平十高高十古
