In [1]:
# -*- coding: utf-8 -*-

import os
import re
import time
import codecs



TIME_FORMAT = '%Y-%m-%d %H:%M:%S'
BASE_FOLDER = "C:/Users/sethf/source/repos/chinesepoem/" # os.path.abspath(os.path.dirname(__file__))
DATA_FOLDER = os.path.join(BASE_FOLDER, 'data')
DEFAULT_FIN = os.path.join(DATA_FOLDER, '唐诗语料库.txt')
DEFAULT_FOUT = os.path.join(DATA_FOLDER, 'poem.txt')
reg_noisy = re.compile('[^\u3000-\uffee]')
reg_note = re.compile('(（.*）)') # Cannot deal with （） in seperate lines
# 中文及全角标点符号(字符)是\u3000-\u301e\ufe10-\ufe19\ufe30-\ufe44\ufe50-\ufe6b\uff01-\uffee

DEFAULT_Char2Vec = os.path.join(DATA_FOLDER, 'Char2Vec100.bin')

# Test the material

In [3]:
def GetFirstNline(filePath, linesNumber):
    fd = codecs.open(filePath, 'r', 'utf-8')
    for i in range(1,linesNumber):
        print(fd.readline())
    fd.close()

GetFirstNline(DEFAULT_FOUT, 3)

饮马长城窟行

塞外悲风切，交河冰已结。瀚海百重波，阴山千里雪。迥戍危烽火，层峦引高节。悠悠卷旆旌，饮马出长城。寒沙连骑迹，朔吹断边声。胡尘清玉塞，羌笛韵金钲。绝漠干戈戢，车徒振原隰。都尉反龙堆，将军旋马邑。扬麾氛雾静，纪石功名立。荒裔一戎衣，灵台凯歌入。



In [4]:
print('{} START'.format(time.strftime(TIME_FORMAT)))
text = codecs.open(DEFAULT_FOUT, 'r', 'utf-8').read()
print('corpus length:', len(text))
print('{} STOP'.format(time.strftime(TIME_FORMAT)))

2017-10-16 20:45:55 START
corpus length: 3364177
2017-10-16 20:45:55 STOP


In [5]:
text = text[:100000]
print('test corpus length:', len(text))

test corpus length: 100000


In [6]:
print('{} START'.format(time.strftime(TIME_FORMAT)))
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)
print('{} STOP'.format(time.strftime(TIME_FORMAT)))

2017-10-16 20:45:58 START
total chars: 3826
2017-10-16 20:45:58 STOP


In [7]:
chars.insert(0, "\0")

In [8]:
''.join(chars[1:200])

'\n·。一丁七万丈三上下不与丑专且丕世丘业丛东丝丞两严丧个中丰丱临丸丹为主丽举乃久义之乌乍乎乏乐乔乖乘乙九也习乡书买乱乳乾了予争事二于亏云互五井亘亚亟亡交亥亦产亨亩享京亭亮亲人亿仁仆仇今介仍从仑仓他仗付仙仞代令以仪仰仲价任仿伉伊伍伎伏伐休众优会伟传伣伤伦伪伫伯估伴伶伸似但位低住佐佑体何余佛作佞佩佯佳佶佾使侁侈侍侑侔供依侠侣侧侪侬侮侯侵便促俄俎俗俘保俞俟信俦俨俩俪俭修俯俱俾倍倏倒倘候倚借倡倢倦倩值倾'

In [9]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [10]:
idx = [char_indices[c] for c in text]

In [11]:
idx[:10]

[3648, 3672, 3460, 619, 2357, 2953, 1, 632, 657, 1124]

In [12]:
''.join(indices_char[i] for i in idx[1000:1040])

'行。遍野屯万骑，临原驻五营。登山麾武节，背水纵神兵。在昔戎戈动，今来宇宙平。\n入'

# import the pre-trained vector

In [25]:

DEFAULT_Char2Vec50 = os.path.join(DATA_FOLDER, 'Char2Vec50.bin')

from gensim.models import word2vec
import numpy as np

model = word2vec.Word2Vec.load(DEFAULT_Char2Vec50)

model[u'行']


array([-0.04007597,  1.43749964, -0.50613195,  0.30402976, -0.61863619,
        0.86658812, -0.85327005,  1.0774374 , -1.7388643 , -0.66667581,
        1.41951251,  1.73685801,  0.88174158, -1.08179212,  0.04298396,
       -0.06425346,  0.70207226, -1.53141725,  0.71608716, -0.90501112,
       -0.33681136,  0.14992777, -0.68830115, -0.19420403,  1.46785641,
       -1.46016061, -0.05367612, -0.05942056,  0.43858421, -0.82355237,
        1.08669186,  1.04226518, -1.23372197, -2.04672313,  0.13865796,
        0.07540765,  1.05799055,  0.84494865,  1.86665952,  0.52074122,
       -0.76229119,  0.08235706, -1.87348688, -1.76977718, -1.9432348 ,
       -0.51557565, -3.51499605, -0.06386752, -2.19704294,  2.92216778], dtype=float32)

In [32]:
print ('creating embedding matrix...')
embedding_matrix = np.zeros((vocab_size, 50))

for i, c in enumerate(chars):
    #print(c)
    if c in model:
        embedding_matrix[i] = model[c]
    # if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
   #     embedding_matrix[i] = embedding_vector

print('Found %s word vectors.' % len(embedding_matrix))

creating embedding matrix...
Found 3826 word vectors.


# Preprocess and create model

In [33]:
maxlen = 40
sentences = []
next_chars = []
for i in range(0, len(idx) - maxlen+1):
    sentences.append(idx[i: i + maxlen])
    next_chars.append(idx[i+1: i+maxlen+1])
print('nb sequences:', len(sentences))

nb sequences: 99961


In [34]:
import numpy as np
import keras
from keras.layers import TimeDistributed, Activation
from numpy.random import choice

Using TensorFlow backend.


In [35]:

sentences = np.concatenate([[np.array(o)] for o in sentences[:-2]])
next_chars = np.concatenate([[np.array(o)] for o in next_chars[:-2]])

In [36]:
sentences.shape, next_chars.shape

((99959, 40), (99959, 40))

In [37]:
n_fac = 50

In [38]:
from keras import backend as K
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.utils.np_utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional
from keras.layers import TimeDistributed, Activation, SimpleRNN, GRU
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.regularizers import l2, l1
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, RMSprop, Adam
#from keras.utils.layer_utils import layer_from_config
from keras.layers import deserialize as layer_from_config

from keras.metrics import categorical_crossentropy, categorical_accuracy
from keras.layers.convolutional import *
from keras.preprocessing import image, sequence
from keras.preprocessing.text import Tokenizer

# Import the training word embedding


In [39]:

model=Sequential([
        Embedding(vocab_size, n_fac, input_length=maxlen, weights=[embedding_matrix],trainable=False),
        Bidirectional(LSTM(512, return_sequences=True, dropout=0.2, recurrent_dropout=0.2,implementation=2)),
        Dropout(0.2),
        Bidirectional(LSTM(512, return_sequences=True, dropout=0.2, recurrent_dropout=0.2,implementation=2)),
        Dropout(0.2),
        TimeDistributed(Dense(vocab_size)),
        Activation('softmax')
    ])   

In [40]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

# Training

In [41]:
def print_example():
    seed_string=u'行。遍野屯万骑，临原驻五营。登山麾武节，背水纵神兵。在昔戎戈动，今来宇宙平。\n入'
    for i in range(320):
        x=np.array([char_indices[c] for c in seed_string[-40:]])[np.newaxis,:]
        preds = model.predict(x, verbose=0)[0][-1]
        preds = preds/np.sum(preds)
        next_char = choice(chars, p=preds)
        seed_string = seed_string + next_char
    print(seed_string)

In [42]:
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, epochs=1)

Epoch 1/1


<keras.callbacks.History at 0x20815452748>

In [43]:
print_example()

行。遍野屯万骑，临原驻五营。登山麾武节，背水纵神兵。在昔戎戈动，今来宇宙平。
入弄柳曲慢
君言若成酌，终此惜最顾。
相和歌辞·铜雀辞
百二胡第头，莫间西出时。君逢焚马绝，攀双歌舞音。刘琅若主，小尽夸人。画粉丛翠。雁色辽阳。暮怜宜骨难，者争苦问田。
横吹曲辞·阳南
汉家若高女，绿刺云山红。悬羞失须边，用有按春看。君遇画柳，空华兰折。
相和歌辞·古堤行
瞩海痕不魂，共万眉胡桑。北元在北雪，黄植泪荷满。暂夜丹金滋，岂怜闲春藏。啄向雾魂起，馹声是丈缯。君可悲晖江，松月一来愁。何何如此时，三十鸢春明。青水危陵上，不古敦衣羞。青卒古当下，枝粉长新镜。想想高筠满，须延万年春。
相和歌辞·折杨曲
妾惜二冬空，非名行未肯。满月鸮三里，古然无云多。珮阁萦囊佩，制阶叶错风。柱启网氛圭，悠妆雁离间。
题恭礼主五殿赐玉亭残
夏杰真初兆，


In [44]:
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x207d8ded7b8>

In [45]:
print_example()

行。遍野屯万骑，临原驻五营。登山麾武节，背水纵神兵。在昔戎戈动，今来宇宙平。
入潼关
崤函乘洞律，黛锁显宫前。万壑浮云起，风调轸玉声。郊坰清广转，法乐调和风。
唐大飨拜洛乐章·昭和
烂俎牺荐，羞荐斯陈。黑修之牡，子器斯中。上帝配食，雍于执敬。爰容景福，永于旧章。
郊庙歌辞·祀雨师乐章·迎俎酌献
撰行协序，垂舞递成。鸾鸾凤舞，飘洞发旂。煌煌开御，穆穆雍雍。
唐明堂乐章·徵音
赫赫离精御炎陆，滔滔炽草郊开。画玉交桃养大人，谁教亿乙化殊平。
郊庙歌辞·汉宗庙乐舞辞·忠顺
明庭展赫，文物昭新。敬承茂典，敢择深衷。睟周历庙，载纬鸿休。
郊庙歌辞·梁太庙乐舞辞·登歌
既赫皇考，浚哉帝台。闓华而及，瑟彼飞香。大矣昭德，夙望明年。
郊庙歌辞·周宗庙乐舞辞·肃顺
恭彻祀礼，既以严禋。魏诚内庙，敢择良辰。载启其著，鸿休用职。
郊庙


In [46]:
DEFAULT_modelweights = os.path.join(DATA_FOLDER, 'char_rnn.h5')
model.save_weights(DEFAULT_modelweights )

In [47]:
model.optimizer.lr=0.0001
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x20817684be0>

In [48]:
print_example()

行。遍野屯万骑，临原驻五营。登山麾武节，背水纵神兵。在昔戎戈动，今来宇宙平。
入潼汤
崤函重律险，壮夫三十二。浇俗杂良霜，黄泥夜点营。凯胡氛九匣，单帐即归时。横沙何有静，谁能扫虏归。
横吹曲辞·陇头水
借问陇头水，今年陇头家。塞头年不见，明月竟不寐。
横吹曲辞·关山月
胡风月夜长，北客终无远。拂拂东飞曙，花成玉座声。草除新树地，凉吹轸离心。林黄帷阁上，砌影乱花呈。登高思遗老，含毫属昭阳。
上巳日赐裴度之张
端拱乘轩镜，昏开景重长。芳菲分日暮，娇罢在浮云。分枝怜菊蕊，风暖洒檀栊。
咏小山
近谷交萦蕊，遥峰对出莲。径细新苔树，寒山带夕华。坐此攀垂萼，今年那必寻。只辞秋雁曲，还用旧来还。
相和歌辞·阳春曲
妾妒白蘋浦，团扇薄时衣。初花复独好，花似绮蛾身。日暮西施望，春深不忆新。
相和歌辞·婕妤怨
谗谤潜来起不知，携手


In [49]:
model.optimizer.lr=0.00001
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x208175c6c50>

In [50]:
print_example()

行。遍野屯万骑，临原驻五营。登山麾武节，背水纵神兵。在昔戎戈动，今来宇宙平。
入潼河
崤函称地险，襟带壮两京。云台先著美，戈门远望新。交河拥已节，寒碛古寒霜。四兵屯九服，汉赏谁封侯。功高无边山，高轼多在张。老日无声影，三春不可穷。空馀片死事，所冀渭滨中。
琴曲歌辞·拘幽操
目掩掩兮其凝其盲，耳肃肃兮听欲闻。天气无为兮为君王圣，厚号万国兮孕八荒。天符既出兮帝业昌，愿临明祀兮降祯祥。
郊庙歌辞·享太庙乐章·景云舞
景云霏烂，告我帝符。噫帝冲德，唯传重圣。格调八簋，时居积庆。懿盛，登遐休。
郊庙歌辞·后唐宗庙乐舞辞·观成舞
穆穆王国，奕奕神猷。毖祀圜配，上帝冲宅。
郊庙歌辞·享章怀太子庙乐章·登歌酌鬯
誉阐元储，寄崇明两。玉裕虽晦，铜楼可想。弦诵辍音，笙歌罢响。币帛言设，灵心不测。
郊庙歌辞·享惠昭太子庙乐章·送神



In [51]:
model.save_weights(DEFAULT_modelweights )

### 用训练好了word2vec代替原来embedding layer, 在速度上有很大的提高，且词句的组织也比较好了

### 最后在8G的GPU上，再用Bidirectional LSMT来训练，这样的希望能看到一个好的结果