In [12]:
# -*- coding: utf-8 -*-

import os
import re
import time
import codecs



TIME_FORMAT = '%Y-%m-%d %H:%M:%S'
BASE_FOLDER = "C:/Users/sethf/source/repos/chinesepoem/" # os.path.abspath(os.path.dirname(__file__))
DATA_FOLDER = os.path.join(BASE_FOLDER, 'data')
DEFAULT_FIN = os.path.join(DATA_FOLDER, '唐诗语料库.txt')
DEFAULT_FOUT = os.path.join(DATA_FOLDER, 'poem.txt')
reg_noisy = re.compile('[^\u3000-\uffee]')
reg_note = re.compile('(（.*）)') # Cannot deal with （） in seperate lines
# 中文及全角标点符号(字符)是\u3000-\u301e\ufe10-\ufe19\ufe30-\ufe44\ufe50-\ufe6b\uff01-\uffee

DEFAULT_Char2Vec = os.path.join(DATA_FOLDER, 'Char2Vec100.bin')

# Test the material

In [13]:
def GetFirstNline(filePath, linesNumber):
    fd = codecs.open(filePath, 'r', 'utf-8')
    for i in range(1,linesNumber):
        print(fd.readline())
    fd.close()

GetFirstNline(DEFAULT_FOUT, 3)

饮马长城窟行

塞外悲风切，交河冰已结。瀚海百重波，阴山千里雪。迥戍危烽火，层峦引高节。悠悠卷旆旌，饮马出长城。寒沙连骑迹，朔吹断边声。胡尘清玉塞，羌笛韵金钲。绝漠干戈戢，车徒振原隰。都尉反龙堆，将军旋马邑。扬麾氛雾静，纪石功名立。荒裔一戎衣，灵台凯歌入。



In [53]:
text = codecs.open(DEFAULT_FOUT, 'r', 'utf-8').read()

In [54]:
print('corpus length:', len(text))

corpus length: 3364177


In [55]:
text = text[:500000]

In [56]:
chars = sorted(list(set(Text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 7503


In [57]:
chars.insert(0, "\0")

In [58]:
''.join(chars[1:200])

'\n()13569CDFGHJLMOQXZ[]·…、。々《》ヨ一丁七万丈三上下不与丐丑专且丕世丘丙业丛东丝丞两严丧个丫中丰丱串丳临丸丹为主丽举乂乃久么义之乌乍乎乏乐乔乖乘乙九乞也习乡书买乱乳乾了予争事二于亏云互五井亘亚些亟亡亢交亥亦产亨亩享京亭亮亲亳亵亶亸亹人亿什仁仄仅仆仇今介仍从仑仓仔仕他仗付仙仝仞仡代令以仪仰仲仳价任仿企伉伊伋伍伎伏伐休众优会伛伞伟传伣伤伥伦伧伪伫伯估伴伶伸伺似伽伾但佉位低住佐'

In [59]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [60]:
idx = [char_indices[c] for c in Text]

In [61]:
idx[:10]

[7050, 7107, 6697, 1044, 4379, 5659, 1, 1084, 1132, 1896]

In [62]:
''.join(indices_char[i] for i in idx[:70])

'饮马长城窟行\n塞外悲风切，交河冰已结。瀚海百重波，阴山千里雪。迥戍危烽火，层峦引高节。悠悠卷旆旌，饮马出长城。寒沙连骑迹，朔吹断边声。胡尘清'

# Preprocess and create model

In [63]:
maxlen = 80
sentences = []
next_chars = []
for i in range(0, len(idx) - maxlen+1):
    sentences.append(idx[i: i + maxlen])
    next_chars.append(idx[i+1: i+maxlen+1])
print('nb sequences:', len(sentences))

nb sequences: 3364098


In [64]:
import numpy as np
import keras
from keras.layers import TimeDistributed, Activation
from numpy.random import choice

In [65]:

sentences = np.concatenate([[np.array(o)] for o in sentences[:-2]])
next_chars = np.concatenate([[np.array(o)] for o in next_chars[:-2]])

In [66]:
sentences.shape, next_chars.shape

((3364096, 80), (3364096, 80))

In [67]:
n_fac = 100

In [68]:
from keras import backend as K
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.utils.np_utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional
from keras.layers import TimeDistributed, Activation, SimpleRNN, GRU
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.regularizers import l2, l1
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, RMSprop, Adam
#from keras.utils.layer_utils import layer_from_config
from keras.layers import deserialize as layer_from_config

from keras.metrics import categorical_crossentropy, categorical_accuracy
from keras.layers.convolutional import *
from keras.preprocessing import image, sequence
from keras.preprocessing.text import Tokenizer

In [69]:

model=Sequential([
        Embedding(vocab_size, n_fac, input_length=maxlen),
        LSTM(512, return_sequences=True, dropout=0.2, recurrent_dropout=0.2,implementation=2),
        Dropout(0.2),
        LSTM(512, return_sequences=True, dropout=0.2, recurrent_dropout=0.2,implementation=2),
        Dropout(0.2),
        TimeDistributed(Dense(vocab_size)),
        Activation('softmax')
    ])   

In [70]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

# Training

In [71]:
def print_example():
    seed_string="ethics is a basic foundation of all that"
    for i in range(320):
        x=np.array([char_indices[c] for c in seed_string[-40:]])[np.newaxis,:]
        preds = model.predict(x, verbose=0)[0][-1]
        preds = preds/np.sum(preds)
        next_char = choice(chars, p=preds)
        seed_string = seed_string + next_char
    print(seed_string)

In [None]:
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=6, epochs=1)

Epoch 1/1
   1968/3364096 [..............................] - ETA: 26227s - loss: 6.7984