# 基于Seq2Seq模型实现自动对下联

本文主要介绍了使用`keras`来构建一个Seq2Seq模型，实现自动对下联的功能    
本文的对联数据可以在[这里](https://github.com/wb14123/couplet-dataset)找到，训练集中大约有70多万条对联，这里将前两个字符稍作修改，将其改成自定义的“结束”($)和“开始”(^)符号

## 预处理数据集

In [1]:
import numpy as np

读入字符集，一共大约有9000多个字符和汉字

In [2]:
VOCABS = []
with open('couplet/vocabs', encoding='utf8') as f:
    for vocab in f:
        VOCABS.append(vocab.strip())

VOCABS[:10]

['$', '^', '。', '，', '风', '春', '一', '人', '月', '山']

In [3]:
VOCABS_SIZE = len(VOCABS)
# 设置最长的句子为30个字符
MAX_SENTENCE_LENGTH = 30
START_SYM = '^'
END_SYM = '$'

In [4]:
VOCAB_TO_INDEX = {v: i for i, v in enumerate(VOCABS)}
INDEX_TO_VOCAB = {i: v for v, i in VOCAB_TO_INDEX.items()}

由于读入所有对联会消耗大量内存，所以这里定义一个读取对联的生成器，而不是一次性读入所有的对联

In [5]:
# 读取文件中的对联
def auto_read_couplet(file, max_len=MAX_SENTENCE_LENGTH):
    while True:
        with open(file, encoding='utf8') as f:
            line = f.readline()
            while line:
                words = line.strip().split(' ')
                if len(words) > max_len:
                    line = f.readline()
                    continue
                yield words
                line = f.readline()

In [6]:
# 将句子用END_SYM填补到最大长度
def pad_sentece(sentence, max_len=MAX_SENTENCE_LENGTH):
    sent_length = len(sentence)
    for _ in range(sent_length, max_len):
        sentence.append(END_SYM)
    return sentence

# 将句子转为向量
def sentence_to_vec(sentence, feature_n):
    X = []
    for word in sentence:
        v = np.zeros(feature_n)
        v[VOCAB_TO_INDEX[word]] = 1
        X.append(v)
    return np.array(X)

# 将向量转为句子
def vec_to_sentence(vector):
    seq = ''
    for vec in vector:
        index = np.argmax(vec)
        word = INDEX_TO_VOCAB[index]
        seq = seq + word
    return seq

生成对联数据，每次返回`batch_size`大小的数据  
函数返回为两部分，第一部分为输入部分，包括`encoder`的输入，即上联和`decoder`输入，第二部分为输出，即下联

In [7]:
def generate_data(batch_size=128, max_len=MAX_SENTENCE_LENGTH, feature_n=VOCABS_SIZE):
    input_generator = auto_read_couplet('couplet/train/in.txt')
    output_generator = auto_read_couplet('couplet/train/out.txt')
    while True:
        encode_inputs = []
        decode_inputs = []
        outputs = []
        for _ in range(batch_size):
            input_encode_couplet = next(input_generator)
            input_encode_couplet = pad_sentece(input_encode_couplet)
            input_encode_couplet_vec = sentence_to_vec(input_encode_couplet, feature_n)
            encode_inputs.append(input_encode_couplet_vec)
            
            output_couplet = next(output_generator)
            output_couplet = pad_sentece(output_couplet)
            outputs_vec = sentence_to_vec(output_couplet, feature_n)
            outputs.append(outputs_vec)
            
            input_decode_couplet = [START_SYM] + output_couplet[:-1]
            input_decode_couplet_vec = sentence_to_vec(input_decode_couplet, feature_n)
            decode_inputs.append(input_decode_couplet_vec)
            
        yield [np.array(encode_inputs), np.array(decode_inputs)], np.array(outputs)

## Seq2Seq模型

这里采用的是基于encoder-decoder框架的Seq2Seq模型，更多模型的细节可以在[基于Eecode-Decoder框架的Seq2Seq模型](https://github.com/snowhyzhang/yukino/blob/master/deep_learning/encoder-decoder_seq2seq_model.ipynb)上找到

In [8]:
from keras import layers
from keras.models import Model
from keras.models import load_model

class Seq2SeqModel:
    def __init__(self, model_name, feature_dim, output_len, hidden_size=128, output_activation='softmax', 
                 optimizer='adam', loss='categorical_crossentropy', metrics=['acc']):
        self.model_name = model_name
        self.feature_dim = feature_dim
        self.output_len = output_len
        
        self.hidden_size = hidden_size
        self.output_activation = output_activation
        self.optimizer = optimizer
        self.loss = loss
        self.metrics = metrics
        
        self.__build_model()
        self.__compile_model()
        
    def __build_model(self):
        # sequence to sequence model
        encoder_inputs = layers.Input(shape=(None, self.feature_dim))
        encoder = layers.LSTM(self.hidden_size, return_state=True)
        _, state_h, state_c = encoder(encoder_inputs)
        encoder_state = [state_h, state_c]

        decoder_inputs = layers.Input(shape=(None, self.feature_dim))
        decoder_lstm = layers.LSTM(self.hidden_size, return_sequences=True, return_state=True)
        decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_state)
        decoder_dense = layers.Dense(self.feature_dim, activation=self.output_activation)
        decoder_outputs = decoder_dense(decoder_outputs)
        self.model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
        
        # encoder
        self.encoder_model = Model(encoder_inputs, encoder_state)
        
        # decoder
        decoder_state_input_h = layers.Input(shape=(self.hidden_size, ))
        decoder_state_input_c = layers.Input(shape=(self.hidden_size, ))
        decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
        decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, 
                                                         initial_state=decoder_states_inputs)
        decoder_states = [state_h, state_c]
        decoder_outputs = decoder_dense(decoder_outputs)
        self.decoder_model = Model([decoder_inputs] + decoder_states_inputs, 
                                   [decoder_outputs] + decoder_states)
    def __compile_model(self):
        self.model.compile(self.optimizer, loss=self.loss, metrics=self.metrics)
    
    def train(self, encode_inputs, decode_inputs, outputs, validation_data=None,
              epochs=1, batch_size=128, verbose=1, callbacks=None):
        self.model.fit([encode_inputs, decode_inputs], outputs, validation_data=validation_data,
                       epochs=epochs, batch_size=batch_size, verbose=verbose, callbacks=callbacks)
    
    def train_generator(self, gen, validation_data=None, steps_per_epoch=128, epochs=1, 
                        verbose=1, callbacks=None):
        self.model.fit_generator(gen, validation_data=validation_data, steps_per_epoch=steps_per_epoch, 
                                 epochs=epochs, verbose=verbose, callbacks=callbacks)
    
    def predict(self, seq):
        state = self.encoder_model.predict(seq)
        target_seq = np.array([0 for _ in range(self.feature_dim)]).reshape(1, 1, self.feature_dim)
        output = []
        for _ in range(self.output_len):
            y_pred, h, c = self.decoder_model.predict([target_seq] + state)
            output.append(y_pred[0, 0, :])
            state = [h, c]
            target_seq = y_pred
        return np.array(output)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [9]:
seq2seq_model = Seq2SeqModel('couplet', VOCABS_SIZE, MAX_SENTENCE_LENGTH, hidden_size=256)
data_generator = generate_data(batch_size=128)

训练模型  

In [10]:
seq2seq_model.train_generator(data_generator, steps_per_epoch=128, epochs=100, verbose=0)

使用训练好的模型来对下联

In [11]:
def predict_couplet(couplet):
    input_couplet = list(couplet)
    input_couplet = pad_sentece(input_couplet)
    input_couplet = sentence_to_vec(input_couplet, VOCABS_SIZE)
    pred = seq2seq_model.predict(input_couplet[np.newaxis, ...])
    output_couplet = vec_to_sentence(pred).replace(END_SYM, '')
    
    print(f'input : {couplet}\noutput: {output_couplet}')

In [12]:
predict_couplet('秋雨无情，三杯浅醉')

input : 秋雨无情，三杯浅醉
output: 春风有梦，一点清香
