# Seq2Seq在机器翻译上的应用

### 01 预处理数据

In [1]:
# generate vocab of english and chinese
import codecs
import collections
from operator import itemgetter

# DARE_TYPE = "english"  or
# DARE_TYPE = "chinese"

def get_vocab(DARE_TYPE):
    if DARE_TYPE == "chinese":
        RAW_DATA = "./train.txt.zh"
        VOCAB_OUTPUT = "zh.vocab"
        VOCAB_SIZE = 4000
    elif DARE_TYPE == "english":
        RAW_DATA = "./train.txt.en"
        VOCAB_OUTPUT = "en.vocab"
        VOCAB_SIZE = 10000

    counter = collections.Counter()

    with codecs.open(RAW_DATA,"r","utf-8") as f:
        for line in f:
            for word in line.strip().split():
                counter[word] += 1

    sorted_word_to_cnt = sorted(counter.items(),key=itemgetter(1),reverse = True)

    sorted_word_list = [x[0] for x in sorted_word_to_cnt]

    sorted_word_list = ["<unk>","<sos>","<eos>"] + sorted_word_list

    if len(sorted_word_list) > VOCAB_SIZE:
        sorted_word_list = sorted_word_list[:VOCAB_SIZE]

    with codecs.open(VOCAB_OUTPUT,'w','utf-8') as file_output:
        for word in sorted_word_list:
            file_output.write(word + '\n')

In [2]:
# get sentences id
DATA_TYPE = "english"

if DATA_TYPE == "chinese":  # 翻译语料的中文部分
    RAW_DATA = "./train.txt.zh"
    VOCAB = "zh.vocab"
    OUTPUT_DATA = "train.zh"
elif DATA_TYPE == "english":  # 翻译语料的英文部分
    RAW_DATA = "./train.txt.en"
    VOCAB = "en.vocab"
    OUTPUT_DATA = "train.en"

with codecs.open(VOCAB,'r','utf-8') as f_vocab:
    vocab = [w.strip() for w in f_vocab.readlines()]

word_to_id = {k: v for (k,v) in zip(vocab,range(len(vocab)))}

def get_id(word):
    return word_to_id[word] if word in word_to_id else word_to_id['<unk>']

fin = codecs.open(RAW_DATA,'r','utf-8')
fout = codecs.open(OUTPUT_DATA,'w','utf-8')

for line in fin:
    words = line.strip().split()+ ["<eos>"]
    out_line = ' '.join([str(get_id(w)) for w in words]) + '\n'
    fout.write(out_line)

fin.close()
fout.close()

In [4]:
get_id("<sos>")

1

In [6]:
#padding and batching
import tensorflow as tf

MAX_LEN = 50
SOS_ID = 1

def MakeDataset(file_path):
    dataset = tf.data.TextLineDataset(file_path)
    dataset = dataset.map(lambda string:tf.string_split([string]).values)
    dataset = dataset.map(lambda string:tf.string_to_number(string,tf.int32))
    dataset = dataset.map(lambda x: (x, tf.size(x)))
    return dataset

def MakeSrcTrgDataset(src_path,trg_path,batch_size):
    src_data = MakeDataset(src_path)
    trg_data = MakeDataset(trg_path)
    dataset = tf.data.Dataset.zip((src_data, trg_data))

    def FilterLength(src_tuple,trg_tuple):
        ((src_input,src_len),(trg_label,trg_len)) = (src_tuple,trg_tuple)
        src_len_ok = tf.logical_and(tf.greater(src_len,1),tf.less_equal(src_len,MAX_LEN))
        trg_len_ok = tf.logical_and(tf.greater(trg_len,1),tf.less_equal(trg_len, MAX_LEN))
        return tf.logical_and(src_len_ok,trg_len_ok)

    dataset = dataset.filter(FilterLength)

    def MakeTrgInput(src_tuple,trg_tuple):
        ((src_input,src_len),(trg_label, trg_len)) = (src_tuple, trg_tuple)
        trg_input = tf.concat([[SOS_ID], trg_label[:-1]], axis=0)
        return ((src_input, src_len), (trg_input, trg_label, trg_len))

    dataset = dataset.map(MakeTrgInput)

    dataset = dataset.shuffle(10000)

    padded_shapes = (
        (tf.TensorShape([None]),    # 源句子是长度未知的向量
         tf.TensorShape([])),       # 源句子长度是单个数字
        (tf.TensorShape([None]),    # 目标句子(解码器输入)是长度未知的向量
         tf.TensorShape([None]),    # 目标句子(解码器目标输出)是长度未知的向量
         tf.TensorShape([]))        # 目标句子长度(输出)是单个数字
    )

    batched_dataset = dataset.padded_batch(batch_size,padded_shapes)

    return batched_dataset

  from ._conv import register_converters as _register_converters


### seq2seq模型建立

In [None]:
class NMTModel(object):
    def __init__(self):
        self.enc_cell = tf.nn.rnn_cell.MutiRNNCell(
        [tf.nn.rnn_cell.LSTMCell(HIDDEN_SIZE) for _ in range(NUM_LAYERS)])
        self.dec_cell = tf.nn.rnn_cell.MutiRNNCell(
        [tf.nn.rnn_cell.LSTMCell(HIDDEN_SIZE) for _ in range(NUM_LAYERS)])
        self.src_embedding = tf.get_variable('src_emb', [SRC_VOCAB_SIZE, HIDDEN_SIZE])
        self.trg_embedding = tf.get_variable('trg_emb', [TRG_VOCAB_SIZE, HIDDEN_SIZE])
        if SHARE_EMB_AND_SOFTMAX:
            self.softmax_weight = tf.transpose(self.trg_embedding)
        else:
            self.softmax_weight = tf.get_variable("weight",[HIDDEN_SIZE,TRG_VOCAB])
        self.softmax_bias = tf.get_variable('softmax_loss',[TRG_VOCAB_SIZE])
        
    def forward(self,src_input,src_size,trg_input,trg_label,trg_size):
        batch_size = tf.shape(src_input)[0]
        src_emb = tf.nn.embedding_lookup(self.src_embedding,src_input)
        trg_emb = tf.nn.embedding_lookup(self.trg_embedding,trg_input)
        src_emb = tf.nn.dropout(src_emb,KEEP_PROB)
        trg_emb = tf.nn.dropout(trg_emb, KEEP_PROB)
        # 编码器读取源句子每个位置的词向量，输出最后一步的隐藏状态enc_state
        with tf.variable_scope('encoder'):
            enc_outputs,enc_state = tf.nn.dynamic_rnn(self.enc_cell,src_emb,src_size, dtype=tf.float32)
        output = tf.reshape(dec_outputs, [-1, HIDDEN_SIZE])
        logits = tf.matmul(output, self.softmax_weight) + self.softmax_bias
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(trg_label, [-1]), logits=logits)
        label_weights = tf.sequence_mask(trg_size, maxlen=tf.shape(trg_label)[1], dtype=tf.float32)
        label_weights = tf.reshape(label_weights, [-1])
        cost = tf.reduce_sum(loss * label_weights)
        cost_per_token = cost / tf.reduce_sum(label_weights)
        # 定义反向传播操作
        trainable_variables = tf.trainable_variables()
        # 控制梯度大小，定义优化方法和训练步骤
        # 算出每个需要更新的值的梯度，并对其进行控制
        grads = tf.gradients(cost / tf.to_float(batch_size), trainable_variables)
        grads, _ = tf.clip_by_global_norm(grads, MAX_GRAD_NORM)
        # 利用梯度下降优化算法进行优化.学习率为1.0
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
        # 相当于minimize的第二步，正常来讲所得到的list[grads,vars]由compute_gradients得到，返回的是执行对应变量的更新梯度操作的op
        train_op = optimizer.apply_gradients(zip(grads, trainable_variables))
        return cost_per_token, train_op