In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd


ModuleNotFoundError: No module named 'pandas'

In [1]:
def multihead_attention(queries,
                        keys,
                        num_units=None,
                        num_heads=8,
                        dropout_rate=0,
                        is_training=True,
                        causality=False,
                        scope="multihead_attention",
                        reuse=None):
    '''Applies multihead attention.

    Args
      queries: A 3d tensor with shape of [N, T_q, C_q].
      keys: A 3d tensor with shape of [N, T_k, C_k].
      num_units: A scalar. Attention size.
      dropout_rate: A floating point number.
      is_training: Boolean. Controller of mechanism for dropout.
      causality: Boolean. If true, units that reference the future are masked.
      num_heads: An int. Number of heads.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.

    Returns
      A 3d tensor with shape of (N, T_q, C)
    '''
    with tf.variable_scope(scope, reuse=reuse):
        # Set the fall back option for num_units
        if num_units is None:
            num_units = queries.get_shape().as_list[-1]

        # Linear projections
        Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C)
        K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
        V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)

        # Split and concat
        Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h)
        K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h)
        V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h)

        # Multiplication
        outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k)

        # Scale
        outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)

        # Key Masking
        key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) # (N, T_k)
        key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k)
        key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k)

        paddings = tf.ones_like(outputs)*(-2**32+1)
        outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k)

        # Causality = Future blinding
        if causality:
            diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k)
            tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense() # (T_q, T_k)
            masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k)

            paddings = tf.ones_like(masks)*(-2**32+1)
            outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k)

        # Activation
        outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k)

        # Query Masking
        query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q)
        query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q)
        query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k)
        outputs *= query_masks # broadcasting. (N, T_q, C)

        # Dropouts
        outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))

        # Weighted sum
        outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h)

        # Restore shape
        outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, C)

        # Residual connection
        outputs += queries

        # Normalize
        outputs = normalize(outputs) # (N, T_q, C)

    return outputs

## Encoder

In [None]:
with tf.variable_scope("encoder"):
    ## Embedding 词向量的计算
    self.enc = embedding(self.x,
                          vocab_size=len(de2idx),
                          num_units=hp.hidden_units,
                          scale=True,
                          scope="enc_embed")

    ## Positional Encoding 加入位置向量
    self.enc += positional_encoding(self.x,
                          num_units=hp.hidden_units,
                          zero_pad=False,
                          scale=False,
                          scope="enc_pe")


    ## Dropout 对输入向量进行dropout处理，减少过拟合
    self.enc = tf.layers.dropout(self.enc,
                                rate=hp.dropout_rate,
                                training=tf.convert_to_tensor(is_training))

    ## Blocks，若干个相同的block
    for i in range(hp.num_blocks):
        with tf.variable_scope("num_blocks_{}".format(i)):
            ### Multihead Attention，多头注意力机制
            self.enc = multihead_attention(queries=self.enc,
                                            keys=self.enc,
                                            num_units=hp.hidden_units,
                                            num_heads=hp.num_heads,
                                            dropout_rate=hp.dropout_rate,
                                            is_training=is_training,
                                            causality=False)

            ### Feed Forward，接上一个全连接层。
            self.enc = feedforward(self.enc, num_units=[4*hp.hidden_units, hp.hidden_units])

## Decoder

In [None]:
# Decoder
with tf.variable_scope("decoder"):
    ## Embedding
    self.dec = embedding(self.decoder_inputs,
                          vocab_size=len(en2idx),
                          num_units=hp.hidden_units,
                          scale=True,
                          scope="dec_embed")

    ## Positional Encoding
    self.dec += positional_encoding(self.decoder_inputs,
                          vocab_size=hp.maxlen,
                          num_units=hp.hidden_units,
                          zero_pad=False,
                          scale=False,
                          scope="dec_pe")

    ## Dropout
    self.dec = tf.layers.dropout(self.dec,
                                rate=hp.dropout_rate,
                                training=tf.convert_to_tensor(is_training))

    ## Blocks
    for i in range(hp.num_blocks):
        with tf.variable_scope("num_blocks_{}".format(i)):
            ## Multihead Attention ( self-attention) ，自注意力机制
            self.dec = multihead_attention(queries=self.dec,
                                            keys=self.dec,
                                            num_units=hp.hidden_units,
                                            num_heads=hp.num_heads,
                                            dropout_rate=hp.dropout_rate,
                                            is_training=is_training,
                                            causality=True,
                                            scope="self_attention")

            ## Multihead Attention ( vanilla attention)，和编码器的输出结果做注意力机制
            self.dec = multihead_attention(queries=self.dec,
                                            keys=self.enc,
                                            num_units=hp.hidden_units,
                                            num_heads=hp.num_heads,
                                            dropout_rate=hp.dropout_rate,
                                            is_training=is_training,
                                            causality=False,
                                            scope="vanilla_attention")

            ## Feed Forward
            self.dec = feedforward(self.dec, num_units=[4*hp.hidden_units, hp.hidden_units])