In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

In [2]:
# 1. preprocessing data
# 2. build model
# 2.1 encoder
# 2.2 attention
# 2.3 decoder
# 2.4 loss & optimizer
# 2.5 train
# 3. evaluation
# 3.1 given sentence ,return translated results
# 3.2 visualize results (attention)

In [1]:
en_spa_file_path = './spa-eng/spa.txt'

import unicodedata
def unicode_to_ascii(s):
    # NFD 是normalize的一种方法，作用是如果一个unicode是多个ascii组成的，就把其拆开
    # Mn 注音
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

en_sentence = 'Then what?'
sp_sentence = '¿Entonces qué?'

print(unicode_to_ascii(en_sentence))
print(unicode_to_ascii(sp_sentence))

T h e n   w h a t ?
¿ E n t o n c e s   q u e ?


In [3]:
import re

def preprocess_sentence(s):
    s = unicode_to_ascii(s.lower().strip())
    # 标点符号前后加空格
    s = re.sub(r"([?.!,¿])", r" \1 ", s)
    # 多余的空格变成一个空格
    s = re.sub(r'[" "]+', " ", s)
    # 除了标点符号和字母外都是空格
    s = re.sub(r'[^a-zA-Z?.!,¿]', " ", s)
    # 去掉前后空格
    s = s.rstrip().strip()
    s = '<start> ' + s + ' <end>'
    return s

print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence))

<start> t h e n w h a t ? <end>
<start> ¿ e n t o n c e s q u e ? <end>


In [4]:
def parse_data(file_name):
    lines = open(file_name, encoding='UTF-8').read().strip().split('\n')
    sentence_pairs = [line.split('\t') for line in lines]
    preprocessed_sentence_pairs = [
        (preprocess_sentence(en), preprocess_sentence(sp)) for en, sp in sentence_pairs
    ]
    return zip(*preprocessed_sentence_pairs)

en_dataset, sp_dataset = parse_data(en_spa_file_path)
print(en_dataset[-1])
print(sp_dataset[-1])

<start> i f y o u w a n t t o s o u n d l i k e a n a t i v e s p e a k e r , y o u m u s t b e w i l l i n g t o p r a c t i c e s a y i n g t h e s a m e s e n t e n c e o v e r a n d o v e r i n t h e s a m e w a y t h a t b a n j o p l a y e r s p r a c t i c e t h e s a m e p h r a s e o v e r a n d o v e r u n t i l t h e y c a n p l a y i t c o r r e c t l y a n d a t t h e d e s i r e d t e m p o . <end>
<start> s i q u i e r e s s o n a r c o m o u n h a b l a n t e n a t i v o , d e b e s e s t a r d i s p u e s t o a p r a c t i c a r d i c i e n d o l a m i s m a f r a s e u n a y o t r a v e z d e l a m i s m a m a n e r a e n q u e u n m u s i c o d e b a n j o p r a c t i c a e l m i s m o f r a s e o u n a y o t r a v e z h a s t a q u e l o p u e d a n t o c a r c o r r e c t a m e n t e y e n e l t i e m p o e s p e r a d o . <end>


In [11]:
a = [(1, 2), (3, 4), (5, 6)]
c, d = zip(*a)
print(c, d)


def tokenizer(lang):
    lang_tokenizer = keras.preprocessing.text.Tokenizer(num_words=None, filters='', split=' ')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)  # text to id
    tensor = keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    return tensor, lang_tokenizer

input_tensor, input_tokenizer = tokenizer(sp_dataset[0:30000])
output_tensor, output_tokenizer = tokenizer(en_dataset[0:30000])

def max_length(tensor):
    return max(len(t) for t in tensor)

max_length_input = max_length(input_tensor)
max_length_output = max_length(output_tensor)
print(max_length_input, max_length_output)

from sklearn.model_selection import train_test_split
input_train, input_eval, output_train, output_eval = train_test_split(input_tensor, output_tensor, test_size=0.2)

len(input_train), len(input_eval), len(output_train), len(output_eval)

def convert(example, tokenizer):
    for t in example:
        if t!= 0:
            print('%d --> %s' % (t, tokenizer.index_word[t]))
            
convert(input_train[0], input_tokenizer)
print()
convert(output_train[0], output_tokenizer)

def make_dataset(input_tensor, output_tensor, batch_size, epochs, shuffle):
    dataset = tf.data.Dataset.from_tensor_slices((input_tensor, output_tensor))
    if shuffle:
        dataset = dataset.shuffle(30000)
    dataset = dataset.repeat(epochs).batch(batch_size, drop_remainder=True)
    return dataset

batch_size=64
epochs=20
train_dataset = make_dataset(input_train, output_train, batch_size, epochs, True)
eval_dataset = make_dataset(input_eval, output_eval, batch_size, 1, False)

for x, y in train_dataset.take(1):
    print(x.shape)

(1, 3, 5) (2, 4, 6)
57 20
37 --> <start>v
1 --> e
9 --> t
1 --> e
13 --> d
1 --> e
2 --> a
24 --> q
11 --> u
8 --> i
38 --> !<end>

39 --> <start>g
2 --> o
4 --> a
17 --> w
4 --> a
14 --> y
6 --> .<end>
(64, 57)


In [12]:
embedding_units = 256
units = 1024
input_vocab_size = len(input_tokenizer.word_index) + 1
output_vocab_size = len(output_tokenizer.word_index) + 1

In [16]:
class Encoder(keras.Model):
    def __init__(self, vocab_size, embedding_units, encoding_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.encoding_units = encoding_units
        self.embedding = keras.layers.Embedding(vocab_size, embedding_units)
        self.gru = keras.layers.GRU(self.encoding_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
        # 遗忘门＋输入门＝１，gru就是表示这个１
    
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.encoding_units))
    
encoder = Encoder(input_vocab_size, embedding_units, units, batch_size)
    
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(x, sample_hidden)
print("sample_output.shape:", sample_output.shape)
print("sample_hidden.shape:", sample_hidden.shape)

sample_output.shape: (64, 57, 1024)
sample_hidden.shape: (64, 1024)


In [19]:
'''
EO: encoder各个位置的输出
H: decoder某一步的隐含状态 
FC:全连接层
X: decoder的一个输入
score = FC(tanh(FC(EO) + FC(H)))

attention_weights = softmax(score, axis=1)
context = sum(attention_weights * EO, axis=1)
final_input = concat(context, embed(x))
'''

class BahdanauAttention(keras.Model):
    def __init__(self, units):
        super().__init__()
        self.W1 = keras.layers.Dense(units)
        self.W2 = keras.layers.Dense(units)
        self.V = keras.layers.Dense(1)
    
    def call(self, decoder_hidden, encoder_outputs):
        # decoder_hidden.shape: (batch_size, units)
        # encoder_outputs.shape: (batch_size, length, units)
        decoder_hidden_with_time_axis = tf.expand_dims(decoder_hidden, 1) # decoder_hidden.shape becomes (batch_size, 1, units)
        # before: (batch_size, length, units)
        # after V: (batch_size, length, 1)
        score = self.V(tf.nn.tanh(self.W1(encoder_outputs) + self.W2(decoder_hidden_with_time_axis)))
        # shape: (batch_size, length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)
        # context_vector.shape: (batch_size, length, units)
        context_vector = attention_weights * encoder_outputs
        # context_vector.shape: (batch_size, units)
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        return context_vector, attention_weights
    
attention_model = BahdanauAttention(units=10)
attention_results, attention_weights = attention_model(sample_hidden, sample_output)

print("attention_results.shape: ", attention_results.shape)
print("attention_weights.shape: ", attention_weights.shape)

attention_results.shape:  (64, 1024)
attention_weights.shape:  (64, 57, 1)


In [20]:
class Decoder(keras.Model):
    def __init__(self, vocab_size, embedding_units, decoding_units, batch_size):
        super().__init__()
        self.batch_size = batch_size
        self.decoding_units = decoding_units
        self.embedding = keras.layers.Embedding(vocab_size, embedding_units)
        self.gru = keras.layers.GRU(self.decoding_units, return_sequences = True, return_state=True, recurrent_initializer='glorot_uniform')
        self.fc = keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.decoding_units)
        
    def call(self, x, hidden, encoding_outputs):
        # context_vector.shape: (batch_size, units)
        context_vector, attention_weights = self.attention(hidden, encoding_outputs)
        
        # before embedding: x.shape: (batch_size, 1)
        # after embedding: x.shape: (batch_size, 1, embedding_units)
        x = self.embedding(x)
        combined_x = tf.concat([tf.expand_dims(context_vector, 1), x], axis = -1)
        
        # output.shape: [batch_size, 1, decoding_units]
        # state.shape: [batch_size, decoding_units]
        output, state = self.gru(combined_x)
        
        # output.shape: [batch_size, decoding_units]
        output = tf.reshape(output, (-1, output.shape[2]))
        
        # output.shape: [batch_size, vocab_size]
        output = self.fc(output)
        return output, state, attention_weights
    
decoder = Decoder(output_vocab_size, embedding_units, units, batch_size)
outputs = decoder(tf.random.uniform((batch_size, 1)), sample_hidden, sample_output)

decoder_output, decoder_hidden, decoder_aw = outputs
print("decoder_output.shape:", decoder_output.shape)
print("decoder_hidden.shape:", decoder_hidden.shape)
print("deoder_attention_weights.shape:",decoder_aw.shape)

decoder_output.shape: (64, 60)
decoder_hidden.shape: (64, 1024)
deoder_attention_weights.shape: (64, 57, 1)


In [22]:
optimizer = keras.optimizers.Adam()

# from_logits: 前面fc仅仅只是一个dense，而没有激活函数，所以from_logits=True。
# reduction: 损失函数如何聚合
loss_object = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.match.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [23]:
@tf.function
def train_step(inp, targ, encoding_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        encoding_outputs, encoding_hidden = encoder(inp, encoding_hidden)
        decoding_hidden = encoding_hidden
        
        # eg: <start> I am here <end>
        # 1. <start> -> I
        # 2. I -> am
        # 3. am -> here
        # 4. here -> <end>
        for t in range(0, targ.shape[1] -1):
            decoding_input = tf.expand_dims(targ[:, t], 1)
            predictions, decoding_hidden, _ = decoder(decoding_input, decoding_hidden, encoding_outputs)
            loss += loss_function(targ[:,t+1], predictions)
    batch_loss = loss /int (targ.shape[0])
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss
         

In [None]:
epochs = 10
steps_per_epoch = len(input_tensor) // batch_size

for epoch in range(epochs):
    start = time.time()
    
    encoding_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epochs)):
        batch_loss = train_step(inp, targ, encoding_hidden)
        total_loss += batch_loss
        
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
            
    print('Epoch {} Loss {:.4f}'.format(epoch+1, total_loss / steps_per_epoch))
    
    print('Time take for 1 epoch {} sec\n'.format(time.time() - start))

In [None]:
def evaluate(input_sentence):
    attention_metrix = np.zeros((max_length_output, max_length_input))
    input_sentence = preprocess_sentence(input_sentence)
    inputs = [input_tokenizer.word_index[token] for token in input_sequence.split(' ')]
    inputs = keras.preprocessing.sequence.pad_sequences([inputs], max_length = max_length_input, padding='post')
    inputs = tf.convert_to_tensor(inputs)
    
    results = ''
    #encoding_hidden = encoder.initialize_hidden_state()
    encoding_hidden = tf.zeros((1, units))
    encoding_outputs, encoding_hidden = encoder(inputs, encoding_hidden)
    decoding_hidden = encoding_hidden
    
    # eg:  <start> -> A
    # A -> B -> C -> D
    # decoding_input.shape: (1, 1)      (batch_size, input_length)
    decoding_input = tf.expand_dims([output_tokenizer.word_index['<start>']], 0)
    for t in range(max_length_output):
        predictions, decoding_hidden, attention_weights = decoder(decoding_input, decoding_hidden, encoding_inputs)
        
        # attention_weights.shape: (batch_size, input_length, 1)    (1, 16, 1)
        attention_weights = tf.reshape(attention_weights, (-1, 1))
        attention_matrix[t] = attention_weights.numpy()
        
        # predictions.shape: (batch_size, vocab_size)   (1, 4935)
        predicted_id = tf.argmax(predictions[0]).numpy()
        results += output_tokenizer.index_word[predicted_id] + ' '
        if output_tokenizer.index_word[predicted_id] == '<end>':
            return results, input_sentence, attention_matrix
        
        decoding_input = tf.expand_dims([predicted_id], 0)
        
    return results, input_sentence, attention_matrix

In [None]:
def plot_attention(attention_matrix, input_sentence, predicted_sentence):
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(1,1,1)
    ax.matshow(attention_matrix, cmap='viridis ')
    font_dict = {'fontsize': 14}
    ax.set_xticklabels([' ']+ input_sentence, fontdict = font_dict, rotation = 90)
    ax.set_yticklabels([' ']+ predicted_sentence, fontdict = font_dict)
    plt.show()
    
def translate(input_sentence):
    results, input_sentence, attention_matrix = evaluate(input_sentence)
    print("Input: %s" % (input_sentence))
    print("Predicted translation: %s" % results)
    
    attention_matrix = attention_matrix[:len(results.split(' ')),
                                                         :len(input_sentence.split(' '))]
    plot_attention(attention_matrix, input_sentence.split(' '), results.split(' '))

In [None]:
translate(u'xxxxxxxxxxxxxxxxxxxxxxxx')  # 此处输入spanish