# Machine Translation 
  - Author: Sabrina Li


In [0]:
!pip install -q tensorflow-gpu==2.0.0-alpha0

[K    100% |████████████████████████████████| 332.1MB 45kB/s 
[K    100% |████████████████████████████████| 3.0MB 6.8MB/s 
[K    100% |████████████████████████████████| 419kB 11.3MB/s 
[K    100% |████████████████████████████████| 61kB 22.0MB/s 
[?25h

In [0]:
!pip install sacrebleu # https://github.com/mjpost/sacreBLEU

Collecting sacrebleu
  Downloading https://files.pythonhosted.org/packages/12/5b/7196b11bca204cb6ca9000b5dc910e809081f224c73ef28e9991080e4e51/sacrebleu-1.3.1.tar.gz
Building wheels for collected packages: sacrebleu
  Building wheel for sacrebleu (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/56/c0/fb/1c7f9b3a71f64cdf86291cc645596f71746807bf2f72b3c1dd
Successfully built sacrebleu
Installing collected packages: sacrebleu
Successfully installed sacrebleu-1.3.1


In [0]:
import numpy as np
import re
import sacrebleu
import tensorflow as tf
import time
import unicodedata
import os
import io

## 1. Train a model to translate from English to Spanish.

In [0]:
# Download the file
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [0]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.rstrip().strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [0]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, SPANISH]
def create_dataset(path, num_examples):
    np.random.seed(123)
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    lines = np.reshape(np.array(lines), (int(len(lines)/2), 2))
    indexes = np.random.choice(lines.shape[0], num_examples)
    selected_lines = np.array([lines[i] for i in indexes]).flatten()
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in selected_lines]
    return word_pairs, zip(*word_pairs)

In [0]:
num_sample = 3000
sentences, (en, sp) = create_dataset(path_to_file, num_sample)
print(en[-1])
print(sp[-1])

<start> over ten thousand messages are sent every second on facebook . <end>
<start> mas de diez mil mensajes son enviados cada segundo en facebook . <end>


In [0]:
#print("Original:", sentences[0])
#sentences = [(preprocess_sentence(source), preprocess_sentence(target)) for (source, target) in sentences]
#print("Preprocessed:", sentences[0])

In [0]:
source_sentences, target_sentences = en, sp

In [0]:
source_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
source_tokenizer.fit_on_texts(source_sentences)
source_data = source_tokenizer.texts_to_sequences(source_sentences)
print("Sequence:", source_data[0])
source_data = tf.keras.preprocessing.sequence.pad_sequences(source_data, padding='post')
print("Padded:", source_data[0])

Sequence: [1, 27, 12, 39, 25, 678, 3, 23, 15, 91, 9, 213, 3, 2]
Padded: [  1  27  12  39  25 678   3  23  15  91   9 213   3   2   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0]


In [0]:
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
target_tokenizer.fit_on_texts(target_sentences)
target_data = target_tokenizer.texts_to_sequences(target_sentences)
target_data = tf.keras.preprocessing.sequence.pad_sequences(target_data, padding='post')

In [0]:
# Create labels for the decoder by shifting the target sequence
# one to the right.
target_labels = np.zeros(target_data.shape)
target_labels[:,0:target_data.shape[1] -1] = target_data[:,1:]

print("Target sequence", target_data[0])
print("Target label", target_labels[0])

Target sequence [  1  26   8  13  23 597   3  13  53  22 529   3   2   0   0   0   0   0
   0   0   0   0   0   0   0   0]
Target label [ 26.   8.  13.  23. 597.   3.  13.  53.  22. 529.   3.   2.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]


In [0]:
source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

In [0]:
def decode(encoded, tokenizer):
  for number in encoded:
    if number !=0:
      print ("%d -> %s" % (number, tokenizer.index_word[number]))
      
decode(source_data[0], source_tokenizer)

1 -> <start>
27 -> mary
12 -> is
39 -> not
25 -> my
678 -> girlfriend
3 -> .
23 -> she
15 -> s
91 -> just
9 -> a
213 -> friend
3 -> .
2 -> <end>


In [0]:
batch_size = 5
dataset = tf.data.Dataset.from_tensor_slices((source_data, target_data, target_labels)).batch(batch_size)

In [0]:
example_batch = next(iter(dataset))
source, target, taget_labels = example_batch
print("Shapes:", source.shape, target.shape, taget_labels.shape)

Shapes: (5, 29) (5, 26) (5, 26)


In [0]:
embedding_size = 32
rnn_size = 64

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self):
    super(Encoder, self).__init__()
    
    self.embedding = tf.keras.layers.Embedding(source_vocab_size,
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)
    
  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)        
    return output, state
  
  def init_state(self, batch_size):
    return tf.zeros((batch_size, rnn_size))

Demonstrate calling the encoder.

In [0]:
# Create a batch of one sentence
ex_sentence = tf.expand_dims(source_data[0], axis=0)
ex_translation = tf.expand_dims(target_data[0], axis=0)
ex_labels = tf.expand_dims(target_labels[0], axis=0)
print(ex_sentence.shape)

encoder = Encoder()
hidden_state = encoder.init_state(batch_size=1)
print(hidden_state.shape)

output, hidden_state = encoder(ex_sentence, hidden_state)
print(output.shape)

(1, 29)
(1, 64)
(1, 29, 64)


In [0]:
class Decoder(tf.keras.Model):
  def __init__(self):
    super(Decoder, self).__init__()
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, 
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)

    self.dense = tf.keras.layers.Dense(target_vocab_size)


  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)
    logits = self.dense(output)
    return logits, state

Demonstrate calling the decoder.

In [0]:
decoder = Decoder()
decoder_output, decoder_state = decoder(ex_labels, hidden_state)
print(decoder_output.shape)

(1, 26, 5424)


In [0]:
crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

def calc_loss(targets, logits):
  mask = tf.math.logical_not(tf.math.equal(targets, 0))
  mask = tf.cast(mask, dtype=tf.int64)
  return crossentropy(targets, logits, sample_weight=mask)

print("Loss", calc_loss(ex_labels, decoder_output))

Loss tf.Tensor(3.9689026, shape=(), dtype=float32)


In [0]:
def translate(idx=None, input_sent=None):
  
    if idx == None and input_sent == None: 
      idx = np.random.choice(len(sentences))
      out_input_sent = sentences[idx][0]
      input_sent = source_data[idx]
      out_target_sent = sentences[idx][1]
    elif idx == None and input_sent != None:
      out_target_sent = "NOT AVAILABLE"
      out_input_sent = input_sent
      #print(out_input_sent)
      source_data2 = source_tokenizer.texts_to_sequences(out_input_sent)
      #print(source_data2)
      source_data2 = tf.keras.preprocessing.sequence.pad_sequences(source_data2, padding='post')
      input_sent = source_data2
      input_sent = np.array(input_sent).reshape(len(input_sent),)
      #print(input_sent)
    else:
      out_input_sent = sentences[idx][0]
      input_sent = source_data[idx]
      out_target_sent = sentences[idx][1]
    #input_sent = source_data[idx]
    input_sent = tf.expand_dims(input_sent, axis=0)
    
    hidden_state = encoder.init_state(batch_size=1)
    output, hidden_state = encoder(input_sent, hidden_state)
    
    decoder_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)
    out_words = []
    
    decoder_state = hidden_state

    while True:
      
        decoder_output, decoder_state = decoder(decoder_input, decoder_state)
        decoder_input = tf.argmax(decoder_output, -1)
        word_idx = decoder_input.numpy()[0][0]
        # if we've predicted 0 (which is reserved, usually this will only happen
        # before the decoder is trained, just stop translating and return
        # what we have)
        if word_idx == 0: 
          out_words.append('<end>')
        else:
          out_words.append(target_tokenizer.index_word[word_idx])

        if out_words[-1] == '<end>' or len(out_words) >= 20:
          break
          
    translation = ' '.join(out_words)    
    return out_input_sent, out_target_sent, translation

In [0]:
input_sent, target_sent, translation = translate()
print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

Input: <start> that s not how the world works . <end>
Target: <start> asi no es como funciona el mundo . <end>
Translation: verme refran comida on amas acuerdo entregale favor charlando apoderan velas averguenza metro papa encontramos moriras perdio darle notara jazz



In [0]:
optimizer = tf.keras.optimizers.Adam()

In [0]:
@tf.function # remove this annotation when debugging
def train_step(source_seq, target_seq, target_labels, initial_state):
  
  with tf.GradientTape() as tape:
    encoder_output, encoder_state = encoder(source_seq, initial_state)
    logits, decoder_state = decoder(target_seq, encoder_state)
    loss = calc_loss(target_labels, logits)

  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

In [0]:
EPOCHS = 400

for epoch in range(EPOCHS):
    start = time.time()
  
    en_initial_states = encoder.init_state(batch_size)
    
    for batch, (source_seq, target_seq, target_labels) in enumerate(dataset):
      #print(source_seq.shape, target_seq.shape, target_labels.shape)
      loss = train_step(source_seq, target_seq, target_labels, en_initial_states)
      elapsed = time.time() - start
    
    if epoch % 10 == 0:
      print("Epoch #%d, Loss %.4f, Time %.2f sec" % (epoch, loss, elapsed))
      input_sent, target_sent, translation = translate()
      print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

Epoch #0, Loss 2.0388, Time 13.94 sec
Input: <start> the king reigned over the island . <end>
Target: <start> el rey reino sobre la isla . <end>
Translation: tom que no que no que no que no que no que no que no que no que no que

Epoch #10, Loss 1.1885, Time 10.71 sec
Input: <start> she was indignant at the way she had been treated . <end>
Target: <start> ella se indigno por la forma en que fue tratada . <end>
Translation: ella se quedo a mary . <end>

Epoch #20, Loss 0.8260, Time 10.68 sec
Input: <start> the statue of liberty is the symbol of the united states . <end>
Target: <start> la estatua de la libertad es el simbolo de america . <end>
Translation: la estatua de la libertad es el simbolo de america . <end>

Epoch #30, Loss 0.5542, Time 10.69 sec
Input: <start> the pot is boiling over . <end>
Target: <start> la olla esta hirviendo . <end>
Translation: la vida es un buen extranjero por nuestro pais . <end>

Epoch #40, Loss 0.4851, Time 10.72 sec
Input: <start> when did you see her

Calculate BLEU Score.

In [0]:
references, hypotheses = [], []

for i in range(len(sentences)):
  input_sent, target_sent, translation = translate()
  references.append(target_sent)
  hypotheses.append("<start> " + translation)
  
results = sacrebleu.raw_corpus_bleu(hypotheses, [references])
print(results)

### 2. Train a second model to translate between the same two languages in reverse order (from Spanish to English).

In [0]:
num_sample = 3000
sentences, (en, sp) = create_dataset(path_to_file, num_sample)
print(en[-1])
print(sp[-1])

In [0]:
source_sentences_re, target_sentences_re = sp, en

In [0]:
source_tokenizer_re = tf.keras.preprocessing.text.Tokenizer(filters='')
source_tokenizer_re.fit_on_texts(source_sentences_re)
source_data_re = source_tokenizer_re.texts_to_sequences(source_sentences_re)
print("Sequence:", source_data_re[1])
source_data_re = tf.keras.preprocessing.sequence.pad_sequences(source_data_re, padding='post')
print("Padded:", source_data_re[1])

In [0]:
target_tokenizer_re = tf.keras.preprocessing.text.Tokenizer(filters='')
target_tokenizer_re.fit_on_texts(target_sentences_re)
target_data_re = target_tokenizer_re.texts_to_sequences(target_sentences_re)
target_data_re = tf.keras.preprocessing.sequence.pad_sequences(target_data_re, padding='post')

In [0]:
# Create labels for the decoder by shifting the target sequence
# one to the right.
target_labels_re = np.zeros(target_data_re.shape)
target_labels_re[:,0:target_data_re.shape[1] -1] = target_data_re[:,1:]

print("Target sequence", target_data_re[0])
print("Target label", target_labels_re[0])

In [0]:
source_vocab_size_re = len(source_tokenizer_re.word_index) + 1
target_vocab_size_re = len(target_tokenizer_re.word_index) + 1

In [0]:
def decode_re(encoded, tokenizer):
  for number in encoded:
    if number !=0:
      print ("%d -> %s" % (number, tokenizer.index_word[number]))
      
decode_re(source_data_re[0], source_tokenizer_re)

In [0]:
batch_size = 5
target_labels_re = tf.convert_to_tensor(target_labels_re)
dataset_re = tf.data.Dataset.from_tensor_slices((source_data_re, target_data_re,target_labels_re)).batch(batch_size)

In [0]:
example_batch = next(iter(dataset_re))
source, target, taget_labels = example_batch
print("Shapes:", source.shape, target.shape, taget_labels.shape)

In [0]:
embedding_size_re = 32
rnn_size_re = 64

In [0]:
class Encoder_re(tf.keras.Model):
  def __init__(self):
    super(Encoder_re, self).__init__()
    
    self.embedding = tf.keras.layers.Embedding(source_vocab_size_re,
                                               embedding_size_re)
    self.gru = tf.keras.layers.GRU(rnn_size_re, 
                                   return_sequences=True, 
                                   return_state=True)
    
  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)        
    return output, state
  
  def init_state(self, batch_size):
    return tf.zeros((batch_size, rnn_size_re))

Demonstrate calling the encoder.

In [0]:
# Create a batch of one sentence
ex_sentence_re = tf.expand_dims(source_data_re[0], axis=0)
ex_translation_re = tf.expand_dims(target_data_re[0], axis=0)
ex_labels_re = tf.expand_dims(target_labels_re[0], axis=0)
print(ex_sentence_re.shape)

encoder_re = Encoder_re()
hidden_state_re = encoder.init_state(batch_size=1)
print(hidden_state_re.shape)

output_re, hidden_state_re = encoder(ex_sentence_re, hidden_state_re)
print(output_re.shape)

In [0]:
class Decoder_re(tf.keras.Model):
  def __init__(self):
    super(Decoder_re, self).__init__()
    self.embedding = tf.keras.layers.Embedding(target_vocab_size_re, 
                                               embedding_size_re)
    self.gru = tf.keras.layers.GRU(rnn_size_re, 
                                   return_sequences=True, 
                                   return_state=True)

    self.dense = tf.keras.layers.Dense(target_vocab_size_re)


  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)
    logits = self.dense(output)
    return logits, state

Demonstrate calling the decoder.

In [0]:
decoder_re = Decoder_re()
decoder_output_re, decoder_state_re = decoder_re(ex_labels_re, hidden_state_re)
print(decoder_output_re.shape)

In [0]:
crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

def calc_loss(targets, logits):
  mask = tf.math.logical_not(tf.math.equal(targets, 0))
  mask = tf.cast(mask, dtype=tf.int64)
  return crossentropy(targets, logits, sample_weight=mask)

print("Loss", calc_loss(ex_labels_re, decoder_output_re))

In [0]:
def translate_re(idx=None, input_sent=None):
  
    if idx == None and input_sent == None: 
      idx = np.random.choice(len(sentences))
      input_sent = source_data_re[idx]
      out_input_sent = sentences[idx][1]
      out_target_sent = sentences[idx][0]
    elif idx == None and input_sent != None:
      out_input_sent = input_sent
      #input_sent = input_sent.strip('<start>')
      #input_sent = input_sent.strip('<end>')
      
      input_sent = preprocess_sentence(input_sent)
      #print(input_sent.split(' '))
      inputs = [source_tokenizer_re.word_index[i] for i in input_sent.split(' ')[2:-2]]
      input_sent = tf.keras.preprocessing.sequence.pad_sequences([inputs], padding='post')
      #input_sent = tf.convert_to_tensor(inputs)
      input_sent = input_sent[0]
      out_target_sent = "UNKNOWN"
      
      
      '''
      idx = np.random.choice(len(sentences))
      
      source_data2 = source_tokenizer_re.texts_to_sequences(input_sent)
      source_data2 = tf.keras.preprocessing.sequence.pad_sequences(source_data2, padding='post')
      input_sent = source_data2
      input_sent = np.array(input_sent).reshape(len(input_sent),)
      #print(input_sent)
      '''
      
      #out_target_sent = sentences[idx][0]     
    else:
      out_input_sent = sentences[idx][1]
      #source_data2 = source_tokenizer_re.texts_to_sequences(out_input_sent)
      #source_data2 = tf.keras.preprocessing.sequence.pad_sequences(source_data2, padding='post')
      input_sent = source_data_re[idx]
      #print(input_sent)
    input_sent = tf.expand_dims(input_sent, axis=0)
    
    hidden_state = encoder_re.init_state(batch_size=1)
    output, hidden_state = encoder_re(input_sent, hidden_state)
    decoder_input = tf.expand_dims([target_tokenizer_re.word_index['<start>']], 0)
    out_words = []
    
    decoder_state = hidden_state

    while True:
      
        decoder_output, decoder_state = decoder_re(decoder_input, decoder_state)
        decoder_input = tf.argmax(decoder_output, -1)
        word_idx = decoder_input.numpy()[0][0]
        # if we've predicted 0 (which is reserved, usually this will only happen
        # before the decoder is trained, just stop translating and return
        # what we have)
        if word_idx == 0: 
          out_words.append('<end>')
        else:
          out_words.append(target_tokenizer_re.index_word[word_idx])

        if out_words[-1] == '<end>' or len(out_words) >= 20:
          break
          
    translation = ' '.join(out_words)
    
    
    
    return out_input_sent, out_target_sent, translation

In [0]:
input_sent, target_sent, translation = translate_re()
print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

In [0]:
optimizer = tf.keras.optimizers.Adam()

In [0]:
@tf.function # remove this annotation when debugging
def train_step_re(source_seq_re, target_seq_re, target_labels_re, initial_state_re):
  
  with tf.GradientTape() as tape:
    encoder_output_re, encoder_state_re = encoder_re(source_seq_re, initial_state_re)
    logits, decoder_state_re = decoder_re(target_seq_re, encoder_state_re)
    loss = calc_loss(target_labels_re, logits)

  variables = encoder_re.trainable_variables + decoder_re.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

In [0]:
EPOCHS = 400

for epoch in range(EPOCHS):
    start = time.time()
  
    en_initial_states_re = encoder_re.init_state(batch_size)
    
    for batch, (source_seq_re, target_seq_re, target_labels_re) in enumerate(dataset_re):
      #print(source_seq.shape, target_seq.shape, target_labels.shape)
      loss = train_step_re(source_seq_re, target_seq_re, target_labels_re, en_initial_states_re)
      elapsed = time.time() - start
    
    if epoch % 10 == 0:
      print("Epoch #%d, Loss %.4f, Time %.2f sec" % (epoch, loss, elapsed))
      input_sent, target_sent, translation = translate_re()
      print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

Calculate BLEU score

In [0]:
references, hypotheses = [], []

for i in range(len(sentences)):
  input_sent, target_sent, translation = translate_re()
  references.append(target_sent)
  hypotheses.append("<start> " + translation)
  
results = sacrebleu.raw_corpus_bleu(hypotheses, [references])
print(results)

### 3. Back-translate. Use the previous two models to translate a sentence from English to Spanish, and then back to English. Compare the original sentence, and the back-translated sentence. Repeat this using an evaluation corpus of 1,000 sentences, and report the BLEU score.

In [0]:
sentences = np.array(sentences)
selected_index = np.random.randint(sentences.shape[0], size=100)


In [0]:
references, translations, hypotheses = [], [], []

for i in selected_index:
  #print(sentences[i][0])
  input_sent, target_sent, translation = translate(i)
  print('input:   ', input_sent)
  translation = "<start>" + translation
  references.append(input_sent)
  translations.append(translation)
  #print('translation: ', translation)


for tr in translations:

  
  input_sent, target_sent, new_translation = translate_re(input_sent= tr)
  #print('input from translation_re: ', input_sent)
  #print('translation from translation_re: ', translation)
  print('new translation from translation_re: ', new_translation)
  hypotheses.append("<start> " + new_translation + " <end>)
  
results = sacrebleu.raw_corpus_bleu(hypotheses, [references])
print(results)

