In [2]:
%tensorflow_version 2.x

TensorFlow 2.x selected.


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import tensorflow as tf
import unicodedata
from tqdm import tqdm_notebook
print(tf.__version__)

2.1.0


In [4]:
!wget http://www.manythings.org/anki/fra-eng.zip
!unzip fra-eng.zip

--2020-03-09 10:13:11--  http://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.109.196, 104.24.108.196, 2606:4700:3033::6818:6dc4, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.109.196|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5939832 (5.7M) [application/zip]
Saving to: ‘fra-eng.zip’


2020-03-09 10:13:12 (21.1 MB/s) - ‘fra-eng.zip’ saved [5939832/5939832]

Archive:  fra-eng.zip
  inflating: _about.txt              
  inflating: fra.txt                 


In [0]:
df = pd.read_csv('/content/fra.txt', sep='\t', header=None)
df.columns = ['eng', 'fra', 'attr']

In [6]:
df.head()

Unnamed: 0,eng,fra,attr
0,Go.,Va !,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Hi.,Salut !,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
2,Hi.,Salut.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
3,Run!,Cours !,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
4,Run!,Courez !,CC-BY 2.0 (France) Attribution: tatoeba.org #9...


In [0]:
df = df[['eng', 'fra']]

In [8]:
df.head()

Unnamed: 0,eng,fra
0,Go.,Va !
1,Hi.,Salut !
2,Hi.,Salut.
3,Run!,Cours !
4,Run!,Courez !


### Clear text

In [0]:
def unicode_to_ascii(sentence):
  return ''.join(char for char in unicodedata.normalize('NFD', sentence) if unicodedata.category(char) != 'Mn')

def clean_text(sentence):
  s = unicode_to_ascii(sentence.lower().strip())
  s = re.sub(r'([!.,?])', r' \1 ', s)
  s = re.sub(r'[^a-zA-Z!.,?]+', r' ', s)
  s = re.sub(r'[\s]+', ' ', s)
  s = '<start> ' + s + ' <end>'
  return s

In [10]:
test_sentence = 'helllo!  its   me.'

print(clean_text(test_sentence))

<start> helllo ! its me .  <end>


In [0]:
df['eng_clean'] = df['eng'].apply(clean_text)
df['fra_clean'] = df['fra'].apply(clean_text)

In [12]:
df.head()

Unnamed: 0,eng,fra,eng_clean,fra_clean
0,Go.,Va !,<start> go . <end>,<start> va ! <end>
1,Hi.,Salut !,<start> hi . <end>,<start> salut ! <end>
2,Hi.,Salut.,<start> hi . <end>,<start> salut . <end>
3,Run!,Cours !,<start> run ! <end>,<start> cours ! <end>
4,Run!,Courez !,<start> run ! <end>,<start> courez ! <end>


In [0]:
eng_data = df['eng_clean'].tolist()
fra_data = df['fra_clean'].tolist()

In [0]:
def lang_tokenize(data):
  lang_token = tf.keras.preprocessing.text.Tokenizer(filters='')
  lang_token.fit_on_texts(data)

  convert_data = lang_token.texts_to_sequences(data)
  return convert_data, lang_token

In [0]:
eng_token_data, eng_token = lang_tokenize(eng_data)
fra_token_data, fra_token = lang_tokenize(fra_data)

In [16]:
# Get # of vocabulary

eng_vocab_size = len(eng_token.word_index) + 1
fra_vocab_size = len(fra_token.word_index) + 1

print(eng_vocab_size)
print(fra_vocab_size)

13860
22791


In [0]:
def max_seq_length(data):
  return max([len(seq) for seq in data])

In [18]:
eng_maxlen = max_seq_length(eng_token_data)
fra_maxlen = max_seq_length(fra_token_data)


print(eng_maxlen)
print(fra_maxlen)

54
65


In [0]:
def padding_data(data):
  maxlen = max_seq_length(data)
  return tf.keras.preprocessing.sequence.pad_sequences(data, maxlen=maxlen, padding='post')

In [0]:
eng_token_data = padding_data(eng_token_data)
fra_token_data = padding_data(fra_token_data)

In [21]:
eng_token_data

array([[   1,   49,    3, ...,    0,    0,    0],
       [   1, 2658,    3, ...,    0,    0,    0],
       [   1, 2658,    3, ...,    0,    0,    0],
       ...,
       [   1,  365,   51, ...,    0,    0,    0],
       [   1,   69,  280, ...,    3,    2,    0],
       [   1,   14,  175, ..., 3418,    3,    2]], dtype=int32)

In [22]:
fra_token_data

array([[   1,  123,   38, ...,    0,    0,    0],
       [   1, 3538,   38, ...,    0,    0,    0],
       [   1, 3538,    3, ...,    0,    0,    0],
       ...,
       [   1, 7296,   12, ...,    0,    0,    0],
       [   1,   60,  175, ..., 2108,    3,    2],
       [   1,   12,    9, ...,    2,    0,    0]], dtype=int32)

In [0]:
eng_input = eng_token_data
fra_input = fra_token_data
fra_output = np.zeros(shape=fra_input.shape, dtype=np.int)
fra_output[:, :-1] = fra_input[:, 1:]

In [24]:
print(eng_input.shape)
print(eng_input)
print('==========')
print(fra_input.shape)
print(fra_input)
print('==========')
print(fra_output.shape)
print(fra_output)

(174481, 54)
[[   1   49    3 ...    0    0    0]
 [   1 2658    3 ...    0    0    0]
 [   1 2658    3 ...    0    0    0]
 ...
 [   1  365   51 ...    0    0    0]
 [   1   69  280 ...    3    2    0]
 [   1   14  175 ... 3418    3    2]]
(174481, 65)
[[   1  123   38 ...    0    0    0]
 [   1 3538   38 ...    0    0    0]
 [   1 3538    3 ...    0    0    0]
 ...
 [   1 7296   12 ...    0    0    0]
 [   1   60  175 ... 2108    3    2]
 [   1   12    9 ...    2    0    0]]
(174481, 65)
[[ 123   38    2 ...    0    0    0]
 [3538   38    2 ...    0    0    0]
 [3538    3    2 ...    0    0    0]
 ...
 [7296   12   42 ...    0    0    0]
 [  60  175   21 ...    3    2    0]
 [  12    9  105 ...    0    0    0]]


In [25]:
print(fra_input.shape)
print(fra_input[0])
print('==========')
print(fra_output.shape)
print(fra_output[0])

(174481, 65)
[  1 123  38   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0]
(174481, 65)
[123  38   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0]


## Build Encoder Decoder Architecture

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, emb_dim, rnn_units):
    super(Encoder, self).__init__()
    self.vocab_size = vocab_size
    self.emb_dim = emb_dim
    self.rnn_units = rnn_units

    self.emb = tf.keras.layers.Embedding(vocab_size, emb_dim)
    self.gru_1 = tf.compat.v1.keras.layers.CuDNNGRU(rnn_units, return_sequences=True, return_state=True, name='encoder')

  def call(self, x, hidden):
    emb = self.emb(x)
    rnn_out, rnn_hidden = self.gru_1(emb, initial_state=hidden)
    return rnn_out, rnn_hidden

  def initial_hidden(self, batch_size):
    return tf.zeros([batch_size, self.rnn_units])

In [0]:
# debuging
test_input = tf.constant([1,2,3,4,5,6,7,8,9,0])
test_input = tf.expand_dims(test_input, 0)

test_enc = Encoder(100, 120, 200) # vocab-size, emb-dim, rnn-units

rnn_initial_hidden = test_enc.initial_hidden(1)
rnn_out, hidden = test_enc(test_input, rnn_initial_hidden)

In [28]:
print('rnn output shape')
print(rnn_out.shape, '\n')
print('output hidden state')
print(hidden.shape)

rnn output shape
(1, 10, 200) 

output hidden state
(1, 200)


In [0]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, emb_dim, rnn_units):
    super(Decoder,self).__init__()
    self.vocab_size = vocab_size
    self.emb_dim =emb_dim
    self.rnn_units=rnn_units

    self.emb = tf.keras.layers.Embedding(vocab_size, emb_dim)
    self.gru_1 = tf.compat.v1.keras.layers.CuDNNGRU(rnn_units, return_sequences=True, return_state=True, name='decoder')
    # self.out_layer = tf.keras.layers.Dense(vocab_size, activation='softmax')
    self.out_layer = tf.keras.layers.Dense(vocab_size)


  def call(self, x, hidden):
    emb = self.emb(x)
    rnn_out, rnn_hidden = self.gru_1(emb, initial_state=hidden)
    result = self.out_layer(rnn_out)

    return result, rnn_hidden

In [0]:
# debuging
test_input = tf.constant([1,2,3,4,5,6,7,8,9,0])
test_input = tf.expand_dims(test_input, 0)


test_dec = Decoder(100, 120, 200) # vocab-size, emb-dim, rnn-units


decoder_initial_hidden = hidden # from encoder hidden_state
rnn_out, hidden = test_dec(test_input, decoder_initial_hidden)

In [31]:
print('rnn output shape')
print(rnn_out.shape, '\n')
print('output hidden state')
print(hidden.shape)

rnn output shape
(1, 10, 100) 

output hidden state
(1, 200)


In [0]:
BATCH_SIZE = 128

dataset = tf.data.Dataset.from_tensor_slices((eng_input, fra_input, fra_output))
dataset = dataset.batch(BATCH_SIZE)

encoder = Encoder(eng_vocab_size, 256, 512)
decoder = Decoder(fra_vocab_size, 256, 512)

In [0]:
LOSS_OBJECT = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

def loss_func(true, pred):
  mask = tf.math.logical_not(tf.math.equal(true, 0))
  mask = tf.cast(mask, dtype=tf.int64)

  loss = LOSS_OBJECT(true, pred, sample_weight=mask)
  return loss

In [0]:
optimizer = tf.keras.optimizers.Adam(clipnorm=5.0)

In [0]:
@tf.function
def train_step(eng_input, fra_input, fra_output, en_init_state):

  # total_loss = 0
  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(eng_input, en_init_state)

    dec_hidden = enc_hidden

    de_outputs = decoder(fra_input, dec_hidden)
    logits= de_outputs[0]
    loss = loss_func(fra_output, logits)

  train_vars = encoder.trainable_variables + decoder.trainable_variables
  grad = tape.gradient(loss, train_vars)
  optimizer.apply_gradients(zip(grad, train_vars))

  return loss 

In [36]:
steps_per_epoch = len(eng_input) // BATCH_SIZE

steps_per_epoch

1363

In [0]:
def inference():
  fra_idx_word = {v: k for k, v in fra_token.word_index.items()}

  selection_index = np.random.choice(len(eng_data))
  sample_eng_sequence = eng_data[selection_index]
  sample_fra_sequence = fra_data[selection_index]
  print('Original Eng Sentence - Preprocessed')
  print(sample_eng_sequence)
  print('=======================')
  print('Original Fra Sentence -Preprocessed')
  print(sample_fra_sequence)

  
  eng_inference_data = eng_token.texts_to_sequences([sample_eng_sequence])
  eng_inference_data = tf.keras.preprocessing.sequence.pad_sequences(eng_inference_data, maxlen=eng_maxlen, padding='post')

  eng_inference_data = tf.constant(eng_inference_data)
  en_init_state = encoder.initial_hidden(1)
  encoder_output, encoder_hidden = encoder(eng_inference_data, en_init_state)

  de_input = tf.constant([[fra_token.word_index['<start>']]]) # (1,1)
  de_init_state = encoder_hidden

  out_word = []
  while True:
    de_output, de_init_state = decoder(de_input, de_init_state)
    # print(de_output, de_init_state)
    # print(tf.argmax(de_output, -1))
    de_input = tf.argmax(de_output, -1)
    
    # print(de_input.numpy()[0][0])
    out_word.append(fra_idx_word[de_input.numpy()[0][0]])

    if out_word[-1] == '<end>' or len(out_word) >= 20:
      break
  
  print('----------------------')
  print('Translated Result')
  print(' '.join(out_word))


In [80]:
inference()

Original Eng Sentence - Preprocessed
<start> he has enough ability to manage a business .  <end>
Original Fra Sentence -Preprocessed
<start> il est assez apte a gerer une affaire .  <end>
----------------------
Translated Result
parlerait reincarne figurine ustensiles sejournez rirais debats deregule empoisonnez repondez realisent distribue abstrait changent monroe curiosite deconseilla maladroit affreuses insolence


In [81]:
EPOCH = 200

for epoch in tqdm_notebook(range(EPOCH)):
  enc_hidden = encoder.initial_hidden(BATCH_SIZE)
  
  for n, (eng, fra, fra_targ) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(eng, fra, fra_targ, enc_hidden)

    if n % 100 == 0:
      print('batch loss:', batch_loss)

  print(epoch, '\t', batch_loss,'\n')
  print('------- INFERENCE MODE --------')
  print(inference())

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

batch loss: tf.Tensor(0.6259085, shape=(), dtype=float32)
batch loss: tf.Tensor(0.4528869, shape=(), dtype=float32)
batch loss: tf.Tensor(0.47813782, shape=(), dtype=float32)
batch loss: tf.Tensor(0.6457364, shape=(), dtype=float32)
batch loss: tf.Tensor(0.6598836, shape=(), dtype=float32)
batch loss: tf.Tensor(0.5036615, shape=(), dtype=float32)
batch loss: tf.Tensor(0.64032376, shape=(), dtype=float32)
batch loss: tf.Tensor(0.5848668, shape=(), dtype=float32)
batch loss: tf.Tensor(0.5931212, shape=(), dtype=float32)
batch loss: tf.Tensor(0.6816991, shape=(), dtype=float32)
batch loss: tf.Tensor(0.6987297, shape=(), dtype=float32)
batch loss: tf.Tensor(0.7667793, shape=(), dtype=float32)
batch loss: tf.Tensor(0.7937891, shape=(), dtype=float32)
batch loss: tf.Tensor(0.8782541, shape=(), dtype=float32)
0 	 tf.Tensor(2.1257715, shape=(), dtype=float32) 

------- INFERENCE MODE --------
Original Eng Sentence - Preprocessed
<start> do you like me , too ?  <end>
Original Fra Sentence -Prep

KeyboardInterrupt: ignored