## Similar Souce with tensorflow

https://machinetalk.org/2019/03/29/neural-machine-translation-with-attention-mechanism/

In [0]:
%tensorflow_version 2.x

TensorFlow 2.x selected.


In [0]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import unicodedata

In [0]:
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [0]:
df = pd.read_csv(path_to_file, sep='\t', header=None)
df.columns = ['eng', 'spa']

In [0]:
df.shape

(118964, 2)

In [0]:
df.sample(10)

Unnamed: 0,eng,spa
38251,Tom wore a white jacket.,Tomás tenía puesto una campera blanca.
31233,He was drunk and angry.,Estaba borracho y furioso.
108661,I can't afford to shop at such an expensive st...,No puedo permitirme comprar en una tienda tan ...
116800,What I can't bear is the sound of chalk squeak...,Lo que no puedo soportar es el sonido de la ti...
79411,I can't believe you're really here.,No me puedo creer que estés aquí.
86483,The theft must've been an inside job.,El robo tiene que haberse hecho por alguien de...
100781,I want to spend the whole weekend in Boston.,Quiero pasar el fin de semana completo en Boston.
72336,I always get along well with him.,Yo siempre me llevo bien con él.
59387,We traveled around Australia.,Viajamos por toda Australia.
20550,I'm a tennis player.,Soy tenista.


In [0]:
def unicode_to_ascii(sentence):
  return ''.join(word for word in unicodedata.normalize('NFD', sentence) if unicodedata.category(word) != 'Mn')

In [0]:
unicodedata.category('¿')

'Po'

In [0]:
def preprocess_text(sentence):
  sentence = unicode_to_ascii(sentence)
  sentence = re.sub(r'([!.,?¿])', r' \1 ', sentence) # \1 means the first charater.. "<space> <first char> <space>"
  sentence = re.sub(r'[" "]+', ' ', sentence)
  sentence = re.sub(r'[^a-zA-Z!.,?¿]+', ' ', sentence)
  sentence = sentence.rstrip().strip()
  sentence = '<start> '+sentence+' <end>'
  return sentence

In [0]:
preprocess_text('heoool.')

'<start> heoool . <end>'

In [0]:
df['eng'] = df['eng'].apply(preprocess_text)
df['spa'] = df['spa'].apply(preprocess_text)

In [0]:
df.head()

Unnamed: 0,eng,spa
0,<start> Go . <end>,<start> Ve . <end>
1,<start> Go . <end>,<start> Vete . <end>
2,<start> Go . <end>,<start> Vaya . <end>
3,<start> Go . <end>,<start> Vayase . <end>
4,<start> Hi . <end>,<start> Hola . <end>


In [0]:
eng_data = df['eng'].tolist()
spa_data = df['spa'].tolist()

In [0]:
lang_token = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<UNK>')

In [0]:
def tokenize_lang(language_data):
  lang_token = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<UNK>')
  lang_token.fit_on_texts(language_data)
  lang_sequence = lang_token.texts_to_sequences(language_data)
  return lang_sequence, lang_token

In [0]:
eng_sequence, eng_token = tokenize_lang(eng_data)
spa_sequence, spa_token = tokenize_lang(spa_data)

In [0]:
def lang_maxlength(lang_sequence):
  result = max([len(seq) for seq in lang_sequence])
  return result

print(lang_maxlength(eng_sequence))

51


In [0]:
eng_maxlen = lang_maxlength(eng_sequence)
spa_maxlen = lang_maxlength(spa_sequence)

def pad_sequence(lang_seq, max_len):
  return tf.keras.preprocessing.sequence.pad_sequences(lang_seq, maxlen=max_len, padding='post')

eng_sequence = pad_sequence(eng_sequence, eng_maxlen)
spa_sequence = pad_sequence(spa_sequence, spa_maxlen)

print(f'eng : {eng_sequence.shape}, eng_max_length: {eng_maxlen}, Token: eng_token')
print(f'eng : {spa_sequence.shape}, eng_max_length: {spa_maxlen}, Token: spa_token')

eng : (118964, 51), eng_max_length: 51, Token: eng_token
eng : (118964, 53), eng_max_length: 53, Token: spa_token


In [0]:
spa_target_sequence = np.zeros(spa_sequence.shape, dtype=np.int)
spa_target_sequence[:, :-1] = spa_sequence[:, 1:]

print(spa_sequence[0])
print(f'spa input seq : {spa_sequence.shape}')
print('==========================================')
print(spa_target_sequence[0])
print(f'spa target seq : {spa_target_sequence.shape}')

[  2 365   4   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
spa input seq : (118964, 53)
[365   4   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
spa target seq : (118964, 53)


In [0]:
dataset = tf.data.Dataset.from_tensor_slices((eng_sequence, spa_sequence, spa_target_sequence)).batch(5)

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, lstm_units):
    super(Encoder, self).__init__()
    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim
    self.lstm_units = lstm_units

    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.lstm = tf.keras.layers.LSTM(lstm_units, return_sequences=True, return_state=True)

  def call(self, inputs):
    sequence = inputs[0]
    state = inputs[1]

    embed = self.embedding(sequence)
    # print(embed)
    out, state_h, state_c = self.lstm(embed, initial_state=state)
    # print(out)
    # print(state_h)
    # print(state_c)
    return out, state_h, state_c

  def init_state(self, batch_size):
    return (tf.zeros([batch_size, self.lstm_units]),
            tf.zeros([batch_size, self.lstm_units]))

In [0]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, lstm_units):
    super(Decoder, self).__init__()
    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim
    self.lstm_units = lstm_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.lstm = tf.keras.layers.LSTM(lstm_units, return_sequences=True, return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)
  
  def call(self, inputs):
    sequence = inputs[0]
    state = inputs[1]

    embed = self.embedding(sequence)
    # print(embed)
    
    lstm_out, state_h, state_c = self.lstm(embed, initial_state=state)
    # print(lstm_out)
    # print(state_h)
    # print(state_c)

    logits = self.dense(lstm_out)
    # print(logits)

    return logits, state_h, state_c

In [0]:
class test_env:
  embedding_size = 300
  lstm_unit = 256
  en_vocab_size = len(eng_token.word_index) + 1
  sp_vocab_size = len(spa_token.word_index) + 1

test_hp = test_env()

print('eng')
print(test_hp.en_vocab_size)
print('spa')
print(test_hp.sp_vocab_size)

eng
12935
spa
24795


In [0]:
encoder = Encoder(test_hp.en_vocab_size, test_hp.embedding_size, test_hp.lstm_unit)

decoder = Decoder(test_hp.sp_vocab_size, test_hp.embedding_size, test_hp.lstm_unit)

# testing
source_input = tf.constant([[1,3,5,7,2,0,0,0]])
init_state = encoder.init_state(1)

encoder_output, en_state_h, en_state_c = encoder([source_input, init_state])

target_input = tf.constant([[1,4,6,9,2,0,0]])
decoder_output, de_state_h, de_state_c = decoder([target_input, (en_state_h, en_state_c)])

In [0]:
print('Source sequences', source_input.shape)
print('Encoder outputs', encoder_output.shape)
print('Encoder state_h', en_state_h.shape)
print('Encoder state_c', en_state_c.shape)
print()
print('Destination sequences', target_input.shape)
print('Decoder outputs', decoder_output.shape)
print('Decoder state_h', de_state_h.shape)
print('Decoder state_c', de_state_c.shape)

Source sequences (1, 8)
Encoder outputs (1, 8, 256)
Encoder state_h (1, 256)
Encoder state_c (1, 256)

Destination sequences (1, 7)
Decoder outputs (1, 7, 24795)
Decoder state_h (1, 256)
Decoder state_c (1, 256)


In [0]:
# 1
crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

def loss_function(targets, logits):
  mask = tf.math.equal(targets, 0)
  mask = tf.math.logical_not(mask)
  mask = tf.cast(mask, tf.int64)
  loss = crossentropy(targets, logits, sample_weight=mask)
  return loss

# 2
ce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

def non_mask_loss_function(targets, logits):
  loss = ce(targets, logits)
  return loss


# 3
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def original_loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [0]:
test_target = tf.constant([[1,2,3,0,0]])

test_pred = tf.constant([[[0.7,0.1,0.1,0.1],    # 0
                          [0.1,0.7,0.1,0.1],    # 1
                          [0.1,0.1,0.7,0.1],    # 2
                          [0.7,0.1,0.1,0.1],    # 0
                          [0.7,0.1,0.1,0.1]]])  # 0

test_pred_2 = tf.constant([[
                          [0.1,0.7,0.1,0.1],    # 1
                          [0.1,0.1,0.7,0.1],    # 2
                          [0.1,0.1,0.1,0.7],    # 3
                          [0.7,0.1,0.1,0.1],    # 0
                          [0.7,0.1,0.1,0.1]     # 0
                          ]])               

print(test_pred)
print('----')
print(test_target)
print('with mask loss')
print('0,1,2,0,0')
print(loss_function(test_target, test_pred))
print('1,2,3,0,0')
print(loss_function(test_target, test_pred_2))
print()
print('with out mask loss')
print('0,1,2,0,0')
print(non_mask_loss_function(test_target, test_pred))
print('1,2,3,0,0')
print(non_mask_loss_function(test_target, test_pred_2))
print()
print('original tf loss')
print('0,1,2,0,0')
print(original_loss_function(test_target, test_pred))
print('1,2,3,0,0')
print(original_loss_function(test_target, test_pred_2))

tf.Tensor(
[[[0.7 0.1 0.1 0.1]
  [0.1 0.7 0.1 0.1]
  [0.1 0.1 0.7 0.1]
  [0.7 0.1 0.1 0.1]
  [0.7 0.1 0.1 0.1]]], shape=(1, 5, 4), dtype=float32)
----
tf.Tensor([[1 2 3 0 0]], shape=(1, 5), dtype=int32)
with mask loss
0,1,2,0,0
tf.Tensor(0.94392794, shape=(), dtype=float32)
1,2,3,0,0
tf.Tensor(0.58392805, shape=(), dtype=float32)

with out mask loss
0,1,2,0,0
tf.Tensor(1.3332134, shape=(), dtype=float32)
1,2,3,0,0
tf.Tensor(0.9732134, shape=(), dtype=float32)

original tf loss
0,1,2,0,0
tf.Tensor(0.94392794, shape=(), dtype=float32)
1,2,3,0,0
tf.Tensor(0.58392805, shape=(), dtype=float32)


In [0]:
encoder = Encoder(test_hp.en_vocab_size, test_hp.embedding_size, test_hp.lstm_unit)
decoder = Decoder(test_hp.sp_vocab_size, test_hp.embedding_size, test_hp.lstm_unit)

In [0]:
optimizer = tf.keras.optimizers.Adam()

In [0]:
# @tf.function
# def train_step(source_sequence, target_seq_in, target_seq_out, en_initial_state):
#   with tf.GradientTape() as tape:
#     en_outputs, en_h_state, en_c_state = encoder([source_sequence, en_initial_state])
#     en_states = (en_h_state, en_c_state) # state_h & state_c
#     de_states = en_states

#     de_outputs = decoder([target_seq_in, de_states])
#     logits= de_outputs[0]
#     loss = loss_function(target_seq_out, logits)
  
#   training_variables = encoder.trainable_variables + decoder.trainable_variables
#   gradients = tape.gradient(loss, training_variables)
#   optimizer.apply_gradients(zip(gradients, training_variables))
#   return loss

In [0]:
def loss_func(targets, logits):
  crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(
      from_logits=True)
  mask = tf.math.logical_not(tf.math.equal(targets, 0))
  mask = tf.cast(mask, dtype=tf.int64)
  loss = crossentropy(targets, logits, sample_weight=mask)
  return loss

    
# @tf.function
# def train_step(source_seq, target_seq_in, target_seq_out, en_initial_states):

#   with tf.GradientTape() as tape:
#       en_outputs = encoder([source_seq, en_initial_states])  # input: (b, 51)
#                                                            # init_state: (b, units)   
#       en_states = en_outputs[1:]                           # state_h, state_c: (b, units), (b, units)
#       de_states = en_states                                 

#       de_outputs = decoder([target_seq_in, de_states])       # input: (b, 53)
#                                                            # de state: state_h, state_c: (b, units), (b, units)
#       logits = de_outputs[0]                               # logits: (b, output_units)          
#       loss = loss_func(target_seq_out, logits)

#   variables = encoder.trainable_variables + decoder.trainable_variables
#   gradients = tape.gradient(loss, variables)
#   optimizer.apply_gradients(zip(gradients, variables))
#   return loss

@tf.function
def train_step(source_seq, target_seq_in, target_seq_out, en_initial_states):
  loss = 0

  with tf.GradientTape() as tape:
    en_outputs, en_state_h, en_state_c = encoder([source_seq, en_initial_states])

    dec_input = tf.expand_dims([spa_token.word_index['<start>']] * 32, 1)
    dec_states = [en_state_h, en_state_c]

    for t in range(target_seq_in.shape[1]):
      prediction, de_state_h, de_state_c = decoder([dec_input, dec_states])
      loss += loss_func(target_seq_out[:, t], prediction)
      dec_input = tf.expand_dims(target_seq_out[:, t], 1)
    
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))
  return loss / target_seq_out.shape[1]

In [0]:
def inference():
  test_source_text = eng_data[np.random.choice(len(eng_data))]
  print(test_source_text)
  test_source_seq = eng_token.texts_to_sequences([test_source_text])
  print(test_source_seq)

  idx2spa = {v:k for k, v in spa_token.word_index.items()}


  en_initial_states = encoder.init_state(1) # batch 1
  en_outputs = encoder([tf.constant(test_source_seq), en_initial_states])

  de_input = tf.constant([[spa_token.word_index['<start>']]]) # shape 1, 1
  de_state_h, de_state_c = en_outputs[1:]

  out_word = []

  while True:
    de_output, de_state_h, de_state_c = decoder([de_input, (de_state_h, de_state_c)])
    de_input = tf.argmax(de_output, axis=-1)
    out_word.append(spa_token.index_word[de_input.numpy()[0][0]])
    if out_word[-1] == '<end>' or len(out_word) >= 20:
      break
  
  print(' '.join(out_word))

In [164]:
inference()

<start> I began to sweat . <end>
[[2, 5, 489, 7, 2445, 4, 3]]
. <end>


In [0]:
dataset = tf.data.Dataset.from_tensor_slices((eng_sequence, spa_sequence, spa_target_sequence))
dataset = dataset.batch(32)

In [0]:
for n, (eng, spa, spa_out) in enumerate(dataset):
  print(n)
  print('=======')
  print(eng)
  print(eng.shape)
  print('=======')
  print(spa)
  print(spa.shape)
  print('=======')
  print(spa_out)
  print(spa_out.shape)
  print('=======')

  if n == 0:
    break

0
tf.Tensor(
[[   2   50    4 ...    0    0    0]
 [   2   50    4 ...    0    0    0]
 [   2   50    4 ...    0    0    0]
 ...
 [   2 1336    4 ...    0    0    0]
 [   2  909    4 ...    0    0    0]
 [   2 1279  120 ...    0    0    0]], shape=(32, 51), dtype=int32)
(32, 51)
tf.Tensor(
[[    2   365     4 ...     0     0     0]
 [    2  1322     4 ...     0     0     0]
 [    2   501     4 ...     0     0     0]
 ...
 [    2 10866    27 ...     0     0     0]
 [    2  3856     4 ...     0     0     0]
 [    2    37  2188 ...     0     0     0]], shape=(32, 53), dtype=int32)
(32, 53)
tf.Tensor(
[[  365     4     3 ...     0     0     0]
 [ 1322     4     3 ...     0     0     0]
 [  501     4     3 ...     0     0     0]
 ...
 [10866    27  7555 ...     0     0     0]
 [ 3856     4     3 ...     0     0     0]
 [   37  2188    85 ...     0     0     0]], shape=(32, 53), dtype=int64)
(32, 53)


In [169]:
NUM_EPOCHS = 200
BATCH_SIZE = 32

for e in range(NUM_EPOCHS):
  en_init_state = encoder.init_state(32)
  for n, (eng, spa, spa_target) in enumerate(dataset.take(1)):
    loss_result = train_step(eng, spa, spa_target, en_init_state)
  # print(loss_result.numpy())
  print('Epoch {} Loss {:.4f}'.format(e + 1, loss_result.numpy()))
  print('------')
  print(inference())
  print('======')

Epoch 1 Loss 0.1237
------
<start> What s going on here ? <end>
[[2, 30, 16, 79, 37, 63, 11, 3]]
. <end>
None
Epoch 2 Loss 0.1129
------
<start> One after another the animals died . <end>
[[2, 75, 183, 321, 6, 760, 308, 4, 3]]
. <end>
None
Epoch 3 Loss 0.1130
------
<start> I understand the risks . <end>
[[2, 5, 209, 6, 2799, 4, 3]]
<end>
None
Epoch 4 Loss 0.1148
------
<start> I need some soap . <end>
[[2, 5, 92, 106, 2563, 4, 3]]
<end>
None
Epoch 5 Loss 0.1134
------
<start> The story is full of holes . <end>
[[2, 6, 414, 12, 520, 19, 3897, 4, 3]]
! <end>
None
Epoch 6 Loss 0.1109
------
<start> Have you made a decision yet ? <end>
[[2, 23, 8, 146, 10, 796, 300, 11, 3]]
! <end>
None
Epoch 7 Loss 0.1096
------
<start> You can t blame them . <end>
[[2, 8, 34, 13, 678, 193, 4, 3]]
! <end>
None
Epoch 8 Loss 0.1096
------
<start> My mission is to photograph the documents . <end>
[[2, 26, 2576, 12, 7, 2517, 6, 2602, 4, 3]]
! <end>
None
Epoch 9 Loss 0.1096
------
<start> He licked his finger

In [0]:
target = 0.2

tf.math.equal(target, 0)

<tf.Tensor: shape=(), dtype=bool, numpy=False>

In [0]:
tf.math.logical_not(tf.math.equal(target, 0))

<tf.Tensor: shape=(), dtype=bool, numpy=True>

In [0]:
target = 0

tf.math.equal(target, 0)

<tf.Tensor: shape=(), dtype=bool, numpy=True>

In [0]:
tf.math.logical_not(tf.math.equal(target, 0))

<tf.Tensor: shape=(), dtype=bool, numpy=False>

In [0]:
tf.cast(tf.math.logical_not(tf.math.equal(target, 0)), tf.int64)

<tf.Tensor: shape=(), dtype=int64, numpy=1>

In [0]:
tf.keras.losses.SparseCategoricalCrossentropy()