In [85]:
import tensorflow as tf
from tensorflow.keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import os
import gc
import time
import re
import unicodedata
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [86]:
data_folder_name = 'fra-eng'
train_filename = 'fra.txt'

# Variable for data directory
DATA_PATH = "/Users/admin/Downloads/fra-eng"
train_filenamepath = "/Users/admin/Downloads/fra-eng/fra.txt"

# Both train and test set are in the root data directory
train_path = DATA_PATH
test_path = DATA_PATH
print(test_path)


/Users/admin/Downloads/fra-eng


In [87]:
# Parameters for our model
INPUT_COLUMN = 'input'
TARGET_COLUMN = 'target'
TARGET_FOR_INPUT = 'target_for_input'
NUM_SAMPLES = 20000 #40000
MAX_VOCAB_SIZE = 20000
EMBEDDING_DIM = 128
HIDDEN_DIM=1024 #512

BATCH_SIZE = 64  # Batch size for training.
EPOCHS = 20  # Number of epochs to train for.

ATTENTION_FUNC='general'

In [88]:
# Some function to preprocess the text data, taken from the Neural machine translation with attention tutorial
# in Tensorflow
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    ''' Preprocess the input text w applying lowercase, removing accents,
    creating a space between a word and the punctuation following it and
    replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    Input:
        - w: a string, input text
    Output:
        - a string, the cleaned text
    '''
    w = unicode_to_ascii(w.lower().strip())

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    #w = '<start> ' + w + ' <end>'

    return w


In [89]:
# Load the dataset: sentence in english, sentence in spanish
df=pd.read_csv(train_filenamepath, sep="\t", header=None, names=[INPUT_COLUMN,TARGET_COLUMN], usecols=[0,1],
               nrows=NUM_SAMPLES)
# Preprocess the input data
input_data=df[INPUT_COLUMN].apply(lambda x : preprocess_sentence(x)).tolist()
# Preprocess and include the end of sentence token to the target text
target_data=df[TARGET_COLUMN].apply(lambda x : preprocess_sentence(x)+ ' <eos>').tolist()
# Preprocess and include a start of setence token to the input text to the decoder, it is rigth shifted
target_input_data=df[TARGET_COLUMN].apply(lambda x : '<sos> '+ preprocess_sentence(x)).tolist()

print(df)
print(input_data[:5])
print(target_data[:5])
print(target_input_data[:5])

                   input                        target
0                    Go.                          Va !
1                    Go.                       Marche.
2                    Go.                    En route !
3                    Go.                       Bouge !
4                    Hi.                       Salut !
...                  ...                           ...
19995  He shares a room.       Il partage une chambre.
19996  He slept all day.  Il a dormi toute la journée.
19997  He slept an hour.         Il a dormi une heure.
19998  He slept an hour.          Il dormit une heure.
19999  He slept soundly.      Il dormait profondément.

[20000 rows x 2 columns]
['go .', 'go .', 'go .', 'go .', 'hi .']
['va ! <eos>', 'marche . <eos>', 'en route ! <eos>', 'bouge ! <eos>', 'salut ! <eos>']
['<sos> va !', '<sos> marche .', '<sos> en route !', '<sos> bouge !', '<sos> salut !']


In [90]:
# Create a tokenizer for the input texts and fit it to them
tokenizer_inputs = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='')
tokenizer_inputs.fit_on_texts(input_data)
# Tokenize and transform input texts to sequence of integers
input_sequences = tokenizer_inputs.texts_to_sequences(input_data)
# Claculate the max length
input_max_len = max(len(s) for s in input_sequences)
print('Max Input Length: ', input_max_len)
# Show some example of tokenize sentences, useful to check the tokenization
print(input_data[1000])
print(input_sequences[1000])

Max Input Length:  7
get going .
[33, 125, 1]


In [91]:
# tokenize the outputs
# don't filter out special characters (filters = '')
# otherwise <sos> and <eos> won't appear
# By default, Keras’ Tokenizer will trim out all the punctuations, which is not what we want.
# we can just set filters as blank here.

# Create a tokenizer for the output texts and fit it to them
tokenizer_outputs = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='')
tokenizer_outputs.fit_on_texts(target_data)
tokenizer_outputs.fit_on_texts(target_input_data)
# Tokenize and transform output texts to sequence of integers
target_sequences = tokenizer_outputs.texts_to_sequences(target_data)
target_sequences_inputs = tokenizer_outputs.texts_to_sequences(target_input_data)

# determine maximum length output sequence
target_max_len = max(len(s) for s in target_sequences)
print('Max Target Length: ', target_max_len)

print(target_data[1000])
print(target_sequences[1000])
print(target_input_data[1000])
print(target_sequences_inputs[1000])

Max Target Length:  16
en route ! <eos>
[23, 352, 7, 2]
<sos> en route !
[3, 23, 352, 7]


In [92]:
# get the word to index mapping for input language
word2idx_inputs = tokenizer_inputs.word_index
print('Found %s unique input tokens.' % len(word2idx_inputs))

# get the word to index mapping for output language
word2idx_outputs = tokenizer_outputs.word_index
print('Found %s unique output tokens.' % len(word2idx_outputs))

# store number of output and input words for later
# remember to add 1 since indexing starts at 1
num_words_output = len(word2idx_outputs) + 1
num_words_inputs = len(word2idx_inputs) + 1

# map indexes back into real words
# so we can view the results
idx2word_inputs = {v:k for k, v in word2idx_inputs.items()}
idx2word_outputs = {v:k for k, v in word2idx_outputs.items()}
print(idx2word_inputs)
print(idx2word_outputs)

Found 3262 unique input tokens.
Found 5912 unique output tokens.
{1: '.', 2: 'i', 3: 'you', 4: '?', 5: 'it', 6: 'tom', 7: 's', 8: 'is', 9: 'a', 10: 're', 11: 'm', 12: 'we', 13: '!', 14: 't', 15: 'me', 16: 'he', 17: 'that', 18: 'are', 19: 'was', 20: 'they', 21: 'this', 22: 'the', 23: 'go', 24: 'can', 25: 'do', 26: 'to', 27: 'don', 28: 'll', 29: 'be', 30: 'have', 31: 'let', 32: 'not', 33: 'get', 34: 'my', 35: 'up', 36: 'here', 37: 'did', 38: 'she', 39: 'no', 40: 'your', 41: 'what', 42: 'in', 43: 'like', 44: 'who', 45: 'all', 46: 'come', 47: 'need', 48: 'on', 49: 'him', 50: 'love', 51: 'take', 52: ',', 53: 'want', 54: 'got', 55: 'out', 56: 'stop', 57: 'one', 58: 'how', 59: 'help', 60: 'us', 61: 'look', 62: 'so', 63: 'please', 64: 'now', 65: 'keep', 66: 'saw', 67: 'too', 68: 'see', 69: 'stay', 70: 'am', 71: 'give', 72: 've', 73: 'home', 74: 'there', 75: 'just', 76: 'back', 77: 'must', 78: 'lost', 79: 'try', 80: 'know', 81: 'will', 82: 'had', 83: 'were', 84: 'leave', 85: 'good', 86: 'has', 

In [93]:
# pad the input sequences
encoder_inputs = pad_sequences(input_sequences, maxlen=input_max_len, padding='post')
print("encoder_inputs.shape:", encoder_inputs.shape)
print("encoder_inputs[0]:", encoder_inputs[0])
# pad the decoder input sequences
decoder_inputs = pad_sequences(target_sequences_inputs, maxlen=target_max_len, padding='post')
print("decoder_inputs[0]:", decoder_inputs[0])
print("decoder_inputs.shape:", decoder_inputs.shape)
# pad the target output sequences
decoder_targets = pad_sequences(target_sequences, maxlen=target_max_len, padding='post')

encoder_inputs.shape: (20000, 7)
encoder_inputs[0]: [23  1  0  0  0  0  0]
decoder_inputs[0]: [ 3 55  7  0  0  0  0  0  0  0  0  0  0  0  0  0]
decoder_inputs.shape: (20000, 16)


In [94]:
# Define a dataset
dataset = tf.data.Dataset.from_tensor_slices(
    (encoder_inputs, decoder_inputs, decoder_targets))
dataset = dataset.shuffle(len(input_data)).batch(
    BATCH_SIZE, drop_remainder=True)

In [95]:
#Encoder layer
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.hidden_dim = hidden_dim
        #Define the embedding layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(
            hidden_dim, return_sequences=True, return_state=True)
    def call(self, input_sequence, states):
        #embed the input
        embed = self.embedding(input_sequence)
        #call the LSTM unit
        output, state_h, state_c = self.lstm(embed, initial_state=states)
        return output, state_h, state_c
    def init_states(self, batch_size):
        # Return a all 0s initial states
        return (tf.zeros([batch_size, self.hidden_dim]),
                tf.zeros([batch_size, self.hidden_dim]))

In [96]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Decoder, self).__init__()
        self.hidden_dim = hidden_dim
          # Define the embedding layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        # Define the RNN layer, LSTM
        self.lstm = tf.keras.layers.LSTM(
            hidden_dim, return_sequences=True, return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)
    def call(self, input_sequence, state):
        # Embed the input
        embed = self.embedding(input_sequence)
        # Call the LSTM unit
        lstm_out, state_h, state_c = self.lstm(embed, state)
        # Dense layer to predict output token
        logits = self.dense(lstm_out)

        return logits, state_h, state_c

In [97]:
#Set the length of the input and output vocabulary
num_words_inputs = len(word2idx_inputs) + 1
num_words_output = len(word2idx_outputs) + 1

#Create the encoder
encoder = Encoder(num_words_inputs, EMBEDDING_DIM, HIDDEN_DIM)
# Get the initial states
initial_state = encoder.init_states(1)
# Call the encoder for testing
test_encoder_output = encoder(tf.constant(
    [[1, 23, 4, 5, 0, 0]]), initial_state)
print(test_encoder_output[0].shape)
#Create the Decoder
decoder = Decoder(num_words_output, EMBEDDING_DIM, HIDDEN_DIM)
de_initial_state = test_encoder_output[1:]
test_decoder_output = decoder(tf.constant(
    [[1, 3, 5, 7, 9, 0, 0, 0]]), de_initial_state)
print(test_decoder_output[0].shape)

(1, 6, 1024)
(1, 8, 5913)


In [98]:
def loss_func(targets, logits):
    crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True)
    mask = tf.math.logical_not(tf.math.equal(targets,0))
    mask = tf.cast(mask, dtype = tf.int64)
    #calculate the loss value
    loss = crossentropy(targets, logits, sample_weight=mask)
    return loss

def accuracy_fn(y_true, y_pred):
        # y_pred shape is batch_size, seq length, vocab size
    # y_true shape is batch_size, seq length
    pred_values = K.cast(K.argmax(y_pred, axis=-1), dtype='int32')
    correct = K.cast(K.equal(y_true, pred_values), dtype='float32')
    # 0 is padding, don't include those
    mask = K.cast(K.greater(y_true, 0), dtype='float32')
    n_correct = K.sum(mask * correct)
    n_total = K.sum(mask)
  
    return n_correct / n_total

In [99]:
# Use the @tf.function decorator to take advance of static graph computation
@tf.function
def train_step(input_seq, target_seq_in, target_seq_out, en_initial_states, optimizer):
    ''' A training step, train a batch of the data and return the loss value reached
        Input:
        - input_seq: array of integers, shape [batch_size, max_seq_len, embedding dim].
            the input sequence
        - target_seq_out: array of integers, shape [batch_size, max_seq_len, embedding dim].
            the target seq, our target sequence
        - target_seq_in: array of integers, shape [batch_size, max_seq_len, embedding dim].
            the input sequence to the decoder, we use Teacher Forcing
        - en_initial_states: tuple of arrays of shape [batch_size, hidden_dim].
            the initial state of the encoder
        - optimizer: a tf.keras.optimizers.
        Output:
        - loss: loss value
        
    '''
    with tf.GradientTape() as tape:
        en_outputs = encoder(input_seq, en_initial_states)
        # Set the encoder and decoder states
        en_states = en_outputs[1:]
        de_states = en_states
        # Get the encoder outputs
        de_outputs = decoder(target_seq_in, de_states)
        # Take the actual output
        logits = de_outputs[0]
        # Calculate the loss function
        loss = loss_func(target_seq_out, logits)
        acc = accuracy_fn(target_seq_out, logits)
    variables = encoder.trainable_variables + decoder.trainable_variables
    # Calculate the gradients for the variables
    gradients = tape.gradient(loss, variables)
    # Apply the gradients and update the optimizer
    optimizer.apply_gradients(zip(gradients, variables))

    return loss, acc

In [100]:
# Create the main train function
def main_train(encoder, decoder, dataset, n_epochs, batch_size, optimizer, checkpoint, checkpoint_prefix):
    losses = []
    accuracies = []
    for e in range(n_epochs):
        #get the initial time
        start = time.time()
        #get the initial state for the encoder
        en_initial_states = encoder.init_states(batch_size)
        #for every batch
        for batch, (input_seq, target_seq_in, target_seq_out) in enumerate(dataset.take(-1)):
            loss, accuracy = train_step(input_seq, target_seq_in, target_seq_out, en_initial_states, optimizer)
        
            if batch % 100 == 0:
                losses.append(loss)
                accuracies.append(accuracy)
                print('Epoch {} Batch {} Loss {:.4f} Acc:{:.4f}'.format(e + 1, batch, loss.numpy(), accuracy.numpy()))
                    # saving (checkpoint) the model every 2 epochs
        if (e + 1) % 2 == 0:
            checkpoint.save(file_prefix = checkpoint_prefix)
    
        print('Time taken for 1 epoch {:.4f} sec\n'.format(time.time() - start))
        
    return losses, accuracies

In [101]:
# Create an Adam optimizer and clips gradients by norm
optimizer = tf.keras.optimizers.Adam(clipnorm=5.0)
# Create a checkpoint object to save the model
checkpoint_dir = './training_ckpt_seq2seq'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

losses, accuracies = main_train(encoder, decoder, dataset, EPOCHS, BATCH_SIZE, optimizer, checkpoint, checkpoint_prefix)
print(losses,accuracies)

2023-10-19 15:57:33.978982: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/AssignAddVariableOp_10.


Epoch 1 Batch 0 Loss 3.2143 Acc:0.0000
Epoch 1 Batch 100 Loss 1.5984 Acc:0.3229
Epoch 1 Batch 200 Loss 1.3719 Acc:0.3777
Epoch 1 Batch 300 Loss 1.2475 Acc:0.4609
Time taken for 1 epoch 18.6542 sec

Epoch 2 Batch 0 Loss 1.1470 Acc:0.4620
Epoch 2 Batch 100 Loss 1.1541 Acc:0.4779
Epoch 2 Batch 200 Loss 1.0889 Acc:0.4694
Epoch 2 Batch 300 Loss 1.0255 Acc:0.4807
Time taken for 1 epoch 15.1702 sec

Epoch 3 Batch 0 Loss 0.9692 Acc:0.4903
Epoch 3 Batch 100 Loss 0.9560 Acc:0.5230
Epoch 3 Batch 200 Loss 0.9368 Acc:0.5256
Epoch 3 Batch 300 Loss 0.9979 Acc:0.4761
Time taken for 1 epoch 14.9852 sec

Epoch 4 Batch 0 Loss 0.9621 Acc:0.5051
Epoch 4 Batch 100 Loss 0.8609 Acc:0.5278
Epoch 4 Batch 200 Loss 0.9160 Acc:0.5096
Epoch 4 Batch 300 Loss 0.8907 Acc:0.5467
Time taken for 1 epoch 15.0227 sec

Epoch 5 Batch 0 Loss 0.8115 Acc:0.5246
Epoch 5 Batch 100 Loss 0.7618 Acc:0.5851
Epoch 5 Batch 200 Loss 0.8079 Acc:0.5611
Epoch 5 Batch 300 Loss 0.7295 Acc:0.5746
Time taken for 1 epoch 14.7262 sec

Epoch 6 Ba

In [105]:

checkpoint_dir = './training_ckpt_seq2seq'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x2ccaf7a30>

In [106]:
def predict(input_text, encoder, input_max_len, tokenizer_inputs, word2idx_outputs, idx2word_outputs):
    if input_text is None:
        input_text = input_data[np.random.choice(len(input_data))]
        print(input_text)
    # Tokenize the input sequence
    input_seq = tokenizer_inputs.texts_to_sequences([input_text])
    # Pad the sentence
    input_seq = pad_sequences(input_seq, maxlen=input_max_len, padding='post')
    print(input_seq)
    # Set the encoder initial state
    en_initial_states = encoder.init_states(1)
    en_outputs = encoder(tf.constant(input_seq), en_initial_states)
    # Create the decoder input, the sos token
    de_input = tf.constant([[word2idx_outputs['<sos>']]])
    # Set the decoder states to the encoder vector or encoder hidden state
    de_state_h, de_state_c = en_outputs[1:]
    
    out_words = []
    while True:
        # Decode and get the output probabilities
        de_output, de_state_h, de_state_c = decoder(
            de_input, (de_state_h, de_state_c))
        # Select the word with the highest probability
        de_input = tf.argmax(de_output, -1)
        # Append the word to the predicted output
        out_words.append(idx2word_outputs[de_input.numpy()[0][0]])
        # Finish when eos token is found or the max length is reached
        if out_words[-1] == '<eos>' or len(out_words) >= 20:
            break

    print(' '.join(out_words))

In [107]:
test_sents = [input_data[10003], input_data[10120]]
#test_sents = [encoder_inputs[1000]]
print(test_sents)
for test_sent in test_sents:
    predict(test_sent, encoder, input_max_len, tokenizer_inputs, word2idx_outputs, idx2word_outputs)

['what a scream !', 'you hypocrite !']
[[ 41   9 926  13   0   0   0]]
quel taudis ! <eos>
[[   3 2629   13    0    0    0    0]]
espece d idiot ! <eos>
