In [1]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.models import load_model
import random
import string
import pandas as pd
from tqdm import tqdm
import pickle
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import unicodedata
import re
import numpy as np
import os
import io
import time

In [40]:
# Download the file
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

In [3]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    # Add space between punctuation and words
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # Replace non-alphabetical characters
    w = re.sub(r"[^a-zA-Z ]+", "", w)

    w = w.strip()

    # Add start and end tokens
    return w


en_sentence = u"May I borrow this book?"
print(preprocess_sentence(en_sentence))

may i borrow this book


In [4]:
def noise(data):
    input_texts = []
    target_texts = []
    for line in data:
        input_text = line.lower()
        input_text = re.sub(r'[^a-zA-Z ]+', '', input_text)
        target_text = "\t" + input_text + "\n"
        input_texts.append(input_text)
        target_texts.append(target_text)
        inp = input_text
        for _ in range(2):
            input_text = inp
            for i in range(np.random.choice(np.arange(0, 2), p=[0.1, 0.9])):
                input_text = input_text.replace(random.choice(list(input_text)), random.choice(string.ascii_letters))

            input_texts.append(input_text.lower())
            target_texts.append(target_text)

    return input_texts, target_texts

In [7]:
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    en = [preprocess_sentence(line.split('\t')[0]) for line in lines[:num_examples]]

    inp, targ = noise(en)

    return inp, targ


def tokenize(lang):
    lang_tokenizer = keras.preprocessing.text.Tokenizer(filters='', char_level=True)
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    return tensor, lang_tokenizer


def load_dataset(path, num_examples=None):
    inp_lang, targ_lang = create_dataset(path, num_examples)
    print(inp_lang[55000])
    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer


In [8]:
# Number of examples to train
num_examples = 100000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)

# Calculate max length of the tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

theell call


In [9]:
# Creating training and validation sets (80-20 split)
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length of datasets
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

24000 24000 6000 6000


In [10]:
def convert(lang, tensor):
    for t in tensor:
        if t != 0:
            print(f'{t} ----> {lang.index_word[t]}')

print("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[5])

print("\nTarget Language; index to word mapping")
convert(targ_lang, target_tensor_train[5])

Input Language; index to word mapping
13 ----> d
4 ----> o
1 ---->  
18 ----> c
4 ----> o
11 ----> m
2 ----> e
1 ---->  
6 ----> a
17 ----> g
6 ----> a
5 ----> i
9 ----> n

Target Language; index to word mapping
3 ----> 	
15 ----> d
6 ----> o
1 ---->  
20 ----> c
6 ----> o
13 ----> m
2 ----> e
1 ---->  
8 ----> a
19 ----> g
8 ----> a
7 ----> i
11 ----> n
4 ----> 



In [11]:
# Hyperparameters and data preparation
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train) // BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index) + 1
vocab_tar_size = len(targ_lang.word_index) + 1

# Dataset pipeline
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

example_input_batch, example_target_batch = next(iter(dataset))
print(example_input_batch.shape, example_target_batch.shape)

(64, 16) (64, 18)


In [32]:
# Encoder model
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

In [13]:
# Testing Encoder
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print('Encoder output shape: (batch size, sequence length, units)', sample_output.shape)
print('Encoder Hidden state shape: (batch size, units)', sample_hidden.shape)

Encoder output shape: (batch size, sequence length, units) (64, 16, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [14]:
# Attention layer (Bahdanau Attention)
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units)", attention_result.shape)
print("Attention weights shape: (batch size, sequence_length, 1)", attention_weights.shape)

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch size, sequence_length, 1) (64, 16, 1)


In [15]:
# Decoder model
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # Used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)

        return x, state, attention_weights

decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

In [16]:
# Testing Decoder
sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)), sample_hidden, sample_output)
print('Decoder output shape: (batch_size, vocab size)', sample_decoder_output.shape)

Decoder output shape: (batch_size, vocab size) (64, 30)


In [17]:
# Optimizer and loss function
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

# Loss function
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [18]:
# Training step
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([targ_lang.word_index['\t']] * BATCH_SIZE, 1)

        # Teacher forcing
        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_function(targ[:, t], predictions)
            dec_input = tf.expand_dims(targ[:, t], 1)  # Teacher forcing

    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [19]:
# Training loop
EPOCHS = 2
for epoch in range(EPOCHS):
    start = time.time()
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}')

    print(f'Epoch {epoch+1} Loss {total_loss/steps_per_epoch:.4f}')
    print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')

Epoch 1 Batch 0 Loss 2.5512
Epoch 1 Batch 100 Loss 1.6371
Epoch 1 Batch 200 Loss 0.8117
Epoch 1 Batch 300 Loss 0.5870
Epoch 1 Loss 1.1844
Time taken for 1 epoch 758.07 sec

Epoch 2 Batch 0 Loss 0.2411
Epoch 2 Batch 100 Loss 0.3191
Epoch 2 Batch 200 Loss 0.1944
Epoch 2 Batch 300 Loss 0.2445
Epoch 2 Loss 0.3206
Time taken for 1 epoch 727.31 sec



### Note: The loss shown here is high since these models were not trained on the maximum capacity of dataset.
### A system with high RAM can be used to train much smarter models.

In [26]:
# Save the models (Encoder and Decoder) using the new `.keras` format
encoder.save('encoder_model.keras')
decoder.save('decoder_model.keras')