In [None]:
import io
import json
import numpy as np
import pandas as pd
import random
import re
import tensorflow as tf
import unicodedata

from google.colab import files
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Download the dataset
!wget https://raw.githubusercontent.com/futuremojo/nlp-demystified/main/datasets/hun_eng_pairs/hun_eng_pairs_train.txt
!wget https://raw.githubusercontent.com/futuremojo/nlp-demystified/main/datasets/hun_eng_pairs/hun_eng_pairs_val.txt
!wget https://raw.githubusercontent.com/futuremojo/nlp-demystified/main/datasets/hun_eng_pairs/hun_eng_pairs_test.txt

--2024-09-27 16:41:44--  https://raw.githubusercontent.com/futuremojo/nlp-demystified/main/datasets/hun_eng_pairs/hun_eng_pairs_train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5518306 (5.3M) [text/plain]
Saving to: ‘hun_eng_pairs_train.txt.1’


2024-09-27 16:41:44 (84.3 MB/s) - ‘hun_eng_pairs_train.txt.1’ saved [5518306/5518306]

--2024-09-27 16:41:44--  https://raw.githubusercontent.com/futuremojo/nlp-demystified/main/datasets/hun_eng_pairs/hun_eng_pairs_val.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 646226 (63

In [None]:
with open('hun_eng_pairs_train.txt') as file:
  train = [line.rstrip() for line in file]

with open('hun_eng_pairs_val.txt') as file:
  val = [line.rstrip() for line in file]

with open('hun_eng_pairs_test.txt') as file:
  test = [line.rstrip() for line in file]

In [None]:
print(train[0])
print(val[0])
print(test[0])

Teszek rá, mit mondasz!<sep>I don't care what you say.
Abbahagyhatom, ha zavar.<sep>If it bothers you, I can stop doing this.
Abbahagyhatom, ha zavar.<sep>If it bothers you, I can stop doing this.


In [None]:
x_train, y_train = [], []

SEPERATOR = "<sep>"

for sample in train:
  x = sample.split(SEPERATOR)
  x_train.append(x[0])
  y_train.append(x[1])

In [None]:
print(x_train[0])
print(y_train[0])

Teszek rá, mit mondasz!
I don't care what you say.


In [None]:
# Unicode normalization
def normalize_unicode(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(s):
  s = normalize_unicode(s)
  s = re.sub(r"([?.!,¿])", r" \1 ", s)
  s = re.sub(r'[" "]+', " ", s)
  s = s.strip()
  return s

In [None]:
x_train_preprocessed = [preprocess_sentence(s) for s in x_train]
y_train_preprocessed = [preprocess_sentence(s) for s in y_train]

In [None]:
def tag_target_sentences(sentences):
  tagged_sentences = map(lambda s: (' ').join(['<sos>', s, '<eos>']), sentences)
  return list(tagged_sentences)

In [None]:
y_train_preprocessed_tagged = tag_target_sentences(y_train_preprocessed)

In [None]:
# Create tokenizers
source_lang_tokeniser = tf.keras.preprocessing.text.Tokenizer(oov_token="<oov>", filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')
target_lang_tokeniser = tf.keras.preprocessing.text.Tokenizer(oov_token="<oov>", filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')

source_lang_tokeniser.fit_on_texts(x_train_preprocessed)
target_lang_tokeniser.fit_on_texts(y_train_preprocessed_tagged)

In [None]:
train_encoder_input_sequences = source_lang_tokeniser.texts_to_sequences(x_train_preprocessed)

In [None]:
def generate_decoder_inputs_targets(sentences, tokenizer):

  sentence_sequences = tokenizer.texts_to_sequences(sentences)

  decoder_input_sequences, decoder_target_sequences = [], []
  for sentence in sentence_sequences:

    decoder_input_sequences.append(sentence[:-1])
    decoder_target_sequences.append(sentence[1:])

  return decoder_input_sequences, decoder_target_sequences

In [None]:
test_decoder_input, test_decoder_target = generate_decoder_inputs_targets(y_train_preprocessed_tagged, target_lang_tokeniser)

In [None]:
train_decoder_input_sequences, train_decoder_target_sequences = generate_decoder_inputs_targets(y_train_preprocessed_tagged, target_lang_tokeniser)

In [None]:
MAX_ENC_LEN = max([len(sentence) for sentence in train_encoder_input_sequences])
MAX_DEC_LEN = max([len(sentence) for sentence in train_decoder_target_sequences])

print(MAX_ENC_LEN)
print(MAX_DEC_LEN)

37
34


In [None]:
train_encoder_input_sequences_padded = pad_sequences(train_encoder_input_sequences, maxlen=MAX_ENC_LEN, padding='post')
train_decoder_input_sequences_padded = pad_sequences(train_decoder_input_sequences, maxlen=MAX_DEC_LEN, padding='post')
train_decoder_target_sequences_padded = pad_sequences(train_decoder_target_sequences, maxlen=MAX_DEC_LEN, padding='post')

In [None]:
def preprocess_dataset(dataset, SEPERATOR="<sep>"):

  x, y = [], []
  for sample in dataset:
    data = sample.split(SEPERATOR)
    x.append(data[0])
    y.append(data[1])

  x_preprocessed = [preprocess_sentence(s) for s in x]
  y_preprocessed = [preprocess_sentence(s) for s in y]

  y_preprocessed_tagged = tag_target_sentences(y_preprocessed)

  encoder_sequences = source_lang_tokeniser.texts_to_sequences(x_preprocessed)
  decoder_input_sequences, decoder_target_sequences = generate_decoder_inputs_targets(y_preprocessed_tagged, target_lang_tokeniser)

  encoder_input_sequences_padded = pad_sequences(encoder_sequences, maxlen=MAX_ENC_LEN, padding='post')
  decoder_input_sequences_padded = pad_sequences(decoder_input_sequences, maxlen=MAX_DEC_LEN, padding='post')
  decoder_target_sequences_padded = pad_sequences(decoder_target_sequences, maxlen=MAX_DEC_LEN, padding='post')

  return encoder_input_sequences_padded, decoder_input_sequences_padded, decoder_target_sequences_padded

In [None]:
val_encoder_input_sequences_padded, val_decoder_input_sequences_padded, val_decoder_target_sequences_padded = preprocess_dataset(val)

In [None]:
source_vocab_size = len(source_lang_tokeniser.word_index) + 1 # +1 is for the <oov> token
target_vocab_size = len(target_lang_tokeniser.word_index) + 1

embedding_dim = 128
hidden_units = 256
batch_size = 32
dropout_rate = 0.2

In [None]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, hidden_units):
    super(Encoder, self).__init__()

    self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, name="encoder_embedding_layer")
    self.lstm = tf.keras.layers.LSTM(units=hidden_units, return_sequences=True, return_state=True, name="encoder_lstm_layer")

  def call(self, input):
    input_embeddings = self.embedding(input)
    output_sequences, hidden_state, cell_state = self.lstm(input_embeddings)
    return output_sequences, hidden_state, cell_state

In [None]:
source_encoder = Encoder(source_vocab_size, embedding_dim, hidden_units)

In [None]:
sample_input = train_encoder_input_sequences_padded[:3]
sample_output_sequences, sample_hidden_state, sample_cell_state = source_encoder(sample_input)

print(sample_output_sequences.shape)
print(sample_hidden_state.shape)
print(sample_cell_state.shape)

(3, 37, 256)
(3, 256)
(3, 256)


In [None]:
class LuongAttention(tf.keras.Model):
  def __init__(self, hidden_units):
    super(LuongAttention, self).__init__()

    self.W_scoring_function = tf.keras.layers.Dense(units=hidden_units, name="W_scoring_function")

  def call(self, inputs):
    encoder_output_sequences, decoder_output = inputs
    z = self.W_scoring_function(encoder_output_sequences)
    attn_scores = tf.matmul(decoder_output, z, transpose_b=True)
    attn_weights = tf.keras.activations.softmax(attn_scores, axis= -1)   # note the -1
    context_vector = tf.matmul(attn_weights, encoder_output_sequences)
    return context_vector, attn_weights

In [None]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, hidden_units):
    super(Decoder, self).__init__()

    self.attention = LuongAttention(hidden_units)
    self.w = tf.keras.layers.Dense(units=hidden_units, activation="tanh", name="attention_tanh_output_layer")
    self.dense = tf.keras.layers.Dense(units=vocab_size, name="decoder_output_layer") # map the LSTM output to target lang vocab size

    self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, name="decoder_embedding_layer")
    self.lstm = tf.keras.layers.LSTM(units=hidden_units, return_sequences=True, return_state=True, name="decoder_lstm_layer")


  def call(self, inputs):

    decoder_input, encoder_output_sequences, hidden_state, cell_state = inputs
    # print("decoder_input Shape: ", decoder_input.shape)
    input_embeddings = self.embedding(decoder_input)
    # print("Input Embeddings Shape: ", input_embeddings.shape)
    decoder_output, decoder_hidden_state, decoder_cell_state = self.lstm(input_embeddings, initial_state=[hidden_state, cell_state])
    # print("Decoder Output Shape: ", decoder_output.shape)

    context_vector, attention_weights = self.attention([encoder_output_sequences, decoder_output])
    # print("Context Vector Shape: ", context_vector.shape)
    # print("Attention Weights Shape: ", attention_weights.shape)

    z = self.w(tf.concat([tf.squeeze(context_vector,1), tf.squeeze(decoder_output,1)], axis=1))
    # print("Z Shape: ", z.shape)
    logits = self.dense(z)
    # print("Logits Shape: ", logits.shape)

    return logits, decoder_hidden_state, decoder_cell_state, attention_weights

In [None]:
target_decoder = Decoder(target_vocab_size, embedding_dim, hidden_units)

In [None]:
sample_decoder_input = train_decoder_input_sequences_padded[:3, 1]
sample_decoder_input = tf.expand_dims(sample_decoder_input, 1)
logits, decoder_hidden_state, decoder_cell_state, attention_weights = target_decoder([sample_decoder_input, sample_output_sequences, sample_hidden_state, sample_cell_state])

In [None]:
class LanguageTranslator(tf.keras.Model):
  def __init__(self, source_encoder, target_decoder):
    super(LanguageTranslator, self).__init__()

    self.source_encoder = source_encoder
    self.target_decoder = target_decoder

  # This method will be called by model.fit for each batch.
  @tf.function
  def train_step(self, inputs):
    encoder_input_sequences, decoder_input_sequences, decoder_target_sequences = inputs

    with tf.GradientTape() as tape:
      encoder_output_sequences, encoder_hidden_state, encoder_cell_state = self.source_encoder(encoder_input_sequences)

      current_decoder_hidden_state = encoder_hidden_state
      current_decoder_cell_state = encoder_cell_state

      total_loss = 0

      # Iterating over each sequence of a sentence (datapoint)
      for t in range(decoder_target_sequences.shape[1]):

        # Input to decoder at each time step is a single token of target language
        decoder_input = decoder_input_sequences[:, t]
        decoder_input = tf.expand_dims(decoder_input, 1)
        logits, decoder_hidden_state, decoder_cell_state, _ = self.target_decoder([decoder_input, encoder_output_sequences, current_decoder_hidden_state, current_decoder_cell_state])

        # The loss is now accumulated through the whole batch
        loss = self.loss(decoder_target_sequences[:, t], logits)
        total_loss += loss

        current_decoder_hidden_state = decoder_hidden_state
        current_decoder_cell_state = decoder_cell_state


      variables = self.source_encoder.trainable_variables + self.target_decoder.trainable_variables
      gradients = tape.gradient(total_loss, variables)
      self.optimizer.apply_gradients(zip(gradients, variables))

      return {"loss": total_loss/decoder_target_sequences.shape[1]}


In [None]:
def loss_func(targets, logits):
  ce_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

  mask = tf.cast(tf.math.not_equal(targets, 0), tf.float32)

  return ce_loss(targets, logits, sample_weight=mask)

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_encoder_input_sequences_padded, train_decoder_input_sequences_padded, train_decoder_target_sequences_padded))
train_dataset = train_dataset.batch(batch_size, drop_remainder=True)

val_dataset = tf.data.Dataset.from_tensor_slices((val_encoder_input_sequences_padded, val_decoder_input_sequences_padded, val_decoder_target_sequences_padded))
val_dataset = val_dataset.batch(batch_size, drop_remainder=True)

In [None]:
encoder = Encoder(source_vocab_size, embedding_dim, hidden_units)
decoder = Decoder(target_vocab_size, embedding_dim, hidden_units)
optimizer = tf.keras.optimizers.Adam()

translator = LanguageTranslator(encoder, decoder)
translator.compile(optimizer=optimizer, loss=loss_func)

In [None]:
epochs = 10

history = translator.fit(train_dataset, epochs=epochs)

Epoch 1/10
[1m2770/2770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 66ms/step - loss: 0.8876
Epoch 2/10
[1m2770/2770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 64ms/step - loss: 0.6152
Epoch 3/10
[1m2770/2770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 64ms/step - loss: 0.4601
Epoch 4/10
[1m2770/2770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 64ms/step - loss: 0.3447
Epoch 5/10
[1m2770/2770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 64ms/step - loss: 0.2615
Epoch 6/10
[1m2770/2770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 64ms/step - loss: 0.2016
Epoch 7/10
[1m2770/2770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 64ms/step - loss: 0.1583
Epoch 8/10
[1m2770/2770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 64ms/step - loss: 0.1270
Epoch 9/10
[1m2770/2770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 64ms/step - loss: 0.1042
Epoch 10/10
[1m2770/2770[0m [32m━━━━━━━━━━━

In [None]:
# save the encoder and decoder weights
encoder.save_weights('encoder.weights.h5')
decoder.save_weights('decoder.weights.h5')

from google.colab import files

files.download('encoder.weights.h5')
files.download('decoder.weights.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
def translate(encoder, decoder, sentence, maxLen=30):

  x_preprocessed = preprocess_sentence(sentence)
  encoder_sequences = source_lang_tokeniser.texts_to_sequences(x_preprocessed)

  encoder_input_sequences_padded = pad_sequences(encoder_sequences, maxlen=MAX_ENC_LEN, padding='post')

  encoder_output_sequences, encoder_hidden_state, encoder_cell_state = encoder(encoder_input_sequences_padded)

  current_decoder_hidden_state = encoder_hidden_state
  current_decoder_cell_state = encoder_cell_state

  currLen = 0
  current_word = '<sos>'
  translated_sentence = []

  while currLen < maxLen:

    # print("input word: ", current_word)

    decoder_input = np.zeros((1,1))
    decoder_input[0, 0] = target_lang_tokeniser.word_index[current_word]
    logits, decoder_hidden_state, decoder_cell_state, _ = decoder([decoder_input, encoder_output_sequences, current_decoder_hidden_state, current_decoder_cell_state])
    predicted_id = tf.argmax(logits, axis= -1)

    current_word = target_lang_tokeniser.index_word[predicted_id.numpy()[0]]

    # print("predicted word: ", current_word)
    # print("\n")

    if current_word == '<eos>':
      return translated_sentence

    translated_sentence.append(current_word)

    decoder_input = predicted_id
    current_decoder_hidden_state = decoder_hidden_state
    current_decoder_cell_state = decoder_cell_state

    currLen += 1

  return translated_sentence

In [None]:
hungarian_sentence1 = "Teszek rá, mit mondasz!"
hungarian_sentence2 = "Abbahagyhatom, ha zavar."
english_sentence1 = translate(encoder, decoder, hungarian_sentence1)
english_sentence2 = translate(encoder, decoder, hungarian_sentence2)
print(english_sentence1)
print(english_sentence2)