# Assignment 6

Import the necessary libraries



In [81]:
from tensorflow import keras
from keras import ops
import string
import re
import numpy as np
import tensorflow.keras.layers as layers
import tensorflow as tf
import random

## Load Data
the the Finnish-Enlish translation dataset

In [82]:
text_file = "../Datasets/fin.txt"

with open(text_file, encoding='utf-8') as f:
    lines = f.read().split("\n")[:-1]
text_pairs = []
for line in lines:
    english, finnish, rest = line.split("\t")
    finnish = "[start] " + finnish + " [end]"
    text_pairs.append((finnish, english))

## Preprocess Data

Shuffle the dataset and split it into training, validation, and test sets. The portions of the dataset are 70% training, 15% validation, and 15% test.

In [83]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]

Strip the punctuation from the text and remove "[" and "]" from the punctuation list. Define a custom standardization function that converts the text to lowercase and removes the punctuation.

In [84]:
strip_chars = string.punctuation
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")


def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase, f"[{re.escape(strip_chars)}]", "")

## Vectorization

Define the vocabulary size as 15 000 and sequence length as 40. Create two `TextVectorization` layers, one for the source language (Finnish) and one for the target language (English). The `TextVectorization` layer will be used to convert the text into integer sequences. The `adapt` method is called on both layers to fit them to the training data.

In [85]:
vocab_size = 15000
sequence_length = 40

source_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
    standardize=custom_standardization,
)
target_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)
train_finnish_texts = [pair[0] for pair in train_pairs]
train_english_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_finnish_texts)
target_vectorization.adapt(train_english_texts)

## Creation Of The Dataset

The batch size is defined to be 64. Define a function `format_dataset` that takes the Finnish and English text as input and returns a dictionary with the Finnish text and the English text shifted by one position. The `make_dataset` function creates a TensorFlow dataset from the pairs of Finnish and English text, shuffles it, and caches it for performance.

In [None]:
batch_size = 64

def format_dataset(fin, eng):
    fin = source_vectorization(fin)
    eng = target_vectorization(eng)
    return ({
                "finnish": fin,
                "english": eng[:, :-1],
            }, eng[:, 1:])

def make_dataset(pairs):
    fin_texts, eng_texts = zip(*pairs)
    fin_texts = list(fin_texts)
    eng_texts = list(eng_texts)
    dataset = tf.data.Dataset.from_tensor_slices((fin_texts, eng_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

Print the shapes of the inputs and target in the dataset.

In [87]:
for inputs, targets in train_ds.take(1):
    print(f"inputs['finnish'].shape: {inputs['finnish'].shape}")
    print(f"inputs['english'].shape: {inputs['english'].shape}")
    print(f"targets.shape: {targets.shape}")

inputs['finnish'].shape: (64, 40)
inputs['english'].shape: (64, 40)
targets.shape: (64, 40)


2025-04-26 12:15:16.769401: W tensorflow/core/kernels/data/cache_dataset_ops.cc:916] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## Positional Embedding Layer
The `PositionalEmbedding` layer is defined to add positional information to the token embeddings. The `call` method computes the token and position embeddings and returns their sum. The `compute_mask` method creates a mask for the input sequences so that the padding tokens are ignored during training. The `get_config` method returns the configuration of the layer.

In [88]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        positions = tf.range(start=0, limit=self.sequence_length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return ops.not_equal(inputs, 0)

    def get_config(self):
        config = super().get_config()
        config.update({
        "output_dim": self.output_dim,
        "sequence_length": self.sequence_length,
        "input_dim": self.input_dim,
        })
        return config

## Encoder and Decoder Layers

The `TransformerEncoder` is defined to create the encoder part of the transformer model. It uses multi-head attention and a feed-forward network. The `call` method computes the attention output and applies layer normalization.

In [89]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(
            inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

`TransformerDecoder` is defined to create the decoder part of the transformer model. It uses multi-head attention and a feed-forward network. The `call` method computes the attention output and applies layer normalization. The `get_causal_attention_mask` method creates a causal attention mask for the decoder.

In [90]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential([layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim)])
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        padding_mask = None
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)
        attention_output_1 = self.attention_1(query=inputs, value=inputs, key=inputs, attention_mask=causal_mask)
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
        attention_output_2 = self.attention_2(query=attention_output_1, value=encoder_outputs, key=encoder_outputs, attention_mask=padding_mask)
        attention_output_2 = self.layernorm_2(attention_output_1 + attention_output_2)
        proj_output = self.dense_proj(attention_output_2)
        return self.layernorm_3(attention_output_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat([tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], axis=0)
        return tf.tile(mask, mult)


## Creating The Transformer Model

The transformer model is created by defining the encoder and decoder inputs. The `PositionalEmbedding` layer is applied to both the encoder and decoder inputs. The `TransformerEncoder` and `TransformerDecoder` layers are applied to the inputs. Finally, dropout of 50% and a dense layer with softmax activation is added to the decoder outputs.

In [91]:
embed_dim = 256
dense_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="finnish")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="english")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)

transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)



## Compile And Train The Model

The model is compiled with the RMSprop optimizer and sparse categorical crossentropy loss. The model is trained for 30 epochs with a batch size of 64. The validation data is used to evaluate the model during training.

In [92]:
transformer.compile(
 optimizer="rmsprop",
 loss="sparse_categorical_crossentropy",
 metrics=["accuracy"])
transformer.fit(train_ds, epochs=30, validation_data=val_ds)

Epoch 1/30


I0000 00:00:1745658918.953806    1618 service.cc:152] XLA service 0x2ca9fe40 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1745658918.953951    1618 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 4070 SUPER, Compute Capability 8.9
2025-04-26 12:15:19.090850: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-04-26 12:15:19.212824: W tensorflow/compiler/tf2xla/kernels/assert_op.cc:39] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
















































I0000 00:00:1745658931.840450    1618 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m380/791[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m25s[0m 61ms/step - accuracy: 0.8588 - loss: 6.0763

2025-04-26 12:15:49.667080: W tensorflow/compiler/tf2xla/kernels/assert_op.cc:39] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


















































[1m661/791[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m9s[0m 73ms/step - accuracy: 0.8535 - loss: 5.7258

2025-04-26 12:16:18.233414: W tensorflow/compiler/tf2xla/kernels/assert_op.cc:39] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
2025-04-26 12:16:20.404425: W tensorflow/compiler/tf2xla/kernels/assert_op.cc:39] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m791/791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 63ms/step - accuracy: 0.8489 - loss: 5.6177 - val_accuracy: 0.6241 - val_loss: 4.5230
Epoch 2/30
[1m791/791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 46ms/step - accuracy: 0.7841 - loss: 4.4282 - val_accuracy: 0.8215 - val_loss: 3.7718
Epoch 3/30
[1m791/791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 48ms/step - accuracy: 0.7467 - loss: 3.6896 - val_accuracy: 0.7917 - val_loss: 3.0303
Epoch 4/30
[1m791/791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 48ms/step - accuracy: 0.7614 - loss: 2.9850 - val_accuracy: 0.8322 - val_loss: 2.7407
Epoch 5/30
[1m791/791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 49ms/step - accuracy: 0.7689 - loss: 2.5443 - val_accuracy: 0.7849 - val_loss: 2.5761
Epoch 6/30
[1m791/791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 48ms/step - accuracy: 0.7864 - loss: 2.2523 - val_accuracy: 0.8555 - val_loss: 2.5222
Epoch 7/30
[1m791/791[0m 

<keras.src.callbacks.history.History at 0x7f0f78316e10>

## Evaluate The Model

The model is evaluated on the test set. The `decode_sequence` function is defined to decode the input sentence and generate the translated output. The function uses the trained transformer model to predict the output sequence. Few examples of translations are printed that show that the model is able to translate some sentences accurately, but some translations are short and not accurate.

In [101]:
eng_vocab = target_vectorization.get_vocabulary()
eng_index_lookup = dict(zip(range(len(eng_vocab)), eng_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization(
            [decoded_sentence]
        )[:, :-1]
        predictions = transformer(
            [tokenized_input_sentence, tokenized_target_sentence]
        )
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = eng_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sentence

test_fin_texts = [pair[0] for pair in test_pairs]
for i in range(10):
    print("Translation " + str(i + 1) + ":")
    input_sentence = random.choice(test_fin_texts)
    print(input_sentence)
    print(decode_sequence(input_sentence))
    print("")

Translation 1:
[start] Tomi hämmentyi. [end]
[start] to my dog                 

Translation 2:
[start] Voisin lähteä pizzalle nyt heti. [end]
[start] can i leave right away  now     away   to from    

Translation 3:
[start] Minä alan väsyä. [end]
[start] are you starting                 

Translation 4:
[start] Jatketaan peliä lounaan jälkeen. [end]
[start] the game after lunch                

Translation 5:
[start] Minkä takia ostit tämän kalliin sanakirjan? [end]
[start] why did you buy this expensive dictionary for            

