In [1]:
!pip install miditok
!pip install symusic
!pip install torch
!pip install transformers
!pip install accelerate
!pip install evaluate
!pip install tensorboard
!pip install scikit-learn
!pip install pretty_midi


!wget https://storage.googleapis.com/magentadata/datasets/maestro/v3.0.0/maestro-v3.0.0-midi.zip
!unzip 'maestro-v3.0.0-midi.zip'
!rm 'maestro-v3.0.0-midi.zip'
!mv 'maestro-v3.0.0' 'Maestro'

from copy import deepcopy
from pathlib import Path
from random import shuffle

from evaluate import load as load_metric
from miditok import REMI, TokenizerConfig
from miditok.pytorch_data import DatasetMIDI, DataCollator
from miditok.utils import split_files_for_training
from miditok.data_augmentation import augment_dataset
from torch import Tensor, argmax
from torch.utils.data import DataLoader
from torch.cuda import is_available as cuda_available, is_bf16_supported
from torch.backends.mps import is_available as mps_available
from transformers import AutoModelForCausalLM, MistralConfig, Trainer, TrainingArguments, GenerationConfig
from transformers.trainer_utils import set_seed
from tqdm import tqdm

Collecting miditok
  Downloading miditok-3.0.4-py3-none-any.whl.metadata (10 kB)
Collecting symusic>=0.5.0 (from miditok)
  Downloading symusic-0.5.5-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting pySmartDL (from symusic>=0.5.0->miditok)
  Downloading pySmartDL-1.3.4-py3-none-any.whl.metadata (2.8 kB)
Downloading miditok-3.0.4-py3-none-any.whl (157 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.2/157.2 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading symusic-0.5.5-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading pySmartDL-1.3.4-py3-none-any.whl (20 kB)
Installing collected packages: pySmartDL, symusic, miditok
Successfully installed miditok-3.0.4 pySmartDL-1.3.4 symusic-0.5.5
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metada

In [2]:
import json
import pretty_midi

In [3]:
import os
from tqdm import tqdm

def find_midi_files(directory):
    """Trova ricorsivamente tutti i file MIDI nella directory."""
    midi_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith((".mid", ".midi")):
                midi_files.append(os.path.join(root, file))
    return midi_files

def midi_to_note_representation(file_path):
    """Converte un file MIDI in una rappresentazione nota-durata."""
    try:
        midi_data = pretty_midi.PrettyMIDI(file_path)
        note_events = []

        for instrument in midi_data.instruments:
            for note in instrument.notes:
                # Convert pitch to note name
                note_name = pretty_midi.note_number_to_name(note.pitch)
                duration = note.end - note.start
                note_events.append(f"{note_name}-{duration:.1f}")

        return ", ".join(note_events)
    except Exception as e:
        print(f"Errore nella conversione del file {file_path}: {e}")
        return None

def create_dataset_from_midi(directory, output_file):
    """Crea un dataset JSON con la rappresentazione delle note dai file MIDI."""
    dataset = {}
    midi_files = find_midi_files(directory)

    for midi_file in tqdm(midi_files):
        note_representation = midi_to_note_representation(midi_file)
        if note_representation:
            dataset[midi_file] = note_representation

    # Salva il dataset in un file JSON
    with open(output_file, "w") as json_file:
        json.dump(dataset, json_file, indent=4)

    print(f"Dataset creato e salvato in {output_file}")

# Specifica la directory e il file di output
midi_directory = "./Maestro"
output_dataset_file = "midi_dataset.json"

create_dataset_from_midi(midi_directory, output_dataset_file)

100%|██████████| 1276/1276 [05:39<00:00,  3.76it/s]


Dataset creato e salvato in midi_dataset.json


In [4]:
dataset_file = "midi_dataset.json"

with open(dataset_file, "r") as json_file:
    dataset = json.load(json_file)

In [5]:
import numpy as np
maestro_dataset = list(dataset.values())

In [6]:
with open("midi_dataset_nolabel.json", "w") as json_file:
    json.dump(maestro_dataset[:10], json_file, indent=4)

In [7]:
import tensorflow as tf
from tensorflow.keras.losses import SparseCategoricalCrossentropy # Change from keras to tensorflow.keras
from tensorflow.keras.optimizers import Adam # Change from keras to tensorflow.keras
from tensorflow.keras.preprocessing.text import Tokenizer # Change from keras to tensorflow.keras
from tensorflow.keras.utils import pad_sequences # Change from keras to tensorflow.keras
from tqdm import tqdm

In [36]:
# Global parameters
EPOCHS = 50
BATCH_SIZE = 32
DATA_PATH = "midi_dataset_nolabel.json"
MAX_POSITIONS_IN_POSITIONAL_ENCODING = 200

# Loss function and optimizer
sparse_categorical_crossentropy = SparseCategoricalCrossentropy(
    from_logits=True, reduction="none"
)
optimizer = Adam()


def train(train_dataset, transformer, epochs):
    """
    Trains the Transformer model on a given dataset for a specified number of epochs.

    Parameters:
        train_dataset (tf.data.Dataset): The training dataset.
        transformer (Transformer): The Transformer model instance.
        epochs (int): The number of epochs to train the model.
    """
    print("Training the model...")
    for epoch in tqdm(range(epochs)):
        total_loss = 0
        # Iterate over each batch in the training dataset
        for (batch, (input, target)) in enumerate(train_dataset):
            # Perform a single training step
            batch_loss = _train_step(input, target=target, transformer=transformer)
            total_loss += batch_loss
            #print(f"Epoch {epoch + 1} Batch {batch + 1} Loss {batch_loss.numpy()}")


@tf.function
def _train_step(input, target, transformer):
    """
    Performs a single training step for the Transformer model.

    Parameters:
        input (tf.Tensor): The input sequences.
        target (tf.Tensor): The target sequences.
        transformer (Transformer): The Transformer model instance.

    Returns:
        tf.Tensor: The loss value for the training step.
    """
    # Prepare the target input and real output for the decoder
    # Pad the sequences on the right by one position
    target_input = _right_pad_sequence_once(target[:, :-1])
    target_real = _right_pad_sequence_once(target[:, 1:])

    # Open a GradientTape to record the operations run
    # during the forward pass, which enables auto-differentiation
    with tf.GradientTape() as tape:
        # Forward pass through the transformer model
        # TODO: Add padding mask for encoder + decoder and look-ahead mask
        # for decoder
        predictions = transformer(input, target=target_input, training=True)

        # Compute loss between the real output and the predictions
        loss = _calculate_loss(real=target_real, pred=predictions)

    # Calculate gradients with respect to the model's trainable variables
    gradients = tape.gradient(loss, transformer.trainable_variables)

    # Apply gradients to update the model's parameters
    gradient_variable_pairs = zip(gradients, transformer.trainable_variables)
    optimizer.apply_gradients(gradient_variable_pairs)

    # Return the computed loss for this training step
    return loss


def _calculate_loss(real, pred):
    """
    Computes the loss between the real and predicted sequences.

    Parameters:
        real (tf.Tensor): The actual target sequences.
        pred (tf.Tensor): The predicted sequences by the model.

    Returns:
        average_loss (tf.Tensor): The computed loss value.
    """

    # Compute loss using the Sparse Categorical Crossentropy
    loss_ = sparse_categorical_crossentropy(real, pred)

    # Create a mask to filter out zeros (padded values) in the real sequences
    boolean_mask = tf.math.equal(real, 0)
    mask = tf.math.logical_not(boolean_mask)

    # Convert mask to the same dtype as the loss for multiplication
    mask = tf.cast(mask, dtype=loss_.dtype)

    # Apply the mask to the loss, ignoring losses on padded positions
    loss_ *= mask

    # Calculate average loss, excluding the padded positions
    total_loss = tf.reduce_sum(loss_)
    number_of_non_padded_elements = tf.reduce_sum(mask)
    average_loss = total_loss / number_of_non_padded_elements

    return average_loss


def _right_pad_sequence_once(sequence):
    """
    Pads a sequence with a single zero at the end.

    Parameters:
        sequence (tf.Tensor): The sequence to be padded.

    Returns:
        tf.Tensor: The padded sequence.
    """
    return tf.pad(sequence, [[0, 0], [0, 1]], "CONSTANT")

In [33]:
melody_preprocessor = MelodyPreprocessor(DATA_PATH, batch_size=BATCH_SIZE, max_length=MAX_POSITIONS_IN_POSITIONAL_ENCODING)
train_dataset = melody_preprocessor.create_training_dataset()
vocab_size = melody_preprocessor.number_of_tokens_with_padding

In [37]:
transformer_model = Transformer(
    num_layers=20,
    d_model=256,
    num_heads=8,
    d_feedforward=1024,
    input_vocab_size=vocab_size,
    target_vocab_size=vocab_size,
    max_num_positions_in_pe_encoder=MAX_POSITIONS_IN_POSITIONAL_ENCODING,
    max_num_positions_in_pe_decoder=MAX_POSITIONS_IN_POSITIONAL_ENCODING,
    dropout_rate=0.1,
)

train(train_dataset, transformer_model, EPOCHS)

Training the model...


100%|██████████| 50/50 [06:14<00:00,  7.48s/it]  


In [38]:
print("Generating a melody...")
melody_generator = MelodyGenerator(
    transformer_model, melody_preprocessor.tokenizer, max_length=MAX_POSITIONS_IN_POSITIONAL_ENCODING
)
start_sequence = ["C5-2.4", "D4-0.1", "A2-0.1", "E2-0.1"] # TODO: pick this random from 1 song in dataset
new_melody = melody_generator.generate(start_sequence)

Generating a melody...


In [39]:
def parse_note_string(note_string):
    """
    Converte una stringa di note in una lista di tuple (nota, durata).
    Esempio: "C4-1.0, G4-1.0" -> [("C4", 1.0), ("G4", 1.0)]
    """
    notes = []
    for note in note_string.split(", "):
        note_name, duration = note.split("-")
        notes.append((note_name, float(duration)))
    return notes

def create_midi_from_notes(note_string, output_file):
    """
    Crea un file MIDI a partire da una stringa di note.
    """
    # Parse la stringa in una lista di note
    notes = parse_note_string(note_string)

    # Crea un oggetto PrettyMIDI e uno strumento
    midi = pretty_midi.PrettyMIDI()
    instrument = pretty_midi.Instrument(program=0)  # 0 = piano

    current_time = 0.0  # Tempo iniziale

    for note_name, duration in notes:
        # Converti la nota in un numero MIDI
        note_number = pretty_midi.note_name_to_number(note_name)
        # Crea la nota MIDI
        note = pretty_midi.Note(velocity=100, pitch=note_number,
                                start=current_time, end=current_time + duration)
        # Aggiungi la nota allo strumento
        instrument.notes.append(note)
        # Aggiorna il tempo corrente
        current_time += duration

    # Aggiungi lo strumento al MIDI
    midi.instruments.append(instrument)

    # Salva il file MIDI
    midi.write(output_file)
    print(f"File MIDI creato: {output_file}")


In [40]:
note_string = new_melody.replace(" ", ", ")
output_midi_file = "output.mid"
create_midi_from_notes(note_string, output_midi_file)

File MIDI creato: output.mid


In [8]:
"""
melody_generator.py

This script defines the MelodyGenerator class, which is responsible for generating
melodies using a trained Transformer model. The class offers functionality to produce
a sequence of musical notes, starting from a given seed sequence and extending it
to a specified maximum length.

The MelodyGenerator class leverages the trained Transformer model's ability to
predict subsequent notes in a melody based on the current sequence context. It
achieves this by iteratively appending each predicted note to the existing sequence
and feeding this extended sequence back into the model for further predictions.

This iterative process continues until the generated melody reaches the desired length
or an end-of-sequence token is predicted. The class utilizes a tokenizer to encode and
decode note sequences to and from the format expected by the Transformer model.

Key Components:
- MelodyGenerator: The primary class defined in this script, responsible for the
  generation of melodies.

Usage:
The MelodyGenerator class can be instantiated with a trained Transformer model
and an appropriate tokenizer. Once instantiated, it can generate melodies by
calling the `generate` method with a starting note sequence.

Note:
This class is intended to be used with a Transformer model that has been
specifically trained for melody generation tasks.
"""

import tensorflow as tf


class MelodyGenerator:
    """
    Class to generate melodies using a trained Transformer model.

    This class encapsulates the inference logic for generating melodies
    based on a starting sequence.
    """

    def __init__(self, transformer, tokenizer, max_length=50):
        """
        Initializes the MelodyGenerator.

        Parameters:
            transformer (Transformer): The trained Transformer model.
            tokenizer (Tokenizer): Tokenizer used for encoding melodies.
            max_length (int): Maximum length of the generated melodies.
        """
        self.transformer = transformer
        self.tokenizer = tokenizer
        self.max_length = max_length

    def generate(self, start_sequence):
        """
        Generates a melody based on a starting sequence.

        Parameters:
            start_sequence (list of str): The starting sequence of the melody.

        Returns:
            str: The generated melody.
        """
        input_tensor = self._get_input_tensor(start_sequence)

        num_notes_to_generate = self.max_length - len(input_tensor[0])

        for _ in range(num_notes_to_generate):
            predictions = self.transformer(
                input_tensor, target=input_tensor, training=False
            )
            predicted_note = self._get_note_with_highest_score(predictions)
            input_tensor = self._append_predicted_note(
                input_tensor, predicted_note
            )

        generated_melody = self._decode_generated_sequence(input_tensor)

        return generated_melody

    def _get_input_tensor(self, start_sequence):
        """
        Gets the input tensor for the Transformer model.

        Parameters:
            start_sequence (list of str): The starting sequence of the melody.

        Returns:
            input_tensor (tf.Tensor): The input tensor for the model.
        """
        input_sequence = self.tokenizer.texts_to_sequences([start_sequence])
        input_tensor = tf.convert_to_tensor(input_sequence, dtype=tf.int64)
        return input_tensor

    def _get_note_with_highest_score(self, predictions):
        """
        Gets the note with the highest score from the predictions.

        Parameters:
            predictions (tf.Tensor): The predictions from the model.

        Returns:
            predicted_note (int): The index of the predicted note.
        """
        latest_predictions = predictions[:, -1, :]
        predicted_note_index = tf.argmax(latest_predictions, axis=1)
        predicted_note = predicted_note_index.numpy()[0]
        return predicted_note

    def _append_predicted_note(self, input_tensor, predicted_note):
        """
        Appends the predicted note to the input tensor.

        Parameters:
            input_tensor (tf.Tensor): The input tensor for the model.

        Returns:
            (tf.Tensor): The input tensor with the predicted note
        """
        return tf.concat([input_tensor, [[predicted_note]]], axis=-1)

    def _decode_generated_sequence(self, generated_sequence):
        """
        Decodes the generated sequence of notes.

        Parameters:
            generated_sequence (tf.Tensor): Tensor with note indexes generated.

        Returns:
            generated_melody (str): The decoded sequence of notes.
        """
        generated_sequence_array = generated_sequence.numpy()
        generated_melody = self.tokenizer.sequences_to_texts(
            generated_sequence_array
        )[0]
        return generated_melody


In [30]:
"""
melody_preprocessor.py

This script defines the MelodyPreprocessor class, a utility for preparing melody
datasets for training in a sequence-to-sequence Transformer model. The class
focuses on processing melody data by tokenizing and encoding the melodies, and
subsequently creating TensorFlow datasets suitable for training sequence-to-sequence
models.

The MelodyPreprocessor handles the entire preprocessing pipeline including loading
melodies from a dataset file, parsing the melodies into individual notes, tokenizing
and encoding these notes, and forming input-target pairs for model training. It
also includes functionality for padding sequences to a uniform length.

Key Features:
- Tokenization and encoding of melodies.
- Dynamic calculation of maximum sequence length based on the dataset.
- Creation of input-target pairs for sequence-to-sequence training.
- Conversion of processed data into TensorFlow datasets.

Usage:
To use the MelodyPreprocessor, initialize it with the path to a dataset containing
melodies and the desired batch size. Then call `create_training_dataset` to prepare
the dataset for training a Transformer model.


Note:
This script is intended to be used with datasets containing melody sequences in a
specific format, where each melody is represented as a string of comma-separated
musical notes (pitch with octave + duration in quarter length).
"""


import json

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer


class MelodyPreprocessor:
    """
    A class for preprocessing melodies for a Transformer model.

    This class takes melodies, tokenizes and encodes them, and prepares
    TensorFlow datasets for training sequence-to-sequence models.
    """

    def __init__(self, dataset_path, batch_size=32, max_length=200):
        """
        Initializes the MelodyPreprocessor.

        Parameters:
            dataset_path (str): Path to the dataset file.
            max_melody_length (int): Maximum length of the sequences.
            batch_size (int): Size of each batch in the dataset.
        """
        self.dataset_path = dataset_path
        self.max_length=max_length
        self.batch_size = batch_size
        self.tokenizer = Tokenizer(filters="", lower=False, split=",")
        self.max_melody_length = None
        self.number_of_tokens = None

    @property
    def number_of_tokens_with_padding(self):
        """
        Returns the number of tokens in the vocabulary including padding.

        Returns:
            int: The number of tokens in the vocabulary including padding.
        """
        return self.number_of_tokens + 1

    def create_training_dataset(self):
        """
        Preprocesses the melody dataset and creates sequence-to-sequence
        training data.

        Returns:
            tf_training_dataset: A TensorFlow dataset containing input-target
                pairs suitable for training a sequence-to-sequence model.
        """
        dataset = self._load_dataset()
        parsed_melodies = [self._parse_melody(melody[:self.max_length]) for melody in dataset]
        tokenized_melodies = self._tokenize_and_encode_melodies(
            parsed_melodies
        )
        self._set_max_melody_length(tokenized_melodies)
        self._set_number_of_tokens()
        input_sequences, target_sequences = self._create_sequence_pairs(
            tokenized_melodies
        )
        tf_training_dataset = self._convert_to_tf_dataset(
            input_sequences, target_sequences
        )
        return tf_training_dataset

    def _load_dataset(self):
        """
        Loads the melody dataset from a JSON file.

        Returns:
            list: A list of melodies from the dataset.
        """
        with open(self.dataset_path, "r") as f:
            return json.load(f)

    def _parse_melody(self, melody_str):
        """
        Parses a single melody string into a list of notes.

        Parameters:
            melody_str (str): A string representation of a melody.

        Returns:
            list: A list of notes extracted from the melody string.
        """
        return melody_str.split(", ")

    def _tokenize_and_encode_melodies(self, melodies):
        """
        Tokenizes and encodes a list of melodies.

        Parameters:
            melodies (list): A list of melodies to be tokenized and encoded.

        Returns:
            tokenized_melodies: A list of tokenized and encoded melodies.
        """
        self.tokenizer.fit_on_texts(melodies)
        tokenized_melodies = self.tokenizer.texts_to_sequences(melodies)
        return tokenized_melodies

    def _set_max_melody_length(self, melodies):
        """
        Sets the maximum melody length based on the dataset.

        Parameters:
            melodies (list): A list of tokenized melodies.
        """
        self.max_melody_length = max([len(melody) for melody in melodies])

    def _set_number_of_tokens(self):
        """
        Sets the number of tokens based on the tokenizer.
        """
        self.number_of_tokens = len(self.tokenizer.word_index)

    def _create_sequence_pairs(self, melodies):
        """
        Creates input-target pairs from tokenized melodies.

        Parameters:
            melodies (list): A list of tokenized melodies.

        Returns:
            tuple: Two numpy arrays representing input sequences and target sequences.
        """
        input_sequences, target_sequences = [], []
        for melody in melodies:
            for i in range(1, len(melody)):
                input_seq = melody[:i]
                target_seq = melody[1 : i + 1]  # Shifted by one time step
                padded_input_seq = self._pad_sequence(input_seq)
                padded_target_seq = self._pad_sequence(target_seq)
                input_sequences.append(padded_input_seq)
                target_sequences.append(padded_target_seq)
        return np.array(input_sequences), np.array(target_sequences)

    def _pad_sequence(self, sequence):
        """
        Pads a sequence to the maximum sequence length.

        Parameters:
            sequence (list): The sequence to be padded.

        Returns:
            list: The padded sequence.
        """
        return sequence + [0] * (self.max_melody_length - len(sequence))

    def _convert_to_tf_dataset(self, input_sequences, target_sequences):
        """
        Converts input and target sequences to a TensorFlow Dataset.

        Parameters:
            input_sequences (list): Input sequences for the model.
            target_sequences (list): Target sequences for the model.

        Returns:
            batched_dataset (tf.data.Dataset): A batched and shuffled
                TensorFlow Dataset.
        """
        dataset = tf.data.Dataset.from_tensor_slices(
            (input_sequences, target_sequences)
        )
        shuffled_dataset = dataset.shuffle(buffer_size=1000)
        batched_dataset = shuffled_dataset.batch(self.batch_size)
        return batched_dataset

In [10]:
"""
This script defines the Transformer model, a state-of-the-art model architecture used
for a variety of natural language processing tasks, adapted here for music generation.
The Transformer model relies on the mechanism of attention, differentially weighting
the significance of different input elements.

The script includes the implementation of various components of the Transformer
architecture, including the Encoder, Decoder, and their respective layers. It also
defines a sinusoidal positional encoding function that provides the model with
information about the relative position of tokens in the sequence.

Key Components:
- Transformer: The main model class combining the Encoder and Decoder.
- Encoder: Processes the input sequence and generates a context-rich representation.
- Decoder: Generates the output sequence based on the Encoder's output and its own
  input.
- EncoderLayer and DecoderLayer: Individual layers used in the Encoder and Decoder.
- _get_angles and sinusoidal_position_encoding: Functions to generate positional
  encoding based on the sequence length and model dimensionality.

Usage:
To use the Transformer model, instantiate it with the required dimensions, number
of layers, vocabulary sizes, and other parameters. The model can then be used for
training or inference tasks in music generation or other sequence-to-sequence
transformations.

Note:
This implementation of the Transformer model is designed for flexibility and can be
adapted for various sequence-to-sequence tasks beyond music generation.
"""

import numpy as np
import tensorflow as tf
from keras.layers import (
    Dense,
    Dropout,
    Embedding,
    LayerNormalization,
    MultiHeadAttention,
)


def sinusoidal_position_encoding(num_positions, d_model):
    """
    Compute positional encoding for a given position and dimension.

    Parameters:
        num_positions (int): Number of positions.
        d_model (int): Dimension of the model.

    Returns:
        Tensor: Positional encoding for the given position and dimension.
    """

    angles = _get_angles(
        np.arange(num_positions)[:, np.newaxis],
        np.arange(d_model)[np.newaxis, :],
        d_model,
    )

    # Apply sin to even indices in the array; 2i
    sines = np.sin(angles[:, 0::2])

    # Apply cos to odd indices in the array; 2i+1
    cosines = np.cos(angles[:, 1::2])

    pos_encoding = np.concatenate([sines, cosines], axis=-1)
    pos_encoding = pos_encoding[np.newaxis, ...]  # (1, position, d_model)

    return tf.cast(pos_encoding, dtype=tf.float32)


def _get_angles(pos, i, d_model):
    """
    Compute the angles for the positional encoding.

    Parameters:
        pos (np.ndarray): Positions.
        i (np.ndarray): Indices.
        d_model (int): Dimension of the model.

    Returns:
        np.ndarray: Angles for the positional encoding.
    """
    angle_dropout_rates = 1 / np.power(
        10000, (2 * (i // 2)) / np.float32(d_model)
    )
    return pos * angle_dropout_rates


class Transformer(tf.keras.Model):
    """
    The Transformer model architecture, consisting of an Encoder and Decoder.
    """

    def __init__(
        self,
        num_layers,
        d_model,
        num_heads,
        d_feedforward,
        input_vocab_size,
        target_vocab_size,
        max_num_positions_in_pe_encoder,
        max_num_positions_in_pe_decoder,
        dropout_rate=0.1,
    ):
        """
        Parameters:
            num_layers (int): Number of layers in both Encoder and Decoder.
            d_model (int): Dimension of the model.
            num_heads (int): Number of attention heads.
            d_feedforward (int): Dimension of the feed forward network.
            input_vocab_size (int): Size of the input vocabulary.
            target_vocab_size (int): Size of the target vocabulary.
            max_num_positions_in_pe_encoder (int): The maximum positions for input.
            max_num_positions_in_pe_decoder (int): The maximum positions for
                target.
            dropout_rate (float): Dropout dropout_rate.
        """
        super(Transformer, self).__init__()
        self.encoder = Encoder(
            num_layers,
            d_model,
            num_heads,
            d_feedforward,
            input_vocab_size,
            max_num_positions_in_pe_encoder,
            dropout_rate,
        )
        self.decoder = Decoder(
            num_layers,
            d_model,
            num_heads,
            d_feedforward,
            target_vocab_size,
            max_num_positions_in_pe_decoder,
            dropout_rate,
        )

        self.final_layer = Dense(target_vocab_size)

    def call(
        self,
        input,
        target,
        training,
        enc_padding_mask=None,
        look_ahead_mask=None,
        dec_padding_mask=None,
    ):
        """
        Process the input through the Transformer model.

        Parameters:
            input (Tensor): Input tensor to the Encoder.
            target (Tensor): Target tensor for the Decoder.
            training (bool): Whether the layer should behave in training mode.
            enc_padding_mask (Tensor): Padding mask for the Encoder.
            look_ahead_mask (Tensor): Look-ahead mask for the Decoder.
            dec_padding_mask (Tensor): Padding mask for the Decoder.

        Returns:
            Tensor: The final output of the Transformer.
            dict: Attention weights from the Decoder layers.
        """
        enc_output = self.encoder(
            input, training=training, mask=enc_padding_mask
        )  # (batch_size, input_seq_len, d_model)

        dec_output = self.decoder(
            target, enc_output=enc_output, training=training, look_ahead_mask=look_ahead_mask, padding_mask=dec_padding_mask
        )  # (batch_size, tar_seq_len, d_model)

        logits = self.final_layer(
            dec_output
        )  # (batch_size, target_seq_len, target_vocab_size)

        return logits


class Encoder(tf.keras.layers.Layer):
    """
    The Encoder of a Transformer model, consisting of multiple EncoderLayers.
    """

    def __init__(
        self,
        num_layers,
        d_model,
        num_heads,
        d_feedforward,
        input_vocab_size,
        maximum_positions_in_pe,
        dropout_rate=0.1,
    ):
        """
        Parameters
            num_layers (int): Number of EncoderLayers.
            d_model (int): Dimension of the model.
            num_heads (int): Number of attention heads.
            d_feedforward (int): Dimension of the feed forward network.
            input_vocab_size (int): Size of the input vocabulary.
            maximum_positions_in_pe (int): The maximum sequence length that
                this model might ever be used with.
            dropout_rate (float): Dropout dropout_rate.
        """
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = Embedding(input_vocab_size, d_model)
        self.pos_encoding = sinusoidal_position_encoding(
            maximum_positions_in_pe, d_model
        )
        self.enc_layers = [
            EncoderLayer(d_model, num_heads, d_feedforward, dropout_rate)
            for _ in range(num_layers)
        ]
        self.dropout = Dropout(dropout_rate)

    def call(self, x, training, mask):
        """
        Process the input through the Encoder.

        Args:
            x (Tensor): Input tensor.
            training (bool): Whether the layer should behave in training mode.
            mask (Tensor): Mask to be applied on attention weights.

        Returns:
            Tensor: Output of the Encoder.
        """
        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))

        sliced_pos_encoding = self._get_sliced_positional_encoding(x)
        x += sliced_pos_encoding

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training=training, mask=mask)

        return x  # (batch_size, input_seq_len, d_model)

    def _get_sliced_positional_encoding(self, x):
        """
        Get a slice of the full positional encoding.

        Patameters:
            x (Tensor): Input tensor.

        Returns:
            Tensor: A slice of the full positional encoding.
        """
        number_of_tokens = x.shape[1]
        return self.pos_encoding[:, :number_of_tokens, :]


class Decoder(tf.keras.layers.Layer):
    """
    The Decoder of a Transformer model, consisting of multiple DecoderLayers.
    """

    def __init__(
        self,
        num_layers,
        d_model,
        num_heads,
        d_feedforward,
        target_vocab_size,
        maximum_positions_in_pe,
        dropout_rate=0.1,
    ):
        """
        Parameters:
            num_layers (int): Number of DecoderLayers.
            d_model (int): Dimension of the model.
            num_heads (int): Number of attention heads.
            d_feedforward (int): Dimension of the feed forward network.
            target_vocab_size (int): Size of the target vocabulary.
            maximum_positions_in_pe (int): The maximum sequence length that
                this model might ever be used with.
            dropout_rate (float): Dropout dropout_rate.
        """
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = Embedding(target_vocab_size, d_model)
        self.pos_encoding = sinusoidal_position_encoding(
            maximum_positions_in_pe, d_model
        )

        self.dec_layers = [
            DecoderLayer(d_model, num_heads, d_feedforward, dropout_rate)
            for _ in range(num_layers)
        ]
        self.dropout = Dropout(dropout_rate)

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        """
        Process the input through the Decoder.

        Parameters:
            x (Tensor): Input tensor to the Decoder.
            enc_output (Tensor): Output from the Encoder.
            training (bool): Whether the layer should behave in training mode.
            look_ahead_mask (Tensor): Mask for the first MultiHeadAttention layer.
            padding_mask (Tensor): Mask for the second MultiHeadAttention layer.

        Returns:
            Tensor: The output of the Decoder.
        """

        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))

        sliced_pos_encoding = self._get_sliced_positional_encoding(x)
        x += sliced_pos_encoding

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.dec_layers[i](
                x, enc_output=enc_output, training=training, look_ahead_mask=look_ahead_mask, padding_mask=padding_mask
            )

        return x

    def _get_sliced_positional_encoding(self, x):
        """
        Get a slice of the full positional encoding.

        Patameters:
            x (Tensor): Input tensor.

        Returns:
            Tensor: A slice of the full positional encoding.
        """
        number_of_tokens = x.shape[1]
        return self.pos_encoding[:, :number_of_tokens, :]


class EncoderLayer(tf.keras.layers.Layer):
    """
    Encoder Layer of a Transformer, consisting of MultiHeadAttention and
    Feed Forward Neural Network.
    """

    def __init__(self, d_model, num_heads, d_feedforward, dropout_rate=0.1):
        """
        Parameters:
            d_model (int): Dimension of the model.
            num_heads (int): Number of attention heads.
            d_feedforward (int): Dimension of the feed forward network.
            dropout_rate (float): Dropout dropout_rate.
        """
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(key_dim=d_model, num_heads=num_heads)
        self.ffn = tf.keras.Sequential(
            [Dense(d_feedforward, activation="relu"), Dense(d_model)]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)

    def call(self, x, training, mask):
        """
        Process the input through the Encoder layer.

        Parameters:
            x (Tensor): Input tensor.
            training (bool): Whether the layer should behave in training mode.
            mask (Tensor): Mask to be applied on attention weights.

        Returns:
            Tensor: Output of the Encoder layer.
        """
        attn_output = self.mha(x, x, x, attention_mask=mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2


class DecoderLayer(tf.keras.layers.Layer):
    """
    Decoder Layer of a Transformer, consisting of two MultiHeadAttention
    layers and a Feed Forward Neural Network.
    """

    def __init__(self, d_model, num_heads, d_feedforward, dropout_rate=0.1):
        """
        Parameters:
            d_model (int): Dimension of the model.
            num_heads (int): Number of attention heads.
            d_feedforward (int): Dimension of the feed forward network.
            dropout_rate (float): Dropout dropout_rate.
        """
        super(DecoderLayer, self).__init__()
        self.mha1 = MultiHeadAttention(key_dim=d_model, num_heads=num_heads)
        self.mha2 = MultiHeadAttention(key_dim=d_model, num_heads=num_heads)

        self.ffn = tf.keras.Sequential(
            [Dense(d_feedforward, activation="relu"), Dense(d_model)]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.layernorm3 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)
        self.dropout3 = Dropout(dropout_rate)

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        """
        Process the input through the Decoder layer.

        Parameters
            x (Tensor): Input tensor to the Decoder layer.
            enc_output (Tensor): Output from the Encoder.
            training (bool): Whether the layer should behave in training mode.
            look_ahead_mask (Tensor): Mask for the first MultiHeadAttention layer.
            padding_mask (Tensor): Mask for the second MultiHeadAttention layer.

        Returns:
            Tensor: The output of the Decoder layer.
        """
        attn1 = self.mha1(x, x, x, attention_mask=look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2 = self.mha2(
            out1, enc_output, enc_output, attention_mask=padding_mask
        )
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)

        return out3