In [None]:
%pip install datasets
%pip install tokenizers
%pip install torch torchvision torchaudio
%pip install --upgrade pip


In [16]:
# Inspect the structure of the first few entries in the dataset
for i in range(3):  # Check the first 3 samples
    print(raw_train_dataset[i])


{'Unnamed: 0.1': 7414, 'Unnamed: 0': 6263, 'Review_Date': ' on 01/16/10 18:39 PM (PST)', 'Author_Name': 'Frank ', 'Vehicle_Title': '2006 Nissan Frontier King Cab SE 4dr King Cab SB (4.0L 6cyl 5A)', 'Review_Title': 'Great truck', 'Review': ' I have owned this vehicle for 3 years and have 64K miles. I have only done normal maintenance and the first set of brake pads. Absolutely no issues at all. Plenty of power, great off road and long distance truck. ', 'Rating': 5}
{'Unnamed: 0.1': 4197, 'Unnamed: 0': 3346, 'Review_Date': ' on 10/25/10 00:00 AM (PDT)', 'Author_Name': 'lawrence ', 'Vehicle_Title': '2003 BMW 5 Series Sedan 530i 4dr Sedan (3.0L 6cyl 5M)', 'Review_Title': 'Transmission problem', 'Review': " Once the 6 year 100k miles warranty is gone, surprises! Maintenance very expensive and for those equipped with lifetime ZF HP19 transmission you'll be lucky if it last past 100K mark. Especially in New England weather. ", 'Rating': 4}
{'Unnamed: 0.1': 12504, 'Unnamed: 0': 9940, 'Review_

In [5]:
#Step1: Load the data and separate into train, validation and test data
import os
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from pathlib import Path
from datasets import load_dataset
from tqdm import tqdm

os.mkdir("./reviewcar")
os.mkdir("./tokenizer_en")
os.mkdir("./tokenizer_my")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the dataset
train_dataset = load_dataset("florentgbelidji/car-reviews")

# Get the total length of the dataset
dataset_length = len(train_dataset['train'])

# Limit the number of data in dataset for faster training purpose
split_lengths = [1500, dataset_length - 1500]

# Split the dataset
raw_train_dataset, rt_to_skip = random_split(train_dataset['train'], split_lengths)


In [11]:
# Step 2: Create tokenizers

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

# Function to get dataset iterator for reviews
def get_ds_iterator(raw_train_dataset):
    for data in raw_train_dataset:
        yield data['Review']  # Use the 'Review' key to access the text

# Create Tokenizer - English (since reviews are in English)
tokenizer_en = Tokenizer(BPE(unk_token="[UNK]"))
trainer_en = BpeTrainer(min_frequency=2, special_tokens=["[PAD]","[UNK]","[CLS]", "[SEP]", "[MASK]"])
tokenizer_en.pre_tokenizer = Whitespace()
tokenizer_en.train_from_iterator(get_ds_iterator(raw_train_dataset), trainer=trainer_en)
tokenizer_en.save("./tokenizer_en/tokenizer_en.json")

# Reload tokenizer from saved file
tokenizer_en = Tokenizer.from_file("./tokenizer_en/tokenizer_en.json")

# Get vocabulary size
source_vocab_size = tokenizer_en.get_vocab_size()

# Calculate max sequence length in the training dataset for reviews
max_seq_len_source = 0

for data in raw_train_dataset:
    enc_ids = tokenizer_en.encode(data['Review']).ids
    max_seq_len_source = max(max_seq_len_source, len(enc_ids))

print(f'max_seqlen_source: {max_seq_len_source}')  # Check the max sequence length

# Set max sequence length for training by adding a buffer to cover additional tokens like PAD, CLS, SEP
max_seq_len = max_seq_len_source + 20





max_seqlen_source: 1116


In [12]:
from torch.utils.data import Dataset, DataLoader
import torch

# Transform raw dataset to the encoded dataset that can be processed by the model
class EncodeDataset(Dataset):
    def __init__(self, raw_dataset, max_seq_len):
        super().__init__()
        self.raw_dataset = raw_dataset
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.raw_dataset)

    def __getitem__(self, index):
        # Fetching the single data for the given index value that consists of the car review.
        raw_text = self.raw_dataset[index]

        # Extracting the review text
        review_text = raw_text['Review']

        # Encoding the review text using the tokenizer
        review_text_encoded = tokenizer_en.encode(review_text).ids

        # Convert the CLS, SEP, and PAD tokens to their corresponding index id in the vocabulary
        CLS_ID = tokenizer_en.token_to_id("[CLS]")
        SEP_ID = tokenizer_en.token_to_id("[SEP]")
        PAD_ID = tokenizer_en.token_to_id("[PAD]")

        # Calculate the number of padding tokens required
        num_padding = self.max_seq_len - len(review_text_encoded) - 2

        # Ensure the padding size is non-negative
        num_padding = max(0, num_padding)

        # Create padding tensors
        padding = torch.tensor([PAD_ID] * num_padding, dtype=torch.int64)

        # Prepare encoder input (adding CLS and SEP tokens)
        encoder_input = torch.cat([torch.tensor([CLS_ID]), torch.tensor(review_text_encoded, dtype=torch.int64), torch.tensor([SEP_ID]), padding], dim=0)

        # In this setup, decoder_input and target_label are usually used for sequence-to-sequence tasks
        # If you're doing something else (like classification), you may not need decoder_input and target_label.

        # Create masks
        encoder_mask = (encoder_input != PAD_ID).unsqueeze(0).unsqueeze(0).int()

        return {
            'encoder_input': encoder_input,
            'encoder_mask': encoder_mask,
            'review_text': review_text
        }

# Causal mask ensures future tokens are masked in the decoder
def causal_mask(size):
    mask = torch.triu(torch.ones(1, size, size), diagonal=1).type(torch.int)
    return mask == 0

# Create DataLoader instance
train_dataloader = DataLoader(EncodeDataset(raw_train_dataset, max_seq_len), batch_size=5, shuffle=True)


In [13]:
import torch
import torch.nn as nn
import math

# Step 4: Input embedding and positional encoding
class EmbeddingLayer(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.d_model = d_model
        # Embedding layer to map token id to embedding vector (vocab_size, d_model)
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, input):
        # Normalize the embedding layer output
        embedding_output = self.embedding(input) * math.sqrt(self.d_model)
        return embedding_output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_seq_len: int, dropout_rate: float):
        super().__init__()
        self.dropout = nn.Dropout(dropout_rate)
        pe = torch.zeros(max_seq_len, d_model)

        pos = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)

        # Add a batch dimension
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, input_embedding):
        input_embedding = input_embedding + self.pe[:, :input_embedding.shape[1], :].requires_grad_(False)
        return self.dropout(input_embedding)

# Step 5: Multihead Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, num_heads: int, dropout_rate: float):
        super().__init__()
        self.num_heads = num_heads
        assert d_model % num_heads == 0, "d_model must be divisible by number of heads"

        # d_k is the dimension of each self-attention head
        self.d_k = d_model // num_heads

        # Weight matrices
        self.W_q = nn.Linear(d_model, d_model, bias=False)
        self.W_k = nn.Linear(d_model, d_model, bias=False)
        self.W_v = nn.Linear(d_model, d_model, bias=False)
        self.W_o = nn.Linear(d_model, d_model, bias=False)

        # Dropout
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, q, k, v, encoder_mask):
        # Calculate queries, keys, values
        query = self.W_q(q)
        key = self.W_k(k)
        value = self.W_v(v)

        # Split by number of heads
        query = query.view(query.shape[0], query.shape[1], self.num_heads, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.num_heads, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.num_heads, self.d_k).transpose(1, 2)

        # Calculate attention scores
        attention_score = (query @ key.transpose(-2, -1)) / math.sqrt(self.d_k)

        # Apply mask
        if encoder_mask is not None:
            attention_score.masked_fill_(encoder_mask == 0, -1e9)

        # Calculate attention weights
        attention_score = attention_score.softmax(dim=-1)

        # Apply dropout
        attention_score = self.dropout(attention_score)

        # Calculate attention output
        attention_output = attention_score @ value

        # Concatenate heads
        attention_output = attention_output.transpose(1, 2).contiguous().view(attention_output.shape[0], -1, self.num_heads * self.d_k)

        # Final linear transformation
        multihead_output = self.W_o(attention_output)

        return multihead_output

# Step 6: Feedforward Network, Layer Normalization, and AddAndNorm

class FeedForward(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout_rate: float):
        super().__init__()
        self.layer_1 = nn.Linear(d_model, d_ff)
        self.layer_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, input):
        return self.layer_2(self.dropout(torch.relu(self.layer_1(input))))

class LayerNorm(nn.Module):
    def __init__(self, d_model: int, eps: float = 1e-5):
        super().__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(d_model))  
        self.beta = nn.Parameter(torch.zeros(d_model))

    def forward(self, input):
        mean = input.mean(dim=-1, keepdim=True)
        std = input.std(dim=-1, keepdim=True)
        return self.gamma * (input - mean) / (std + self.eps) + self.beta

class AddAndNorm(nn.Module):
    def __init__(self, d_model: int, dropout_rate: float):
        super().__init__()
        self.layer_norm = LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, input, sub_layer):
        return input + self.dropout(sub_layer(self.layer_norm(input)))

# Step 7: Encoder Block and Encoder

class EncoderBlock(nn.Module):
    def __init__(self, multihead_attention: MultiHeadAttention, feed_forward: FeedForward, d_model: int, dropout_rate: float):
        super().__init__()
        self.multihead_attention = multihead_attention
        self.feed_forward = feed_forward
        self.addnorm_1 = AddAndNorm(d_model, dropout_rate)
        self.addnorm_2 = AddAndNorm(d_model, dropout_rate)

    def forward(self, encoder_input, encoder_mask):
        encoder_input = self.addnorm_1(encoder_input, lambda x: self.multihead_attention(x, x, x, encoder_mask))
        encoder_input = self.addnorm_2(encoder_input, self.feed_forward)
        return encoder_input

class Encoder(nn.Module):
    def __init__(self, encoderblocklist: nn.ModuleList, d_model: int):
        super().__init__()
        self.encoderblocklist = encoderblocklist
        self.layer_norm = LayerNorm(d_model)

    def forward(self, encoder_input, encoder_mask):
        for encoderblock in self.encoderblocklist:
            encoder_input = encoderblock(encoder_input, encoder_mask)
        encoder_output = self.layer_norm(encoder_input)
        return encoder_output

# Step 8: Decoder Block, Decoder, and the Projection Layer

class DecoderBlock(nn.Module):
    def __init__(self, masked_multihead_attention: MultiHeadAttention, cross_multihead_attention: MultiHeadAttention, feed_forward: FeedForward, d_model: int, dropout_rate: float):
        super().__init__()
        self.masked_multihead_attention = masked_multihead_attention
        self.cross_multihead_attention = cross_multihead_attention
        self.feed_forward = feed_forward
        self.addnorm_1 = AddAndNorm(d_model, dropout_rate)
        self.addnorm_2 = AddAndNorm(d_model, dropout_rate)
        self.addnorm_3 = AddAndNorm(d_model, dropout_rate)

    def forward(self, decoder_input, encoder_output, encoder_mask, decoder_mask):
        decoder_input = self.addnorm_1(decoder_input, lambda x: self.masked_multihead_attention(x, x, x, decoder_mask))
        decoder_input = self.addnorm_2(decoder_input, lambda x: self.cross_multihead_attention(x, encoder_output, encoder_output, encoder_mask))
        decoder_input = self.addnorm_3(decoder_input, self.feed_forward)
        return decoder_input

class Decoder(nn.Module):
    def __init__(self, decoderblocklist: nn.ModuleList, d_model: int):
        super().__init__()
        self.decoderblocklist = decoderblocklist
        self.layer_norm = LayerNorm(d_model)

    def forward(self, decoder_input, encoder_output, encoder_mask, decoder_mask):
        for decoderblock in self.decoderblocklist:
            decoder_input = decoderblock(decoder_input, encoder_output, encoder_mask, decoder_mask)
        decoder_output = self.layer_norm(decoder_input)
        return decoder_output

class ProjectionLayer(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.projection_layer = nn.Linear(d_model, vocab_size)

    def forward(self, decoder_output):
        output = self.projection_layer(decoder_output)
        return output

# Step 9: Create and Build Transformer

class Transformer(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, source_embed: EmbeddingLayer, target_embed: EmbeddingLayer, source_pos: PositionalEncoding, target_pos: PositionalEncoding, projection_layer: ProjectionLayer):
        super().__init__()
        self.source_embed = source_embed
        self.target_embed = target_embed
        self.source_pos = source_pos
        self.target_pos = target_pos
        self.encoder = encoder
        self.decoder = decoder
        self.projection_layer = projection_layer

    def forward(self, source_input, target_input, encoder_mask, decoder_mask):
        # Encoder: Source input to encoder
        encoder_input = self.source_embed(source_input)
        encoder_input = self.source_pos(encoder_input)
        encoder_output = self.encoder(encoder_input, encoder_mask)

        # Decoder: Target input to decoder
        decoder_input = self.target_embed(target_input)
        decoder_input = self.target_pos(decoder_input)
        decoder_output = self.decoder(decoder_input, encoder_output, encoder_mask, decoder_mask)

        # Projection Layer: Decoder output to projected output
        output = self.projection_layer(decoder_output)
        return output


In [15]:
def build_model(source_vocab_size, target_vocab_size, max_seq_len, d_model):
    # Embedding and positional encoding layers for the source and target
    source_embed = EmbeddingLayer(d_model, source_vocab_size)
    target_embed = EmbeddingLayer(d_model, target_vocab_size)
    source_pos = PositionalEncoding(d_model, max_seq_len, dropout_rate=0.1)
    target_pos = PositionalEncoding(d_model, max_seq_len, dropout_rate=0.1)
    
    # Multihead attention and feed-forward layers for encoder and decoder
    multihead_attention = MultiHeadAttention(d_model=d_model, num_heads=8, dropout_rate=0.1)
    feed_forward = FeedForward(d_model=d_model, d_ff=2048, dropout_rate=0.1)
    
    # Building the encoder and decoder blocks
    encoder_blocks = nn.ModuleList([EncoderBlock(multihead_attention, feed_forward, d_model, dropout_rate=0.1) for _ in range(6)])
    decoder_blocks = nn.ModuleList([DecoderBlock(multihead_attention, multihead_attention, feed_forward, d_model, dropout_rate=0.1) for _ in range(6)])
    
    encoder = Encoder(encoder_blocks, d_model)
    decoder = Decoder(decoder_blocks, d_model)
    
    # Projection layer
    projection_layer = ProjectionLayer(d_model, target_vocab_size)
    
    # Building the Transformer model
    model = Transformer(encoder, decoder, source_embed, target_embed, source_pos, target_pos, projection_layer)
    
    return model

# Assuming you have a tokenizer for a different language
from some_tokenizer_library import Tokenizer

# Define both tokenizers
tokenizer_en = Tokenizer(vocab_file='path/to/english_vocab/file')
tokenizer_my = Tokenizer(vocab_file='path/to/my_vocab/file')

# Then build the model
model = build_model(
    source_vocab_size=tokenizer_en.get_vocab_size(),
    target_vocab_size=tokenizer_my.get_vocab_size(),
    max_seq_len=max_seq_len,
    d_model=512
).to(device)

print(model)


ModuleNotFoundError: No module named 'some_tokenizer_library'

In [None]:
#Step 10: Training and Validation of malayGPT

def run_validation(model, validation_ds, tokenizer_en, tokenizer_my, max_seq_len, device, print_msg, global_step):
    model.eval()
    count = 0

    with torch.no_grad():
        for batch in validation_ds:
            count += 1
            encoder_input = batch["encoder_input"].to(device)
            encoder_mask = batch["encoder_mask"].to(device)

            cls_id = tokenizer_my.token_to_id('[CLS]')
            sep_id = tokenizer_my.token_to_id('[SEP]')

            # Computing the output of the encoder for the source sequence
            encoder_output = model.encode(encoder_input, encoder_mask)
            # for prediction task, the first token that goes in decoder input is the [CLS] token
            decoder_input = torch.empty(1, 1).fill_(cls_id).type_as(encoder_input).to(device)
            # since we need to keep adding the output back to the input until the [SEP] - end token is received.
            while True:
                # check if the max length is received
                if decoder_input.size(1) == max_seq_len:
                    break

                # recreate mask each time the new output is added the decoder input for next token prediction
                decoder_mask = causal_mask(decoder_input.size(1)).type_as(encoder_mask).to(device)

                # apply projection only to the next token
                out = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)

                # apply projection only to the next token
                prob = model.project(out[:, -1])

                # select the token with highest probablity which is a greedy search implementation
                _, next_word = torch.max(prob, dim=1)
                decoder_input = torch.cat(
                    [decoder_input, torch.empty(1, 1).type_as(encoder_input).fill_(next_word.item()).to(device)], dim=1
                )
                # check if the new token is the end of token
                if next_word == sep_id:
                    break
            # final output is the concatinated decoder input till the end token is reached
            model_out = decoder_input.squeeze(0)

            source_text = batch["source_text"][0]
            target_text = batch["target_text"][0]
            model_out_text = tokenizer_my.decode(model_out.detach().cpu().numpy())

            # Print the source, target and model output
            print_msg('-'*55)
            # print_msg(f"{f'SOURCE: ':>12}{source_text}")
            # print_msg(f"{f'TARGET: ':>12}{target_text}")
            # print_msg(f"{f'PREDICTED: ':>12}{model_out_text}")
            print_msg(f'Source Text: {source_text}')
            print_msg(f'Target Text: {target_text}')
            print_msg(f'Predicted by MalayGPT: {model_out_text}')

            if count == 2:
                break

def train_model(preload_epoch=None):
    # The entire training, validation cycle will run for 20 cycles or epochs.
    EPOCHS = 10
    initial_epoch = 0
    global_step = 0

    # Adam is one of the most commonly used optimization algorithms that hold the current state and will update the parameters based on the computed gradients.
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, eps=1e-9)

    # If the preload_epoch is not none, that means the training will start with the weights, optimizer that has been last saved and start with preload epoch + 1
    if preload_epoch is not None:
      model_filename = f"./malaygpt/model_{preload_epoch}.pt"
      state = torch.load(model_filename)
      model.load_state_dict(state['model_state_dict'])
      initial_epoch = state['epoch'] + 1
      optimizer.load_state_dict(state['optimizer_state_dict'])
      global_step = state['global_step']

    # The CrossEntropyLoss loss function computes the difference between the projection output and target label.
    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_en.token_to_id('[PAD]'), label_smoothing=0.1).to(device)

    for epoch in range(initial_epoch, EPOCHS):
        # torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
        for batch in batch_iterator:
            encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)
            target_label = batch['target_label'].to(device) # (B, seq_len)

            # Run the tensors through the encoder, decoder and the projection layer
            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
            projection_output = model.project(decoder_output) # (B, seq_len, vocab_size)

            # Compute the loss using a simple cross entropy
            loss = loss_fn(projection_output.view(-1, tokenizer_my.get_vocab_size()), target_label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

            # Backpropagate the loss
            loss.backward()

            # Update the weights
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

            global_step += 1

        # VALIDATION BLOCK STARTS HERE [Runs every epoch after the training block is complete]
        run_validation(model, val_dataloader, tokenizer_en, tokenizer_my, max_seq_len, device, lambda msg: batch_iterator.write(msg), global_step)

        # Save the model at the end of every epoch
        model_filename = f"./malaygpt/model_{epoch}.pt"
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, model_filename)



In [None]:
# Train our model
train_model(preload_epoch=None)

In [None]:
#Step 11: Finally testing our malayGPT model to translated new sentences. Let's give it a try.

def malaygpt(user_input_text):

    # validation using input text
    user_input_text = str(user_input_text).strip()

    # Let's get the model Define the device, tokenizers, and model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer_en = Tokenizer.from_file("./tokenizer_en/tokenizer_en.json")
    tokenizer_my = Tokenizer.from_file("./tokenizer_my/tokenizer_my.json")

    # Build our model
    # model = build_model(tokenizer_en.get_vocab_size(), tokenizer_my.get_vocab_size(), max_seq_len, max_seq_len, d_model=512).to(device)
    # model = get_model(tokenizer_en.get_vocab_size(), tokenizer_my.get_vocab_size()).to(device)
    model = build_model(tokenizer_en.get_vocab_size(), tokenizer_my.get_vocab_size(),max_seq_len, max_seq_len, d_model=512).to(device)

    # Load the specific checkpoint of the model that you've saved during training.
    checkpoint_number = 9    # for this test, I am taking checkpoint number 10
    model_filename = f"./malaygpt/model_{checkpoint_number}.pt"
    state = torch.load(model_filename)
    model.load_state_dict(state['model_state_dict'])

    # Lets beging the inferencing
    model.eval()
    with torch.no_grad():
        # Precompute the encoder output and reuse it for every generation step
        source_text_encoding = tokenizer_en.encode(user_input_text)
        source_text_encoding = torch.cat([
            torch.tensor([tokenizer_en.token_to_id('[CLS]')], dtype=torch.int64),
            torch.tensor(source_text_encoding.ids, dtype=torch.int64),
            torch.tensor([tokenizer_en.token_to_id('[SEP]')], dtype=torch.int64),
            torch.tensor([tokenizer_en.token_to_id('[PAD]')] * (max_seq_len - len(source_text_encoding.ids) - 2), dtype=torch.int64)
        ], dim=0).to(device)
        source_mask = (source_text_encoding != tokenizer_en.token_to_id('[PAD]')).unsqueeze(0).unsqueeze(0).int().to(device)
        encoder_output = model.encode(source_text_encoding, source_mask)

        # Initialize the decoder input with the sos token
        decoder_input = torch.empty(1, 1).fill_(tokenizer_my.token_to_id('[CLS]')).type_as(source_text_encoding).to(device)

        # Generate the translation word by word
        while decoder_input.size(1) < max_seq_len:
            # build mask for target and calculate output
            decoder_mask = torch.triu(torch.ones((1, decoder_input.size(1), decoder_input.size(1))), diagonal=1).type(torch.int).type_as(source_mask).to(device)
            out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

            # project next token
            prob = model.project(out[:, -1])
            _, next_word = torch.max(prob, dim=1)
            decoder_input = torch.cat([decoder_input, torch.empty(1, 1).type_as(source_text_encoding).fill_(next_word.item()).to(device)], dim=1)

            # print the translated word
            print(f"{tokenizer_my.decode([next_word.item()])}", end=' ')

            # break if we predict the end of sentence token
            if next_word == tokenizer_my.token_to_id('[SEP]'):
                break

    # convert ids to tokens
    return tokenizer_my.decode(decoder_input[0].tolist())

In [None]:
# Test 1: Translation using MalayGPT
user_input = "Good Morning"
transalated_text = malaygpt(user_input)

print(f"User input (in English): {user_input}")
print(f"Translation (in Malay): {transalated_text}")

In [None]:
# Test 2: Translation using MalayGPT
user_input = "what"
transalated_text = malaygpt(user_input)

print(f"User input (in English): {user_input}")
print(f"Translation (in Malay): {transalated_text}")