In [None]:
%pip install -U pytorch-lightning

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.5.3-py3-none-any.whl.metadata (20 kB)
Collecting torchmetrics>0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.8.1-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.1.0->pytorch-lightning)
  Downloadin

In [None]:
import torch ## torch let's us create tensors and also provides helper functions
import torch.nn as nn ## torch.nn gives us nn.Module, nn.Embedding() and nn.Linear()
import torch.nn.functional as F # This gives us the softmax() and argmax()
from torch.optim import Adam # This is the optimizer we will use
import lightning as L # Lightning makes it easier to write, optimize and scale our code"
from torch.utils.data import TensorDataset, DataLoader # We'll store our data in DataLoaders

## Prepare Input & Output Data

In [None]:
# Kamus dan Data Anda
input_vocab_id = {'<SOS>': 0, '<EOS>': 1, '<PAD>': 2, 'saya': 3, 'kamu': 4, 'dia': 5, 'makan': 6, 'minum': 7, 'air': 8, 'nasi': 9, 'ikan': 10, 'suka': 11, 'ini': 12, 'itu': 13, 'pergi': 14, 'sekolah': 15, 'rumah': 16, 'ke':17}
output_vocab_en = {'<SOS>': 0, '<EOS>': 1, '<PAD>': 2, 'I': 3, 'you': 4, 'he': 5, 'she': 6, 'eat': 7, 'drink': 8, 'water': 9, 'rice': 10, 'fish': 11, 'like': 12, 'this': 13, 'that': 14, 'go': 15, 'to': 16, 'school': 17, 'home': 18}
PAD_IDX = output_vocab_en['<PAD>']
MAX_LEN_INPUT = 5
MAX_LEN_OUTPUT = 6

def tokenize_and_pad(sentence, vocab, max_len, is_input=True):
    tokens = [vocab[word] for word in sentence.split()]
    if is_input:
        tokens = tokens + [vocab['<EOS>']]
    padded_tokens = tokens + [vocab['<PAD>']] * (max_len - len(tokens))
    return padded_tokens

indonesian_sentences = ['saya suka nasi', 'kamu makan ikan', 'dia minum air', 'saya pergi ke sekolah', 'kamu pergi ke rumah', 'dia makan itu', 'saya suka ini', 'kamu suka ikan', 'dia pergi', 'saya makan']
english_translations = ['I like rice', 'you eat fish', 'she drink water', 'I go to school', 'you go to home', 'she eat that', 'I like this', 'you like fish', 'she go', 'I eat']

inputs_list, decoder_inputs_list, labels_list = [], [], []
for i in range(len(indonesian_sentences)):
    input_tokens = tokenize_and_pad(indonesian_sentences[i], input_vocab_id, MAX_LEN_INPUT, is_input=True)
    inputs_list.append(input_tokens)
    decoder_in = tokenize_and_pad(english_translations[i], output_vocab_en, MAX_LEN_OUTPUT - 1, is_input=False)
    decoder_in = [output_vocab_en['<SOS>']] + decoder_in
    decoder_inputs_list.append(decoder_in)
    label_out = tokenize_and_pad(english_translations[i], output_vocab_en, MAX_LEN_OUTPUT - 1, is_input=False)
    label_out = label_out + [output_vocab_en['<EOS>']]
    labels_list.append(label_out)

inputs_tensor = torch.tensor(inputs_list)
decoder_inputs_tensor = torch.tensor(decoder_inputs_list)
labels_tensor = torch.tensor(labels_list)

dataset = TensorDataset(inputs_tensor, decoder_inputs_tensor, labels_tensor)
dataloader = DataLoader(dataset, batch_size=2)

## Positional Encoding

In [None]:
class PositionEncoding(nn.Module):
    def __init__(self, d_model, max_len): # Hapus nilai default 2 dan 3
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(start=0, end=max_len, step=1).float().unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # Tambahkan dimensi batch ke positional encoding
        return x + self.pe[:x.size(1), :].unsqueeze(0)

## Multi-head Attention

In [None]:
class Attention(nn.Module):
    def __init__(self, d_model, row_dim=0, col_dim=1): # Hapus nilai default 2
        super().__init__()
        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.row_dim = row_dim
        self.col_dim = col_dim

    def forward(self, encodings_q, encodings_k, encodings_v, mask=None):
        ## We pass those sets of encodings to the various weight matrices.
        q = self.W_q(encodings_q)
        k = self.W_k(encodings_k)
        v = self.W_v(encodings_v)
        sims = torch.matmul(q, k.transpose(dim0=self.row_dim, dim1=self.col_dim))
        scaled_sims = sims / torch.tensor(k.size(self.col_dim)**0.5)
        if mask is not None:
            scaled_sims = scaled_sims.masked_fill(mask=mask, value=-1e9)
        attention_percents = F.softmax(scaled_sims, dim=self.col_dim)
        attention_scores = torch.matmul(attention_percents, v)
        # print(q.shape, k.shape, sims.shape)
        return attention_scores

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, row_dim=1, col_dim=2, num_heads=1): # Hapus nilai default 2
        super().__init__()
        self.heads = nn.ModuleList([
            Attention(d_model, row_dim, col_dim) for _ in range(num_heads)
        ])
        self.reduce_attention_dim = nn.Linear(in_features=(num_heads * d_model), out_features=d_model)

    def forward(self, q, k, v, mask=None):
        # FIX: Gabungkan pada dimensi fitur (dim=-1)
        multihead_scores = torch.cat([head(encodings_q=q, encodings_k=k, encodings_v=v, mask=mask) for head in self.heads], dim=-1)
        return self.reduce_attention_dim(multihead_scores)

### Feed Forwad Network

In [None]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout, activation=nn.ReLU):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.act = activation()
    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.fc2(x)
        return x  # shape (B, L, d_model)

## Encoder

In [None]:
class EncoderBlock(nn.Module):
    def __init__(self, d_model, num_heads=1, d_ff=8, dropout=0.0):
        super().__init__()
        self.mha = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.ffn = FeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout, activation=nn.ReLU)
        self.norm2 = nn.LayerNorm(d_model)
    def forward(self, x):
        # x: (B, L, d_model)
        mha_out = self.mha(x, x, x,  mask = None)
        # 1st residual + norm (post-ln)
        x = self.norm1(x + mha_out)
        # feed-forward
        ff = self.ffn(x)
        # 2nd residual + norm
        x = self.norm2(x + ff)
        return x

class Encoder(nn.Module):
    def __init__(self, num_tokens, d_model, max_len, n_blocks=1, num_heads=1):
        super().__init__()
        self.we = nn.Embedding(num_embeddings=num_tokens, embedding_dim=d_model)
        self.pe = PositionEncoding(d_model=d_model, max_len=max_len)
        self.encoder_blocks = nn.ModuleList([
            EncoderBlock(d_model=d_model, num_heads=num_heads, d_ff=8)
            for _ in range(n_blocks)
        ])

    def forward(self, token_ids):
        word_embeddings = self.we(token_ids)
        x = self.pe(word_embeddings) # Gunakan 'x' sebagai variabel iteratif
        for block in self.encoder_blocks:
            x = block(x) # Gunakan output block sebelumnya sebagai input block selanjutnya
        return x

## Decoder

In [None]:
class DecoderBlock(nn.Module):
    def __init__(self, d_model, num_heads=1, d_ff=8, dropout=0.0):
        super().__init__()
        self.mha1 = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.mha2 = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm2 = nn.LayerNorm(d_model)
        self.ffn = FeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout, activation=nn.ReLU)
        self.norm3 = nn.LayerNorm(d_model)

    def forward(self, x, encoder_out, mask):
      mha1_out = self.mha1(q = x, k = x, v = x, mask = mask)
      x = self.norm1(x + mha1_out)
      mha2_out = self.mha2(q = x, k = encoder_out, v = encoder_out, mask = None)
      x = self.norm2(x + mha2_out)
      ff = self.ffn(x)
      x = self.norm3(x + ff)
      return x

class Decoder(nn.Module):
    def __init__(self, num_tokens, d_model, max_len, n_blocks=1, num_heads=1):
        super().__init__()
        self.we = nn.Embedding(num_embeddings=num_tokens, embedding_dim=d_model)
        self.pe = PositionEncoding(d_model=d_model, max_len=max_len)
        self.decoder_blocks = nn.ModuleList([
            DecoderBlock(d_model=d_model, num_heads=num_heads, d_ff=8)
            for _ in range(n_blocks)
        ])
        self.fc_layer = nn.Linear(in_features=d_model, out_features=num_tokens)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def forward(self, token_ids, encoder_out):
        seq_len = token_ids.size(1)
        word_embeddings = self.we(token_ids)
        x = self.pe(word_embeddings)
        mask = torch.tril(torch.ones(seq_len, seq_len)).to(self.device)
        mask = mask == 0
        for block in self.decoder_blocks:
            x = block(x, encoder_out, mask)
        fc_layer_output = self.fc_layer(x)
        return fc_layer_output

## Transformer

In [None]:
class Transformer(L.LightningModule):
    # Perbarui __init__ untuk menerima semua parameter
    def __init__(self, input_vocab_size, output_vocab_size, d_model, max_len_input, max_len_output):
        super().__init__()
        # Teruskan parameter dengan benar ke Encoder dan Decoder
        self.encoder = Encoder(num_tokens=input_vocab_size, d_model=d_model, max_len=max_len_input)
        self.decoder = Decoder(num_tokens=output_vocab_size, d_model=d_model, max_len=max_len_output)
        self.loss = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
        self.output_vocab = output_vocab_en
        self.max_len_input = max_len_input
        self.max_len_output = max_len_output

    def forward(self, inputs, labels):
        encoder_values = self.encoder(inputs)
        output_presoftmax = self.decoder(labels, encoder_values) # Input ke decoder dengan teacher forcing (Single Pass) tanpa auto regressive
        return output_presoftmax

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=0.005)

    def training_step(self, batch, batch_idx):
        encoder_input, decoder_input_teacher_forcing, decoder_target = batch
        output = self.forward(encoder_input, decoder_input_teacher_forcing)
        loss = self.loss(output.reshape(-1, output.size(-1)), decoder_target.reshape(-1))
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def predict(self, input_sentence_tensor, max_output_len=5): # Inference dengan auto regressive di decoder
        self.eval()
        with torch.no_grad():
            if input_sentence_tensor.dim() == 1:
                input_sentence_tensor = input_sentence_tensor.unsqueeze(0)

            encoder_output = self.encoder(input_sentence_tensor.to(self.device))
            decoder_input = torch.tensor([[self.output_vocab['<SOS>']]]).to(self.device)
            predicted_tokens = []

            for _ in range(max_output_len):
                output_logits = self.decoder(decoder_input, encoder_output)
                last_token_logits = output_logits[:, -1, :]
                predicted_token_id = torch.argmax(last_token_logits, dim=-1)

                if predicted_token_id.item() == self.output_vocab['<EOS>']:
                    break

                predicted_tokens.append(predicted_token_id.item())
                decoder_input = torch.cat([decoder_input, predicted_token_id.unsqueeze(0)], dim=1)

            id_to_word = {v: k for k, v in self.output_vocab.items()}
            predicted_sentence = [id_to_word[token_id] for token_id in predicted_tokens]
            return predicted_sentence

## Train

In [None]:
# Inisialisasi dan pelatihan model
MAX_LEN_INPUT = 5
MAX_LEN_OUTPUT = 6

# Panggil kelas Transformer dengan urutan argumen yang benar
transformer = Transformer(
    input_vocab_size=len(input_vocab_id),
    output_vocab_size=len(output_vocab_en),
    d_model=16,
    max_len_input=MAX_LEN_INPUT,
    max_len_output=MAX_LEN_OUTPUT
)

trainer = L.Trainer(max_epochs=30, enable_progress_bar=True)
trainer.fit(transformer, train_dataloaders=dataloader)

INFO: 💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:lightning.pytorch.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO: GPU available: False, used: False
INFO:lightning.pytorch.utilities.rank_zero:GPU available: False, used: False
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: 
  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | encoder | Encoder          | 1.7 K  | train
1 | decoder | D

Training: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=30` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=30` reached.


## Inference

In [None]:
# Contoh penggunaan inferensi
input_sentence = "kamu makan ikan"
input_tokens = [input_vocab_id[word] for word in input_sentence.split()]
input_tensor = torch.tensor(input_tokens)

print(f"Menerjemahkan: '{input_sentence}' -> {input_tensor}")
predicted_translation = transformer.predict(input_tensor)
print(f"Hasil prediksi: {predicted_translation}")

Menerjemahkan: 'kamu makan ikan' -> tensor([ 4,  6, 10])
Hasil prediksi: ['you', 'eat', 'fish', 'fish', 'fish']


## Basic Transformer With Auto Regressive Inference

In [None]:
## first, a dictionary for the input vocabulary
input_vocab = {'<SOS>': 0, ## <SOS> = start of sequence.
               'lets': 1,
               'to': 2,
               'go': 3}

## Now a dictionary for the output vocabulary
output_vocab = {'<SOS>': 0,
                'ir': 1,
                'vamos': 2,
                'y': 3,
                '<EOS>': 4,
                '<PAD>': 5} # Tambahkan token padding

# Tentukan PAD_IDX sesuai dengan ID token <PAD>
PAD_IDX = output_vocab['<PAD>']

# Output vocabulary: {'<SOS>': 0, 'ir': 1, 'vamos': 2, 'y': 3, '<EOS>': 4, '<PAD>': 5}
# Input: "lets go" -> Output: "vamos y"
# Input: "to go"   -> Output: "ir"
inputs = torch.tensor([[1, 3],
                       [2, 3]])

# Panjang sekuens terpanjang adalah 3 (untuk 'vamos y')
# Pad 'ir' agar panjangnya sama
decoder_inputs = torch.tensor([[0, 2, 3],    # <SOS> 'vamos' 'y'
                               [0, 1, PAD_IDX]]) # <SOS> 'ir' <PAD>

labels = torch.tensor([[2, 3, 4],        # 'vamos' 'y' <EOS>
                       [1, 4, PAD_IDX]]) # 'ir' <EOS> <PAD>

dataset = TensorDataset(inputs, labels)
dataloader = DataLoader(dataset)

In [None]:
class PositionEncoding(nn.Module):
    def __init__(self, d_model=2, max_len=3):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(start=0, end=max_len, step=1).float().unsqueeze(1)
        div_term = 1/torch.tensor(10000.0)**(torch.arange(start=0, end=d_model, step=2).float() / d_model)
        pe[:, 0::2] = torch.sin(position * div_term) ## every other column, starting with the 1st, has sin() values
        pe[:, 1::2] = torch.cos(position * div_term) ## every other column, starting with the 2nd, has cos() values
        ## Now we "register 'pe'.
        self.register_buffer('pe', pe)
    def forward(self, x):
        return x + self.pe[:x.size(0), :]

class Attention(nn.Module):
    def __init__(self, d_model=2, row_dim=0, col_dim=1):
        super().__init__()
        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.row_dim = row_dim
        self.col_dim = col_dim

    def forward(self, encodings_q, encodings_k, encodings_v, mask=None):
        ## We pass those sets of encodings to the various weight matrices.
        q = self.W_q(encodings_q)
        k = self.W_k(encodings_k)
        v = self.W_v(encodings_v)
        sims = torch.matmul(q, k.transpose(dim0=self.row_dim, dim1=self.col_dim))
        scaled_sims = sims / torch.tensor(k.size(self.col_dim)**0.5)
        if mask is not None:
            scaled_sims = scaled_sims.masked_fill(mask=mask, value=-1e9)
        attention_percents = F.softmax(scaled_sims, dim=self.col_dim)
        attention_scores = torch.matmul(attention_percents, v)
        # print(q.shape, k.shape, sims.shape)
        return attention_scores

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model=2, row_dim=1, col_dim=2, num_heads=1): # Sesuaikan row/col dim untuk (B, L, D)
        super().__init__()
        self.heads = nn.ModuleList(
            [Attention(d_model, row_dim, col_dim)
             for _ in range(num_heads)]
        )
        self.reduce_attention_dim = nn.Linear(in_features=(num_heads*d_model), out_features=d_model)

    def forward(self, q, k, v, mask=None):
        # FIX: Gabungkan pada dimensi fitur (dim=-1)
        multihead_scores = torch.cat([head(encodings_q=q, encodings_k=k, encodings_v=v, mask=mask) for head in self.heads], dim=-1)

        # Sekarang multihead_scores punya shape (B, L, num_heads*d_model)
        # yang cocok untuk reduce_attention_dim
        return self.reduce_attention_dim(multihead_scores)

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout, activation=nn.ReLU):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.act = activation()
    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.fc2(x)
        return x  # shape (B, L, d_model)

class EncoderBlock(nn.Module):
    def __init__(self, d_model=2, num_heads=1, d_ff=8, dropout=0.0):
        super().__init__()
        self.mha = MultiHeadAttention(d_model=d_model, num_heads=num_heads)  # (B,L,d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.ffn = FeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout, activation=nn.ReLU)
        self.norm2 = nn.LayerNorm(d_model)
    def forward(self, x):
        # x: (B, L, d_model)
        mha_out = self.mha(x, x, x,  mask = None)
        # 1st residual + norm (post-ln)
        x = self.norm1(x + mha_out)
        # feed-forward
        ff = self.ffn(x)
        # 2nd residual + norm
        x = self.norm2(x + ff)
        return x

class Encoder(nn.Module):
    def __init__(self, num_tokens=4, d_model=2, max_len=3, n_blocks=1, num_heads=1):
        super().__init__()
        # L.seed_everything(42)
        # Embedding + Positional Encoding
        self.we = nn.Embedding(num_embeddings=num_tokens, embedding_dim=d_model)
        self.pe = PositionEncoding(d_model=d_model, max_len=max_len)
        # Stacked Encoder Block
        self.encoder_blocks = nn.ModuleList([
            EncoderBlock(d_model=d_model, num_heads=num_heads, d_ff=8)
            for block in range(n_blocks)
        ])

    def forward(self, token_ids):
        word_embeddings = self.we(token_ids)
        x = self.pe(word_embeddings) # Gunakan 'x' sebagai variabel iteratif
        for block in self.encoder_blocks:
            x = block(x) # Gunakan output block sebelumnya sebagai input block selanjutnya
        return x

class DecoderBlock(nn.Module):
  def __init__(self, d_model=2, num_heads=1, d_ff=8, dropout=0.0):
    super().__init__()
    self.mha1 = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
    self.norm1 = nn.LayerNorm(d_model)
    self.mha2 = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
    self.norm2 = nn.LayerNorm(d_model)
    self.ffn = FeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout, activation=nn.ReLU)
    self.norm3 = nn.LayerNorm(d_model)

  def forward(self, x, encoder_out, mask):
    mha1_out = self.mha1(q = x, k = x, v = x, mask = mask)
    x = self.norm1(x + mha1_out)
    mha2_out = self.mha2(q = x, k = encoder_out, v = encoder_out, mask = None)
    x = self.norm2(x + mha2_out)
    ff = self.ffn(x)
    x = self.norm3(x + ff)
    return x

class Decoder(nn.Module):
  def __init__(self, num_tokens=5, d_model=2, max_len=3, n_blocks=1, num_heads=1): # num_tokens=5 untuk output_vocab
    super().__init__()
    self.we = nn.Embedding(num_embeddings=num_tokens, embedding_dim=d_model)
    self.pe = PositionEncoding(d_model=d_model, max_len=max_len)
    self.decoder_blocks = nn.ModuleList([
            DecoderBlock(d_model=d_model, num_heads=num_heads, d_ff=8)
            for block in range(n_blocks)
        ])
    self.fc_layer = nn.Linear(in_features=d_model, out_features=num_tokens)
    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  def forward(self, token_ids, encoder_out):
    # token_ids shape: (batch_size, seq_len)
    seq_len = token_ids.size(1) # Ambil panjang sekuens dari dimensi ke-1

    word_embeddings = self.we(token_ids)
    x = self.pe(word_embeddings)

    # FIX: Buat mask berdasarkan seq_len, bukan batch_size
    # Mask akan berbentuk (seq_len, seq_len)
    mask = torch.tril(torch.ones(seq_len, seq_len)).to(self.device)
    mask = mask == 0 # Invert mask agar nilai True diabaikan

    for block in self.decoder_blocks:
        x = block(x, encoder_out, mask)

    fc_layer_output = self.fc_layer(x)
    return fc_layer_output

class Transformer(L.LightningModule):
    def __init__(self, input_size, output_size, d_model=2, max_len=3):
        super().__init__()
        self.encoder = Encoder(num_tokens=len(input_vocab), d_model=d_model, max_len=max_len)
        self.decoder = Decoder(num_tokens=len(output_vocab), d_model=d_model, max_len=max_len)
        self.loss = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
        self.output_vocab = output_vocab
        self.max_len = max_len

    def forward(self, inputs, labels):
        # PyTorch Lightning secara otomatis memindahkan tensor inputs dan labels
        # ke device yang benar sebelum forward() dipanggil.
        # Anda tidak perlu lagi menggunakan .to(self.device) di sini.
        encoder_values = self.encoder(inputs)
        output_presoftmax = self.decoder(labels, encoder_values)
        return output_presoftmax

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=0.005)

    def training_step(self, batch, batch_idx):
        encoder_input, decoder_target = batch

        if encoder_input.dim() == 1:
            encoder_input = encoder_input.unsqueeze(0)
            decoder_target = decoder_target.unsqueeze(0)

        # Gunakan self.device yang sudah disediakan oleh PL
        decoder_input = torch.cat([torch.tensor([[self.output_vocab['<SOS>']]]).to(self.device), decoder_target], dim=1)[:, :-1]

        output = self.forward(encoder_input, decoder_input)

        loss = self.loss(output.reshape(-1, output.size(-1)), decoder_target.reshape(-1))
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def predict(self, input_sentence_tensor, max_output_len=10):
        self.eval()
        with torch.no_grad():
            if input_sentence_tensor.dim() == 1:
                input_sentence_tensor = input_sentence_tensor.unsqueeze(0)

            encoder_output = self.encoder(input_sentence_tensor.to(self.device))

            decoder_input = torch.tensor([[self.output_vocab['<SOS>']]]).to(self.device)
            predicted_tokens = []

            for _ in range(max_output_len):
                output_logits = self.decoder(decoder_input, encoder_output)
                last_token_logits = output_logits[:, -1, :]
                predicted_token_id = torch.argmax(last_token_logits, dim=-1)

                if predicted_token_id.item() == self.output_vocab['<EOS>']:
                    break

                predicted_tokens.append(predicted_token_id.item())
                decoder_input = torch.cat([decoder_input, predicted_token_id.unsqueeze(0)], dim=1)

            id_to_word = {v: k for k, v in self.output_vocab.items()}
            predicted_sentence = [id_to_word[token_id] for token_id in predicted_tokens]

            return predicted_sentence

# Inisialisasi dan pelatihan model
transformer = Transformer(len(input_vocab), len(output_vocab), d_model=16, max_len=3)
trainer = L.Trainer(max_epochs=50, enable_progress_bar=True)
trainer.fit(transformer, train_dataloaders=dataloader)

INFO: 💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:lightning.pytorch.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO: GPU available: False, used: False
INFO:lightning.pytorch.utilities.rank_zero:GPU available: False, used: False
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: 
  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | encoder | Encoder          | 1.4 K  | train
1 | decoder | D

Training: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=50` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=50` reached.


In [None]:
# Contoh penggunaan inferensi
input_sentence = "lets go"
input_tokens = [input_vocab[word] for word in input_sentence.split()]
input_tensor = torch.tensor(input_tokens)

print(f"Menerjemahkan: '{input_sentence}' -> {input_tensor}")
predicted_translation = transformer.predict(input_tensor)
print(f"Hasil prediksi: {predicted_translation}")

Menerjemahkan: 'lets go' -> tensor([1, 3])
Hasil prediksi: ['vamos', 'y']
