# Machine Translation - NLP - Pytorch 

In [1]:
import os
from tokenizers import ByteLevelBPETokenizer
from sklearn.model_selection import train_test_split

In [4]:
data_path = '/kaggle/input/en-id-dataset/ind.txt'

## Load & Bersihkan Dataset

In [5]:
en_sents = []
id_sents = []

try:
    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Pisahkan berdasarkan tab dan ambil dua elemen pertama
            parts = line.strip().split('\t')
            if len(parts) >= 2:
                en_sents.append(parts[0])
                id_sents.append(parts[1])
except FileNotFoundError:
    print(f"Error: File tidak ditemukan di path '{data_path}'")
    print("Pastikan path dataset sudah benar.")
    # Hentikan eksekusi jika file tidak ada
    exit()


print(f"Total pasangan kalimat: {len(en_sents)}")
print("\nContoh data:")
for i in range(5):
    print(f"EN: {en_sents[i]}")
    print(f"ID: {id_sents[i]}")

Total pasangan kalimat: 14881

Contoh data:
EN: Hi.
ID: Hai.
EN: Run!
ID: Lari!
EN: Run.
ID: Lari!
EN: Who?
ID: Siapa?
EN: Wow!
ID: Wow!


## Latih Tokenizer Subward

In [7]:
output_dir = 'tokenizers'
os.makedirs(output_dir, exist_ok=True)

# Ukuran vocabulary
VOCAB_SIZE = 16000 
MIN_FREQUENCY = 2

# Tokenizer Bahasa Inggris
en_tokenizer = ByteLevelBPETokenizer()
en_tokenizer.train_from_iterator(en_sents, vocab_size=VOCAB_SIZE, min_frequency=MIN_FREQUENCY, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])
en_tokenizer.save_model(output_dir, "en")

# Tokenizer Bahasa Indonesia
id_tokenizer = ByteLevelBPETokenizer()
id_tokenizer.train_from_iterator(id_sents, vocab_size=VOCAB_SIZE, min_frequency=MIN_FREQUENCY, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])
id_tokenizer.save_model(output_dir, "id")

print(f"\nTokenizer dilatih dan disimpan di '{output_dir}'.")
print(f"Ukuran Vocab: {VOCAB_SIZE}")








Tokenizer dilatih dan disimpan di 'tokenizers'.
Ukuran Vocab: 16000


## Bagi Data Menjadi Train, Validation & Test Set  

In [8]:
# pisah kadi data latih dan temp set (validasi + uji)
train_en, temp_en, train_id, temp_id = train_test_split(
    en_sents, id_sents, test_size=0.2, random_state=42
)

# pisah temp set jadi validasi dan uji
val_en, test_en, val_id, test_id = train_test_split(
    temp_en, temp_id, test_size=0.5, random_state=42
)

print(f"\nData berhasil dibagi:")
print(f"Ukuran set Latih: {len(train_en)}")
print(f"Ukuran set Validasi: {len(val_en)}")
print(f"Ukuran set Uji: {len(test_en)}")


Data berhasil dibagi:
Ukuran set Latih: 11904
Ukuran set Validasi: 1488
Ukuran set Uji: 1489


## Simpan Data

In [9]:
data_dir = 'data_split'
os.makedirs(data_dir, exist_ok=True)

def save_sents(sents, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for sent in sents:
            f.write(sent + '\n')

save_sents(train_en, os.path.join(data_dir, 'train.en'))
save_sents(train_id, os.path.join(data_dir, 'train.id'))
save_sents(val_en, os.path.join(data_dir, 'val.en'))
save_sents(val_id, os.path.join(data_dir, 'val.id'))
save_sents(test_en, os.path.join(data_dir, 'test.en'))
save_sents(test_id, os.path.join(data_dir, 'test.id'))

print(f"\nData yang sudah dibagi disimpan di direktori '{data_dir}'.")
print("\nTahap persiapan data selesai!")


Data yang sudah dibagi disimpan di direktori 'data_split'.

Tahap persiapan data selesai!


## Setup

In [30]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tokenizers import Tokenizer
from tokenizers.models import BPE
import os
import random

In [31]:
# setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


## Tokenizer

In [32]:
# kongifurasi & hyperparameter
TOKENIZER_DIR = 'tokenizers'
DATA_DIR = 'data_split'

In [33]:
# English tokenizer
en_tokenizer = Tokenizer(BPE(
    vocab=os.path.join(TOKENIZER_DIR, "en-vocab.json"),
    merges=os.path.join(TOKENIZER_DIR, "en-merges.txt")
))
en_tokenizer.save(os.path.join(TOKENIZER_DIR, "en-tokenizer.json"))

# Indonesian tokenizer
id_tokenizer = Tokenizer(BPE(
    vocab=os.path.join(TOKENIZER_DIR, "id-vocab.json"),
    merges=os.path.join(TOKENIZER_DIR, "id-merges.txt")
))
en_tokenizer.save(os.path.join(TOKENIZER_DIR, "en-tokenizer.json"))

  en_tokenizer = Tokenizer(BPE(
  id_tokenizer = Tokenizer(BPE(


In [13]:
# Hyperparameters
INPUT_DIM = 16000   # Ukuran vocab EN 
OUTPUT_DIM = 16000  # Ukuran vocab ID 
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
BATCH_SIZE = 128

In [36]:
# load tokenizer
en_tokenizer = Tokenizer.from_file(os.path.join(TOKENIZER_DIR, "en-tokenizer.json"))
id_tokenizer = Tokenizer.from_file(os.path.join(TOKENIZER_DIR, "id-tokenizer.json"))

In [38]:
# get id token spesial
SRC_PAD_IDX = en_tokenizer.token_to_id('<pad>')
TRG_PAD_IDX = id_tokenizer.token_to_id('<pad>')
TRG_SOS_IDX = id_tokenizer.token_to_id('<s>')
TRG_EOS_IDX = id_tokenizer.token_to_id('</s>')

## Dataset Class

In [42]:
class TranslationDataset(Dataset):
    def __init__(self, data_dir, lang_pair='en-id', split='train'):
        self.src_sents = self._load_sentences(os.path.join(data_dir, f'{split}.{lang_pair.split("-")[0]}'))
        self.trg_sents = self._load_sentences(os.path.join(data_dir, f'{split}.{lang_pair.split("-")[1]}'))

    def _load_sentences(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            return [line.strip() for line in f]

    def __len__(self):
        return len(self.src_sents)

    def __getitem__(self, idx):
        return self.src_sents[idx], self.trg_sents[idx]

## Definisikan Function Collate Untuk Data Loader

In [44]:
def collate_fn(batch, src_tokenizer, trg_tokenizer, src_pad_idx, trg_pad_idx, trg_sos_idx, trg_eos_idx, device):
    src_batch, trg_batch = [], []
    for src_sample, trg_sample in batch:
        src_batch.append(torch.tensor(src_tokenizer.encode(src_sample).ids, dtype=torch.long))
        trg_batch.append(torch.tensor([trg_sos_idx] + trg_tokenizer.encode(trg_sample).ids + [trg_eos_idx], dtype=torch.long))

    # Pad sequences
    src_padded = nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=src_pad_idx)
    trg_padded = nn.utils.rnn.pad_sequence(trg_batch, batch_first=True, padding_value=trg_pad_idx)

    return src_padded.to(device), trg_padded.to(device)

## Arsitektur Model

In [45]:
# Encoderr
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src = [src_len, batch_size]
        embedded = self.dropout(self.embedding(src))
        # embedded = [src_len, batch_size, emb_dim]
        outputs, hidden = self.rnn(embedded)
        # outputs = [src_len, batch_size, hid_dim * num_directions]
        # hidden = [n_layers * num_directions, batch_size, hid_dim]
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        # hidden = [batch_size, dec_hid_dim]
        return outputs, hidden

In [46]:
# ATTENTION
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)

    def forward(self, hidden, encoder_outputs):
        # hidden = [batch_size, dec_hid_dim]
        # encoder_outputs = [src_len, batch_size, enc_hid_dim * 2]
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # hidden = [batch_size, src_len, dec_hid_dim]
        # encoder_outputs = [batch_size, src_len, enc_hid_dim * 2]
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2)))
        # energy = [batch_size, src_len, dec_hid_dim]
        attention = self.v(energy).squeeze(2)
        # attention = [batch_size, src_len]
        return torch.softmax(attention, dim=1)


In [47]:
# DECODER
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        # input = [batch_size]
        # hidden = [batch_size, dec_hid_dim]
        # encoder_outputs = [src_len, batch_size, enc_hid_dim * 2]
        input = input.unsqueeze(0)
        # input = [1, batch_size]
        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch_size, emb_dim]
        a = self.attention(hidden, encoder_outputs)
        # a = [batch_size, src_len]
        a = a.unsqueeze(1)
        # a = [batch_size, 1, src_len]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # encoder_outputs = [batch_size, src_len, enc_hid_dim * 2]
        weighted = torch.bmm(a, encoder_outputs)
        # weighted = [batch_size, 1, enc_hid_dim * 2]
        weighted = weighted.permute(1, 0, 2)
        # weighted = [1, batch_size, enc_hid_dim * 2]
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        # rnn_input = [1, batch_size, (enc_hid_dim * 2) + emb_dim]
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        # output = [1, batch_size, dec_hid_dim]
        # hidden = [1, batch_size, dec_hid_dim]
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
        # prediction = [batch_size, output_dim]
        return prediction, hidden.squeeze(0)

In [48]:
# SEQ2SEQ WRAPPER
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        # src = [src_len, batch_size]
        # trg = [trg_len, batch_size]
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src)
        input = trg[0,:]
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs

## Inisialisasi Model $ Training Komponen

In [49]:
# Buat instance model
attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)
model = Seq2Seq(enc, dec, device).to(device)

In [50]:
# Inisialisasi weights
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
model.apply(init_weights) 

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(16000, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(16000, 256)
    (rnn): GRU(1280, 512)
    (fc_out): Linear(in_features=1792, out_features=16000, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [51]:
# Optimizer
optimizer = optim.Adam(model.parameters())

In [52]:
# Loss function
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [54]:
# DataLoader
train_dataset = TranslationDataset(DATA_DIR, split='train')
valid_dataset = TranslationDataset(DATA_DIR, split='val')

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                              collate_fn=lambda b: collate_fn(b, en_tokenizer, id_tokenizer, SRC_PAD_IDX, TRG_PAD_IDX, TRG_SOS_IDX, TRG_EOS_IDX, device))
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False,
                              collate_fn=lambda b: collate_fn(b, en_tokenizer, id_tokenizer, SRC_PAD_IDX, TRG_PAD_IDX, TRG_SOS_IDX, TRG_EOS_IDX, device))

In [55]:
print("\nModel, Optimizer, Loss Function, dan DataLoaders berhasil dibuat.")
print(f'Model memiliki {sum(p.numel() for p in model.parameters() if p.requires_grad):,} parameter yang dapat dilatih.')


Model, Optimizer, Loss Function, dan DataLoaders berhasil dibuat.
Model memiliki 43,313,280 parameter yang dapat dilatih.


In [56]:
# tes ambil 1 batch dari train dataloader
src, trg = next(iter(train_dataloader))
print(f"\nContoh ukuran batch sumber (source): {src.shape}")
print(f"Contoh ukuran batch target: {trg.shape}")


Contoh ukuran batch sumber (source): torch.Size([128, 34])
Contoh ukuran batch target: torch.Size([128, 32])


In [57]:
import torch.optim as optim
from tqdm import tqdm
import time
import math

In [66]:
# fungsi training
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    
    # Menggunakan tqdm untuk progress bar
    for i, batch in enumerate(tqdm(iterator, desc="Training")):
        src, trg = batch
        
        # Transpose batch karena model RNN/GRU di PyTorch
        # secara default mengharapkan input: [seq_len, batch_size]
        src = src.permute(1, 0)
        trg = trg.permute(1, 0)
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        # trg = [trg_len, batch_size]
        # output = [trg_len, batch_size, output_dim]
        
        output_dim = output.shape[-1]
        
        # Reshape output dan target untuk loss function
        # Abaikan token <sos> di awal
        output = output[1:].reshape(-1, output_dim)
        trg = trg[1:].reshape(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        
        # Mencegah 'exploding gradients'
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [67]:
# fungsi evaluasi
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for i, batch in enumerate(tqdm(iterator, desc="Evaluating")):
            src, trg = batch
            src = src.permute(1, 0)
            trg = trg.permute(1, 0)

            # Matikan teacher forcing untuk evaluasi
            output = model(src, trg, 0) 
            
            output_dim = output.shape[-1]
            
            output = output[1:].reshape(-1, output_dim)
            trg = trg[1:].reshape(-1)

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
            
    return epoch_loss / len(iterator)


In [68]:
# helper function untuk menghitung waktu
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## Training Main Loop

In [69]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_dataloader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_dataloader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # Simpan model jika validation loss membaik
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'baseline-rnn-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

print("\nTraining selesai!")
print(f"Model terbaik disimpan sebagai 'baseline-rnn-model.pt' dengan validation loss: {best_valid_loss:.3f}")

Training: 100%|██████████| 93/93 [00:49<00:00,  1.87it/s]
Evaluating: 100%|██████████| 12/12 [00:03<00:00,  3.58it/s]


Epoch: 01 | Time: 0m 53s
	Train Loss: 5.668 | Train PPL: 289.582
	 Val. Loss: 5.209 |  Val. PPL: 182.890


Training: 100%|██████████| 93/93 [00:49<00:00,  1.88it/s]
Evaluating: 100%|██████████| 12/12 [00:03<00:00,  3.58it/s]


Epoch: 02 | Time: 0m 52s
	Train Loss: 4.833 | Train PPL: 125.621
	 Val. Loss: 5.267 |  Val. PPL: 193.824


Training: 100%|██████████| 93/93 [00:49<00:00,  1.88it/s]
Evaluating: 100%|██████████| 12/12 [00:03<00:00,  3.54it/s]


Epoch: 03 | Time: 0m 52s
	Train Loss: 4.520 | Train PPL:  91.863
	 Val. Loss: 5.013 |  Val. PPL: 150.367


Training: 100%|██████████| 93/93 [00:48<00:00,  1.90it/s]
Evaluating: 100%|██████████| 12/12 [00:03<00:00,  3.54it/s]


Epoch: 04 | Time: 0m 52s
	Train Loss: 4.267 | Train PPL:  71.301
	 Val. Loss: 4.956 |  Val. PPL: 142.091


Training: 100%|██████████| 93/93 [00:48<00:00,  1.92it/s]
Evaluating: 100%|██████████| 12/12 [00:03<00:00,  3.59it/s]


Epoch: 05 | Time: 0m 51s
	Train Loss: 4.082 | Train PPL:  59.287
	 Val. Loss: 4.847 |  Val. PPL: 127.372


Training: 100%|██████████| 93/93 [00:48<00:00,  1.92it/s]
Evaluating: 100%|██████████| 12/12 [00:03<00:00,  3.56it/s]


Epoch: 06 | Time: 0m 51s
	Train Loss: 3.885 | Train PPL:  48.648
	 Val. Loss: 4.794 |  Val. PPL: 120.792


Training: 100%|██████████| 93/93 [00:49<00:00,  1.89it/s]
Evaluating: 100%|██████████| 12/12 [00:03<00:00,  3.56it/s]


Epoch: 07 | Time: 0m 52s
	Train Loss: 3.693 | Train PPL:  40.147
	 Val. Loss: 4.681 |  Val. PPL: 107.827


Training: 100%|██████████| 93/93 [00:48<00:00,  1.91it/s]
Evaluating: 100%|██████████| 12/12 [00:03<00:00,  3.53it/s]


Epoch: 08 | Time: 0m 52s
	Train Loss: 3.503 | Train PPL:  33.208
	 Val. Loss: 4.612 |  Val. PPL: 100.678


Training: 100%|██████████| 93/93 [00:48<00:00,  1.93it/s]
Evaluating: 100%|██████████| 12/12 [00:03<00:00,  3.57it/s]


Epoch: 09 | Time: 0m 51s
	Train Loss: 3.301 | Train PPL:  27.127
	 Val. Loss: 4.578 |  Val. PPL:  97.289


Training: 100%|██████████| 93/93 [00:50<00:00,  1.85it/s]
Evaluating: 100%|██████████| 12/12 [00:03<00:00,  3.57it/s]


Epoch: 10 | Time: 0m 53s
	Train Loss: 3.055 | Train PPL:  21.229
	 Val. Loss: 4.482 |  Val. PPL:  88.442

Training selesai!
Model terbaik disimpan sebagai 'baseline-rnn-model.pt' dengan validation loss: 4.482


## Import Module SacreBLEU 

In [72]:
!pip install sacrebleu

import spacy
from tqdm import tqdm
import sacrebleu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m592.7 kB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.2.0 sacrebleu-2.5.1


In [73]:
# laod model
model.load_state_dict(torch.load('baseline-rnn-model.pt'))
print("Model 'baseline-rnn-model.pt' berhasil dimuat.")

Model 'baseline-rnn-model.pt' berhasil dimuat.


## Fungsi Terjemahan

In [74]:
def translate_sentence(sentence, src_tokenizer, trg_tokenizer, model, device, max_len=50):
    model.eval()

    # Tokenisasi kalimat sumber
    src_tokens = src_tokenizer.encode(sentence.lower()).ids
    src_tensor = torch.LongTensor(src_tokens).unsqueeze(1).to(device) # [src_len, 1]

    with torch.no_grad():
        encoder_outputs, hidden = model.encoder(src_tensor)

    # Dapatkan token <sos> dan <eos> dari tokenizer target
    trg_sos_idx = trg_tokenizer.token_to_id('<s>')
    trg_eos_idx = trg_tokenizer.token_to_id('</s>')
    
    # Mulai output dengan token <sos>
    trg_indexes = [trg_sos_idx]

    for i in range(max_len):
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
        
        with torch.no_grad():
            output, hidden = model.decoder(trg_tensor, hidden, encoder_outputs)
        
        pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token)

        if pred_token == trg_eos_idx:
            break
    
    # Konversi kembali dari index ke token
    trg_tokens = trg_tokenizer.decode(trg_indexes, skip_special_tokens=True)
    
    return trg_tokens

In [75]:
# fungsi evaluasi kuantiatif dengan SacreBLEU
def calculate_bleu(dataset, src_tokenizer, trg_tokenizer, model, device):
    trgs = []
    preds = []
    
    for src_sent, trg_sent in tqdm(dataset, desc="Calculating BLEU"):
        pred_trg = translate_sentence(src_sent, src_tokenizer, trg_tokenizer, model, device)
        
        preds.append(pred_trg)
        trgs.append(trg_sent)
        
    # Sacrebleu mengharapkan list of references, kita bungkus dalam list tambahan
    bleu = sacrebleu.corpus_bleu(preds, [trgs])
    
    return bleu

In [76]:
# load dataset 
test_dataset = TranslationDataset(DATA_DIR, split='test')

In [77]:
# hitung score bleu
bleu_score = calculate_bleu(test_dataset, en_tokenizer, id_tokenizer, model, device)
print(f'\nBLEU score on test set = {bleu_score.score:.2f}')

Calculating BLEU: 100%|██████████| 1489/1489 [00:18<00:00, 78.40it/s]



BLEU score on test set = 0.07


In [81]:
# fungsi evaluasi kualitatif 
def show_random_examples(dataset, num_examples=5):
    model.eval()
    
    print("\n--- Contoh Hasil Terjemahan ---")
    for _ in range(num_examples):
        src, trg = random.choice(dataset)
        
        translated_sentence = translate_sentence(src, en_tokenizer, id_tokenizer, model, device)
        
        print(f"\nSumber (EN)      : {src}")
        print(f"Target (ID)      : {trg}")
        print(f"Prediksi Model   : {translated_sentence}")

valid_dataset = TranslationDataset(DATA_DIR, split='val')
show_random_examples(valid_dataset)


--- Contoh Hasil Terjemahan ---

Sumber (EN)      : Are you talking to me?
Target (ID)      : Apakah kau sedang bicara padaku?
Prediksi Model   : <s> Kamu m el ihat b el ihat b erapa ? </s>

Sumber (EN)      : I won't go there anymore.
Target (ID)      : Aku tidak akan pergi ke sana lagi.
Prediksi Model   : <s> Di at idak akan m em akai p ad anya . </s>

Sumber (EN)      : Tom saw Mary eating an apple.
Target (ID)      : Tom melihat Mary memakan apel.
Prediksi Model   : <s> Izinkan m em buat m em buat m em buat m em buat m em buat m em buat k enal . </s>

Sumber (EN)      : You're not the only one who's hungry.
Target (ID)      : Bukan hanya kamu saja yang merasa lapar.
Prediksi Model   : <s> Kamu s idak m eng h ar us m el ajar b agi . </s>

Sumber (EN)      : What have you got?
Target (ID)      : Kalian punya apa?
Prediksi Model   : <s> Kamu m anak amu akan k amu ? </s>


## Arsitektur Model

In [82]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_dim, dropout, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, emb_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, emb_dim, 2).float() * (-math.log(10000.0) / emb_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [83]:
class Transformer(nn.Module):
    def __init__(self, 
                 src_vocab_size, 
                 trg_vocab_size, 
                 src_pad_idx, 
                 trg_pad_idx,
                 emb_dim=256, 
                 nhead=8, 
                 num_encoder_layers=3,
                 num_decoder_layers=3, 
                 dim_feedforward=512, 
                 dropout=0.1,
                 device='cpu'):
        super().__init__()

        self.device = device
        self.src_tok_emb = nn.Embedding(src_vocab_size, emb_dim)
        self.trg_tok_emb = nn.Embedding(trg_vocab_size, emb_dim)
        self.positional_encoding = PositionalEncoding(emb_dim, dropout)
        
        # PyTorch sudah menyediakan implementasi Transformer standar
        self.transformer = nn.Transformer(d_model=emb_dim,
                                          nhead=nhead,
                                          num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers,
                                          dim_feedforward=dim_feedforward,
                                          dropout=dropout,
                                          batch_first=False) # Kita set False agar sesuai dengan RNN
        
        self.generator = nn.Linear(emb_dim, trg_vocab_size)
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz, device=self.device)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def _create_padding_mask(self, pad_idx, sequence):
        return (sequence == pad_idx).transpose(0, 1)

    def forward(self, src, trg):
        # src: [src_len, batch_size]
        # trg: [trg_len, batch_size]

        src_seq_len = src.shape[0]
        trg_seq_len = trg.shape[0]

        src_padding_mask = self._create_padding_mask(self.src_pad_idx, src)
        trg_padding_mask = self._create_padding_mask(self.trg_pad_idx, trg)

        trg_mask = self._generate_square_subsequent_mask(trg_seq_len)

        src_emb = self.positional_encoding(self.src_tok_emb(src))
        trg_emb = self.positional_encoding(self.trg_tok_emb(trg))

        output = self.transformer(src_emb, trg_emb,
                                  src_mask=None, # Tidak perlu untuk encoder
                                  tgt_mask=trg_mask,
                                  memory_mask=None, # Tidak perlu
                                  src_key_padding_mask=src_padding_mask,
                                  tgt_key_padding_mask=trg_padding_mask,
                                  memory_key_padding_mask=src_padding_mask)
        
        return self.generator(output)

## Inisialisai Model, Optimizer, Loss

In [84]:
# Hyperparameter
INPUT_DIM = 16000
OUTPUT_DIM = 16000
EMB_DIM = 256
NHEAD = 8
FFN_HID_DIM = 512
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
DROPOUT = 0.1

In [85]:
# Model
transformer_model = Transformer(src_vocab_size=INPUT_DIM,
                                trg_vocab_size=OUTPUT_DIM,
                                src_pad_idx=SRC_PAD_IDX,
                                trg_pad_idx=TRG_PAD_IDX,
                                emb_dim=EMB_DIM,
                                nhead=NHEAD,
                                num_encoder_layers=NUM_ENCODER_LAYERS,
                                num_decoder_layers=NUM_DECODER_LAYERS,
                                dim_feedforward=FFN_HID_DIM,
                                dropout=DROPOUT,
                                device=device).to(device)



In [86]:
# Weight
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)
transformer_model.apply(initialize_weights)

Transformer(
  (src_tok_emb): Embedding(16000, 256)
  (trg_tok_emb): Embedding(16000, 256)
  (positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-2): 3 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((256,), eps=1e-05, elementwise

In [88]:
# optimizer 
transformer_optimizer = optim.Adam(transformer_model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [89]:
# Loss function tetap sama
print("Model Transformer, Optimizer, dan Loss Function berhasil dibuat.")
print(f'Model memiliki {sum(p.numel() for p in transformer_model.parameters() if p.requires_grad):,} parameter yang dapat dilatih.')

Model Transformer, Optimizer, dan Loss Function berhasil dibuat.
Model memiliki 16,258,688 parameter yang dapat dilatih.


## Fungsi Training & Evaluasi Untuk Transformer

In [96]:
# fungsi training
def train_transformer(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(tqdm(iterator, desc="Training")):
        src, trg = batch
        
        # Transpose batch agar sesuai dengan input model [seq_len, batch_size]
        src = src.permute(1, 0)
        trg = trg.permute(1, 0)
        
        optimizer.zero_grad()
        
        # Siapkan input dan target untuk decoder
        # Input tidak menyertakan token <eos> terakhir
        trg_input = trg[:-1, :]
        
        # Model memprediksi berdasarkan src dan trg_input
        output = model(src, trg_input)
        
        output_dim = output.shape[-1]
        
        # Reshape output untuk loss function
        # Target untuk loss tidak menyertakan token <sos> pertama
        output = output.reshape(-1, output_dim)
        trg_output = trg[1:, :].reshape(-1)
        
        loss = criterion(output, trg_output)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [97]:
def evaluate_transformer(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for i, batch in enumerate(tqdm(iterator, desc="Evaluating")):
            src, trg = batch
            src = src.permute(1, 0)
            trg = trg.permute(1, 0)

            trg_input = trg[:-1, :]
            
            output = model(src, trg_input)
            
            output_dim = output.shape[-1]
            
            output = output.reshape(-1, output_dim)
            trg_output = trg[1:, :].reshape(-1)

            loss = criterion(output, trg_output)
            
            epoch_loss += loss.item()
            
    return epoch_loss / len(iterator)

In [98]:
# fungsi helper untuk hitung waktu
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## Training Loop Utama

In [99]:
N_EPOCHS = 15
CLIP = 1

best_valid_loss = float('inf')

In [100]:
for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    # Gunakan model dan optimizer Transformer
    train_loss = train_transformer(transformer_model, train_dataloader, transformer_optimizer, criterion, CLIP)
    valid_loss = evaluate_transformer(transformer_model, valid_dataloader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # Simpan model jika validation loss membaik
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(transformer_model.state_dict(), 'transformer-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

print("\nTraining Transformer selesai!")
print(f"Model terbaik disimpan sebagai 'transformer-model.pt' dengan validation loss: {best_valid_loss:.3f}")

Training: 100%|██████████| 93/93 [00:06<00:00, 15.01it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 41.31it/s]


Epoch: 01 | Time: 0m 6s
	Train Loss: 7.899 | Train PPL: 2695.223
	 Val. Loss: 6.434 |  Val. PPL: 622.452


Training: 100%|██████████| 93/93 [00:06<00:00, 15.12it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 41.44it/s]


Epoch: 02 | Time: 0m 6s
	Train Loss: 5.861 | Train PPL: 351.058
	 Val. Loss: 5.507 |  Val. PPL: 246.397


Training: 100%|██████████| 93/93 [00:06<00:00, 15.02it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 39.87it/s]


Epoch: 03 | Time: 0m 6s
	Train Loss: 5.359 | Train PPL: 212.479
	 Val. Loss: 5.248 |  Val. PPL: 190.145


Training: 100%|██████████| 93/93 [00:06<00:00, 15.23it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 39.52it/s]


Epoch: 04 | Time: 0m 6s
	Train Loss: 5.136 | Train PPL: 170.028
	 Val. Loss: 5.019 |  Val. PPL: 151.196


Training: 100%|██████████| 93/93 [00:06<00:00, 15.27it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 40.65it/s]


Epoch: 05 | Time: 0m 6s
	Train Loss: 4.956 | Train PPL: 141.989
	 Val. Loss: 4.921 |  Val. PPL: 137.147


Training: 100%|██████████| 93/93 [00:06<00:00, 15.21it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 40.54it/s]


Epoch: 06 | Time: 0m 6s
	Train Loss: 4.871 | Train PPL: 130.469
	 Val. Loss: 4.852 |  Val. PPL: 127.998


Training: 100%|██████████| 93/93 [00:06<00:00, 14.94it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 41.10it/s]


Epoch: 07 | Time: 0m 6s
	Train Loss: 4.797 | Train PPL: 121.100
	 Val. Loss: 4.768 |  Val. PPL: 117.688


Training: 100%|██████████| 93/93 [00:06<00:00, 15.21it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 40.22it/s]


Epoch: 08 | Time: 0m 6s
	Train Loss: 4.698 | Train PPL: 109.781
	 Val. Loss: 4.664 |  Val. PPL: 106.033


Training: 100%|██████████| 93/93 [00:06<00:00, 15.04it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 41.12it/s]


Epoch: 09 | Time: 0m 6s
	Train Loss: 4.601 | Train PPL:  99.595
	 Val. Loss: 4.584 |  Val. PPL:  97.877


Training: 100%|██████████| 93/93 [00:06<00:00, 15.28it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 41.27it/s]


Epoch: 10 | Time: 0m 6s
	Train Loss: 4.520 | Train PPL:  91.854
	 Val. Loss: 4.521 |  Val. PPL:  91.946


Training: 100%|██████████| 93/93 [00:06<00:00, 15.04it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 41.37it/s]


Epoch: 11 | Time: 0m 6s
	Train Loss: 4.455 | Train PPL:  86.016
	 Val. Loss: 4.467 |  Val. PPL:  87.122


Training: 100%|██████████| 93/93 [00:06<00:00, 15.14it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 40.56it/s]


Epoch: 12 | Time: 0m 6s
	Train Loss: 4.393 | Train PPL:  80.896
	 Val. Loss: 4.416 |  Val. PPL:  82.765


Training: 100%|██████████| 93/93 [00:06<00:00, 15.04it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 39.98it/s]


Epoch: 13 | Time: 0m 6s
	Train Loss: 4.334 | Train PPL:  76.232
	 Val. Loss: 4.337 |  Val. PPL:  76.511


Training: 100%|██████████| 93/93 [00:06<00:00, 15.14it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 41.14it/s]


Epoch: 14 | Time: 0m 6s
	Train Loss: 4.272 | Train PPL:  71.688
	 Val. Loss: 4.285 |  Val. PPL:  72.627


Training: 100%|██████████| 93/93 [00:06<00:00, 15.06it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 41.00it/s]


Epoch: 15 | Time: 0m 6s
	Train Loss: 4.208 | Train PPL:  67.207
	 Val. Loss: 4.213 |  Val. PPL:  67.586

Training Transformer selesai!
Model terbaik disimpan sebagai 'transformer-model.pt' dengan validation loss: 4.213


## Load Model Transformasi Terbaik

In [101]:
transformer_model.load_state_dict(torch.load('transformer-model.pt'))
print("Model Transformer terbaik 'transformer-model.pt' berhasil dimuat.")

Model Transformer terbaik 'transformer-model.pt' berhasil dimuat.


## Fungsi Terjemahan Untuk Transform

In [102]:
def translate_sentence_transformer(sentence, src_tokenizer, trg_tokenizer, model, device, max_len=50):
    model.eval()

    # Tokenisasi kalimat sumber
    src_tokens = src_tokenizer.encode(sentence.lower()).ids
    src_tensor = torch.LongTensor(src_tokens).unsqueeze(1).to(device) # [src_len, 1]

    # Buat padding mask untuk source
    src_padding_mask = model._create_padding_mask(model.src_pad_idx, src_tensor)

    with torch.no_grad():
        # Encoder memproses seluruh kalimat sumber sekali
        memory = model.transformer.encoder(model.positional_encoding(model.src_tok_emb(src_tensor)), 
                                            src_key_padding_mask=src_padding_mask)
    
    # Dapatkan token <sos> dan <eos>
    trg_sos_idx = trg_tokenizer.token_to_id('<s>')
    trg_eos_idx = trg_tokenizer.token_to_id('</s>')
    
    # Mulai output dengan token <sos>
    trg_indexes = [trg_sos_idx]

    for i in range(max_len):
        # Buat tensor dari output sejauh ini
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(1).to(device) # [trg_len, 1]
        
        # Buat subsequent mask untuk target
        trg_mask = model._generate_square_subsequent_mask(trg_tensor.size(0))

        with torch.no_grad():
            # Decoder memprediksi token berikutnya
            output = model.transformer.decoder(model.positional_encoding(model.trg_tok_emb(trg_tensor)), 
                                               memory, 
                                               tgt_mask=trg_mask)
            
            # Ambil prediksi dari token terakhir saja
            pred = model.generator(output[-1, :, :])
        
        pred_token = pred.argmax(1).item()
        trg_indexes.append(pred_token)

        if pred_token == trg_eos_idx:
            break
    
    # Konversi kembali dari index ke token
    trg_tokens = trg_tokenizer.decode(trg_indexes, skip_special_tokens=True)
    
    return trg_tokens

In [103]:
# fungsi calculate
def calculate_bleu_transformer(dataset, src_tokenizer, trg_tokenizer, model, device):
    trgs = []
    preds = []
    
    for src_sent, trg_sent in tqdm(dataset, desc="Calculating BLEU"):
        pred_trg = translate_sentence_transformer(src_sent, src_tokenizer, trg_tokenizer, model, device)
        
        preds.append(pred_trg)
        trgs.append(trg_sent)
        
    bleu = sacrebleu.corpus_bleu(preds, [trgs])
    
    return bleu

In [104]:
# Muat data uji
test_dataset = TranslationDataset(DATA_DIR, split='test')

In [105]:
# Hitung skor BLEU
bleu_score_transformer = calculate_bleu_transformer(test_dataset, en_tokenizer, id_tokenizer, transformer_model, device)
print(f'\nBLEU score (Transformer) on test set = {bleu_score_transformer.score:.2f}')

Calculating BLEU: 100%|██████████| 1489/1489 [01:05<00:00, 22.88it/s]



BLEU score (Transformer) on test set = 0.01


## Evaluasi Kualitatif

In [106]:
def show_random_examples_transformer(dataset, num_examples=5):
    print("\n--- Contoh Hasil Terjemahan (Transformer) ---")
    for _ in range(num_examples):
        src, trg = random.choice(dataset)
        
        translated_sentence = translate_sentence_transformer(src, en_tokenizer, id_tokenizer, transformer_model, device)
        
        print(f"\nSumber (EN)      : {src}")
        print(f"Target (ID)      : {trg}")
        print(f"Prediksi Model   : {translated_sentence}")

# Ambil dari data validasi
valid_dataset = TranslationDataset(DATA_DIR, split='val')
show_random_examples_transformer(valid_dataset)


--- Contoh Hasil Terjemahan (Transformer) ---

Sumber (EN)      : Tom isn't afraid of death.
Target (ID)      : Tom tidak takut mati.
Prediksi Model   : <s> Aku t idak m emb el akukan m ang m ang . </s>

Sumber (EN)      : A horse is an animal.
Target (ID)      : Kuda adalah binatang.
Prediksi Model   : <s> Itu m ang m ang m ang . </s>

Sumber (EN)      : I have no knife to cut with.
Target (ID)      : Aku tidak punya pisau untuk memotongnya.
Prediksi Model   : <s> Di it amu m ang m ang m ang m ang . </s>

Sumber (EN)      : You must start soon.
Target (ID)      : Kamu harus mulai secepatnya.
Prediksi Model   : <s> Di ak amu m ang m ang m ang m ang . </s>

Sumber (EN)      : Tom sat down on the sand next to Mary.
Target (ID)      : Tom duduk di atas pasir di sebelah Mary.
Prediksi Model   : <s> Aku t idak m emb el akukan m ang m ang m ang m ang m ang . </s>


## Ablation Study

In [112]:
# jumlah parameter sama, jumlah layer berbeda
ABLATED_NUM_LAYERS = 1

ablated_model = Transformer(src_vocab_size=INPUT_DIM,
                            trg_vocab_size=OUTPUT_DIM,
                            src_pad_idx=SRC_PAD_IDX,
                            trg_pad_idx=TRG_PAD_IDX,
                            emb_dim=EMB_DIM,
                            nhead=NHEAD,
                            # Perubahan utama ada di sini
                            num_encoder_layers=ABLATED_NUM_LAYERS,
                            num_decoder_layers=ABLATED_NUM_LAYERS,
                            dim_feedforward=FFN_HID_DIM,
                            dropout=DROPOUT,
                            device=device).to(device)

In [113]:
# Inisialisasi weights
ablated_model.apply(initialize_weights)

Transformer(
  (src_tok_emb): Embedding(16000, 256)
  (trg_tok_emb): Embedding(16000, 256)
  (positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((256,), eps=1e-05, elementwise_affin

In [115]:
# optimizer baru
ablated_optimizer = optim.Adam(ablated_model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

print(f'\nModel Ablated memiliki {sum(p.numel() for p in ablated_model.parameters() if p.requires_grad):,} parameter yang dapat dilatih.')


Model Ablated memiliki 13,622,912 parameter yang dapat dilatih.


## Latih Model Ablated

In [116]:
N_EPOCHS = 10 
CLIP = 1
best_ablated_valid_loss = float('inf')

In [117]:
for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss = train_transformer(ablated_model, train_dataloader, ablated_optimizer, criterion, CLIP)
    valid_loss = evaluate_transformer(ablated_model, valid_dataloader, criterion)
    
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_ablated_valid_loss:
        best_ablated_valid_loss = valid_loss
        torch.save(ablated_model.state_dict(), 'transformer-ablated-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} | PPL: {math.exp(valid_loss):7.3f}')

print("\nTraining model ablated selesai!")

Training: 100%|██████████| 93/93 [00:03<00:00, 23.38it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 56.04it/s]


Epoch: 01 | Time: 0m 4s
	Train Loss: 7.960 | PPL: 2864.890
	 Val. Loss: 6.483 | PPL: 653.662


Training: 100%|██████████| 93/93 [00:03<00:00, 23.66it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 55.64it/s]


Epoch: 02 | Time: 0m 4s
	Train Loss: 5.912 | PPL: 369.345
	 Val. Loss: 5.549 | PPL: 257.108


Training: 100%|██████████| 93/93 [00:03<00:00, 23.50it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 54.94it/s]


Epoch: 03 | Time: 0m 4s
	Train Loss: 5.407 | PPL: 222.869
	 Val. Loss: 5.277 | PPL: 195.829


Training: 100%|██████████| 93/93 [00:03<00:00, 23.38it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 53.18it/s]


Epoch: 04 | Time: 0m 4s
	Train Loss: 5.168 | PPL: 175.591
	 Val. Loss: 5.063 | PPL: 157.987


Training: 100%|██████████| 93/93 [00:03<00:00, 23.49it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 52.94it/s]


Epoch: 05 | Time: 0m 4s
	Train Loss: 4.976 | PPL: 144.830
	 Val. Loss: 4.933 | PPL: 138.828


Training: 100%|██████████| 93/93 [00:04<00:00, 23.05it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 55.71it/s]


Epoch: 06 | Time: 0m 4s
	Train Loss: 4.880 | PPL: 131.604
	 Val. Loss: 4.872 | PPL: 130.535


Training: 100%|██████████| 93/93 [00:03<00:00, 23.68it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 55.61it/s]


Epoch: 07 | Time: 0m 4s
	Train Loss: 4.812 | PPL: 122.933
	 Val. Loss: 4.799 | PPL: 121.399


Training: 100%|██████████| 93/93 [00:03<00:00, 23.49it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 52.20it/s]


Epoch: 08 | Time: 0m 4s
	Train Loss: 4.725 | PPL: 112.757
	 Val. Loss: 4.695 | PPL: 109.366


Training: 100%|██████████| 93/93 [00:03<00:00, 23.41it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 55.64it/s]


Epoch: 09 | Time: 0m 4s
	Train Loss: 4.610 | PPL: 100.519
	 Val. Loss: 4.562 | PPL:  95.765


Training: 100%|██████████| 93/93 [00:03<00:00, 23.75it/s]
Evaluating: 100%|██████████| 12/12 [00:00<00:00, 55.65it/s]


Epoch: 10 | Time: 0m 4s
	Train Loss: 4.495 | PPL:  89.540
	 Val. Loss: 4.458 | PPL:  86.284

Training model ablated selesai!


## Evaluasi Model Ablated

In [118]:
# muat weights terbaik dari model ablated
ablated_model.load_state_dict(torch.load('transformer-ablated-model.pt'))

# htung skor BLEU pada test set
bleu_score_ablated = calculate_bleu_transformer(test_dataset, en_tokenizer, id_tokenizer, ablated_model, device)

print("\n--- Hasil Ablation Study ---")
print(f"Model Transformer Asli (3 layers) -> BLEU Score: {bleu_score_transformer.score:.2f}")
print(f"Model Transformer Ablated (1 layer) -> BLEU Score: {bleu_score_ablated.score:.2f}")

Calculating BLEU: 100%|██████████| 1489/1489 [00:33<00:00, 44.78it/s]



--- Hasil Ablation Study ---
Model Transformer Asli (3 layers) -> BLEU Score: 0.01
Model Transformer Ablated (1 layer) -> BLEU Score: 0.01
