# Importing libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import torchtext
import tqdm
import evaluate
from sklearn.model_selection import train_test_split
from datasets import Dataset, load_dataset
from torchtext.vocab import build_vocab_from_iterator
import io
from collections import Counter, OrderedDict
from torchtext.vocab import Vocab

2024-04-28 17:25:35.843794: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-28 17:25:35.843824: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-28 17:25:35.844542: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-28 17:25:35.848237: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# pip install torch torchvision torchaudio torchtext --extra-index-url https://download.pytorch.org/whl/cu12.2

# Using dataset from huggingface for English to Chinese

In [3]:
dataset = load_dataset("larryvrh/WikiMatrix-v1-En_Zh-filtered")

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'zh'],
        num_rows: 678099
    })
})

In [5]:
# Extract the dataset from DatasetDict
full_dataset = dataset['train']

In [6]:
# Convert to a pandas DataFrame if necessary
df = full_dataset.to_pandas()

In [7]:
df.head()

Unnamed: 0,en,zh
0,"On 14 October 2013, Foreign Minister Erlan Idr...","2013年10月14日,外交部长厄兰·伊德里索夫会见了乌克兰外长列昂里德·库扎哈。"
1,Richard's misfortunes seemed to follow him int...,理查的逃脱为他带来了好处。
2,STK has been deployed by many mobile operators...,"全球众多移动运营商已使用STK技术部署了众多应用,这些应用通常是基于菜单式操作,提供诸如手机..."
3,Due to the semantics of some programming langu...,"由于某些编程语言的语义,编译器生成的代码允许在线程A执行完变量的初始化之前,更新变量并将其指..."
4,"In the Wa language, spoken in the borderlands ...","在云南省与掸邦边境地区的佤语中,称呼中国人的词为Hox/Hawx,发音为/hɔʔ/。"


In [8]:
# Take a subset from the full data
df = df[:30000]

In [9]:
# Split into train and test+valid
train_df, test_valid_df = train_test_split(df, test_size=0.2, random_state=42)

# Split test+valid into test and valid
test_df, valid_df = train_test_split(test_valid_df, test_size=0.5, random_state=42)

# If needed, convert these DataFrames back to datasets
train_data = Dataset.from_pandas(train_df)
valid_data = Dataset.from_pandas(valid_df)
test_data = Dataset.from_pandas(test_df)

In [10]:
print(f"Training data: {len(train_data)} rows")
print(f"Validation data: {len(valid_data)} rows")
print(f"Test data: {len(test_data)} rows")


Training data: 24000 rows
Validation data: 3000 rows
Test data: 3000 rows


In [11]:
train_data[10]

{'en': 'This amendment shall not be so construed as to affect the election or term of any Senator chosen before it becomes valid as part of the Constitution.',
 'zh': '本条修正案不得作如此解释,以致影响在本条修正案作为宪法的一部分生效以前当选的任何参议员的选举或任期。',
 '__index_level_0__': 25721}

# Tokenizers

In [12]:
#!python -m spacy download en_core_web_sm


In [13]:
#!python -m spacy download zh_core_web_md

In [14]:
en_nlp = spacy.load("en_core_web_sm")
zh_nlp = spacy.load("zh_core_web_md")

In [15]:
string = "In theory, there’s no difference between theory and practice. In practice there is !"

[token.text for token in en_nlp.tokenizer(string)]

['In',
 'theory',
 ',',
 'there',
 '’s',
 'no',
 'difference',
 'between',
 'theory',
 'and',
 'practice',
 '.',
 'In',
 'practice',
 'there',
 'is',
 '!']

In [16]:
def tokenize_example(example, en_nlp, zh_nlp, max_length, lower, sos_token, eos_token):
    # Extract and check the type of English and Chinese texts
    en_text = example['en']
    zh_text = example['zh']

    # Ensure the texts are strings (you may need to adapt this based on your actual data structure)
    if isinstance(en_text, list):
        en_text = ' '.join(en_text)
    if isinstance(zh_text, list):
        zh_text = ' '.join(zh_text)

    # Tokenize texts
    en_tokens = [token.text for token in en_nlp.tokenizer(en_text)][:max_length]
    zh_tokens = [token.text for token in zh_nlp.tokenizer(zh_text)][:max_length]

    # Convert tokens to lowercase if required
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        zh_tokens = [token.lower() for token in zh_tokens]

    # Add start and end of sentence tokens
    en_tokens = [sos_token] + en_tokens + [eos_token]
    zh_tokens = [sos_token] + zh_tokens + [eos_token]

    return {'en_tokens': en_tokens, 'zh_tokens': zh_tokens}



In [17]:
max_length = 1_000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "en_nlp": en_nlp,
    "zh_nlp": zh_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}

train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [18]:
train_data[0]

{'en': "Amy Carter, former President Jimmy Carter's daughter.",
 'zh': '艾美·卡特, 卡特总统的女儿。',
 '__index_level_0__': 21753,
 'en_tokens': ['<sos>',
  'amy',
  'carter',
  ',',
  'former',
  'president',
  'jimmy',
  'carter',
  "'s",
  'daughter',
  '.',
  '<eos>'],
 'zh_tokens': ['<sos>', '艾美·卡特', ',', '卡特', '总统', '的', '女儿', '。', '<eos>']}

# Building Vocabularies

In [19]:
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"
sos_token = "<sos>"
eos_token = "<eos>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["en_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

zh_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["zh_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

In [20]:
len(en_vocab)

19855

In [21]:
len(zh_vocab)

22022

In [33]:
assert en_vocab[unk_token] == zh_vocab[unk_token]
assert en_vocab[pad_token] == zh_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

In [34]:
en_vocab.set_default_index(unk_index)
zh_vocab.set_default_index(unk_index)

###  'numericalize_example' function which we'll use with the map method of our dataset. This will "numericalize" (a fancy way of saying convert tokens to indices) our tokens in each example using the vocabularies

In [35]:
def numericalize_example(example, en_vocab, zh_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    zh_ids = zh_vocab.lookup_indices(example["zh_tokens"])
    return {"en_ids": en_ids, "zh_ids": zh_ids}

In [36]:
fn_kwargs = {"en_vocab": en_vocab, "zh_vocab": zh_vocab}

train_data = train_data.map(numericalize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [37]:
train_data[0]


{'en': "Amy Carter, former President Jimmy Carter's daughter.",
 'zh': '艾美·卡特, 卡特总统的女儿。',
 '__index_level_0__': 21753,
 'en_tokens': ['<sos>',
  'amy',
  'carter',
  ',',
  'former',
  'president',
  'jimmy',
  'carter',
  "'s",
  'daughter',
  '.',
  '<eos>'],
 'zh_tokens': ['<sos>', '艾美·卡特', ',', '卡特', '总统', '的', '女儿', '。', '<eos>'],
 'en_ids': [2, 6025, 5598, 5, 359, 150, 4650, 5598, 24, 1067, 6, 3],
 'zh_ids': [2, 0, 5, 9793, 230, 4, 1117, 6, 3]}

#  Data Loaders

In [38]:
data_type = "torch"
format_columns = ["en_ids", "zh_ids"]

train_data = train_data.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

valid_data = valid_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

test_data = test_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

In [39]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_zh_ids = [example["zh_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_zh_ids = nn.utils.rnn.pad_sequence(batch_zh_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "zh_ids": batch_zh_ids,
        }
        return batch

    return collate_fn

In [40]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [41]:
batch_size = 128

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

# Building the Model

In [42]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src = [src length, batch size]
        embedded = self.dropout(self.embedding(src))
        # embedded = [src length, batch size, embedding dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs = [src length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # outputs are always from the top hidden layer
        return hidden, cell

In [43]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        # input = [batch size]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # n directions in the decoder will both always be 1, therefore:
        # hidden = [n layers, batch size, hidden dim]
        # context = [n layers, batch size, hidden dim]
        input = input.unsqueeze(0)
        # input = [1, batch size]
        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch size, embedding dim]
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # output = [seq length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # seq length and n directions will always be 1 in this decoder, therefore:
        # output = [1, batch size, hidden dim]
        # hidden = [n layers, batch size, hidden dim]
        # cell = [n layers, batch size, hidden dim]
        prediction = self.fc_out(output.squeeze(0))
        # prediction = [batch size, output dim]
        return prediction, hidden, cell

In [68]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"
        assert (
            encoder.n_layers == decoder.n_layers
        ), "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio):
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        # tensor to store decoder outputs
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # first input to the decoder is the <sos> tokens
        input = trg[0, :]
        # input = [batch size]
        for t in range(1, trg_length):
            # insert input token embedding, previous hidden and previous cell states
            # receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            # output = [batch size, output dim]
            # hidden = [n layers, batch size, hidden dim]
            # cell = [n layers, batch size, hidden dim]
            # place predictions in a tensor holding predictions for each token
            outputs[t] = output
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            # get the highest predicted token from our predictions
            top1 = output.argmax(1)
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            input = trg[t] if teacher_force else top1
            # input = [batch size]
        return outputs

# Training the Model

## Model Initialization

In [69]:
input_dim = len(zh_vocab)
output_dim = len(en_vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 1024
n_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    n_layers,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_layers,
    decoder_dropout,
)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Seq2Seq(encoder, decoder, device).to(device)

In [70]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)


model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(22022, 256)
    (rnn): LSTM(256, 1024, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(19855, 256)
    (rnn): LSTM(256, 1024, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=1024, out_features=19855, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [71]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 58,367,631 trainable parameters


## Optimizer

In [72]:
optimizer = optim.Adam(model.parameters())

## Loss Function

In [73]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

## Training Loop

In [74]:
def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch["zh_ids"].to(device)
        trg = batch["en_ids"].to(device)
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        # output = [(trg length - 1) * batch size, trg vocab size]
        trg = trg[1:].view(-1)
        # trg = [(trg length - 1) * batch size]
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

## Model Training

In [76]:
n_epochs = 10
clip = 1.0
teacher_forcing_ratio = 0.4

best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(
        model,
        train_data_loader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device,
    )
    valid_loss = evaluate_fn(
        model,
        valid_data_loader,
        criterion,
        device,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "tut1-model.pt")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

 10%|█         | 1/10 [01:59<17:59, 119.96s/it]

	Train Loss:   6.760 | Train PPL: 862.249
	Valid Loss:   6.508 | Valid PPL: 670.487


 20%|██        | 2/10 [04:01<16:06, 120.82s/it]

	Train Loss:   6.525 | Train PPL: 682.107
	Valid Loss:   6.499 | Valid PPL: 664.395


 30%|███       | 3/10 [06:03<14:09, 121.37s/it]

	Train Loss:   6.405 | Train PPL: 604.711
	Valid Loss:   6.487 | Valid PPL: 656.359


 40%|████      | 4/10 [08:06<12:11, 121.89s/it]

	Train Loss:   6.291 | Train PPL: 539.687
	Valid Loss:   6.482 | Valid PPL: 653.563


 50%|█████     | 5/10 [10:07<10:08, 121.70s/it]

	Train Loss:   6.167 | Train PPL: 476.960
	Valid Loss:   6.383 | Valid PPL: 591.718


 60%|██████    | 6/10 [12:07<08:04, 121.13s/it]

	Train Loss:   6.051 | Train PPL: 424.666
	Valid Loss:   6.336 | Valid PPL: 564.644


 70%|███████   | 7/10 [14:11<06:06, 122.13s/it]

	Train Loss:   5.929 | Train PPL: 375.775
	Valid Loss:   6.312 | Valid PPL: 551.087


 80%|████████  | 8/10 [16:13<04:03, 121.97s/it]

	Train Loss:   5.836 | Train PPL: 342.442
	Valid Loss:   6.308 | Valid PPL: 548.941


 90%|█████████ | 9/10 [18:16<02:02, 122.33s/it]

	Train Loss:   5.749 | Train PPL: 313.842
	Valid Loss:   6.292 | Valid PPL: 539.981


100%|██████████| 10/10 [20:17<00:00, 121.79s/it]

	Train Loss:   5.658 | Train PPL: 286.692
	Valid Loss:   6.302 | Valid PPL: 545.845





# Evaluating the Model

## Evaluation Loop

In [75]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch["zh_ids"].to(device)
            trg = batch["en_ids"].to(device)
            # src = [src length, batch size]
            # trg = [trg length, batch size]
            output = model(src, trg, 0)  # turn off teacher forcing
            # output = [trg length, batch size, trg vocab size]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            # output = [(trg length - 1) * batch size, trg vocab size]
            trg = trg[1:].view(-1)
            # trg = [(trg length - 1) * batch size]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

## Evaluation

In [65]:
model.load_state_dict(torch.load("tut1-model.pt"))

test_loss = evaluate_fn(model, test_data_loader, criterion, device)

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

| Test Loss: 6.321 | Test PPL: 555.865 |


In [60]:
def translate_sentence(
    sentence,
    model,
    en_nlp,
    zh_nlp,
    en_vocab,
    zh_vocab,
    lower,
    sos_token,
    eos_token,
    device,
    max_output_length=25,
):
    model.eval()
    with torch.no_grad():
        if isinstance(sentence, str):
            tokens = [token.text for token in zh_nlp.tokenizer(sentence)]
        else:
            tokens = [token for token in sentence]
        if lower:
            tokens = [token.lower() for token in tokens]
        tokens = [sos_token] + tokens + [eos_token]
        ids = zh_vocab.lookup_indices(tokens)
        tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
        hidden, cell = model.encoder(tensor)
        inputs = en_vocab.lookup_indices([sos_token])
        for _ in range(max_output_length):
            inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
            output, hidden, cell = model.decoder(inputs_tensor, hidden, cell)
            predicted_token = output.argmax(-1).item()
            inputs.append(predicted_token)
            if predicted_token == en_vocab[eos_token]:
                break
        tokens = en_vocab.lookup_tokens(inputs)
    return tokens

In [61]:
sentence = test_data[0]["zh"]
expected_translation = test_data[0]["en"]

sentence, expected_translation

('召至榻前,问所以匡弼储君者,对称旨。', 'They beg him for help as they melt before him.')

In [62]:
translation = translate_sentence(
    sentence,
    model,
    en_nlp,
    zh_nlp,
    en_vocab,
    zh_vocab,
    lower,
    sos_token,
    eos_token,
    device,
)

In [63]:
translation

['<sos>',
 'the',
 '<unk>',
 'of',
 'the',
 '<unk>',
 ',',
 'the',
 '<unk>',
 ',',
 '<unk>',
 ',',
 '<unk>',
 ',',
 'and',
 '<unk>',
 '.',
 '<eos>']

# Since we are not training the model for more epochs the model has yet to learn the nuances in language which is causing the poor results.