In [None]:
'''Adapted from https://github.com/chrisvdweth/nus-cs4248x/blob/master/3-neural-nlp/Section%204.2%20-%20Transformer%20Machine%20Translation.ipynb'''
!pip install -U spacy
!python -m spacy download en_core_web_sm
!python -m spacy download zh_core_web_sm
!pip install -U torchdata
!pip install datasets
!pip install portalocker
!pip install sentencepiece

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting zh-core-web-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.7.0/zh_core_web_sm-3.7.0-py3-none-any.whl (48.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-pkuseg<0.1.0,>=0.0.27 (from

In [None]:
import math
from tqdm.notebook import tqdm
from typing import Iterable, List
from timeit import default_timer as timer
import torch
import torch.nn as nn
import pandas as pd
from torch import Tensor
from torch.nn import Transformer
import sentencepiece as spm

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

Mounted at /content/drive


In [None]:
use_cuda = torch.cuda.is_available()

# Use this line below to enforce the use of the CPU
#use_cuda = False

DEVICE = torch.device("cuda:0" if use_cuda else "cpu")

print("Available device: {}".format(DEVICE))

Available device: cuda:0


In [None]:
from datasets import load_dataset
train_dataset = load_dataset(
                  "ngxingyu/iwslt17_google_trans_scores_sentiments",
                  # streaming=True,
                  split="train",
                )
test_dataset = load_dataset(
                  "ngxingyu/iwslt17_google_trans_scores_sentiments",
                  # streaming=True,
                  split="test",
                )
validation_dataset = load_dataset(
                    "ngxingyu/iwslt17_google_trans_scores_sentiments",
                    # streaming=True,
                    split="validation",
                )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/733 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/53.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/231k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.97M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/229736 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/875 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8549 [00:00<?, ? examples/s]

In [None]:
print(train_dataset, validation_dataset)

Dataset({
    features: ['bleurt_score', 'comet_score', 'en', 'google_zh', 'zh', 'en_sentiment', 'zh_sentiment'],
    num_rows: 229736
}) Dataset({
    features: ['bleurt_score', 'comet_score', 'en', 'google_zh', 'zh', 'en_sentiment', 'zh_sentiment'],
    num_rows: 875
})


In [None]:
SRC_LANGUAGE = 'en'
TGT_LANGUAGE = 'zh'

# Placeholders
token_transform = {}
vocab_transform = {}

In [None]:
en_data = train_dataset[SRC_LANGUAGE]
zh_data = train_dataset[TGT_LANGUAGE]
data = []
with tqdm(total=len(train_dataset)) as progress_bar:
  for i in range(len(train_dataset)):
      parallel_sentences = {}
      parallel_sentences[SRC_LANGUAGE] = en_data[i]
      parallel_sentences[TGT_LANGUAGE] = zh_data[i]
      data.append(parallel_sentences)
      progress_bar.update(1)

# Concatenate English and Chinese sentences
all_sentences = [entry[SRC_LANGUAGE] for entry in data] + [entry[TGT_LANGUAGE] for entry in data]
# Save the sentences to a text file
with open("all_sentences.txt", "w", encoding="utf-8") as f:
    for sentence in all_sentences:
        f.write(sentence + "\n")

# Define paths for saving models and vocabs
model_prefix = "spm_model"
# Train the SentencePiece model
spm.SentencePieceTrainer.train(input="all_sentences.txt", model_prefix=model_prefix, vocab_size=10000)

  0%|          | 0/229736 [00:00<?, ?it/s]

In [None]:
# Load SentencePiece model
tokeniser = spm.SentencePieceProcessor()
tokeniser.load(f"{model_prefix}.model")

# Test tokeniser
entry = data[0]
english_tokens = tokeniser.encode_as_pieces(entry['en'])
chinese_tokens = tokeniser.encode_as_pieces(entry['zh'])
print("English tokens:", english_tokens)
print("Chinese tokens:", chinese_tokens)

English tokens: ['▁Thank', '▁you', '▁so', '▁much', ',', '▁Chris', '.', '▁And', '▁it', "'", 's', '▁truly', '▁a', '▁great', '▁honor', '▁to', '▁have', '▁the', '▁opportunity', '▁to', '▁come', '▁to', '▁this', '▁stage', '▁twice', ';', '▁I', "'", 'm', '▁extremely', '▁grateful', '.']
Chinese tokens: ['▁', '非常', '谢谢', ',', '克里斯', '。', '的确', '非常', '荣幸', '▁能', '有', '第二', '次', '站在', '这个', '台', '上', '的机会', ',', '我', '真是', '非常', '感', '激', '。']


In [None]:
from torch.utils.data import Dataset
from typing import Iterable, List

class CustomDataset(Dataset):
    def __init__(self, data_iter: Iterable, tokeniser):
        self.data_iter = data_iter
        self.tokeniser = tokeniser
        self.process_data_iter()
        self.tokens = {SRC_LANGUAGE: [], TGT_LANGUAGE: []}

    def __len__(self):
        return len(self.data_iter)

    def __getitem__(self, idx):
        sample = self.data_iter[idx]
        return sample

    def process_data_iter(self):
        en_data = self.data_iter['en']
        zh_data = self.data_iter['zh']
        iter = []
        for i in range(len(self.data_iter)):
          data_sample = {}
          data_sample['en'] = en_data[i]
          data_sample['zh'] = zh_data[i]
          iter.append(data_sample)
        self.data_iter = iter

    def tokenize_sentence(self, sentence: str, language: str):
        tokens = self.tokeniser.encode_as_pieces(sentence)
        self.tokens[language].append(tokens)
        return tokens


    def yield_tokens(self, language: str):
        with tqdm(total=len(self.data_iter)) as progress_bar:
            for data_sample in self.data_iter:
                yield self.tokenize_sentence(data_sample[language], language)
                progress_bar.update(1)

In [None]:
custom_train_dataset = CustomDataset(train_dataset, tokeniser)
custom_validation_dataset = CustomDataset(validation_dataset, tokeniser)
custom_test_dataset = CustomDataset(test_dataset, tokeniser)

# Define special symbols and indices
PAD_IDX, UNK_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<PAD>', '<UNK>', '<SOS>', '<EOS>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(custom_train_dataset.yield_tokens(ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

# Set `UNK_IDX` as the default index. This index is returned when the token is not found.
# If not set, it throws ``RuntimeError`` when the queried token is not found in the Vocabulary.
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    vocab_transform[ln].set_default_index(UNK_IDX)

  0%|          | 0/229736 [00:00<?, ?it/s]

  0%|          | 0/229736 [00:00<?, ?it/s]

In [None]:
print('EN vocab length:', len(vocab_transform[SRC_LANGUAGE]))
print('ZH vocab length:', len(vocab_transform[TGT_LANGUAGE]))

EN vocab length: 3881
ZH vocab length: 9611


In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])


class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [None]:
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None, src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask)

In [None]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask.bool()


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)

    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [None]:
# Custom transformation function for tokenization using SentencePieceProcessor
def tokenize_with_sentencepiece(tokenizer):
    def transform(txt_input):
        return tokenizer.encode_as_pieces(txt_input)
    return transform

# Helper method to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# method to add SOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([SOS_IDX]), torch.tensor(token_ids), torch.tensor([EOS_IDX])))


# `src` and `tgt` language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(tokenize_with_sentencepiece(tokeniser), # Tokenization
                                               vocab_transform[ln], # Vectorization
                                               tensor_transform)    # Add SOS/EOS and create tensor


# Method to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for i in range(len(batch)):
        src_sample, tgt_sample = batch[i][SRC_LANGUAGE], batch[i][TGT_LANGUAGE]
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

In [None]:
def train_epoch(model, optimizer, criterion):
    model.train()
    losses = 0

    train_dataloader = DataLoader(custom_train_dataset.data_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in tqdm(train_dataloader, total=len(list(train_dataloader))):
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        # Remove last entry an all target sequences (typically PAD, can be EOS)
        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        # Remove <SOS> from all targets
        tgt_out = tgt[1:, :]

        loss = criterion(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))


def evaluate(model, criterion):
    model.eval()
    losses = 0

    val_dataloader = DataLoader(custom_validation_dataset.data_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in tqdm(val_dataloader, total=len(list(val_dataloader))):
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]

        loss = criterion(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))  # Use the custom loss function
        losses += loss.item()

    return losses / len(list(val_dataloader))

Here we introduce a new loss function **(Incomplete Trust)**

In [None]:
import torch.nn.functional as F

class In_trust_Loss(nn.Module):
    def __init__(self, num_classes, alpha=1, beta=0.8,delta=0.5, ignore_index=PAD_IDX):
        super().__init__()
        self.alpha = alpha
        self.beta = beta
        self.num_classes = num_classes
        self.delta = delta
        self.cross_entropy = torch.nn.CrossEntropyLoss()
        #self.crf = CRF(num_tags= num_classes, batch_first=True)
    def forward(self, logits,labels):

        #loss_mask = labels.gt(0)
        #Loss CRF
        ce = self.cross_entropy(logits,labels)
        #Loss In_trust
        active_logits = logits.view(-1,self.num_classes)
        active_labels = labels.view(-1)

        pred = F.softmax(active_logits, dim=1)
        pred = torch.clamp(pred, min=1e-7, max=1.0)
        label_one_hot = torch.nn.functional.one_hot(active_labels,self.num_classes).float()
        label_one_hot = torch.clamp(label_one_hot, min=1e-4, max=1.0)
        dce = (-1*torch.sum(pred * torch.log(pred*self.delta + label_one_hot*(1-self.delta)), dim=1))

        # Loss

        loss = self.alpha * ce - self.beta * dce.mean()
        return loss

In [None]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 32
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

# Create model
transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

# Initialize weights
for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

# Move model to device (ideally GPU, otherwise CPU)
transformer = transformer.to(DEVICE)

# Definte the custome loss function
custom_criterion = In_trust_Loss(num_classes=TGT_VOCAB_SIZE, ignore_index=PAD_IDX)
# Define optimizer
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)



In [None]:
NUM_EPOCHS = 2

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer, custom_criterion)
    end_time = timer()
    val_loss = evaluate(transformer, custom_criterion)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time (total) = {(end_time - start_time):.3f}s"))

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 1, Train loss: -3.139, Val loss: -3.230, Epoch time (total) = 615.455s


  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 2, Train loss: -3.388, Val loss: -3.369, Epoch time (total) = 615.287s


In [None]:
# Method to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0)).type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

# Actual method to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(model, src, src_mask, max_len=num_tokens+5, start_symbol=SOS_IDX).flatten()

    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<SOS>", "").replace("<EOS>", "")

def translate_dataset(model, validation_dataset):
    translations = []
    for sample in tqdm(validation_dataset, total=len(validation_dataset)):
        src_sentence = sample[SRC_LANGUAGE]
        tgt_sentence = sample[TGT_LANGUAGE]

        # Translate the source sentence using the model
        translated_sentence = translate(model, src_sentence)

        # Append the translation along with source and target sentences
        translations.append({'en': src_sentence, 'zh': tgt_sentence, 'translated_output': translated_sentence})

    return pd.DataFrame(translations)

In [None]:
print(translate(transformer, "Thank you very much."))
print(translate(transformer, "And it was a huge success."))

 ▁谢谢 。 
 ▁它 是一个 成功 。 


In [None]:
translations = translate_dataset(transformer, custom_test_dataset.data_iter)

  0%|          | 0/8549 [00:00<?, ?it/s]

In [None]:
def clean_tranlsations(text):
    # Replace underscores with spaces
    cleaned_text = text.replace("▁", " ")
    # For Chinese text, trim leading and trailing spaces, but preserve spaces between characters
    if text.strip() == text:  # Check if the text has no leading or trailing spaces
        cleaned_text = cleaned_text.strip()  # Trim leading and trailing spaces
    return cleaned_text

translations['translated_output'] = translations['translated_output'].apply(clean_tranlsations)
translations.to_csv(f'transformer_translations_in_trust_{BATCH_SIZE}.csv', index=True)

In [None]:
translations.head()

Unnamed: 0,en,zh,translated_output
0,"Several years ago here at TED, Peter Skillman ...",几年前，在TED大会上， Peter Skillman 介绍了一个设计挑战 叫做“棉花糖挑战”,"几年前 , TED 演讲 者 的 TED 演讲 者 叫做 “ 挑战 ” 的 挑..."
1,And the idea's pretty simple: Teams of four h...,是个非常简单的主意 要求一组四人的团队搭建一个独立的最高建筑 材料是20根意大利面条 一段胶...,想法 是 : 简单的 : 四 种 子 的 四 种 子 使用 的 到 2...
2,The marshmallow has to be on top.,棉花糖必须放在最上面,海 里 有 的 路 线 上 必须 在 上 。
3,"And, though it seems really simple, it's actua...",这虽然看似简单，其实并不容易 因为它要求人们 迅速地合作,"但是 , 这 真的 真的 很简单 , 因为 人们 非常 困难 , 非常 困难 。"
4,"And so, I thought this was an interesting idea...",我觉得这是个有趣的主意 我把它放到了设计专题讨论会上,"所以 , 我 想 一下 这个 有趣的 有趣的 想法 , 我 在 设计 了一个 设计 设..."
