In [None]:
!pip install transformers

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
  
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
# model = AutoModelForMaskedLM.from_pretrained("bert-base-chinese")

In [None]:
! wget http://data.statmt.org/news-commentary/v16/training/news-commentary-v16.en-zh.tsv.gz
! gunzip news-commentary-v16.en-zh.tsv.gz

In [None]:
import pandas as pd
df = pd.read_csv('news-commentary-v16.en-zh.tsv', sep='\t', error_bad_lines=False, header=None).dropna().reset_index(drop=True)
df = df.sample(frac=1, random_state=1).reset_index(drop=True)
print("Len df is:", len(df))
df.head()

In [None]:
from torchtext.vocab import build_vocab_from_iterator
from typing import Iterable, List
from transformers import AutoTokenizer


SRC_LANGUAGE = 'chinese'
TGT_LANGUAGE = 'english'

LEN_DATA = len(df)

# Place-holders
token_transform = {SRC_LANGUAGE: None, TGT_LANGUAGE:None}
vocab_transform = {SRC_LANGUAGE: None, TGT_LANGUAGE:None}

dataset = [(ch,en) for ch, en in zip(list(df.iloc[:LEN_DATA][1]), list(df.iloc[:LEN_DATA][0]))]

#dataset = {SRC_LANGUAGE: list(df.iloc[:len(df)][1]), TGT_LANGUAGE: list(df.iloc[:len(df)][0])}

token_transform[SRC_LANGUAGE] = AutoTokenizer.from_pretrained("bert-base-chinese")
token_transform[TGT_LANGUAGE] = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
import torch
from typing import Tuple, List
def get_input_ids(lines: pd.core.series.Series, tokenizer: AutoTokenizer) -> List:
    input_ids: List = []
  
    for line in lines:
        line_ids = torch.tensor(tokenizer(line)['input_ids'], dtype=torch.long)

        input_ids.append(line_ids)
    return input_ids

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

In [None]:
# set seed for reproducability
SEED = 9120

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size) #division term 1/(10000^(2i/dim_model))
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)#embadding layer :vocab_size*emb_size
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 128,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size, 
                                       nhead=nhead, 
                                       num_encoder_layers=num_encoder_layers, 
                                       num_decoder_layers=num_decoder_layers, 
                                       dim_feedforward=dim_feedforward, 
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb,
                                tgt_emb,
                                src_mask,
                                tgt_mask, 
                                None,
                                src_padding_mask,
                                tgt_padding_mask,
                                memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [None]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [None]:
torch.manual_seed(9120)

SRC_VOCAB_SIZE = token_transform[SRC_LANGUAGE].vocab_size
print(SRC_VOCAB_SIZE)
TGT_VOCAB_SIZE = token_transform[TGT_LANGUAGE].vocab_size
print(TGT_VOCAB_SIZE)
EMB_SIZE = 256
NHEAD = 8
FFN_HID_DIM = 256
BATCH_SIZE = 4
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
PAD_IDX = 0

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)
print(DEVICE)

In [None]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001 * 5, betas=(0.9, 0.98), eps=1e-9)

In [None]:
import random
from torch.utils.data import DataLoader

from torch.nn.utils.rnn import pad_sequence

train_index = len(dataset)*8//10
print(train_index)
val_index = len(dataset)//10
print(val_index)

accumulation_steps = 32
def train_epoch(model, optimizer):
    index = 0
    model.train()
    losses = 0
    train_dataloader = DataLoader(dataset[0:train_index], batch_size=BATCH_SIZE)# due to my gpu memory space i used first 100000 utterances to train
    for src, tgt in train_dataloader:
        src_tokens = get_input_ids(src, token_transform[SRC_LANGUAGE])
        tgt_tokens = get_input_ids(tgt, token_transform[TGT_LANGUAGE])
        
        src_tokens = pad_sequence(src_tokens, padding_value=PAD_IDX)
        tgt_tokens = pad_sequence(tgt_tokens, padding_value=PAD_IDX)

        src_tokens = src_tokens.to(DEVICE)
        tgt_tokens = tgt_tokens.to(DEVICE)

        #teacher forcing 
        tgt_input = tgt_tokens[:-1, :] #the output embeddings are offset by one position

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src_tokens, tgt_input)
        
        model(src_tokens, tgt_input)
        logits = model(src_tokens, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)
        optimizer.zero_grad()

        tgt_out = tgt_tokens[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))#logits 3d to 2d tgt_out 2d to 1d logits total_number of words * vocal size tgt_out total_number of words
        losses += loss.item()
        loss = loss / accumulation_steps
        loss.backward()
        optimizer.step()

        if (index+1) % accumulation_steps == 0:  
          #print(loss.item())            # Wait for several backward steps
          optimizer.step()                            # Now we can do an optimizer step
          optimizer.zero_grad()    # Reset gradients tensors  

        index += 1
        if (index % 1000 == 0):
          print("Current index:", index)

    return losses / len(train_dataloader)


def evaluate(model):
    model.eval()
    losses = 0

    val_iter = dataset[train_index:train_index+val_index]
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE)

    for src, tgt in val_dataloader:
        src_tokens = get_input_ids(src, token_transform[SRC_LANGUAGE])
        tgt_tokens = get_input_ids(tgt, token_transform[TGT_LANGUAGE])
        
        src_tokens = pad_sequence(src_tokens, padding_value=PAD_IDX)
        tgt_tokens = pad_sequence(tgt_tokens, padding_value=PAD_IDX)

        src_tokens = src_tokens.to(DEVICE)
        tgt_tokens = tgt_tokens.to(DEVICE)

        #teacher forcing 
        tgt_input = tgt_tokens[:-1, :] #the output embeddings are offset by one position

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src_tokens, tgt_input)

        logits = model(src_tokens, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt_tokens[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(val_dataloader)

from timeit import default_timer as timer
NUM_EPOCHS = 2
train_record = []
val_record = []

print("Start training")
for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    train_record.append(train_loss)
    val_record.append(val_loss)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))    
    torch.save(transformer.state_dict(), '/kaggle/working/zh_en_checkpoints') # save your trianed model

In [None]:
torch.save(transformer.state_dict(), '/kaggle/working/zh_en_checkpoints') # save your trianed model

In [None]:
BOS_IDX = 101

EOS_IDX = 102

MAX_LEN_TGT = 64

In [None]:
from nltk.translate.bleu_score import corpus_bleu

def bleu_eval(model,tokenizer, test_iter):
  output = [translate(model, i[0]) for i in test_iter[:1000]]
  true_output = [i[1] for i in test_iter[:1000]]
  bleu_score = corpus_bleu(true_output, output, weights=(0.25, 0.25, 0.25, 0.25))

  print("-------------------------------------------------------------")
  print("Input.     :", [i[0] for i in test_iter[:1]])
  print("True output:", true_output[0])
  print("Output     :", output[0])
  print("BLEU Score = ", bleu_score)
  print("-------------------------------------------------------------")

In [None]:
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        #print(prob.shape)
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()
        #print(next_word)
        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

def sample_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        #print(prob.shape)
        p = torch.nn.functional.softmax(prob/1.2)
        #print(p.shape)
        next_word = torch.multinomial(p.flatten(),1)[0]

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = torch.tensor(token_transform[SRC_LANGUAGE](src_sentence)['input_ids']).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + MAX_LEN_TGT, start_symbol=BOS_IDX).flatten()
    
#     print("TGT is", token_transform[TGT_LANGUAGE].decode(tgt_tokens))
    return token_transform[TGT_LANGUAGE].decode(tgt_tokens).replace("[CLS]", "").replace("[SEP]", "").strip()
translate(transformer,'但这并没有对PD起到帮助作用。')

In [None]:
test_iter = dataset[train_index+val_index:]
bleu_eval(transformer,token_transform[TGT_LANGUAGE], test_iter)

In [None]:
translate(transformer,'开展科学研究主要是为了改善提高我们的生活，但是，这也是一项产业，代表了来自政府和企业界的巨额投资。')

In [None]:
translate(transformer,'花费两千亿美元可以防止几十万人死亡，产生的效益高于成本25倍。')