In [1]:
import torch
import torch.nn as nn
import spacy
from torch.utils.tensorboard import SummaryWriter
from torchtext.datasets import Multi30k
from torchtext.data import Field,BucketIterator

In [2]:
from translation_utils import translate_sentence,bleu,save_checkpoint,load_checkpoint

In [3]:
import de_core_news_sm
nlp_de = de_core_news_sm.load()

In [4]:
import en_core_web_sm
nlp_en = en_core_web_sm.load()

In [5]:
def tokenize_german(text):
    return [tok.text for tok in nlp_de.tokenizer(text)]

def tokenize_german(text):
    return [tok.text for tok in nlp_en.tokenizer(text)]

In [6]:
german=Field(tokenize=tokenize_german,lower=True,init_token="<sos>",eos_token="<eos>")
english=Field(tokenize=tokenize_german,lower=True,init_token="<sos>",eos_token="<eos>")

In [7]:
train_data,valid_data,test_data=Multi30k.splits(
exts=(".de",".en"),fields=(german,english)
)

In [8]:
german.build_vocab(train_data,max_size=10000,min_freq=2)
english.build_vocab(train_data,max_size=10000,min_freq=2)

In [9]:
class Transformer(nn.Module):
    def __init__(
    self,
    embedding_size,
    source_vocab_size,
    target_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device
    ):
        super(Transformer,self).__init__()
        self.src_word_embedding=nn.Embedding(source_vocab_size,embedding_size)
        self.src_position_embedding=nn.Embedding(max_len,embedding_size)
        self.target_word_embedding=nn.Embedding(target_vocab_size,embedding_size)
        self.target_position_embedding=nn.Embedding(max_len,embedding_size)
        
        self.device=device
        
        self.transformer=nn.Transformer(
        embedding_size,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        )
        
        self.fc_out=nn.Linear(embedding_size,target_vocab_size)
        self.dropout=nn.Dropout(dropout)
        self.source_pad_idx=src_pad_idx
        
    def make_src_mask(self,src):
        #src shape = src_len,n  transformer takes opposite fo this n,src_len
        src_mask=src.transpose(0,1)==self.source_pad_idx
        return src_mask
    
    
    def forward(self,src,target):
        src_seq_length,N=src.shape
        target_seq_length,N=target.shape
        
        src_positions=(torch.arange(0,src_seq_length)
                       .unsqueeze(1)
                       .expand(src_seq_length,N)
                       .to(self.device)
                      )
        
        target_positions=(torch.arange(0,target_seq_length)
                          .unsqueeze(1)
                          .expand(target_seq_length,N)
                          .to(self.device)
                         )
        
        
        embed_src=self.dropout((self.src_word_embedding(src)+self.src_position_embedding(src_positions)))
        
        embed_target=self.dropout((self.src_word_embedding(target)+self.target_position_embedding(target_positions)))
        
        src_padding_mask=self.make_src_mask(src)
        target_mask=self.transformer.generate_square_subsequent_mask(target_seq_length).to(self.device)
        
        out=self.transformer(
        embed_src,
        embed_target,
        src_key_padding_mask=src_padding_mask,
        tgt_mask=target_mask
        )
        
        return out

In [10]:
#training phase
if torch.cuda.is_available():
    device="cuda"
else:
    device="cpu"

print(device)

cuda


In [11]:
load_model=False
save_model=True


num_epochs=5
learning_rate=3e-4
batch_size=32
embedding_size = 512
src_vocab_size=len(german.vocab)
target_vocab_size=len(english.vocab)

num_heads=8
num_encoder_layers=3
num_decoder_layers=3

dropout=0.1

max_length=100

forward_expansion=4

src_pad_idx=english.vocab.stoi["<pad>"]


In [12]:
#tensorboard for nice plots

In [13]:
writer=SummaryWriter("runs/loss_plot")
step=8

In [14]:
train_iterator,valid_iterator,test_iterator=BucketIterator.splits(
(train_data,valid_data,test_data),
batch_size=batch_size,
sort_within_batch=True,
sort_key=lambda x:len(x.src),
device=device,)

In [15]:
model=Transformer(embedding_size,
                 src_vocab_size,
                 target_vocab_size,
                 src_pad_idx,
                 num_heads,
                 num_encoder_layers,
                 num_decoder_layers,
                 forward_expansion,
                 dropout,
                 max_length,
                 device,).to(device)

In [17]:
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)

pad_idx=english.vocab.stoi["<pad>"]

criterion=nn.CrossEntropyLoss(ignore_index=pad_idx)

if(load_model):
    load_checkpoint(torch.load("my_checkpoint_pth.tar"),model,optimizer)
    
sentence="ein pferd geht unter einer brücke neben einem boot."

In [18]:
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.1, patience=10, verbose=True
)

In [19]:
for epoch in range(num_epochs):
    print(f'Epoch {epoch}/{num_epochs} ')

    if(save_model):
        checkpoint={
            "state_dict":model.state_dict(),
            "optimizer":optimizer.state_dict(),
        }


    model.eval()
    translated_sentence=translate_sentence(model,sentence,german,english,device,max_length=100)
    
    print(f"Translated example sentence: \n {translated_sentence}")
    model.train()
    losses = []

    for batch_idx, batch in enumerate(train_iterator):
        # Get input and targets and get to cuda
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)

        # Forward prop
        output = model(inp_data, target[:-1, :])
        
        output = output.reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)
        
        optimizer.zero_grad()

        loss = criterion(output, target)
        losses.append(loss.item())

        # Back prop
        loss.backward()
        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1
    mean_loss = sum(losses) / len(losses)
    scheduler.step(mean_loss)

Epoch 0/5 
here
Translated example sentence: 
 ['construction', 'short', 'colorful', 'head', 'camera', 'parade', 'camera', 'parade', 'camera', 'construction', 'track', 'head', 'camera', 'parade', 'camera', 'head', 'head', 'large', 'head', 'camera', 'parade', 'camera', 'camera', 'parade', 'camera', 'parade', 'camera', 'track', 'head', 'head', 'head', 'camera', 'parade', 'camera', 'parade', 'camera', 'head', 'camera', 'parade', 'camera', 'parade', 'camera', 'parade', 'camera', 'baby', 'sweater', 'rock', 'path', 'camera', 'colorful', 'track', 'head', 'camera', 'head', 'runs', 'sweater', 'head', 'large', 'sweater', 'vest', 'colorful', 'head', 'camera', 'parade', 'camera', 'parade', 'camera', 'parade', 'camera', 'parade', 'camera', 'parade', 'camera', 'parade', 'racing', 'head', 'camera', 'construction', 'track', 'head', 'camera', 'baby', 'construction', 'camera', 'parade', 'camera', 'sweater', 'vest', 'short', 'sweater', 'parade', 'camera', 'parade', 'camera', 'working', 'tent', 'edge', 'c

In [20]:
# running on entire test data takes a while
score = bleu(test_data[1:100], model, german, english, device)
print(f"Bleu score {score * 100:.2f}")


here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
Bleu score 26.66
