<a href="https://colab.research.google.com/github/supersophieminikittybabycakes/TorchText-Translator-Final-ML-Project-/blob/master/Copy_of_New_Translator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Copied from this source:
https://pytorch.org/tutorials/beginner/torchtext_translation_tutorial.html and Sophie's tutorial.

In [0]:
import random
from typing import Tuple

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor

# Encoder is a decoding neural network class that avoids information compression (that other modules have)
# when processing by allowing the decoder to examine the entire source sentence at each decoding step by using attention.
# Attention calculates an attention vector, that is the length of the source sentence. 
# The attention vector has the property that each element is between 0 and 1, and the entire vector sums to 1. 
# Encoder then calculates a weighted sum of our source sentence hidden states to get a weighted source vector.
class Encoder(nn.Module):
    def __init__(self,
                 input_dim: int,
                 emb_dim: int,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 dropout: float):
        super().__init__()

        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim)

        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)

        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self,
                src: Tensor) -> Tuple[Tensor]:

        embedded = self.dropout(self.embedding(src))

        outputs, hidden = self.rnn(embedded)

        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))

        return outputs, hidden

# Attention calculates an attention vector, that is the length of the source sentence. 
# The attention vector has the property that each element is between 0 and 1, and the entire vector sums to 1. 
# Encoder then calculates a weighted sum of our source sentence hidden states to get a weighted source vector.

class Attention(nn.Module):
    def __init__(self,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 attn_dim: int):
        super().__init__()

        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim

        self.attn_in = (enc_hid_dim * 2) + dec_hid_dim

        self.attn = nn.Linear(self.attn_in, attn_dim)

    def forward(self,
                decoder_hidden: Tensor,
                encoder_outputs: Tensor) -> Tensor:

        src_len = encoder_outputs.shape[0]

        repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        energy = torch.tanh(self.attn(torch.cat((
            repeated_decoder_hidden,
            encoder_outputs),
            dim = 2)))

        attention = torch.sum(energy, dim=2)

        return F.softmax(attention, dim=1)

# Model has the architecture of a encoding network, decoding network, and attention network.
# Essentially, decoding is reading, encoding is comprehending and connecting to the other language
# Encoder uses the attention network

class Decoder(nn.Module):
    def __init__(self,
                 output_dim: int,
                 emb_dim: int,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 dropout: int,
                 attention: nn.Module):
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)

        self.out = nn.Linear(self.attention.attn_in + emb_dim, output_dim)

        self.dropout = nn.Dropout(dropout)


    def _weighted_encoder_rep(self,
                              decoder_hidden: Tensor,
                              encoder_outputs: Tensor) -> Tensor:

        a = self.attention(decoder_hidden, encoder_outputs)

        a = a.unsqueeze(1)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        weighted_encoder_rep = torch.bmm(a, encoder_outputs)

        weighted_encoder_rep = weighted_encoder_rep.permute(1, 0, 2)

        return weighted_encoder_rep


    def forward(self,
                input: Tensor,
                decoder_hidden: Tensor,
                encoder_outputs: Tensor) -> Tuple[Tensor]:

        input = input.unsqueeze(0)

        embedded = self.dropout(self.embedding(input))

        weighted_encoder_rep = self._weighted_encoder_rep(decoder_hidden,
                                                          encoder_outputs)

        rnn_input = torch.cat((embedded, weighted_encoder_rep), dim = 2)

        output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))

        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted_encoder_rep = weighted_encoder_rep.squeeze(0)

        output = self.out(torch.cat((output,
                                     weighted_encoder_rep,
                                     embedded), dim = 1))

        return output, decoder_hidden.squeeze(0)

# A Sequence to Sequence network consisting of two recurring neural networks: the encoder and decoder. 
# The encoder reads an input sequence and outputs a single vector, and the 
# decoder reads that vector to produce an output sequence

class Seq2Seq(nn.Module):
    def __init__(self,
                 encoder: nn.Module,
                 decoder: nn.Module,
                 device: torch.device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self,
                src: Tensor,
                trg: Tensor,
                teacher_forcing_ratio: float = 0.5) -> Tensor:

        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

#        print('max_len:',max_len)

        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)

        encoder_outputs, hidden = self.encoder(src)

        # first input to the decoder is the <sos> token
        output = trg[0,:]

        for t in range(1, max_len):
            output, hidden = self.decoder(output, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)

        return outputs


In [0]:
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

# building the fields for German (source language) and English (target). 
# also holds info about how the datatypes should be tokenized and converted to a tensor.

SRC = Field(tokenize = "spacy",
            # tokenizer_language="de",
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

TRG = Field(tokenize = "spacy",
            # tokenizer_language="en",
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)


In [0]:
#split into test, train with German and English extensions, SRC and TRG respectively

train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
                                                    fields = (SRC, TRG))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:00<00:00, 4.10MB/s]
validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 1.26MB/s]

downloading validation.tar.gz
downloading mmt_task1_test2016.tar.gz



mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 1.29MB/s]


In [0]:
#constructing the Vocab object for the Fields i.e. assembling all the possible values/words for both languages

SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 128

#splits the data into batches of 128 sentences according to size

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    device = device)

In [0]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
# ENC_EMB_DIM = 256
# DEC_EMB_DIM = 256
# ENC_HID_DIM = 512
# DEC_HID_DIM = 512
# ATTN_DIM = 64
# ENC_DROPOUT = 0.5
# DEC_DROPOUT = 0.5
#inputting 
ENC_EMB_DIM = 32
DEC_EMB_DIM = 32
ENC_HID_DIM = 64
DEC_HID_DIM = 64
ATTN_DIM = 8
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)

attn = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)

dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)


def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)


model.apply(init_weights)

optimizer = optim.Adam(model.parameters())


def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,857,261 trainable parameters


In [0]:
PAD_IDX = TRG.vocab.stoi['<pad>']
# print(PAD_IDX)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

*** Start reading new stuff here***
To speed up process, I am only running one epoch currently. Change it back to at least 10 when you are done.

In [0]:
import math
import time

# function for training model
def train(model: nn.Module,
          iterator: BucketIterator,
          optimizer: optim.Optimizer,
          criterion: nn.Module,
          clip: float):

    model.train()

    epoch_loss = 0

    for _, batch in enumerate(iterator):

        src = batch.src
        trg = batch.trg

        optimizer.zero_grad()

        output = model(src, trg)

        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

# function for finding the cumulative loss while looping
def evaluate(model: nn.Module,
             iterator: BucketIterator,
             criterion: nn.Module):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for _, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

#figuring out how long each epoch takes
def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


In [0]:
N_EPOCHS = 25
CLIP = 1

best_valid_loss = float('inf')

#running the model, evaluating and printing losses 
for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

Epoch: 01 | Time: 0m 28s
	Train Loss: 5.680 | Train PPL: 292.828
	 Val. Loss: 5.254 |  Val. PPL: 191.319
Epoch: 02 | Time: 0m 28s
	Train Loss: 5.021 | Train PPL: 151.525
	 Val. Loss: 5.082 |  Val. PPL: 161.035
Epoch: 03 | Time: 0m 28s
	Train Loss: 4.718 | Train PPL: 111.971
	 Val. Loss: 4.978 |  Val. PPL: 145.195
Epoch: 04 | Time: 0m 28s
	Train Loss: 4.537 | Train PPL:  93.449
	 Val. Loss: 4.853 |  Val. PPL: 128.090
Epoch: 05 | Time: 0m 28s
	Train Loss: 4.398 | Train PPL:  81.301
	 Val. Loss: 4.819 |  Val. PPL: 123.890
Epoch: 06 | Time: 0m 28s
	Train Loss: 4.314 | Train PPL:  74.742
	 Val. Loss: 4.750 |  Val. PPL: 115.618
Epoch: 07 | Time: 0m 28s
	Train Loss: 4.222 | Train PPL:  68.142
	 Val. Loss: 4.653 |  Val. PPL: 104.918
Epoch: 08 | Time: 0m 28s
	Train Loss: 4.128 | Train PPL:  62.033
	 Val. Loss: 4.538 |  Val. PPL:  93.543
Epoch: 09 | Time: 0m 28s
	Train Loss: 4.015 | Train PPL:  55.423
	 Val. Loss: 4.472 |  Val. PPL:  87.544
Epoch: 10 | Time: 0m 28s
	Train Loss: 3.915 | Train PPL

How do we convert a tensor back to a sentence? Let's first define this helper function.

In [0]:
#Defining function to convert tensor into sentence
import numpy as np
def word_ids_to_sentence(id_tensor, vocab, join=None):
    """Converts a sequence of word ids to a sentence"""
    # print('id_tensor', id_tensor.shape, id_tensor)
    if isinstance(id_tensor, torch.LongTensor):
        ids = id_tensor.transpose(0, 1).contiguous().view(-1)
    elif isinstance(id_tensor, np.ndarray):
        ids = id_tensor.transpose().reshape(-1)
    # print('ids', ids.shape, ids)
    batch = [vocab.itos[ind] for ind in ids] # denumericalize
    if join is None:
        return batch
    else:
        return join.join(batch)


We can translate an English sentence to a vector and back to a sentence


In [0]:
#Translating English sentence into vector and 
# translating the vector back into English sentence
print('original: ', 'a man in an orange hat staring at something.')
train = Example.fromlist(
    data=['a man in an orange hat staring at something.'], 
    fields=[('trg', TRG)])
custom_train_data = Dataset([train], fields=[('trg', TRG)])
train_iter = BucketIterator(
    dataset=custom_train_data, batch_size=1)
s = next(enumerate(train_iter))[1].trg.to(device)

word_ids_to_sentence(s.cpu().data.numpy(), TRG.vocab, join=' ')

original:  a man in an orange hat staring at something.


'<sos> a man in an orange hat staring at something . <eos>'

And the same with German

In [0]:
# Translating German sentence into vector and 
# translating the vector back into German sentence
print('original', 'ein Mann in einem orangefarbenen Hut mit einem Blick auf etwas.')
train = Example.fromlist(
    data=['ein Mann in einem orangefarbenen Hut mit einem Blick auf etwas.'], 
    fields=[('src', SRC)])
custom_train_data = Dataset([train], fields=[('src', SRC)])
train_iter = BucketIterator(
    dataset=custom_train_data, batch_size=1)
s = next(enumerate(train_iter))[1].src.to(device)

word_ids_to_sentence(s.cpu().data.numpy(), SRC.vocab, join=' ')

original ein Mann in einem orangefarbenen Hut mit einem Blick auf etwas.


'<sos> ein mann in einem orangefarbenen hut mit einem blick auf etwas . <eos>'

In [0]:
def check_model(src_tensor: Tensor,
                trg_tensor: Tensor):
  
    # print('src tensor:',src_tensor)
    # print('trg tensor:',trg_tensor)

    # put the two tensors through the model
    res_tensor = model(src_tensor, trg_tensor)

    # print('res tensor:',res_tensor)


    src_arrs = src_tensor.cpu().data.numpy()
    print('src:',word_ids_to_sentence(src_arrs, SRC.vocab, join=' '))

    trg_arrs = trg_tensor.cpu().data.numpy()
    print('trg:',word_ids_to_sentence(trg_arrs, TRG.vocab, join=' '))

    res_arrs = res_tensor.cpu().data.numpy()
    res_arrs = np.argmax(res_arrs, axis=2)
    print('res:', word_ids_to_sentence(res_arrs, TRG.vocab, join=' '))

    # We were initally unable to succesfully translate from one language to another.
    # We determined this error was because our target tensor was just a [0] vector.
    # With a [0] target tensor, the model cannot change the source tensor input 
    # to the expected output target tensor because the two tensors are too disimilar in shape.

The key issue that we fixed was the trg tensor. Instead of using a [0] tensor, we just needed a target tensor that was at least as big as the source tensor. Then, the model just fills in 0's/padding for the unused space in the translation. Using a target tensor of a batch works when using example sentences because the target tensor is already big enough.   

In [0]:
# pulling up a sentence from the dataset
batch=next(enumerate(train_iterator))[1]

In [0]:
# getting the source and target attributes
src_tensor = batch.src
trg_tensor = batch.trg

check_model(src_tensor, trg_tensor)

src: <sos> ein mann mit jeans , langärmeligem kapuzenpulli und rucksack geht durch eine wüste . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <sos> ein roter schlepper legt gleich am hafen an . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <sos> eine dunkelhäutige alte frau mit grauem haar hält sich an einer gelben blume fest . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <sos> ein mädchen lächelt außerhalb eines ladens . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <sos> eine lächelnde frau spielt geige vor einem türkisfarbenen hintergrund . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <sos> menschen stehen am strand , einer davon unter einem grünen schirm . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <sos> ein junge läuft vor wellen hinter ihm mit einem netz du

Let's see if we can create examples from scratch. We gotta turn the sentence into dataset. And use the actual translation as both a comparison and to produce a suitable target tensor. Observation: target string of (source words) - 1 gives an optimal # of words in result


In [0]:
# Creating examples, turning sentence into the dataset

from torchtext.data import Example
from torchtext.data import Dataset

src_text='zwei junge weiße männer sind i m freien in der nähe vieler büsche ';
trg_text='the the the the the the the the the the the the the the the the the';

# Target tensor is just a string of words, instead of just a [0]

src_data = Example.fromlist(data=[src_text], fields=[('src', SRC)])
trg_data = Example.fromlist(data=[trg_text], fields=[('trg', TRG)])

src_dataset = Dataset([src_data], fields=[('src', SRC)])
trg_dataset = Dataset([trg_data], fields=[('trg', TRG)])

And make a bucket. Then we'll be able to view the vector form of this sentence that we created from scratch:

In [0]:
# Making a bucket with BucketIterator to view vector form of sentence


from torchtext.data import BucketIterator

src_dataset_iter = BucketIterator(dataset=src_dataset, batch_size=1)
src_tensor=next(enumerate(src_dataset_iter))[1].src.to(device)
# print('src tensor:',src_tensor)

trg_dataset_iter = BucketIterator(dataset=trg_dataset, batch_size=1)
trg_tensor=next(enumerate(trg_dataset_iter))[1].trg.to(device)
# print('trg tensor:',trg_tensor)


Then, we could apply the model to predict the english translation vector

In [0]:
# Puttting both tensors through the model
# Getting an actual sentence output
check_model(src_tensor, trg_tensor)

src: <sos> zwei junge weiße männer sind i m freien in der nähe vieler büsche <eos>
trg: <sos> the the the the the the the the the the the the the the the the the <eos>
res: <unk> two young two white men are outside outside outside outside . <eos> . <eos> . <eos> . .
