In [17]:
import torch
from torch import nn
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torch.utils.data import DataLoader, Dataset
from torchtext.vocab import vocab
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import torch.optim as optim
import random
import time
import math
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [2]:
SEED = 1234

random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x10454bc10>

In [5]:
device = torch.device('cpu')
max_lines = 10000
batch_size = 8

In [6]:
lang1 = 'en'
lang2 = 'de'

en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')

def lang1_tokenizer(sentence):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [token.lower() for token in en_tokenizer(sentence)]

def lang2_tokenizer(sentence):
    """
    Tokenizes German text from a string into a list of strings
    """
    return [token for token in de_tokenizer(sentence)]

def build_vocab(lang1='en', lang2='de'):
    counter1 = Counter()
    counter2 = Counter()
    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').read().strip().split('\n')
    for l in lines:
        l1, l2 = l.split('\t')
        counter1.update(lang1_tokenizer(l1))
        counter2.update(lang2_tokenizer(l2))

    vocab1 = vocab(counter1, min_freq=2, specials=['<unk>', '<pad>', '<sos>', '<eos>'])
    vocab2 = vocab(counter2, min_freq=2, specials=['<unk>', '<pad>', '<sos>', '<eos>'])
    vocab1.set_default_index(vocab1["<unk>"])
    vocab2.set_default_index(vocab2["<unk>"])
    return [vocab1, vocab2]

lang1_vocab, lang2_vocab = build_vocab(lang1, lang2)
lang1_vocab_size = len(lang1_vocab)
lang2_vocab_size = len(lang2_vocab)
print('lang1_vocab_vocab_size', lang1_vocab_size)
print('lang2_vocab_size', lang2_vocab_size)

lang1_vocab_vocab_size 12030
lang2_vocab_size 23470


In [7]:
class TranslationDataset(Dataset):
    def __init__(self, lang1, lang2, max_lines, device=torch.device("cpu")):
        self.lang1 = lang1
        self.lang2 = lang2
        self.device = device
        self.data = []
        self.untokenized_data = []
        lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').read().strip().split('\n')
        lines.reverse() #use longer data
        num_lines = 0
        for l in lines:
            l1, l2 = l.split('\t')
            if (max_lines is None or max_lines > num_lines):
                l1_tokens = self.tokenize_sentence(l1, True)
                l2_tokens = self.tokenize_sentence(l2, False)
                self.data.append((l1_tokens, l2_tokens))
                self.untokenized_data.append((l1, l2))
                num_lines += 1

        self.len = len(self.data)

    def tokenize_sentence(self, sentence, is_lang1):
        vocab = lang1_vocab if is_lang1 else lang2_vocab
        tokenizer = lang1_tokenizer if is_lang1 else lang2_tokenizer
        indexes = [vocab[token] for token in tokenizer(sentence)]
        indexes = [vocab['<sos>']] + indexes + [vocab['<eos>']]
        return torch.tensor(indexes, dtype=torch.long, device=self.device)

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.len

In [8]:
dataset = TranslationDataset(lang1, lang2, max_lines, device)
print('length of dataset', len(dataset))

length of dataset 10000


In [9]:
train_set, val_set, test_set = torch.utils.data.random_split(dataset, [0.8, 0.1, 0.1], generator=torch.Generator().manual_seed(SEED))

def collate_fn(batch):
    source = [item[0] for item in batch] 
    #pad them using pad_sequence method from pytorch. 
    source = pad_sequence(source, batch_first=True, padding_value=lang1_vocab['<pad>']) 
    
    #get all target indexed sentences of the batch
    target = [item[1] for item in batch] 
    #pad them using pad_sequence method from pytorch. 
    target = pad_sequence(target, batch_first=True, padding_value=lang2_vocab['<pad>'])
    return source, target

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

print('length of train_loader', len(train_loader))
print('length of val_loader', len(val_loader))
print('length of test_loader', len(test_loader))


length of train_loader 1000
length of val_loader 125
length of test_loader 125


In [11]:
for i, item in enumerate(train_loader):
    print(item[0].shape, item[1].shape)
    break

torch.Size([8, 27]) torch.Size([8, 27])


In [12]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
    def forward(self, x):
        return self.embed(x)

In [None]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len = 80):
        super().__init__()
        self.d_model = d_model
        
        # create constant 'pe' matrix with values dependant on 
        # pos and i
        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
                
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
 
    
    def forward(self, x):
        # make embeddings relatively larger
        x = x * math.sqrt(self.d_model)
        #add constant to embedding
        seq_len = x.size(1)
        x = x + (self.pe[:,:seq_len]).item()
        return x

In [14]:
batch = next(iter(train_loader))
input_seq = batch[0]
input_pad = lang1_vocab.get_stoi()['<pad>']
# creates mask with 0s wherever there is padding in the input
input_msk = (input_seq != input_pad).unsqueeze(1)
print(input_msk.shape)

In [20]:
# create mask as before

target_seq = batch[0]
target_pad = lang2_vocab.get_stoi()['<pad>']
target_msk = (target_seq != target_pad).unsqueeze(1)
    
size = target_seq.size(1) # get seq_len for matrix

nopeak_mask = np.triu(np.ones((1, size, size)), k=1).astype('uint8')
nopeak_mask = torch.from_numpy(nopeak_mask) == 0

target_msk = target_msk & nopeak_mask