In [39]:
import torch
from torch import nn
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torch.utils.data import DataLoader, Dataset
from torchtext.vocab import vocab
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import random

In [13]:
# import re
# def get_translation_texts(input_file='data/deu.txt', output_file='data/de-en.txt'):
#   regex = r"\tCC-BY 2.0.+$"
#   lines = open(input_file, encoding='utf-8').read().strip().split('\n')
#   lines = [re.sub(regex, '', line) for line in lines]
#   file = open(output_file,'w')
#   for line in lines:
#     file.write(line+"\n")
#   file.close()

In [32]:
SEED = 1234

random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1142bde10>

In [28]:
device = torch.device('cpu')
max_lines = 10000
batch_size = 4

In [20]:
lang1 = 'de'
lang2 = 'en'

lang1_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
lang2_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

def build_vocab(lang1='de', lang2='en'):
    counter1 = Counter()
    counter2 = Counter()
    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').read().strip().split('\n')
    for l in lines:
        l1, l2 = l.split('\t')
        counter1.update(lang1_tokenizer(l1))
        counter2.update(lang2_tokenizer(l2))

    vocab1 = vocab(counter1, specials=['<unk>', '<pad>', '<sos>', '<eos>'])
    vocab2 = vocab(counter2, specials=['<unk>', '<pad>', '<sos>', '<eos>'])
    vocab1.set_default_index(vocab1["<unk>"])
    vocab2.set_default_index(vocab2["<unk>"])
    return [vocab1, vocab2]

lang1_vocab, lang2_vocab = build_vocab(lang1, lang2)
lang1_vocab_vocab_size = len(lang1_vocab)
lang2_vocab_size = len(lang2_vocab)
print('lang1_vocab_vocab_size', lang1_vocab_vocab_size)
print('lang2_vocab_size', lang2_vocab_size)

lang1_vocab_vocab_size 20342
lang2_vocab_size 41070


In [25]:
class TranslationDataset(Dataset):
    def __init__(self, lang1, lang2, max_lines, device=torch.device("cpu")):
        self.lang1 = lang1
        self.lang2 = lang2
        self.device = device
        self.data = []
        self.untokenized_data = []
        lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').read().strip().split('\n')
        num_lines = 0
        for l in lines:
            l1, l2 = l.split('\t')
            if (max_lines is None or max_lines > num_lines):
                l1_tokens = self.tokenize_sentence(l1, True)
                l2_tokens = self.tokenize_sentence(l2, False)
                self.data.append((l1_tokens, l2_tokens))
                self.untokenized_data.append((l2, l1))
                num_lines += 1

        self.len = len(self.data)

    def tokenize_sentence(self, sentence, is_lang1):
        vocab = lang2_vocab if is_lang1 else lang1_vocab
        tokenizer = lang2_tokenizer if is_lang1 else lang1_tokenizer
        indexes = [vocab[token] for token in tokenizer(sentence)]
        indexes = [vocab['<sos>']] + indexes + [vocab['<eos>']]
        return torch.tensor(indexes, dtype=torch.long, device=self.device)

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.len

In [26]:
dataset = TranslationDataset(lang1, lang2, max_lines, device)
print('length of dataset', len(dataset))

length of dataset 10000


In [34]:
train_set, val_set, test_set = torch.utils.data.random_split(dataset, [0.8, 0.1, 0.1], generator=torch.Generator().manual_seed(SEED))

def collate_fn(batch):
    source = [item[0] for item in batch] 
    #pad them using pad_sequence method from pytorch. 
    source = pad_sequence(source, batch_first=False, padding_value=lang1_vocab['<pad>']) 
    
    #get all target indexed sentences of the batch
    target = [item[1] for item in batch] 
    #pad them using pad_sequence method from pytorch. 
    target = pad_sequence(target, batch_first=False, padding_value=lang2_vocab['<pad>'])
    return source, target

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

print('length of train_loader', len(train_loader))
print('length of val_loader', len(val_loader))
print('length of test_loader', len(test_loader))


length of train_loader 2000
length of val_loader 250
length of test_loader 250


In [36]:
for i, item in enumerate(train_loader):
    print(item[0].shape, item[1].shape)
    print([idx[0].item() for idx in item[0]])
    print([idx[0].item() for idx in item[1]])
    break

torch.Size([7, 4]) torch.Size([6, 4])
[2, 0, 141, 0, 5, 3, 1]
[2, 0, 71, 0, 5, 3]


In [None]:
class Encoder(nn.Module):
    def __init__(self, 
                 input_dim, 
                 emb_dim, 
                 hid_dim, 
                 n_layers, 
                 kernel_size, 
                 dropout, 
                 device,
                 max_length = 100):
        super().__init__()
        
        assert kernel_size % 2 == 1, "Kernel size must be odd!"
        
        self.device = device
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
        
        self.tok_embedding = nn.Embedding(input_dim, emb_dim)
        self.pos_embedding = nn.Embedding(max_length, emb_dim)
        
        self.emb2hid = nn.Linear(emb_dim, hid_dim)
        self.hid2emb = nn.Linear(hid_dim, emb_dim)
        
        self.convs = nn.ModuleList([nn.Conv1d(in_channels = hid_dim, 
                                              out_channels = 2 * hid_dim, 
                                              kernel_size = kernel_size, 
                                              padding = (kernel_size - 1) // 2)
                                    for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input):
        #input = (seq_len, batch_size)
        
        batch_size = input.size(1)
        seq_len = input.size(0)
        
        #create position tensor, pos = [0, 1, 2, 3, ..., seq_len - 1], tok_embedded = pos_embedded = (batch_size, seq_len, emb_dim)
        pos = torch.arange(0, seq_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)

        #embed tokens and positions, tok_embedded = pos_embedded = [seq_len，batch_size, emb_dim]
        tok_embedded = self.tok_embedding(input)
        pos_embedded = self.pos_embedding(pos)
        
        #combine embeddings by elementwise summing, [seq_len，batch_size, emb_dim]
        embedded = self.dropout(tok_embedded + pos_embedded)
        
        #pass embedded through linear layer to convert from emb_dim to hid_dim
        conv_input = self.emb2hid(embedded) # [seq_len, batch_size, hid_dim]
                
        #permute for convolutional layer
        conv_input = conv_input.permute(1, 2, 0) # [batch_size, hid_dim, seq_len]
        
        #begin convolutional blocks...
        for i, conv in enumerate(self.convs):
            #pass through convolutional layer
            conved = conv(self.dropout(conv_input)) # [batch_size, 2 * hid_dim, seq_len]

            #pass through GLU activation function
            conved = F.glu(conved, dim = 1) # [batch_size, hid_dim, seq_len]

            #apply residual connection
            conved = (conved + conv_input) * self.scale #[batch_size, hid_dim, seq_len]
            
            #set conv_input to conved for next loop iteration
            conv_input = conved
        
        #...end convolutional blocks
        
        #permute and convert back to emb_dim
        conved = self.hid2emb(conved.permute(0, 2, 1)) #[batch_size, seq_len, emb_dim]
        
        #elementwise sum output (conved) and input (embedded) to be used for attention # [batch_size, seq_len, emb_dim]
        combined = (conved + embedded) * self.scale
        
        return conved, combined