In [60]:
from datasets import Dataset
import json

import torch
from torch.utils.data import Dataset as torchDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [48]:
DATA_FOLDER_PATH = './data'
EN_SENTENCES_PATH = DATA_FOLDER_PATH + "/microlang_20000_eng.txt"
SW_SENTENCES_PATH = DATA_FOLDER_PATH + "/microlang_20000_swe.txt"
EN_VOCAB_PATH = DATA_FOLDER_PATH + "/eng_vocab.json"
SW_VOCAB_PATH = DATA_FOLDER_PATH + "/swe_vocab.json"

## Load data and prepare dataset

In [14]:
def load_data_from_files(en_file_path, sw_file_path):
    
    with open(en_file_path, 'r', encoding='utf-8') as en_file:
        en_sentences = [line.strip() for line in en_file]
        
    with open(sw_file_path, 'r', encoding='utf-8') as sw_file:
        sw_sentences = [line.strip() for line in sw_file]
        
    dataset = Dataset.from_dict({"en": en_sentences, "sw": sw_sentences})
    return dataset    
    

In [27]:
dataset = load_data_from_files(EN_SENTENCES_PATH, SW_SENTENCES_PATH)

In [28]:
print(dataset)

Dataset({
    features: ['en', 'sw'],
    num_rows: 20000
})


In [29]:
def count_unique_words(dataset, field):
    vocab = set()
    for sentence in dataset[field]:
        for word in sentence.lower().split():
            vocab.add(word)
    return len(vocab)

en_unique = count_unique_words(dataset, "en")
sw_unique = count_unique_words(dataset, "sw")

print("English unique words:", en_unique)
print("Swedish unique words:", sw_unique)

English unique words: 161
Swedish unique words: 248


This is not surpising as Swedish is morphologically much more complex than English.

In [30]:
dataset = dataset.train_test_split(test_size=0.2, seed=42)
temp = dataset["test"].train_test_split(test_size=0.5, seed=42)

dataset = {
    "train": dataset["train"],
    "val": temp["train"],
    "test": temp["test"],
}
train_data, val_data, test_data = (
    dataset["train"],
    dataset["val"],
    dataset["test"],
)
print(train_data)
print(val_data)
print(test_data)

Dataset({
    features: ['en', 'sw'],
    num_rows: 16000
})
Dataset({
    features: ['en', 'sw'],
    num_rows: 2000
})
Dataset({
    features: ['en', 'sw'],
    num_rows: 2000
})


## Define tokenizer

Define special tokens

In [31]:
sos_token = "<SOS>"
eos_token = "<EOS>"
unk_token = "<UNK>"
pad_token = "<PAD>"
special_tokens = [unk_token, pad_token, sos_token, eos_token]

Since there are so few unique words in the dataset (161 and 248), there is no need to use any fancy pancy tokenizers. I will build my own lil tokenizer, which will be just fine for this job. Even using spaCy would seem a bit redundant since there is no punctuation and no commas in the dataset.

In [32]:
class lilTokenizer:
    def __init__(self):
        self.word2idx = {pad_token: 0, sos_token: 1, eos_token: 2, unk_token: 3}
        self.idx2word = {0: pad_token, 1: sos_token, 2: eos_token, 3:unk_token}
        self.vocab_size = 4
        
    def build_vocab(self, sentences):
        for sentence in sentences:
            for word in sentence.lower().split():
                if word not in self.word2idx:
                    self.word2idx[word] = self.vocab_size
                    self.idx2word[self.vocab_size] = word
                    self.vocab_size += 1
                
    def encode(self, sentence):
        '''Convert a sentence to integers and wrap it in eos and sos tokens'''
        tokens = []
        for word in sentence.lower().split():
            tokens.append(self.word2idx.get(word, self.word2idx[unk_token]))
        return [self.word2idx[sos_token]] + tokens + [self.word2idx[eos_token]]
    
    def decode(self, token_ids):
        '''Convers integers back to readable text'''
        words = []
        for idx in token_ids:
            word = self.idx2word[idx] 
            # self.idx2word.get(idx, unk_token) would off course be better practice,
            # but I want to make sure I get an error if something is misaligend in my code.
            # So I will leave it like this for learning purposes!
            if word == eos_token:
                break
            if word not in [sos_token, pad_token]:
                words.append(word)
        return " ".join(words) 
        

In [39]:
en_sentences_train = [sentence for sentence in train_data["en"]]
sw_sentences_train = [sentence for sentence in train_data["sw"]]
print(len(en_sentences_train), len(sw_sentences_train))


16000 16000


In [40]:
en_tokenizer = lilTokenizer()
en_tokenizer.build_vocab(en_sentences_train)
sw_tokenizer = lilTokenizer()
sw_tokenizer.build_vocab(sw_sentences_train)

Lil sanity check

In [45]:
assert en_tokenizer.word2idx[pad_token] == sw_tokenizer.word2idx[pad_token]
assert en_tokenizer.word2idx[sos_token] == sw_tokenizer.word2idx[sos_token]
assert en_tokenizer.word2idx[eos_token] == sw_tokenizer.word2idx[eos_token]
assert en_tokenizer.word2idx[unk_token] == sw_tokenizer.word2idx[unk_token]

pad_idx = en_tokenizer.word2idx[pad_token]
sos_idx = en_tokenizer.word2idx[sos_token]
eos_idx = en_tokenizer.word2idx[eos_token]
unk_idx = en_tokenizer.word2idx[unk_token]

# should be 0, 1, 2, 3
print(f"Special token indices: {pad_idx}, {sos_idx}, {eos_idx}, {unk_idx}")

Special token indices: 0, 1, 2, 3


In [54]:
# should be 161 + 4 = 165 and 248 + 4 = 252
print(len(en_tokenizer.word2idx), len(sw_tokenizer.word2idx))

165 252


### Save vocabularies

In [55]:
def save_vocab(tokenizer, filepath):
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(tokenizer.word2idx, f, ensure_ascii=False, indent=4)

In [56]:
save_vocab(en_tokenizer, EN_VOCAB_PATH)
save_vocab(sw_tokenizer, SW_VOCAB_PATH)

## DataLoaders

Alright, lets get these sentences into pytorch!

In [67]:
class TranslationDataset(torchDataset):
    def __init__(self, en_sentences, sw_sentences, en_tokenizer, sw_tokenizer):
        self.en_sentences = en_sentences
        self.sw_sentences = sw_sentences
        self.en_tokenizer = en_tokenizer
        self.sw_tokenizer = sw_tokenizer

    def __len__(self):
        return len(self.en_sentences)

    def __getitem__(self, idx):
        # raw text for a specific index
        en_text = self.en_sentences[idx]
        sw_text = self.sw_sentences[idx]

        # encode texts using my awesome lilTokenizer
        en_encoded = self.en_tokenizer.encode(en_text)
        sw_encoded = self.sw_tokenizer.encode(sw_text)

        # convert the integer lists into pytorch tensors!
        return torch.tensor(en_encoded), torch.tensor(sw_encoded)

In [68]:
def collate_fn(batch):
    en_batch = []
    sw_batch = []
    
    # separate tensors
    for en_item, sw_item in batch:
        en_batch.append(en_item)
        sw_batch.append(sw_item)
        
    # pad the sequences
    # batch_first=False creates shape: (Sequence_Length, Batch_Size)
    en_padded = pad_sequence(en_batch, padding_value=pad_idx, batch_first=False)
    sw_padded = pad_sequence(sw_batch, padding_value=pad_idx, batch_first=False)
    
    return en_padded, sw_padded

In [69]:
train_dataset = TranslationDataset(
    en_sentences=en_sentences_train, 
    sw_sentences=sw_sentences_train, 
    en_tokenizer=en_tokenizer, 
    sw_tokenizer=sw_tokenizer
)

BATCH_SIZE = 32

train_loader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True,
    collate_fn=collate_fn,
    drop_last=True
)

# check first batch
en_batch, sw_batch = next(iter(train_loader))

print(f"English Batch Shape: {en_batch.shape} (Seq_Len, Batch_Size)")
print(f"Swedish Batch Shape: {sw_batch.shape} (Seq_Len, Batch_Size)")

English Batch Shape: torch.Size([8, 32]) (Seq_Len, Batch_Size)
Swedish Batch Shape: torch.Size([8, 32]) (Seq_Len, Batch_Size)


## Define Encoder and Decoder for Seq2Seq

### Encoder

### Decoder