In [4]:
import numpy as np
import time

import torch
from torch import nn

In [5]:
# Import Data helpers
import helper

source_path = 'data/letters_source.txt'
target_path = 'data/letters_target.txt'

source_sentences = helper.load_data(source_path)
target_sentences = helper.load_data(target_path)

In [6]:
source_sentences[:10]

'bsaqq\nnpy\n'

In [7]:
target_sentences[:10]

'abqqs\nnpy\n'

In [24]:
source_texts = source_sentences.split("\n")
target_texts = target_sentences.split("\n")

In [25]:
source_texts[:5]

['bsaqq', 'npy', 'lbwuj', 'bqv', 'kial']

In [26]:
target_texts[:5]

['abqqs', 'npy', 'bjluw', 'bqv', 'aikl']

In [30]:
def tokens_int_lookup(words):
    
    token_int = {
        "<PAD>": 0,
        "<UNK>": 1,
        "<GO>": 2,
        "<EOS>": 3,        
    }
    chars = set([ch for word in words for ch in word])
    vocab_to_int = {vocab: idx for idx, vocab in enumerate(chars, 4)}
    vocab_to_int.update(token_int)
    int_to_vocab = {v: k for k, v in vocab_to_int.items()}
    
    return vocab_to_int, int_to_vocab

In [31]:
source_vocab_to_int, source_int_to_vocab = tokens_int_lookup(source_texts)
target_vocab_to_int, target_int_to_vocab = tokens_int_lookup(target_texts)

In [70]:
def build_token_ints(sequences, tokens_lookup, max_seq_length=20):
    token_ints = np.zeros(shape=(len(sequences), 20), dtype=np.int32)
    for i, seq in enumerate(sequences):
        for j, token in enumerate(seq):
            token_ints[i, j] = tokens_lookup.get(token, "<UNK>")
    return token_ints

In [71]:
source_tokens = [[ch for ch in text] for text in source_texts]
target_tokens = [[ch for ch in text] + ["<EOS>"] for text in target_texts]

In [72]:
source_tokens[:5]

[['b', 's', 'a', 'q', 'q'],
 ['n', 'p', 'y'],
 ['l', 'b', 'w', 'u', 'j'],
 ['b', 'q', 'v'],
 ['k', 'i', 'a', 'l']]

In [73]:
target_tokens[:5]

[['a', 'b', 'q', 'q', 's', '<EOS>'],
 ['n', 'p', 'y', '<EOS>'],
 ['b', 'j', 'l', 'u', 'w', '<EOS>'],
 ['b', 'q', 'v', '<EOS>'],
 ['a', 'i', 'k', 'l', '<EOS>']]

In [74]:
source_token_ints = build_token_ints(source_tokens, source_vocab_to_int, max_seq_length=10)
target_token_ints = build_token_ints(target_tokens, target_vocab_to_int, max_seq_length=10)

In [75]:
source_token_ints[:5]

array([[25, 28, 17, 16, 16,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 8, 27, 12,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [20, 25, 22, 19, 18,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [25, 16, 29,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 7, 10, 17, 20,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0]], dtype=int32)

In [76]:
target_token_ints[:5]

array([[17, 25, 16, 16, 28,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 8, 27, 12,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [25, 18, 20, 19, 22,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [25, 16, 29,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [17, 10,  7, 20,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0]], dtype=int32)

In [None]:
class EncoderNN(nn.Module):
    
    def __init__(self, vocab_size, embed_dims, rnn_size, *args, **kwargs):
        
        super().__init__(*args, **kwargs)
        
        self.embeddings = nn.Embedding(vocab_size, embed_dims)
        
        self.rnn = nn.LSTM(embed_dims, rnn_size, batch_first=True, bidirectional=True)
        
        