# Imports, hyperparameters and other initializations

In [40]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import transformers
from torch.utils.data import DataLoader
from tokenizers import Tokenizer
from datasets import load_from_disk

In [41]:
def one_hot_encode(idx, vocab_size):
    """
    One-hot encodes a single word given its index and the size of the vocabulary.
    
    Args:
     `idx`: the index of the given word
     `vocab_size`: the size of the vocabulary
    
    Returns a 1-D numpy array of length `vocab_size`.
    """
    # Initialize the encoded array
    one_hot = np.zeros(vocab_size)
    
    # Set the appropriate element to one
    one_hot[idx] = 1.0

    return one_hot


def one_hot_encode_sequence(sequence, vocab_size):
    """
    One-hot encodes a sequence of words given a fixed vocabulary size.
    
    Args:
     `sentence`: a list of words to encode
     `vocab_size`: the size of the vocabulary
     
    Returns a 3-D numpy array of shape (num words, vocab size, 1).
    """
    # Encode each word in the sentence
    encoding = np.array([one_hot_encode(word_to_idx[word], vocab_size) for word in sequence])

    # Reshape encoding s.t. it has shape (num words, vocab size, 1)
    encoding = encoding.reshape(encoding.shape[0], encoding.shape[1], 1)
    
    return encoding

In [42]:
# Load pre-trained tokenizer and tokenized datasets:
tokenizer = Tokenizer.from_file("serialized_tokenizer")
train_ds, val_ds, test_ds = load_from_disk("tokenized_train"), load_from_disk("tokenized_val"), load_from_disk("tokenized_test")
#train_ds.set_format(type="pt", columns=["ids", "attention_mask"])
#val_ds.set_format(type="pt", columns=["ids", "attention_mask"])
#test_ds.set_format(type="pt", columns=["ids", "attention_mask"])

train_ids = train_ds["ids"]
val_ids = val_ds["ids"]
test_ids = test_ds["ids"]

In [43]:
VOCAB_SIZE = tokenizer.get_vocab_size()
EMBED_DIM = 64
HIDDEN_DIM = 64
N_LAYERS = 2
DROPOUT_RATE = 0.5

In [44]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, n_layers, dropout_rate):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.lstm = nn.LSTM(
            input_size = embed_dim,
            hidden_size = hidden_dim,
            num_layers = n_layers,
            bias = True, # default
            batch_first = False, # default
            dropout = dropout_rate,
            bidirectional = False # default
        )
        self.dropout = nn.Dropout(dropout_rate)
    def forward(self, x):
        # x: [seq len, batch]
        e = self.dropout(self.embedding(x))
        # e: [seq len, batch, emb] 
        _, (h, c) = self.lstm(e)
        # h, c: [layers, batch, hidden dim]
        return h, c

class Decoder(nn.Module):
    def __init__(self, output_dim, embed_dim, hidden_dim, n_layers, dropout_rate):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embed_dim)
        self.lstm = nn.LSTM(
            input_size = embed_dim,
            hidden_size = hidden_dim,
            num_layers = n_layers,
            bias = True, # default
            batch_first = False, # default
            dropout = dropout_rate,
            bidirectional = False # default
        )
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, output_dim)
    def forward(self, x, h, c):
        # x: [batch], h, c: [n layers, batch, hidden dim]
        x = x.view(1, -1)
        # x: [1, batch]
        e = self.dropout(self.embedding(x))
        # e: [1, batch, embed dim]
        o, (h, c) = self.lstm(e, (h,c))
        # o: [seq len, batch, hidden dim]
        p = self.fc_out(o.squeeze(0))
        # p: [batch, output dim]
        return p, h, c


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    
    def forward(self, x, target):
        # x: [len, batch]
        target_length = target.shape[0]
        batch_size = target.shape[1]
        output_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(seq_len, batch_size, output_vocab_size)
        h, c = self.encoder(x)

        # first token
        x = target[0, :]
        for t in range(1, target_length):
            o, h, c = self.decoder(x, h, c)
            outputs[t] = o
            x = o.argmax(1)

        return outputs



In [45]:
encoder = Encoder(input_dim=VOCAB_SIZE, embed_dim=EMBED_DIM, hidden_dim=HIDDEN_DIM, n_layers=N_LAYERS, dropout_rate=DROPOUT_RATE)
decoder = Decoder(output_dim=VOCAB_SIZE, embed_dim=EMBED_DIM, hidden_dim=HIDDEN_DIM, n_layers=N_LAYERS, dropout_rate=DROPOUT_RATE)

model = Seq2Seq(encoder, decoder)

In [46]:
def init(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.1, 0.1)

print(model.apply(init))
print(f'Parameters: {sum(p.numel() for p in model.parameters()):,}')

optimizer = optim.SGD(model.parameters(), lr=0.7)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.token_to_id("[PAD]"))

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(12800, 64)
    (lstm): LSTM(64, 64, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(12800, 64)
    (lstm): LSTM(64, 64, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
    (fc): Linear(in_features=64, out_features=12800, bias=True)
  )
)
Parameters: 2,603,520


In [47]:
NUM_EPOCHS = 100
BATCH_SIZE = 128

training_loss, validation_loss = [], []

for i in range(NUM_EPOCHS):
    # Track loss
    epoch_training_loss = 0
    epoch_validation_loss = 0

    model.eval()

    #eval stuff

    model.train()


In [48]:
val_ds.set_format("numpy")

59