In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import transformers
from torch.utils.data import DataLoader
from tokenizers import Tokenizer
from datasets import load_from_disk

In [4]:
# Load pre-trained tokenizer and tokenized datasets:
tokenizer = Tokenizer.from_file("serialized_tokenizer")
train_ds, val_ds, test_ds = load_from_disk("tokenized_train"), load_from_disk("tokenized_val"), load_from_disk("tokenized_test")
train_ds.set_format(type="pt", columns=["ids", "attention_mask"])
val_ds.set_format(type="pt", columns=["ids", "attention_mask"])
test_ds.set_format(type="pt", columns=["ids", "attention_mask"])

train_ids = train_ds["ids"]
val_ids = val_ds["ids"]
test_ids = test_ds["ids"]

In [5]:
print(val_ds.shape)
print(test_ds.shape)
print(train_ds.shape)

(3760, 3)
(4358, 3)
(1801350, 3)


In [50]:
VOCAB_SIZE = tokenizer.get_vocab_size()

def prep_batches(dataset, batch_size, seq_len):
    num_batches = len(dataset) // batch_size
    inputs = dataset[:num_batches * batch_size]
    targets = torch.zeros_like(inputs)
    for i in range(0, len(inputs)):
        targets[i][:-1] = inputs[i][1:] # skip first token
        # targets[i][-1] = dataset[i][0] as first token is always [CLS], no reason to append to the end.
    inputs = inputs.view((batch_size, -1, seq_len))
    targets = targets.view((batch_size, -1, seq_len))
    return inputs, targets

def one_hot_encode(idx, vocab_size):
    one_hot = np.zeros(vocab_size)
    one_hot[idx] = 1
    return one_hot

def one_hot_encode_seq(sequence, vocab_size):
    encoding = torch.tensor([one_hot_encode(token, vocab_size) for token in sequence])
    encoding = encoding.view(encoding.shape[0], encoding.shape[1])
    return encoding

x, y = prep_batches(val_ids, 64, 256)
print(one_hot_encode_seq(x[0][0], VOCAB_SIZE).shape)

torch.Size([256, 12800])


In [49]:
EMBED_DIM = 64
HIDDEN_DIM = 64
N_LAYERS = 2
DROPOUT_RATE = 0.5

In [None]:
class Seq(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, n_layers, dropout_rate):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_dim, embed_dim) #input_dim == vocab_size (one-hot encoding)
        self.lstm = nn.LSTM(
            input_size = embed_dim,
            hidden_size = hidden_dim,
            num_layers = n_layers,
            bias = True, # default
            batch_first = False, # default
            dropout = dropout_rate,
            bidirectional = False # default
        )
        self.dropout = nn.Dropout(dropout_rate)
    def forward(self, x):
        # x: [seq len, batch]
        e = self.dropout(self.embedding(x))
        # e: [seq len, batch, emb] 
        _, (h, c) = self.lstm(e)
        # h, c: [layers, batch, hidden dim]
        return h, c