In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import transformers
from torch.utils.data import DataLoader
from tokenizers import Tokenizer
from datasets import load_from_disk

In [2]:
# Load pre-trained tokenizer and tokenized datasets:
tokenizer = Tokenizer.from_file("serialized_tokenizer")
train_ds, val_ds, test_ds = load_from_disk("tokenized_train"), load_from_disk("tokenized_val"), load_from_disk("tokenized_test")
train_ds.set_format(type="pt", columns=["ids", "attention_mask"])
val_ds.set_format(type="pt", columns=["ids", "attention_mask"])
test_ds.set_format(type="pt", columns=["ids", "attention_mask"])

train_ids = train_ds["ids"]
val_ids = val_ds["ids"]
test_ids = test_ds["ids"]

In [3]:
VOCAB_SIZE = tokenizer.get_vocab_size()

def prep_batches(dataset, batch_size, seq_len):
    num_batches = len(dataset) // batch_size
    inputs = dataset[:num_batches * batch_size]
    targets = torch.zeros_like(inputs)
    for i in range(0, len(inputs)):
        targets[i][:-1] = inputs[i][1:] # skip first token
        # targets[i][-1] = dataset[i][0] # as first token is always [CLS], no reason to append to the end.
    inputs = inputs.view((num_batches, -1, seq_len))
    targets = targets.view((num_batches, -1, seq_len))
    return inputs, targets

def one_hot_encode(idx, vocab_size):
    one_hot = np.zeros(vocab_size)
    one_hot[idx] = 1
    return one_hot

def one_hot_encode_seq(sequence, vocab_size):
    encoding = np.array([one_hot_encode(token, vocab_size) for token in sequence])
    #encoding = encoding.view(encoding.shape[0], encoding.shape[1], 1)
    return encoding

def one_hot_encode_batch(batch, vocab_size):
    encoding = torch.tensor([one_hot_encode_seq(sequence, vocab_size) for sequence in batch])
    return encoding

In [4]:
SEQ_LEN = 256
EMBED_DIM = 32
HIDDEN_DIM = 32
N_LAYERS = 2
BATCH_SIZE = 64
EPOCHS = 10
DROPOUT_RATE = 0.5
GRADIENT_CLIP = 5

In [5]:
class Seq(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, n_layers, dropout_rate, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
        super(Seq, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(
            input_size = embed_dim,
            hidden_size = hidden_dim,
            num_layers = n_layers,
            bias = True, # default
            batch_first = True,
            dropout = dropout_rate,
            bidirectional = False # default
        )
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    def forward(self, x, h, c):
        # x: [batch, seq len] # Just seq len?

        e = self.dropout(self.embedding(x))
        # e: [batch, seq len, emb]

        e = nn.utils.rnn.pack_padded_sequence(e, torch.Tensor(BATCH_SIZE).fill_(SEQ_LEN), batch_first=True)
        o, (h, c) = self.lstm(e,(h,c))
        # o: [batch, seq len, hidden dim], (h, c): [n layers, batch, hidden dim]
        o, _ = nn.utils.rnn.pad_packed_sequence(o, batch_first=True)

        # [batch * seq len, hidden dim]
        o = o.reshape(-1, o.shape[2])
        p = self.fc(o)
        #p = p.view(BATCH_SIZE, SEQ_LEN, VOCAB_SIZE)
        return p, h, c



        

In [6]:
net = Seq(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.SGD(net.parameters(), lr=0.05, momentum=0, weight_decay=0)

In [7]:
train_batches = prep_batches(train_ids, BATCH_SIZE, SEQ_LEN)
valid_batches = prep_batches(val_ids, BATCH_SIZE, SEQ_LEN)
test_batches  = prep_batches(test_ids, BATCH_SIZE, SEQ_LEN)

In [8]:
h = torch.zeros((N_LAYERS, BATCH_SIZE, HIDDEN_DIM))
c = torch.zeros_like(h)
p, h, c = net(train_batches[0][0], h, c)

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
net.to(device)
iteration = 0
for e in range(EPOCHS):
    h = torch.zeros((N_LAYERS, BATCH_SIZE, HIDDEN_DIM)).to(device)
    c = torch.zeros_like(h).to(device)
    net.train()
    for i in range(len(train_batches[0])):
        iteration += 1

        # zero gradients
        optimizer.zero_grad()

        # data to device
        x = torch.tensor(train_batches[0][i]).to(device)
        y = torch.tensor(train_batches[1][i]).to(device)
        y = y.view(-1)

        lgts, _, _ = net(x, h, c) # Logits: [batch*seq_len, vocab_size]
        loss = criterion(lgts, y) # Targets: [batch*seq_len]
        h.detach()
        c.detach()

        loss_val = loss.item()
        loss.backward(retain_graph=(False if i == len(train_batches[0])-1 else True))
        torch.nn.utils.clip_grad_norm_(net.parameters(), GRADIENT_CLIP)
        optimizer.step()

        if iteration % 1 == 0:
            print('Epoch: {}/{}'.format(e, EPOCHS), 'Iteration: {}'.format(iteration), 'Loss: {}'.format(loss_val))

Epoch: 0/10 Iteration: 1 Loss: 8.998676300048828
Epoch: 0/10 Iteration: 2 Loss: 9.0042724609375
Epoch: 0/10 Iteration: 3 Loss: 8.996356964111328
Epoch: 0/10 Iteration: 4 Loss: 9.016846656799316
Epoch: 0/10 Iteration: 5 Loss: 8.997598648071289
Epoch: 0/10 Iteration: 6 Loss: 9.002593994140625
Epoch: 0/10 Iteration: 7 Loss: 8.99396800994873
Epoch: 0/10 Iteration: 8 Loss: 8.998547554016113
Epoch: 0/10 Iteration: 9 Loss: 8.99447250366211
Epoch: 0/10 Iteration: 10 Loss: 9.000444412231445
Epoch: 0/10 Iteration: 11 Loss: 9.005206108093262
Epoch: 0/10 Iteration: 12 Loss: 9.0035982131958
Epoch: 0/10 Iteration: 13 Loss: 9.002791404724121
Epoch: 0/10 Iteration: 14 Loss: 9.002777099609375
Epoch: 0/10 Iteration: 15 Loss: 8.998552322387695
Epoch: 0/10 Iteration: 16 Loss: 9.003440856933594
Epoch: 0/10 Iteration: 17 Loss: 9.0027494430542
Epoch: 0/10 Iteration: 18 Loss: 8.998038291931152
Epoch: 0/10 Iteration: 19 Loss: 8.99258804321289
Epoch: 0/10 Iteration: 20 Loss: 8.992025375366211
Epoch: 0/10 Iterat

KeyboardInterrupt: 