In [1]:
### this is for running in local ###
import os
try:
    os.environ['HTTP_PROXY']='http://185.46.212.90:80'
    os.environ['HTTPS_PROXY']='http://185.46.212.90:80'
    print ("proxy_exported")
except:
    None

proxy_exported


In [2]:
!python -m pip install "kivy[base]" kivy_examples

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m

In [3]:
from torch.utils.data import Dataset
import torch.nn.functional as F
import transformer
import re
from os.path import exists
from collections import Counter
import random
import torch
from torchinfo import summary
from tqdm.auto import tqdm
import numpy as np

proxy_exported


In [4]:
# settings
n_vocab = 40000
seq_len = 20
p_random_mask = 0.15
batch_size = 16
n_iterations = 20
epochs = 10
device = "cuda" if torch.cuda.is_available() else "cpu"

### Loading Data 

In [5]:
data_pth = 'sentences_data/training.txt'
sentences = open(data_pth).read().lower().split('\n')
special_chars = '?;.:/*!+-()[]{}"\'&'
sentences = [re.sub(f'[{re.escape(special_chars)}]','\g<0> ',s).split(' ') for s in sentences]
sentences = [[w for w in s if len(w)] for s in sentences]

In [6]:
# create vocab
vocab_pth = 'sentences_data/vocab.txt'
if not exists(vocab_pth):
  words = [w for s in sentences for w in s]
  vocab = Counter(words).most_common(n_vocab)
  vocab = [w[0] for w in vocab]
else:
  vocab = open(vocab_pth).read().split('\n')

In [7]:
class SentenceDataset(Dataset):
  def __init__(self,sentences,vocab,seq_len):
    dataset = self
    dataset.sentences = sentences
    dataset.vocab = vocab + ['<ignore>','<oov>','<mask>']
    dataset.vocab = {e:i for i,e in enumerate(dataset.vocab)}
    dataset.rvocab = {v:k for k,v in dataset.vocab.items()}

    dataset.seq_len = seq_len

    dataset.IGNORE_IDX = dataset.vocab['<ignore>']
    dataset.OUT_OF_VOCAB_IDX = dataset.vocab['<oov>']
    dataset.MASK_IDX = dataset.vocab['<mask>']

  def __getitem__(self,index,p_random_mask=0.15):
    dataset = self

    s = []
    while len(s) < dataset.seq_len:
      s.extend(dataset.get_sentence_idx(index % len(dataset)))
      index += 1

    s = s[:dataset.seq_len]
    [s.append(dataset.IGNORE_IDX) for i in range(dataset.seq_len - len(s))]
    s = [(dataset.MASK_IDX,w) if random.random() < p_random_mask else (w,dataset.IGNORE_IDX) for w in s]

    return {'input': torch.Tensor([w[0] for w in s]).long(),
            'target':torch.Tensor([w[1] for w in s]).long()}

  def __len__(self):
    return len(self.sentences)

  def get_sentence_idx(self,index):
    dataset = self
    s = dataset.sentences[index]
    s = [dataset.vocab[w] if w in dataset.vocab else dataset.OUT_OF_VOCAB_IDX for w in s]
    return s

In [8]:
# create dataset and train/test data
print('creating dataset...')
dataset = SentenceDataset(sentences, vocab, seq_len)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
print(f"size of train {train_size} and size of test {test_size}")

#dataloader = torch.utils.data.DataLoader(dataset,shuffle=True, drop_last=True, pin_memory=False, batch_size=batch_size)
train_data, test_data = torch.utils.data.random_split(dataset, [train_size, test_size])

train_dataloader = torch.utils.data.DataLoader(train_data,batch_size = batch_size,
                              shuffle=True,num_workers=2,
                              pin_memory=True)
test_dataloader = torch.utils.data.DataLoader(test_data,batch_size = batch_size,
                              shuffle=False,num_workers=2,
                              pin_memory=True)


# sample sentence
batch_output = next(iter(train_dataloader))
input_sentence, input_label = batch_output['input'][0], batch_output['target'][0]
print(input_sentence, input_label)

creating dataset...
size of train 99565 and size of test 24892
tensor([    6,     3,   400, 23947, 23947, 23946,     4, 23946,     2,   106,
        23946, 23947,  4130,   238,    39,   631,     4, 23946,    69,   434]) tensor([23945, 23945, 23945,   354,     7, 23945, 23945, 23945, 23945, 23945,
        23945, 23946, 23945, 23945, 23945, 23945, 23945, 23945, 23945, 23945])


### Loading Model

In [9]:
bert_model = transformer.Bert(n_embeddings = len(dataset.vocab))
bert_model.to(device)
optimizer = torch.optim.Adam(params = bert_model.parameters(),lr=1e-4,betas=(0.9,0.999),
                             weight_decay=1e-4)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=dataset.IGNORE_IDX)
summary(model= bert_model, input_size=(32,20), dtypes = [torch.int32],col_names=["input_size","output_size","num_params","trainable"],
        col_width=20,
        row_settings=["var_names"])

Layer (type (var_name))                                      Input Shape          Output Shape         Param #              Trainable
Bert (Bert)                                                  [32, 20]             [32, 20, 23948]      2,560                True
├─Embedding (embeddings)                                     [32, 20]             [32, 20, 128]        3,065,344            True
├─Dropout (embedding_dropout)                                [32, 20, 128]        [32, 20, 128]        --                   --
├─Sequential (transformer_encoder)                           [32, 20, 128]        [32, 20, 128]        --                   True
│    └─TransformerEncoderBlock (0)                           [32, 20, 128]        [32, 20, 128]        --                   True
│    │    └─MultiHeadAttentionBlock (msa_block)              [32, 20, 128]        [32, 20, 128]        66,304               True
│    │    └─MLPBlock (mlp_block)                             [32, 20, 128]        [32, 20, 128

### Training the Model (Epochs) 

In [10]:
for epoch in tqdm(range(epochs)):
    # train model
    bert_model.train()
    train_loss = 0

    for batch_idx, batch_data in enumerate(train_dataloader):
        # infer
        masked_input = batch_data['input']
        masked_target = batch_data['target']

        masked_input = masked_input.to(device)
        masked_target = masked_target.to(device)
        output_pred = bert_model(masked_input)

        # compute the cross-entropy loss
        output_v = output_pred.view(-1, output_pred.shape[-1])
        target_v = masked_target.view(-1, 1).squeeze()
        loss = loss_fn(output_v, target_v)

        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(train_dataloader)

    # test model
    bert_model.eval()
    test_loss = 0

    with torch.inference_mode():
        for batch_idx, batch_data in enumerate(test_dataloader):
            # infer
            masked_input = batch_data['input']
            masked_target = batch_data['target']

            masked_input = masked_input.to(device)
            masked_target = masked_target.to(device)
            output_pred = bert_model(masked_input)

            # compute the cross-entropy loss
            output_v = output_pred.view(-1, output_pred.shape[-1])
            target_v = masked_target.view(-1, 1).squeeze()
            loss = loss_fn(output_v, target_v)

            test_loss += loss.item()

        test_loss = test_loss / len(test_dataloader)

    print(
        f"Epoch: {epoch + 1} | "
        f"train_loss: {train_loss:.4f} | "
        f"test_loss: {test_loss:.4f} | "
        f"| Δw:, {round(bert_model.embeddings.weight.grad.abs().sum().item(), 3)}"
    )


  0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 5.3634 | test_loss: 5.1780 | | Δw:, 3.908
Epoch: 2 | train_loss: 5.1266 | test_loss: 5.0701 | | Δw:, 6.083


### Training the Model (Iterations) 

In [11]:
def get_batch(loader,loader_iter):
    try:
        batch = next(loader_iter)
    except StopIteration:
        loader_iter = iter(loader)
        batch = next(loader_iter)
    return batch, loader_iter

In [12]:
batch_iter = iter(train_dataloader)
bert_model.train()
print_each = 10

for it in range(n_iterations):
    # get batch
    batch_data, batch_iter = get_batch(train_dataloader, batch_iter)

    masked_input = batch_data['input']
    masked_target = batch_data['target']

    masked_input = masked_input.to(device)
    masked_target = masked_target.to(device)
    output_pred = bert_model(masked_input)

    # compute the cross-entropy loss
    output_v = output_pred.view(-1, output_pred.shape[-1])
    target_v = masked_target.view(-1, 1).squeeze()
    loss = loss_fn(output_v, target_v)

    train_loss += loss.item()

    optimizer.zero_grad()
    loss.backward()

    optimizer.step()

    # print step
    if it % print_each == 0:
        print('it:', it,
              ' | loss', np.round(loss.item(), 2),
              ' | Δw:', round(bert_model.embeddings.weight.grad.abs().sum().item(), 3))


it: 0  | loss 5.19  | Δw: 6.936
it: 10  | loss 5.18  | Δw: 5.817
