In [1]:
import re
from torch.utils.data import Dataset
import torch.nn.functional as F
from collections import Counter
from os.path import exists
import torch.optim as optim
import torch.nn as nn
import numpy as np
import random
import torch
import math
import re


In [2]:
# Input
# =============================================================================
#1) load text
print('loading text...')
pth = 'training.txt'
sentences = open(pth).read().lower().split('\n')

#2) tokenize sentences (can be done during training, you can also use spacy udpipe)
print('tokenizing sentences...')
special_chars = ',?;.:/*!+-()[]{}"\'&'
sentences = [re.sub(f'[{re.escape(special_chars)}]', ' \g<0> ', s).split(' ') for s in sentences]
sentences = [[w for w in s if len(w)] for s in sentences]

#3) create vocab if not already created
print('creating/loading vocab...')
pth = 'vocab.txt'
if not exists(pth):
    words = [w for s in sentences for w in s]
    vocab = Counter(words).most_common(n_vocab) #keep the N most frequent words
    vocab = [w[0] for w in vocab]
    open(pth, 'w+').write('\n'.join(vocab))
else:
    vocab = open(pth).read().split('\n')

loading text...
tokenizing sentences...
creating/loading vocab...


In [3]:
batch_size = 32
seq_len = 20
embed_size = 128
inner_ff_size = embed_size * 4
n_heads = 8
n_code = 8
n_vocab = 40000
dropout = 0.1
# n_workers = 12

In [4]:
from dataset import SentencesDataset
train_ds = SentencesDataset(sentences,vocab,seq_len)

In [5]:
kwargs = {'shuffle':True,  'drop_last':True, 'pin_memory':True, 'batch_size':batch_size}
data_loader = torch.utils.data.DataLoader(train_ds, **kwargs)

In [6]:
from transformer import Config

In [7]:
len(train_ds.vocab)

23948

In [8]:
model_args = dict(name = "BERT",n_code=8, n_heads=8, embed_size=128, inner_ff_size = 128*4, n_embeddings=len(train_ds.vocab), seq_len = 20, dropout=0.1)

In [9]:
config = Config(**model_args)

In [10]:
config

Config(name='BERT', n_code=8, n_heads=8, embed_size=128, inner_ff_size=512, n_embeddings=23948, seq_len=20, dropout=0.1)

In [11]:
from transformer import Transformer
from dataset import get_batch

In [12]:
model = Transformer(config)

In [15]:
model = model.cuda()
optim_kwargs = {'lr':1e-4, 'weight_decay':1e-4, 'betas':(.9,.999)}
# =============================================================================
# Optimizer
# =============================================================================
print('initializing optimizer and loss...')
optimizer = optim.Adam(model.parameters(), **optim_kwargs)
loss_model = nn.CrossEntropyLoss(ignore_index=train_ds.IGNORE_IDX)

# =============================================================================
# Train
# =============================================================================
print('training...')
print_each = 10
model.train()
batch_iter = iter(data_loader)
n_iteration = 100
for it in range(n_iteration):
    
    #get batch
    batch, batch_iter = get_batch(data_loader, batch_iter)
    
    #infer
    masked_input = batch['input']
    masked_target = batch['target']
    
    masked_input = masked_input.cuda(non_blocking=True)
    masked_target = masked_target.cuda(non_blocking=True)
    output,loss = model(masked_input)
    
    #compute the cross entropy loss 
    output_v = output.view(-1,output.shape[-1])
    target_v = masked_target.view(-1,1).squeeze()
    loss = loss_model(output_v, target_v)
    
    #compute gradients
    loss.backward()
     
    #apply gradients
    optimizer.step()
    
    #print step

    if it % print_each == 0:
        print('it:', it, 
              ' | loss', np.round(loss.item(),2),
              ' | Δw:', round(model.transformer.wte.weight.grad.abs().sum().item(),3))
    
    #reset gradients
    optimizer.zero_grad()
    

# =============================================================================
# Results analysis
# =============================================================================
print('saving embeddings...')
N = 3000
np.savetxt('values.tsv', np.round(model.transformer.wte.weight.detach().cpu().numpy()[0:N], 2), delimiter='\t', fmt='%1.2f')
s = [train_ds.rvocab[i] for i in range(N)]
open('names.tsv', 'w+').write('\n'.join(s) )

initializing optimizer and loss...
training...
it: 0  | loss 10.36  | Δw: 2.46
it: 10  | loss 9.82  | Δw: 1.737
it: 20  | loss 9.63  | Δw: 1.279
it: 30  | loss 9.43  | Δw: 1.197
it: 40  | loss 9.35  | Δw: 1.086
it: 50  | loss 8.98  | Δw: 0.923
it: 60  | loss 9.03  | Δw: 0.748
it: 70  | loss 8.78  | Δw: 0.804
it: 80  | loss 8.75  | Δw: 0.901
it: 90  | loss 8.81  | Δw: 0.754
saving embeddings...


19913