In [1]:
from lstm import LSTM_Model
from data import init_datasets
import numpy as np
import time
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [2]:
topic = 'nyt_covid'
path = 'data/small_test_corpora/'

In [3]:
batch_size = 3
time_steps = 5
freq_threshold = 1
epochs = 5

In [4]:
datasets = init_datasets(topic=topic, 
                         freq_threshold=freq_threshold, 
                         time_steps=time_steps, 
                         batch_size=batch_size, 
                         path=path)

In [5]:
vocab_size = datasets['vocab_size']
word2index = datasets['word2index']
data_loaders = datasets['data_loaders']

In [6]:
vocab_size

50

In [7]:
train, valid, test = data_loaders

In [8]:
x = next(iter(train))
x[0].size()

torch.Size([3, 4, 1])

In [9]:
test.dataset.x.size()

torch.Size([24, 4, 1])

In [10]:
x[0][0][:3]

tensor([[38],
        [13],
        [26]])

In [11]:
x[1][0][:3]

tensor([[13],
        [26],
        [ 6]])

In [12]:
hyperparams = {
    'embed_dims': None,
    'freq_threshold': 3,
    'dropout_prob': 0.5,
    'init_range': 0.05,
    'epochs': 40,
    'learning_rate': 1,
    'learning_rate_decay': 1.2,
    'num_layers': 2,
    'batch_size': 20,
    'time_steps': 35,
    'max_grad': 5,
    'embed_tying': False,
    'bias': False,
    'save_model': True,
    'load_model': True,
    'model_path': 'lstm_model',
    'topic': 'wiki', # enter 'wiki' or 'nyt_covid'
    'path': 'data/small_test_corpora'
}

In [13]:
def OLD_LUKE_train(model, data, epochs, learning_rate, learning_rate_decay, max_grad):
    train_loader, valid_loader, test_loader = data
    start_time = time.time()

    print("Starting training.\n")

    for epoch in range(epochs):
        model.train()
        batch_size = train_loader.batch_size
        states = model.init_state(batch_size)

        if epoch > 5:
            learning_rate = learning_rate / learning_rate_decay

        for i, (x, y) in enumerate(train_loader):
            batch_size = len(x)
            model.zero_grad()
            states = model.detach_states(states)
            scores, states = model(x, states)
            loss = neg_log_likelihood_loss(scores, y)
            loss.backward()
            with torch.no_grad():
                norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad)
                for param in model.parameters():
                    param -= learning_rate * param.grad
            if i % (len(train_data) // 10) == 0:
                end_time = time.time()
                print("batch no = {:d} / {:d}, ".format(i, len(train_loader)) +
                      "train loss = {:.3f}, ".format(loss.item() / batch_size) +
                      "dw.norm() = {:.3f}, ".format(norm) +
                      "lr = {:.3f}, ".format(learning_rate) +
                      "since beginning = {:d} mins, ".format(round((end_time-start_time)/60)) +
                      "cuda memory = {:.3f} GBs".format(torch.cuda.max_memory_allocated()/1024/1024/1024))
        model.eval()
        valid_perplexity = get_perplexity(model, valid_loader, batch_size)
        print("Epoch : {:d} || Validation set perplexity : {:.3f}".format(epoch+1, valid_perplexity))
        print("*************************************************\n")
    test_perp = get_perplexity(model, test_loader, batch_size)
    print("Test set perplexity : {:.3f}".format(test_perp))
    print("Training is over.")
    return model

In [14]:
#The loss function.
def _neg_log_likelihood_loss(scores, y):
    batch_size = y.size(1)
    print("y shape: ", y.shape)
    print("scores shape: ", scores.shape)
    expscores = scores.exp()
    print("expscores shape: ", expscores.shape)
    probabilities = expscores / expscores.sum(1, keepdim = True)
    print("prob shape: ", probabilities.shape)
    print("dim 1 : ", len(y.reshape(-1)))
    print("dim 2 : ", y.reshape(-1).shape)
    answerprobs = probabilities[range(len(y.reshape(-1))), y.reshape(-1)]
    #I multiply by batch_size as in the original paper
    #Zaremba et al. sum the loss over batches but average these over time.
    return torch.mean(-torch.log(answerprobs) * batch_size)

In [15]:
def neg_log_likelihood_loss(scores, targets):
    # substituting with cross entropy loss
    batch_size = targets.size(1)
    return F.cross_entropy(scores.reshape(-1, scores.size(2)), targets.reshape(-1)) * batch_size

In [16]:
def get_perplexity(model, data, batch_size):
    with torch.no_grad():
        losses = []
        states = model.init_state(batch_size)
        for x, y in data:
            # x = torch.transpose(x, 0, 1)
            # y = torch.transpose(y, 0, 1)
            scores, states = model(x, states)
            loss = neg_log_likelihood_loss(scores, y)
            #Again with the sum/average implementation described in 'nll_loss'.
            losses.append(loss.data.item() / batch_size)
    return np.exp(np.mean(losses))

In [17]:
import timeit

def train(device, data, model, epochs, lr, max_grad, batch_size):
    train_loader, valid_loader, test_loader = data
    tic = timeit.default_timer()
    total_words = 0
    print("Starting training.\n")
    for epoch in range(epochs):
        states = model.init_state(batch_size)
        for i, (x, y) in enumerate(train_loader):
            if x.size(0) < batch_size:
                continue
            # x = torch.transpose(x, 0, 1)
            # y = torch.transpose(y, 0, 1)
            total_words += x.numel()
            model.zero_grad()
            states = model.detach_states(states)
            scores, states = model(x, states)
            loss = neg_log_likelihood_loss(scores, y)
            loss.backward()
            with torch.no_grad():
                norm = nn.utils.clip_grad_norm_(model.parameters(), max_grad)
                for param in model.parameters():
                    param -= lr * param.grad
            if i % (len(train_loader)//10) == 0:
                toc = timeit.default_timer()
                print("batch no = {:d} / {:d}, ".format(i, len(train_loader)) +
                      "train loss = {:.3f}, ".format(loss.item()/batch_size) +
                      "wps = {:d}, ".format(round(total_words/(toc-tic))) +
                      "dw.norm() = {:.3f}, ".format(norm) +
                      "lr = {:.3f}, ".format(lr) +
                      "since beginning = {:d} mins, ".format(round((toc-tic)/60))) 
                    #   "cuda memory = {:.3f} GBs".format(torch.cuda.max_memory_allocated()/1024/1024/1024))
    
    return model
    #     model.eval()
    #     val_perp = perplexity(vld, model)
    #     print("Epoch : {:d} || Validation set perplexity : {:.3f}".format(epoch+1, val_perp))
    #     print("*************************************************\n")
    # tst_perp = perplexity(tst, model)
    # print("Test set perplexity : {:.3f}".format(tst_perp))
    # print("Training is over.")

In [30]:
def train_l(model, data, epochs, learning_rate, learning_rate_decay, max_grad):
    
    train_loader, valid_loader, test_loader = data
    start_time = time.time()
    
    total_words = 0
    print("Starting training.\n")
    
    for epoch in range(epochs):
        batch_size = train_loader.batch_size
        states = model.init_state(batch_size)

        if epoch > 5:
            learning_rate = learning_rate / learning_rate_decay
        
        for i, (x, y) in enumerate(train_loader):
            # if x.size(0) < batch_size:
            #    continue
            print(f"x size before : {x.size()}")
            print(x)
            print(y)
            # x = torch.transpose(x, 0, 1)
            print(f"x size after : {x.size()}")
            # y = torch.transpose(y, 0, 1)
            
            total_words += x.numel()
            model.zero_grad()
            
            # batch_size = len(x))
            states = model.detach_states(states)
            scores, states = model(x, states)
            loss = neg_log_likelihood_loss(scores, y)
            loss.backward()
            
            with torch.no_grad():
                norm = nn.utils.clip_grad_norm_(model.parameters(), max_grad)
                for param in model.parameters():
                    param -= learning_rate * param.grad
            
            if i % (len(train_loader) // 10) == 0:
                end_time = time.time()
                print("batch no = {:d} / {:d}, ".format(i, len(train_loader)) +
                      "train loss = {:.3f}, ".format(loss.item() / batch_size) +
                      "wps = {:d}, ".format(round(total_words/(end_time-start_time))) +
                      "dw.norm() = {:.3f}, ".format(norm) +
                      "lr = {:.3f}, ".format(learning_rate) +
                      "since beginning = {:d} mins, ".format(round((end_time-start_time)/60))) # +
                      # "cuda memory = {:.3f} GBs".format(torch.cuda.max_memory_allocated()/1024/1024/1024))
        
        model.eval()
        valid_perplexity = get_perplexity(model, valid_loader, batch_size)
        print("Epoch : {:d} || Validation set perplexity : {:.3f}".format(epoch+1, valid_perplexity))
        print("*************************************************\n")
    test_perp = get_perplexity(model, test_loader, batch_size)
    print("Test set perplexity : {:.3f}".format(test_perp))
    print("Training is over.")
    return model

In [31]:
embed_dims = int(np.ceil(np.sqrt(np.sqrt(vocab_size))))
embed_dims

3

In [32]:
embed_dims = int(np.ceil(np.sqrt(np.sqrt(vocab_size))))
model = LSTM_Model(vocab_size=vocab_size, max_grad=5, embed_dims=embed_dims, num_layers=2,
                   dropout_prob=0.5, init_param=0.05, bias=False, embed_tying=False)

In [33]:
trn, vld, tst = data_loaders

In [34]:
torch.transpose

<function _VariableFunctionsClass.transpose>

In [35]:
train_l(model=model, data=data_loaders, epochs=5, learning_rate=1., learning_rate_decay=1.2, max_grad=5)

Starting training.

x size before : torch.Size([3, 4, 1])
tensor([[[ 5],
         [21],
         [29],
         [13]],

        [[18],
         [ 5],
         [ 5],
         [25]],

        [[ 5],
         [25],
         [ 9],
         [ 2]]])
tensor([[[21],
         [29],
         [13],
         [26]],

        [[ 5],
         [ 5],
         [25],
         [ 5]],

        [[25],
         [ 9],
         [ 2],
         [ 4]]])
x size after : torch.Size([3, 4, 1])


RuntimeError: input must have 3 dimensions, got 4

In [68]:
train(device='cpu', data=data_loaders, model=model, epochs=5, lr=1., max_grad=5, batch_size=batch_size)

Starting training.

batch no = 0 / 3371, train loss = 10.272, wps = 2065, dw.norm() = 0.001, lr = 1.000, since beginning = 0 mins, 
batch no = 337 / 3371, train loss = 10.272, wps = 3510, dw.norm() = 0.001, lr = 1.000, since beginning = 1 mins, 
batch no = 674 / 3371, train loss = 10.272, wps = 3188, dw.norm() = 0.003, lr = 1.000, since beginning = 2 mins, 
batch no = 1011 / 3371, train loss = 7.877, wps = 3123, dw.norm() = 3.944, lr = 1.000, since beginning = 3 mins, 
batch no = 1348 / 3371, train loss = 7.830, wps = 3061, dw.norm() = 12.840, lr = 1.000, since beginning = 4 mins, 
batch no = 1685 / 3371, train loss = 7.678, wps = 3053, dw.norm() = 3.644, lr = 1.000, since beginning = 5 mins, 
batch no = 2022 / 3371, train loss = 7.355, wps = 3007, dw.norm() = 8.166, lr = 1.000, since beginning = 7 mins, 
batch no = 2359 / 3371, train loss = 7.253, wps = 2963, dw.norm() = 3.315, lr = 1.000, since beginning = 8 mins, 
batch no = 2696 / 3371, train loss = 7.279, wps = 2895, dw.norm() = 4

KeyboardInterrupt: 

## SHAPE OF ARRAYS AND TENSORS

In [58]:
from data import _init_corpora
train, valid, test, vocab = _init_corpora(path=path, topic=topic, freq_threshold=freq_threshold)

In [59]:
train_t = train.reshape(-1, 1)
valid_t = valid.reshape(-1, 1)
test_t = test.reshape(-1, 1)

In [60]:
test.shape

(126,)

In [61]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

In [66]:
TEST = torch.LongTensor(test)
TEST_T = torch.LongTensor(test_t)

In [68]:
TEST = torch.split(tensor=TEST, split_size_or_sections=time_steps)
TEST_T = torch.split(tensor=TEST_T, split_size_or_sections=time_steps)

In [69]:
TEST[:3]

(tensor([31, 25,  5,  5,  5, 44,  5, 33,  5,  5]),
 tensor([40, 41,  5, 16,  5,  5,  5,  5,  5,  5]),
 tensor([ 2,  4,  5,  7,  5,  5, 33,  5,  5, 36]))

In [70]:
TEST_T[:3]

(tensor([[31],
         [25],
         [ 5],
         [ 5],
         [ 5],
         [44],
         [ 5],
         [33],
         [ 5],
         [ 5]]),
 tensor([[40],
         [41],
         [ 5],
         [16],
         [ 5],
         [ 5],
         [ 5],
         [ 5],
         [ 5],
         [ 5]]),
 tensor([[ 2],
         [ 4],
         [ 5],
         [ 7],
         [ 5],
         [ 5],
         [33],
         [ 5],
         [ 5],
         [36]]))

In [71]:
TEST_seq = pad_sequence(TEST, batch_first=True, padding_value=0)

In [72]:
TEST_T_seq = pad_sequence(TEST_T, batch_first=True, padding_value=0)

In [73]:
TEST_seq[-3:]

tensor([[ 5, 40,  5,  5,  5,  5,  5, 33,  5,  5],
        [ 8,  5, 10,  5,  5,  1,  5,  5,  5,  5],
        [ 5,  5,  5,  5,  2,  4,  0,  0,  0,  0]])

In [74]:
TEST_T_seq[-3:]

tensor([[[ 5],
         [40],
         [ 5],
         [ 5],
         [ 5],
         [ 5],
         [ 5],
         [33],
         [ 5],
         [ 5]],

        [[ 8],
         [ 5],
         [10],
         [ 5],
         [ 5],
         [ 1],
         [ 5],
         [ 5],
         [ 5],
         [ 5]],

        [[ 5],
         [ 5],
         [ 5],
         [ 5],
         [ 2],
         [ 4],
         [ 0],
         [ 0],
         [ 0],
         [ 0]]])

In [86]:
TEST_seq_i = TEST_seq.narrow_copy(1, 0, TEST_seq.shape[1])
TEST_seq_o = TEST_seq.narrow_copy(1, 1, TEST_seq.shape[1])

RuntimeError: start (1) + length (10) exceeds dimension size (10).

In [84]:
TEST_seq_i[-3:]

tensor([[ 5, 40,  5,  5,  5,  5,  5, 33,  5,  5],
        [ 8,  5, 10,  5,  5,  1,  5,  5,  5,  5],
        [ 5,  5,  5,  5,  2,  4,  0,  0,  0,  0]])

In [85]:
TEST_seq_o[-3:]

tensor([[40,  5,  5,  5,  5,  5, 33,  5,  5],
        [ 5, 10,  5,  5,  1,  5,  5,  5,  5],
        [ 5,  5,  5,  2,  4,  0,  0,  0,  0]])

In [30]:
TEST_T_seq_i = TEST_T_seq.narrow_copy(1, 0, TEST_T_seq.shape[1] - 1)
TEST_T_seq_o = TEST_T_seq.narrow_copy(1, 1, TEST_T_seq.shape[1] - 1)

In [31]:
TEST_T_seq_i[-3:]

tensor([[[15502],
         [ 7717],
         [18542],
         [ 7105],
         [   77],
         [26313],
         [12434],
         [23496],
         [ 2866]],

        [[10880],
         [10291],
         [ 9056],
         [   77],
         [13514],
         [22559],
         [ 2150],
         [   77],
         [ 9054]],

        [[   77],
         [ 9054],
         [19242],
         [   77],
         [ 2378],
         [ 9056],
         [ 1419],
         [   80],
         [    0]]])

In [32]:
TEST_T_seq_o[-3:]

tensor([[[ 7717],
         [18542],
         [ 7105],
         [   77],
         [26313],
         [12434],
         [23496],
         [ 2866],
         [17241]],

        [[10291],
         [ 9056],
         [   77],
         [13514],
         [22559],
         [ 2150],
         [   77],
         [ 9054],
         [25836]],

        [[ 9054],
         [19242],
         [   77],
         [ 2378],
         [ 9056],
         [ 1419],
         [   80],
         [    0],
         [    0]]])

In [39]:
TEST_seq_i.size()

torch.Size([23773, 9])

In [40]:
TEST_T_seq_i.size()

torch.Size([23773, 9, 1])

In [None]:
data = torch.LongTensor(data)
# split tensor into tensors of of size time_steps
data = torch.split(tensor=data, split_size_or_sections=time_steps)

# note: word2index['<pad>'] = 0
sequences = pad_sequence(data, batch_first=True, padding_value=0)

# from seq we generate 2 copies.
# inputs=seq[:-1], targets=seq[1:]
sequences_inputs = sequences.narrow_copy(1, 0, sequences.shape[1] - 1)
sequences_targets = sequences.narrow_copy(1, 1, sequences.shape[1] - 1)