In [1]:
from lstm import LSTM_Model
from data import init_datasets
import numpy as np
import time
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [8]:
def neg_log_likelihood_loss(scores, targets):
    # substituting with cross entropy loss
    # get batch size
    batch_size = targets.size(1)

    # print(f"scores size : {scores.size()}")
    # print(f"scores reshaped to : {scores.reshape(-1, scores.size(2)).size()}")

    # print(f"targets size : {targets.size()}")
    # print(f"targets reshaped to : {targets.reshape(-1).size()}")

    # scores are shape (batch_size, time_steps, vocab_size)
    # scores are reshaped to (batch_size * time_steps, vocab_size)

    # targets are shape (batch_size, time_steps)
    # targets are reshapes to (batch_size*time_steps)
    return F.cross_entropy(scores.reshape(-1, scores.size(2)), targets.reshape(-1)) * batch_size

In [9]:
def get_perplexity(data, model, batch_size):
    model.eval()
    with torch.no_grad():
        losses = []
        states = model.init_state(batch_size)
        for x, y in data:
            # print(f"x size : {x.size()}")
            # print(f"y size : {y.size()}")
            scores, states = model(x, states)
            # print(f"scores size : {scores.size()}")
            loss = neg_log_likelihood_loss(scores, y)
            # print(f"loss : {loss}")
            #Again with the sum/average implementation described in 'nll_loss'.
            losses.append(loss.data.item() / batch_size)
    return np.exp(np.mean(losses))

In [10]:
def train(data, model, epochs, learning_rate, learning_rate_decay, max_grad):
    
    train_loader, valid_loader, test_loader = data
    start_time = time.time()
    
    total_words = 0
    print("Starting training.\n")
    batch_size = train_loader.batch_size
    
    for epoch in range(epochs):
        # batch_size = train_loader.batch_size
        states = model.init_state(batch_size)

        if epoch > 5:
            learning_rate = learning_rate / learning_rate_decay
        
        for i, (x, y) in enumerate(train_loader):
            total_words += x.numel()
            model.zero_grad()
            
            # batch_size = len(x))
            states = model.detach_states(states)
            scores, states = model(x, states)
            loss = neg_log_likelihood_loss(scores=scores, targets=y)
            loss.backward()
            
            with torch.no_grad():
                norm = nn.utils.clip_grad_norm_(model.parameters(), max_grad)
                for param in model.parameters():
                    param -= learning_rate * param.grad
            
            if i % (len(train_loader) // 10) == 0:
                end_time = time.time()
                print("batch no = {:d} / {:d}, ".format(i, len(train_loader)) +
                      "train loss = {:.3f}, ".format(loss.item() / batch_size) +
                      "wps = {:d}, ".format(round(total_words/(end_time-start_time))) +
                      "dw.norm() = {:.3f}, ".format(norm) +
                      "lr = {:.3f}, ".format(learning_rate) +
                      "since beginning = {:d} mins, ".format(round((end_time-start_time)/60))) # +
                      # "cuda memory = {:.3f} GBs".format(torch.cuda.max_memory_allocated()/1024/1024/1024))
        
        model.eval()
        valid_perplexity = get_perplexity(data=valid_loader, model=model, batch_size=batch_size)
        print("Epoch : {:d} || Validation set perplexity : {:.3f}".format(epoch+1, valid_perplexity))
        print("*************************************************\n")
    test_perp = get_perplexity(data=test_loader, model=model, batch_size=batch_size)
    print("Test set perplexity : {:.3f}".format(test_perp))
    print("Training is over.")
    return model

In [11]:
hyperparams = {
        'embed_dims': 25,
        'device': 'cpu', # 'gpu'
        'freq_threshold': 1,
        'dropout_prob': 0.5,
        'init_range': 0.05,
        'epochs': 10,
        'learning_rate': 1,
        'learning_rate_decay': 1.2,
        'num_layers': 2,
        'batch_size': 5, #TODO: on nyt small dataset, we get issues with batch size and timesteps. Not sure why
        'time_steps': 10,
        'max_grad': 5,
        'embed_tying': False,
        'bias': False,
        'save_model': True,
        'load_model': False,
        'model_path': 'lstm_model',
        'topic': 'nyt_covid', # enter 'wiki' or 'nyt_covid'
        'path': 'data/test_corpora'
    }

In [12]:
# set params for init_datasets
data_params = {k:hyperparams[k] for k in ['topic','freq_threshold', 'time_steps', 'batch_size',  'path']}
datasets = init_datasets(**data_params)

In [13]:
# we store the vcab size and word2index dict
vocab_size = datasets['vocab_size']
word2index = datasets['word2index']

# we get the data_loaders: train, valid, test
data_loaders = datasets['data_loaders']

In [14]:
# set params for model training
model_params = ['device', 'embed_dims', 'dropout_prob', 'init_range',
                'num_layers', 'max_grad', 'embed_tying', 'bias']
model_params = {k:hyperparams[k] for k in model_params}
model_params['vocab_size'] = vocab_size
model = LSTM_Model(**model_params)

In [15]:
train_loader, valid_loader, test_loader = data_loaders
start_time = time.time()

total_words = 0
print("Starting training.\n")
batch_size = train_loader.batch_size

Starting training.



In [16]:
model.__dict__

{'training': True,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_non_persistent_buffers_set': set(),
 '_backward_hooks': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_modules': OrderedDict([('dropout', Dropout(p=0.5, inplace=False)),
              ('embed', Embedding(1085, 25)),
              ('lstms',
               ModuleList(
                 (0): LSTM(25, 25, bias=False, batch_first=True)
                 (1): LSTM(25, 25, bias=False, batch_first=True)
               )),
              ('fc', Linear(in_features=25, out_features=1085, bias=False))]),
 'max_grad': 5,
 'init_param': 0.05,
 'bias': False,
 'embed_tying': False,
 'vocab_size': 1085,
 'device': device(type='cpu')}

In [17]:
states = model.init_state(batch_size)

In [27]:
states[0][0].size()

torch.Size([1, 3, 25])

In [28]:
perplexity = get_perplexity(data=train_loader, model=model, batch_size=train_loader.batch_size)

In [29]:
perplexity

114563.8136259254

In [34]:
model.eval()
states = model.init_state(batch_size)

In [37]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7ff66c34adc0>

In [39]:
sample = next(iter(train_loader))

In [41]:
x, y = sample

In [43]:
print(x)
print(y)

tensor([[ 1, 10, 41,  5,  5],
        [ 5,  1,  7,  5,  5],
        [21, 29, 38,  1, 41]])
tensor([[10, 41,  5,  5,  5],
        [ 1,  7,  5,  5,  5],
        [29, 38,  1, 41, 24]])


In [44]:
scores, states = model(x, states)

In [47]:
print(x.size())
print(scores.size())

torch.Size([3, 5])
torch.Size([3, 5, 1085])


In [51]:
loss = neg_log_likelihood_loss(scores, y)

In [52]:
loss

tensor(34.9466, grad_fn=<MulBackward0>)

In [53]:
losses = []

In [54]:
losses.append(loss.data.item() / batch_size)

In [55]:
losses

[11.648880004882812]

In [60]:
np.mean(losses)

11.648880004882812

In [59]:
np.exp(np.mean(losses))

114562.98119796545

In [61]:
b = y.size(1)
b

5

In [63]:
y.size()

torch.Size([3, 5])

In [None]:
def neg_log_likelihood_loss(scores, targets):
    # substituting with cross entropy loss
    # get batch size
    (batch_size, seq length)
    batch_size = targets.size(0)

    # print(f"scores size : {scores.size()}")
    # print(f"scores reshaped to : {scores.reshape(-1, scores.size(2)).size()}")

    # print(f"targets size : {targets.size()}")
    # print(f"targets reshaped to : {targets.reshape(-1).size()}")

    # scores are shape (batch_size, time_steps, vocab_size)
    # scores are reshaped to (batch_size * time_steps, vocab_size)

    # targets are shape (batch_size, time_steps)
    # targets are reshapes to (batch_size*time_steps)
    return F.cross_entropy(scores.reshape(-1, scores.size(2)), targets.reshape(-1)) * batch_size

In [None]:
with torch.no_grad():
        losses = []
        states = model.init_state(batch_size)
        for x, y in data:
            # print(f"x size : {x.size()}")
            # print(f"y size : {y.size()}")
            scores, states = model(x, states)
            # print(f"scores size : {scores.size()}")
            loss = neg_log_likelihood_loss(scores, y)
            # print(f"loss : {loss}")
            #Again with the sum/average implementation described in 'nll_loss'.
            losses.append(loss.data.item() / batch_size)
    return np.exp(np.mean(losses))

In [None]:

for epoch in range(epochs):
    # batch_size = train_loader.batch_size
    states = model.init_state(batch_size)

In [8]:
def main():
    hyperparams = {
        'embed_dims': 25,
        'device': 'cpu', # 'gpu'
        'freq_threshold': 1,
        'dropout_prob': 0.5,
        'init_range': 0.05,
        'epochs': 10,
        'learning_rate': 1,
        'learning_rate_decay': 1.2,
        'num_layers': 2,
        'batch_size': 5, #TODO: on nyt small dataset, we get issues with batch size and timesteps. Not sure why
        'time_steps': 10,
        'max_grad': 5,
        'embed_tying': False,
        'bias': False,
        'save_model': True,
        'load_model': False,
        'model_path': 'lstm_model',
        'topic': 'nyt_covid', # enter 'wiki' or 'nyt_covid'
        'path': 'data/test_corpora'
    }

    # set params for init_datasets
    data_params = {k:hyperparams[k] for k in ['topic','freq_threshold', 'time_steps', 'batch_size',  'path']}
    datasets = init_datasets(**data_params)

    # we store the vcab size and word2index dict
    vocab_size = datasets['vocab_size']
    word2index = datasets['word2index']

    # we get the data_loaders: train, valid, test
    data_loaders = datasets['data_loaders']

    # set params for model training
    model_params = ['device', 'embed_dims', 'dropout_prob', 'init_range',
                    'num_layers', 'max_grad', 'embed_tying', 'bias']
    model_params = {k:hyperparams[k] for k in model_params}
    model_params['vocab_size'] = vocab_size
    # Masum recommended this as embed dims
    # TODO: Make this more easily modifiable.
    #   Want to do embed dims = user input if input provied, else embed dims = line below
    # model_params['embed_dims'] = int(np.ceil(np.sqrt(np.sqrt(vocab_size))))
    model = LSTM_Model(**model_params)
    print(f"vocab size : {vocab_size}")
    perplexity = get_perplexity(data=data_loaders[1], model=model, batch_size=data_loaders[1].batch_size)
    print("perplexity on %s dataset before training: %.3f, " % ('valid', perplexity))
    """
    for d, l in zip(data_loaders, ['train', 'valid', 'test']):
        perplexity = get_perplexity(data=d, model=model, batch_size=d.batch_size)
        print("perplexity on %s dataset before training: %.3f, " % (l, perplexity))
    """


    if hyperparams['load_model']:
        model.load_state_dict(torch.load(hyperparams['model_path']))

    else:
        training_params = ['epochs', 'learning_rate', 'learning_rate_decay', 'max_grad']
        training_params = {k:hyperparams[k] for k in training_params}
        model = train(data=data_loaders, model=model, **training_params)

    # now calculate perplexities for train, valid, test
    for d, l in zip(data_loaders, ['train', 'valid', 'test']):
        perplexity = get_perplexity(data=d, model=model, batch_size=d.batch_size)
        print("perplexity on %s dataset after training : %.3f, " % (l, perplexity))

    if hyperparams['save_model']:
        torch.save(model.state_dict(), hyperparams['model_path'])

torch.Size([3, 4, 1])

In [9]:
test.dataset.x.size()

torch.Size([24, 4, 1])

In [10]:
x[0][0][:3]

tensor([[38],
        [13],
        [26]])

In [11]:
x[1][0][:3]

tensor([[13],
        [26],
        [ 6]])

In [12]:
hyperparams = {
    'embed_dims': None,
    'freq_threshold': 3,
    'dropout_prob': 0.5,
    'init_range': 0.05,
    'epochs': 40,
    'learning_rate': 1,
    'learning_rate_decay': 1.2,
    'num_layers': 2,
    'batch_size': 20,
    'time_steps': 35,
    'max_grad': 5,
    'embed_tying': False,
    'bias': False,
    'save_model': True,
    'load_model': True,
    'model_path': 'lstm_model',
    'topic': 'wiki', # enter 'wiki' or 'nyt_covid'
    'path': 'data/small_test_corpora'
}

## SHAPE OF ARRAYS AND TENSORS

In [58]:
from data import _init_corpora
train, valid, test, vocab = _init_corpora(path=path, topic=topic, freq_threshold=freq_threshold)

In [59]:
train_t = train.reshape(-1, 1)
valid_t = valid.reshape(-1, 1)
test_t = test.reshape(-1, 1)

In [60]:
test.shape

(126,)

In [61]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

In [66]:
TEST = torch.LongTensor(test)
TEST_T = torch.LongTensor(test_t)

In [68]:
TEST = torch.split(tensor=TEST, split_size_or_sections=time_steps)
TEST_T = torch.split(tensor=TEST_T, split_size_or_sections=time_steps)

In [69]:
TEST[:3]

(tensor([31, 25,  5,  5,  5, 44,  5, 33,  5,  5]),
 tensor([40, 41,  5, 16,  5,  5,  5,  5,  5,  5]),
 tensor([ 2,  4,  5,  7,  5,  5, 33,  5,  5, 36]))

In [70]:
TEST_T[:3]

(tensor([[31],
         [25],
         [ 5],
         [ 5],
         [ 5],
         [44],
         [ 5],
         [33],
         [ 5],
         [ 5]]),
 tensor([[40],
         [41],
         [ 5],
         [16],
         [ 5],
         [ 5],
         [ 5],
         [ 5],
         [ 5],
         [ 5]]),
 tensor([[ 2],
         [ 4],
         [ 5],
         [ 7],
         [ 5],
         [ 5],
         [33],
         [ 5],
         [ 5],
         [36]]))

In [71]:
TEST_seq = pad_sequence(TEST, batch_first=True, padding_value=0)

In [72]:
TEST_T_seq = pad_sequence(TEST_T, batch_first=True, padding_value=0)

In [73]:
TEST_seq[-3:]

tensor([[ 5, 40,  5,  5,  5,  5,  5, 33,  5,  5],
        [ 8,  5, 10,  5,  5,  1,  5,  5,  5,  5],
        [ 5,  5,  5,  5,  2,  4,  0,  0,  0,  0]])

In [74]:
TEST_T_seq[-3:]

tensor([[[ 5],
         [40],
         [ 5],
         [ 5],
         [ 5],
         [ 5],
         [ 5],
         [33],
         [ 5],
         [ 5]],

        [[ 8],
         [ 5],
         [10],
         [ 5],
         [ 5],
         [ 1],
         [ 5],
         [ 5],
         [ 5],
         [ 5]],

        [[ 5],
         [ 5],
         [ 5],
         [ 5],
         [ 2],
         [ 4],
         [ 0],
         [ 0],
         [ 0],
         [ 0]]])

In [86]:
TEST_seq_i = TEST_seq.narrow_copy(1, 0, TEST_seq.shape[1])
TEST_seq_o = TEST_seq.narrow_copy(1, 1, TEST_seq.shape[1])

RuntimeError: start (1) + length (10) exceeds dimension size (10).

In [84]:
TEST_seq_i[-3:]

tensor([[ 5, 40,  5,  5,  5,  5,  5, 33,  5,  5],
        [ 8,  5, 10,  5,  5,  1,  5,  5,  5,  5],
        [ 5,  5,  5,  5,  2,  4,  0,  0,  0,  0]])

In [85]:
TEST_seq_o[-3:]

tensor([[40,  5,  5,  5,  5,  5, 33,  5,  5],
        [ 5, 10,  5,  5,  1,  5,  5,  5,  5],
        [ 5,  5,  5,  2,  4,  0,  0,  0,  0]])

In [30]:
TEST_T_seq_i = TEST_T_seq.narrow_copy(1, 0, TEST_T_seq.shape[1] - 1)
TEST_T_seq_o = TEST_T_seq.narrow_copy(1, 1, TEST_T_seq.shape[1] - 1)

In [31]:
TEST_T_seq_i[-3:]

tensor([[[15502],
         [ 7717],
         [18542],
         [ 7105],
         [   77],
         [26313],
         [12434],
         [23496],
         [ 2866]],

        [[10880],
         [10291],
         [ 9056],
         [   77],
         [13514],
         [22559],
         [ 2150],
         [   77],
         [ 9054]],

        [[   77],
         [ 9054],
         [19242],
         [   77],
         [ 2378],
         [ 9056],
         [ 1419],
         [   80],
         [    0]]])

In [32]:
TEST_T_seq_o[-3:]

tensor([[[ 7717],
         [18542],
         [ 7105],
         [   77],
         [26313],
         [12434],
         [23496],
         [ 2866],
         [17241]],

        [[10291],
         [ 9056],
         [   77],
         [13514],
         [22559],
         [ 2150],
         [   77],
         [ 9054],
         [25836]],

        [[ 9054],
         [19242],
         [   77],
         [ 2378],
         [ 9056],
         [ 1419],
         [   80],
         [    0],
         [    0]]])

In [39]:
TEST_seq_i.size()

torch.Size([23773, 9])

In [40]:
TEST_T_seq_i.size()

torch.Size([23773, 9, 1])

In [None]:
data = torch.LongTensor(data)
# split tensor into tensors of of size time_steps
data = torch.split(tensor=data, split_size_or_sections=time_steps)

# note: word2index['<pad>'] = 0
sequences = pad_sequence(data, batch_first=True, padding_value=0)

# from seq we generate 2 copies.
# inputs=seq[:-1], targets=seq[1:]
sequences_inputs = sequences.narrow_copy(1, 0, sequences.shape[1] - 1)
sequences_targets = sequences.narrow_copy(1, 1, sequences.shape[1] - 1)