In [80]:
from collections import defaultdict
import math
import time
import random
import os, sys

import torch
import torch.nn as nn
from torch.autograd import Variable

torch.manual_seed(100)

<torch._C.Generator at 0x7feb50116c10>

In [111]:
# Define FFN language model
class FFN_LM(nn.Module):
    def __init__(self, nwords, emb_size, hid_size, num_hist, dropout):
        super(FFN_LM, self).__init__()
        self.embedding = nn.Embedding(nwords, emb_size)
        # Use nn.Sequential to stack layers together
        self.fnn = nn.Sequential(
            nn.Linear(
                in_features=num_hist*emb_size,
                out_features=hid_size,
                bias=True),
            nn.Tanh(),
            nn.Dropout(dropout),
            nn.Linear(
                in_features=hid_size, 
                out_features=nwords,
                bias=True))
    
    def forward(self, words):
        emb = self.embedding(words) # 3D tensor [batch_size x num_hist x emb_size]
        # Note: size -1 is inferred from other dimensions
        feat = emb.view(emb.size(0), -1)  # 2D tensor of size [batch_size x (num_hist*emb_size)]
        logit = self.fnn(feat) # 2D tensor of size [batch_size x nwords]
        return logit

In [112]:
N = 2
EMB_SIZE = 128 # size of embedding
HID_SIZE = 128 # size of hidden layer
USE_CUDA = torch.cuda.is_available()

# Initialize the model and the optimizer
model = FFN_LM(nwords=nwords, emb_size=EMB_SIZE, hid_size=HID_SIZE, num_hist=N, dropout=0.2)
if USE_CUDA:
  model = model.cuda()
optimizer = torch.optim.Adam(
    model.parameters(), 
    lr=0.001)  # Learning rate

In [97]:
# Read in the data
w2i = defaultdict(lambda: len(w2i))
S = w2i["<s>"]
UNK = w2i["<unk>"]
def read_dataset(filename):
  with open(filename, "r") as f:
    for line in f:
      yield [w2i[x] for x in line.strip().split(" ")]
    
train = list(read_dataset("../data/ptb/train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("../data/ptb/valid.txt"))
i2w = {v: k for k, v in w2i.items()}
nwords = len(w2i)

In [98]:
# Peek at what the training set looks like
train[0]

[2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25]

### [Connection between `loss.backward()` and `optimizer.step()`](https://stackoverflow.com/questions/53975717/pytorch-connection-between-loss-backward-and-optimizer-step)

* When initializing optimizer, we told it to update `model.parameters()`. Gradients are stored by the tensors themselves. After computing gradients for all tensors in the models, `optimizer.step()` does the parameter update.

* `loss.backward()` computes gradient of loss w.r.t all the parameters in loss and stores them in `parameter.grad` attribute for every parameter.


In [108]:
def to_variable(words):
    data_type = torch.LongTensor
    if USE_CUDA:
        data_type = torch.cuda.LongTensor
    var = torch.tensor(words).type(data_type)
    return var
    

def calc_sent_loss(sent, ffn_model):
    # Initial history = list of EOS tokens
    hist = [S] * N
    all_histories = []
    all_targets = []
    
    # Step through sentence, incl. EOS token
    for word in sent + [S]:
        all_histories.append(list(hist))
        all_targets.append(word)
        hist = hist[1:] + [word]
        
    target = to_variable(all_targets)
    logits = ffn_model(to_variable(all_histories))
    loss = nn.functional.cross_entropy(
        input=logits,
        target=target)
    return loss  # Scalar; same size as target

In [117]:
from tqdm import tqdm 


last_dev = 1e20
best_dev = 1e20

for ITER in range(1):
    random.shuffle(train)
    
    # Train the model
    model.train()
    train_words, train_loss = 0, 0.0
    start = time.time()
    
    for sent_id, sent in tqdm(enumerate(train), total=len(train)):
        my_loss = calc_sent_loss(sent, model)
        train_loss += my_loss.item()
        train_words += len(sent)
        optimizer.zero_grad()  # Zero the gradients
        my_loss.backward()  # Backprop
        optimizer.step()    # Gradient descent
        
        sent_num = sent_id+1
        if sent_num % 5000 == 0:
            # Show output every 5000 sentences
            print("-- finished {} sentences (word/sec={})".format(
                sent_num,
                train_words/(time.time()-start)))
            
    print("iter {}: train loss/word={}, ppl={} (word/sec={})".format(
        ITER,
        train_loss/train_words,
        math.exp(train_loss/train_words),
        train_words/(time.time()-start)))

    # Evaluate the model 
    model.eval()
    dev_words, dev_loss = 0, 0.0
    start = time.time()
    for sent_id, sent in enumerate(dev):
        my_loss = calc_sent_loss(sent, model)
        dev_loss += my_loss.item()
        dev_words += len(sent)
        
    # Keep track of dev accuracy; 
    # reduce learning rate if it got worse 
    if last_dev < dev_loss:
        optimizer.learning_rate /= 2 
    last_dev = dev_loss 
        
    # Keep track of best dev accuracy;
    # save the model only if it's the best one 
    if best_dev > dev_loss:
        torch.save(model, "model.pt")
            
    print("iter {}: train loss/word={}, ppl={} (word/sec={})".format(
        ITER,
        dev_loss/dev_words,
        math.exp(dev_loss/dev_words),
        dev_words/(time.time()-start)))

 12%|█▏        | 5039/42068 [00:14<01:48, 340.09it/s]

-- finished 5000 sentences (word/sec=7129.843704671385)


 24%|██▍       | 10089/42068 [00:25<00:47, 669.76it/s]

-- finished 10000 sentences (word/sec=8312.075084494212)


 36%|███▌      | 15115/42068 [00:33<00:42, 636.93it/s]

-- finished 15000 sentences (word/sec=9510.632538733607)


 48%|████▊     | 20058/42068 [00:46<01:01, 356.88it/s]

-- finished 20000 sentences (word/sec=9101.844729998478)


 60%|█████▉    | 25086/42068 [01:00<00:39, 430.42it/s]

-- finished 25000 sentences (word/sec=8789.046025066133)


 71%|███████▏  | 30060/42068 [01:12<00:32, 366.59it/s]

-- finished 30000 sentences (word/sec=8780.62272732762)


 83%|████████▎ | 35049/42068 [01:25<00:19, 368.93it/s]

-- finished 35000 sentences (word/sec=8648.183674085956)


 95%|█████████▌| 40082/42068 [01:39<00:04, 450.54it/s]

-- finished 40000 sentences (word/sec=8505.166196963566)


100%|██████████| 42068/42068 [01:43<00:00, 404.78it/s]


iter 0: train loss/word=0.26074971731609586, ppl=1.297902782187288 (word/sec=8539.548232144462)
iter 0: train loss/word=0.26370150697961214, ppl=1.30173957812544 (word/sec=57248.69215374463)


### [`torch.multinomial`](https://pytorch.org/docs/stable/generated/torch.multinomial.html)

* Returns a tensor where each row contains `num_samples` indices sampled from the multinomial probability distribution located in the corresponding row of tensor `input`.

In [130]:
MAX_LEN = 100

def generate_sent(ffn_model):
    hist = [S] * N  # List of EOS symbols to start
    sent = []
    while True:
        logits = ffn_model(to_variable([hist]))
        # Get a probability distribution
        m = nn.Softmax(dim=1)  # Dimension along which softmax will be computed
        prob = m(logits)
        # prob = nn.functional.softmax(logits)  
        word = prob.multinomial(1).item()
        if word == S or len(sent) == MAX_LEN:
            break
        sent.append(word)
        hist = hist[1:] + [word]
    return sent

In [131]:
for _ in range(5):
    sent = generate_sent(model)
    print(" ".join([i2w[x] for x in sent]))

the nation talks over the stores who was outcry handling that that he left <unk> ensure
japanese investors
the bill holds the the proposal
chicago a nine to kill boost poverty had half <unk> if you are rising will be a quick u.s. sounds step both the sun said mr. slow plenty of change
for his night would have found banco bush college sports total
