## Data

In [None]:
DATA_DIR = "/content/drive/MyDrive/a3_data/lm_data"

In [None]:
import os
from io import open
import torch
import math
import torch.nn as nn
import time
import random
from torch.nn import functional as F

In [None]:
SEED = 0
TRAIN_BATCH_SIZE = 100
TEST_BATCH_SIZE = 100
WORD_EMBED_DIM = 200
HID_EMBED_DIM = 200
N_LAYERS = 2
DROPOUT = 0.5
LOG_INTERVAL = 100
EPOCHS = 20
BPTT = 50 # sequence length
CLIP = 0.25
TIED = False
SAVE_BEST = os.path.join(DATA_DIR, 'model.pt')

## Build vocabulary and convert text in corpus to lists of word index

In [None]:
class WordDict(object):
    def __init__(self):
        # mapping between word type to its index
        self.word2idx = {}
        # mapping between index to word type
        self.idx2word = {}

    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = len(self.word2idx)
            self.idx2word[self.word2idx[word]] = word
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

class Corpus(object):
    def __init__(self, path):
        self.train_file = os.path.join(path, 'train.txt')
        self.valid_file = os.path.join(path, 'valid.txt')
        self.test_file = os.path.join(path, 'test.txt')

        self.dictionary = WordDict()

        self.train = self.tokenize(self.train_file)
        self.valid = self.tokenize(self.valid_file)
        self.test = self.tokenize(self.test_file)

    def tokenize(self, filename):
        corpus = open(filename).readlines()
        ids = []
        for line in corpus:
          line = line.strip()
          if (line != '\n') and (line != ''):
              tokens = line.lower().rstrip().split()
              tokens.insert(0, '<sos>')
              tokens.append('<eos>')
              for token in tokens:
                  ids.append(self.dictionary.add_word(token))
        return ids

corpus = Corpus(DATA_DIR)
print(len(corpus.train))
print(len(corpus.valid))
print(len(corpus.test))
print(len(corpus.dictionary))

2099444
218808
246993
28913


In [None]:
def batchify(ids, batch_size):
    num_batches = len(ids) // batch_size
    data = []
    for i in range(num_batches):
        batch = []
        for j in range(batch_size):
            val = ids[i + (j*num_batches)]
            batch.append(val)
        data.append(batch)
    return torch.LongTensor(data)

train_data = batchify(corpus.train, TRAIN_BATCH_SIZE)
val_data = batchify(corpus.valid, TEST_BATCH_SIZE)
test_data = batchify(corpus.test, TEST_BATCH_SIZE)

print(train_data.shape)
print(val_data.shape)
print(test_data.shape)

torch.Size([20994, 100])
torch.Size([2188, 100])
torch.Size([2469, 100])


In [None]:
def get_batch(source, i):
    seq_len = BPTT if i + BPTT < len(source) else len(source) - 1 - i
    data = source[i:i+seq_len,:]
    target = torch.flatten(source[i+1:i+seq_len+1,:])
    return data, target

data, targets = get_batch(train_data, 0)
print(data)
print(targets)

tensor([[    0,   701,    10,  ...,    18, 28809,   272],
        [    1,  1791,    14,  ...,   438,  8623, 20553],
        [    2,   130,   119,  ...,   984,    18,   300],
        ...,
        [   35,    17,  5419,  ...,  5099,    16,    14],
        [   36,   346,    62,  ...,    14,     5,  1625],
        [   37,  3544,    38,  ...,  7773,     0,  1654]])
tensor([    1,  1791,    14,  ..., 17113,     1,  5407])


In [None]:
class LSTMModel(nn.Module):

    def __init__(self, vocab_size, word_embedding_size, nhid, nlayers, dropout=0.5, tied_weights=False):
        super(LSTMModel, self).__init__()
        self.nhid = nhid # hidden dimension of LSTM
        self.nlayers = nlayers # number of LSTM layers
        self.vocab_size = vocab_size
        self.encoder = torch.nn.Embedding(vocab_size, word_embedding_size)
        self.lstm = nn.LSTM(input_size=word_embedding_size, hidden_size=self.nhid, num_layers=self.nlayers, batch_first=False)
        self.output = nn.Linear(self.nhid, self.vocab_size)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.init_weights()

    def init_weights(self):
        """
        For example:
        # initrange = 0.1
        # nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        This is not all that you need!
        """
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -1*initrange, initrange)
        for ls in self.lstm._all_weights:
          for param in ls:
            if param in self.lstm.__dict__:
              nn.init.uniform_(self.lstm.__dict__[param], -1*initrange, initrange)
        nn.init.uniform_(self.output.weight, -1*initrange, initrange)

    def forward(self, input, hidden):
        """
        # Parameters
        input: input embedding
        hidden: hidden states in LSTM
        # Returns
        decoded: refers to the output of decoder layer over the vocabulary. Note that you don't need to pass it through the softmax layer
        hidden: stores the hidden states in LSTM
        """
        z, hidden = self.lstm(self.dropout1(self.encoder(input)), hidden)
        z = self.dropout2(z)
        decoded = self.output(z).view(-1, self.vocab_size)
        return decoded, hidden

    # initialize parameters in LSTM
    def init_hidden(self, bsz):
        weight = next(self.parameters())
        return (weight.new_zeros(self.nlayers, bsz, self.nhid),
            weight.new_zeros(self.nlayers, bsz, self.nhid))

In [None]:
# Set the random seed for reproducibility.
torch.manual_seed(SEED)
# set device as GPU/CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

def train():
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(TRAIN_BATCH_SIZE)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, BPTT)):
        data, targets = get_batch(train_data, i)
        data = data.to(device)
        targets = targets.to(device)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()
        optimizer.zero_grad()
        hidden = repackage_hidden(hidden) # Note that the main advantage here is that the hidden value is continual from the previous forward pass
        output, hidden = model(data, hidden)
        loss = criterion(output, targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        optimizer.step()

        total_loss += loss.item()

        if batch % LOG_INTERVAL == 0 and batch > 0:
            cur_loss = total_loss / LOG_INTERVAL
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // BPTT,
                elapsed * 1000 / LOG_INTERVAL, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [None]:
def evaluate(data_source):
    model.eval()
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(TEST_BATCH_SIZE)

    loss_vals = []
    for batch, i in enumerate(range(0, data_source.size(0)-1, BPTT)):
        data, targets = get_batch(data_source, i)
        data = data.to(device)
        targets = targets.to(device)

        with torch.no_grad():
          hidden = repackage_hidden(hidden)
          output, hidden = model(data, hidden)
          loss = criterion(output, targets)
          loss_vals.append(loss.item())

    average_log_loss = torch.mean(torch.FloatTensor(loss_vals))

    return average_log_loss

In [None]:
# prepare the model, loss, and optimizer
ntokens = len(corpus.dictionary)
model = LSTMModel(ntokens, WORD_EMBED_DIM, HID_EMBED_DIM, N_LAYERS, DROPOUT, TIED).to(device)
criterion = nn.CrossEntropyLoss() # use crossentropy loss
optimizer = torch.optim.Adam(model.parameters()) # use adam optimizer with default setting
best_val_loss = None

# Training framework
for epoch in range(1, EPOCHS+1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
        'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
        val_loss, math.exp(val_loss)))
    print('-' * 89)

    # Save the model if the validation loss is the best we've seen so far.
    if not best_val_loss or val_loss < best_val_loss:
        with open(SAVE_BEST, 'wb') as f:
            torch.save(model, f)
            print("save new best model!")
        best_val_loss = val_loss

| epoch   1 |   100/  419 batches | ms/batch 192.61 | loss  7.62 | ppl  2047.39
| epoch   1 |   200/  419 batches | ms/batch 188.93 | loss  6.82 | ppl   912.85
| epoch   1 |   300/  419 batches | ms/batch 190.36 | loss  6.55 | ppl   701.86
| epoch   1 |   400/  419 batches | ms/batch 191.90 | loss  6.41 | ppl   610.28
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 82.92s | valid loss  6.04 | valid ppl   421.80
-----------------------------------------------------------------------------------------
save new best model!
| epoch   2 |   100/  419 batches | ms/batch 194.48 | loss  6.35 | ppl   574.70
| epoch   2 |   200/  419 batches | ms/batch 192.28 | loss  6.20 | ppl   490.98
| epoch   2 |   300/  419 batches | ms/batch 192.53 | loss  6.13 | ppl   459.11
| epoch   2 |   400/  419 batches | ms/batch 192.41 | loss  6.07 | ppl   434.41
-------------------------------------------------------------------------------------

In [None]:
# Load the best saved model.
with open(SAVE_BEST, 'rb') as f:
    model = torch.load(f)
    # After loading the RNN params, they are not a continuous chunk of memory.
    # flatten_paramters() makes them a continuous chunk, and will speed up the forward pass.
    # Currently, only RNN model supports flatten_parameters function.
    model.lstm.flatten_parameters()

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

| End of training | test loss  4.94 | test ppl   139.71


In [None]:
# GPT-2 Text Generation
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)
input_ids = tokenizer.encode('I went to', return_tensors='tf')
tf.random.set_seed(0)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [None]:
# Top K
sample_output = model.generate(input_ids, do_sample=True, max_length=50, top_k=50)
print("Top K Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True), "\n")

# Top P
sample_output = model.generate(input_ids, do_sample=True, max_length=50, top_p=0.92, top_k=0)
print("Top P Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True), "\n")

# Random Sampling with Temperature
sample_output = model.generate(input_ids, do_sample=True, max_length=50, top_k=0, temperature=0.7)
print("Temperature Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True), "\n")

# Beam Search
beam_output = model.generate(input_ids, max_length=50, num_beams=5, early_stopping=True)
print("Beam Search Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True), "\n")

# Greedy
greedy_output = model.generate(input_ids, max_length=50)
print("Greedy Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True), "\n")

Top K Output:
----------------------------------------------------------------------------------------------------
I went to a store one weekend and someone asked me about this," said Ryan. "They never went back and told me, 'Glad you asked'." To add insult to injury, there is a line in her life which says: "I 

Top P Output:
----------------------------------------------------------------------------------------------------
I went to the store yesterday and bought quite a few things. I'm not sure about the price as of yet, so I'm sure one day it will be nice if it's an extra 60 dollars. Might as well check out what I've 

Temperature Output:
----------------------------------------------------------------------------------------------------
I went to bed, and as soon as I woke up had given the room to my sons, and I was quite glad to hear they came to see me.

"I had a few desires to go to my parents; but when I 

Beam Search Output:
---------------------------------------------------

In [None]:
def generate_text(prompt, sampling_func):
    # # Generation with LSTM lm given a sampling function and a prompt
    max_length = 30
    ids = []
    for word in prompt.split():
        ids.append(corpus.dictionary.word2idx[word])
    hidden = model.init_hidden(1)
    with torch.no_grad():  # no tracking history
        output, hidden = model(torch.LongTensor([[wid] for wid in ids]).to(device), hidden)
        word_prob = torch.nn.functional.softmax(output[-1,:], dim=0).cpu()
        generations = []
        for i in range(max_length):
            word_idx = sampling_func(word_prob)
            word = corpus.dictionary.idx2word[word_idx]
            generations.append(word)
            if word == "<eos>":
                break
            new_word = torch.LongTensor([[word_idx]]).to(device)
            output, hidden = model(new_word, hidden)
            word_prob = torch.nn.functional.softmax(output[-1,:], dim=0).cpu()
    return generations

In [None]:
def greedy_sampling(word_prob):
    word_id = torch.argmax(word_prob).item()
    return word_id

def random_sampling(word_prob):
    word_id = word_prob.multinomial(num_samples=1).item()
    return word_id

def topk_sampling(word_prob):
    k = 10
    top_k, idx = torch.topk(word_prob, k)
    top_k = F.normalize(top_k, dim=0)
    id = top_k.multinomial(num_samples=1).item()
    word_id = idx[id].item()
    return word_id


In [None]:
# Greedy Sampling
prompt = "i went to".lower()
generations = generate_text(prompt, greedy_sampling)
print('prompt: ' + prompt)
print(' '.join(generations))
print("")
prompt = "we played with".lower()
generations = generate_text(prompt, greedy_sampling)
print('prompt: ' + prompt)
print(' '.join(generations))
print("")
prompt = "I absolutely love".lower()
generations = generate_text(prompt, greedy_sampling)
print('prompt: ' + prompt)
print(' '.join(generations))
print("")
prompt = "I absolutely hate".lower()
generations = generate_text(prompt, greedy_sampling)
print('prompt: ' + prompt)
print(' '.join(generations))

prompt: i went to
the first time of the game . <eos>

prompt: we played with
the <unk> <unk> ) , the <unk> of the <unk> is a <unk> , and the <unk> of the <unk> . <eos>

prompt: i absolutely love
<unk> <unk> ! " <eos>

prompt: i absolutely hate
the <unk> , and the <unk> of the <unk> , and the <unk> of the <unk> . <eos>


In [None]:
# Random Sampling
prompt = "i went to".lower()
generations = generate_text(prompt, random_sampling)
print('prompt: ' + prompt)
print(' '.join(generations))
print("")
prompt = "we played with".lower()
generations = generate_text(prompt, random_sampling)
print('prompt: ' + prompt)
print(' '.join(generations))
print("")
prompt = "I absolutely love".lower()
generations = generate_text(prompt, random_sampling)
print('prompt: ' + prompt)
print(' '.join(generations))
print("")
prompt = "I absolutely hate".lower()
generations = generate_text(prompt, random_sampling)
print('prompt: ' + prompt)
print(' '.join(generations))

prompt: i went to
runway 5 , the time in the manager among rescue problems , which has a tiny , such as the scientific chin and highest @-@ sensitive stations with a small

prompt: we played with
. in the set of 1896 wheeler was darden 's best son , which under financial genesis . <eos>

prompt: i absolutely love
beyond <unk> , <unk> <unk> ( 60 : 5 inc . long ) . <eos>

prompt: i absolutely hate
la rosa <unk> . <eos>


In [None]:
# Top K Sampling
prompt = "i went to".lower()
generations = generate_text(prompt, topk_sampling)
print('prompt: ' + prompt)
print(' '.join(generations))
print("")
prompt = "we played with".lower()
generations = generate_text(prompt, topk_sampling)
print('prompt: ' + prompt)
print(' '.join(generations))
print("")
prompt = "I absolutely love".lower()
generations = generate_text(prompt, topk_sampling)
print('prompt: ' + prompt)
print(' '.join(generations))
print("")
prompt = "I absolutely hate".lower()
generations = generate_text(prompt, topk_sampling)
print('prompt: ' + prompt)
print(' '.join(generations))

prompt: i went to
their first time on the end of the season . in january 2013 , the team was the most famous for the first one in his first week . <eos>

prompt: we played with
the <unk> of the <unk> and the most important songs of the series , <unk> <unk> , and <unk> <unk> in which the player 's first work , which is

prompt: i absolutely love
a a <unk> <unk> in this way to the <unk> . <eos>

prompt: i absolutely hate
her <unk> . <eos>
