# Imports

In [1]:
import sentencepiece as spm
import numpy as np
import time

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.autograd import Variable
from torch import optim

from classes import *

## Config

In [19]:
SENTENCEPIECE = False
text_path = 'texts/hp_chap1.txt'
batch_size = 10
lr = 20  # learning rate
epochs = 10
clip = 0.25
log_interval = 200
save = 'pretrained_models/hp_model'  # path of model name you want to save
dropout = 0.5
n_summaries = 50  # number of summaries to generate
seed = 42
temperature = 2  # if this is higher, more variation in output space
words = 1000  # number of words to generate in the word generation dump

## Load Data, Tokenise

In [20]:
def form_sentence(tweet):
    tweet_blob = TextBlob(tweet)
    return ' '.join(tweet_blob.words)

In [21]:
with open(f'{text_path}', 'r') as f:
    data = f.read()

if SENTENCEPIECE: 
    spm.SentencePieceTrainer.Train(f'--input={text_path} --model_prefix=m --vocab_size=3000 --model_type=bpe')

    # makes segmenter instance and loads the model file (m.model)
    sp = spm.SentencePieceProcessor()
    sp.load('m.model')
    ids = sp.encode_as_ids(data)
    ntokens = 3000 # vocab size
    bptt = 256  # sequence length

    print(sp.encode_as_pieces("is upset that he can't update his Facebook by texting it... " +
                              "and might cry as a result  School today also. Blah!"))
else: 
    corpus = Corpus(path=text_path)
    ids = corpus.tokenize()
    ntokens = len(corpus.dictionary)
    bptt = 35

## Train Val Split

In [22]:
train_data = ids[:900]
val_data = ids[900:1200]

In [23]:
train_data = torch.tensor(np.hstack(train_data))
val_data = torch.tensor(np.hstack(val_data))

## Train Model

In [24]:
model = RNNModel(rnn_type='LSTM', ntoken=ntokens, ninp=256, nhid=512, nlayers=2, dropout=0.5, tie_weights=False)
criterion = nn.CrossEntropyLoss()

In [25]:
batch_size = 20
eval_batch_size = 20
train_data = batchify(train_data, batch_size)
val_data = batchify(val_data, batch_size)

In [26]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)

            output, hidden = model(data, hidden)
            hidden = repackage_hidden(hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    hidden = model.init_hidden(batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()
        hidden = repackage_hidden(hidden)
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args.bptt, lr,
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

# Loop over epochs.
best_val_loss = None

# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, epochs):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time()-epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

-----------------------------------------------------------------------------------------
| end of epoch   1 | time:  0.60s | valid loss 11.38 | valid ppl 87665.17
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   2 | time:  0.54s | valid loss  6.42 | valid ppl   613.27
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   3 | time:  0.51s | valid loss  6.74 | valid ppl   849.46
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   4 | time:  0.54s | valid loss  6.31 | valid ppl   550.39
--------------------------------------------------------------------------

# Use model 
## (1) Generate text in the style of the trained model 
## (2) Generate a summary of text you pass through the trained model

In [32]:
eval_batch_size = 50  # reflective of how long you want your summary to be
test_preprocessed = ids  # use all data (in this case) for summary
test_data = torch.tensor(test_preprocessed)
test_data = batchify(test_data, eval_batch_size)

  This is separate from the ipykernel package so we can avoid doing imports until


In [33]:
def get_encoder_outputs(data_source):  # for summary
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    hidden = model.init_hidden(eval_batch_size)
    outputs = []
    hiddens = {layer: [] for layer in range(len(hidden))}
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            emb = model.encoder(data)
            output, hidden = model.rnn(emb, hidden)
            outputs.append(output)
            for layer in range(len(hidden)):
                hiddens[layer].append(hidden[layer])
            hidden = repackage_hidden(hidden)
    return outputs, hiddens

In [34]:
outputs, hiddens = get_encoder_outputs(test_data)

In [35]:
# Set the random seed manually for reproducibility.
torch.manual_seed(seed)
device = torch.device("cpu")
model.eval()  # turn off dropout

hidden = model.init_hidden(1)
input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)

outf = 'hp_generated_dictionary.txt'
with open(outf, 'w') as outf:
    with torch.no_grad():  # no tracking history
        for i in range(words):
            output, hidden = model(input, hidden)
            word_weights = output.squeeze().div(temperature).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            input.fill_(word_idx)
            if SENTENCEPIECE:
                word = sp.decode_ids([word_idx.item()]) 
            else: 
                word = corpus.dictionary.idx2word[word_idx]

            outf.write(word + ('\n' if i % 20 == 19 else ' '))

            if i % log_interval == 0:
                print('| Generated {}/{} words'.format(i+log_interval, words))

| Generated 200/1000 words
| Generated 400/1000 words
| Generated 600/1000 words
| Generated 800/1000 words
| Generated 1000/1000 words


In [36]:
outf = 'hp_summaries_dictionaries.txt'
with open(outf, 'w') as outf:
    for n in range(n_summaries):
        outputs, hiddens = get_encoder_outputs(test_data)
        output_mean = torch.cat(outputs).mean(axis=0)
        hidden_means = []
        for layer in range(len(hiddens.keys())):
            hidden_mean = torch.cat(hiddens[layer]).mean(axis=0)
            hidden_means.append(hidden_mean)
        hidden_means = tuple(hidden_means)
        for output in output_mean:
            word_weights = output.squeeze().div(temperature).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            input.fill_(word_idx)
            if SENTENCEPIECE:
                word = sp.decode_ids([word_idx.item()]) 
            else: 
                word = corpus.dictionary.idx2word[word_idx]
            outf.write(word + ' ')
        outf.write('\n\n')
        if n % 10 == 0:
            print('| Generated {}/{} summaries'.format(n+10, n_summaries))
          

| Generated 10/50 summaries
| Generated 20/50 summaries
| Generated 30/50 summaries
| Generated 40/50 summaries
| Generated 50/50 summaries
