In [1]:
%load_ext autoreload

In [2]:
from BrattEssay import load_bratt_essays
from collections import defaultdict
from IterableFP import flatten
from Settings import Settings

#target_folder = "Training"
settings = Settings()
#root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
#training_folder = root_folder + target_folder + "/"

#essays = load_bratt_essays(training_folder)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [3]:
#from https://github.com/pytorch/examples/blob/master/word_language_model/model.py
import torch.nn as nn
from torch.autograd import Variable

class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ntarget, ninp, nhid, nlayers):
        super(RNNModel, self).__init__()
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, bias=False)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, bias=False)
        self.decoder = nn.Linear(nhid, ntarget) # changed from ntoken

        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.encoder(input)
        output, hidden = self.rnn(emb, hidden)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            return (Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()),
                    Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()))
        else:
            return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_())

In [4]:
class Args(object):
    def __init__(self):
        self.model = "GRU"
        self.emsize = 100
        self.nhid = 100
        self.nlayers = 1 # default 2
        self.lr = 20
        self.clip = 0.5
        self.epochs = 200
        self.batch_size = 20
        # sequence length, what does this mean ???
        self.bptt = 30
        self.seed = 1111
        self.cuda = False  # Throws error during data load
        self.log_interval=50
        self.save = "/Users/simon.hughes/data/pytorch/models/rq1_rnn.pt"
        
args = Args()

In [5]:
import time
import math
import torch
import torch.nn as nn
from torch.autograd import Variable

# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")
    else:
        print("CUDA Enabled")
        torch.cuda.manual_seed(args.seed)
else:
    print("CUDA is not available")



In [17]:
import os
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

class Corpus(object):
    def __init__(self):
        self.dictionary = Dictionary()
        self.labels_dictionary = Dictionary()
        self.train = self.tokenize("/Users/simon.hughes/data/tensorflow/translate/cb/Training/inputs.txt", self.dictionary)
        self.train_lbls = self.tokenize("/Users/simon.hughes/data/tensorflow/translate/cb/Training/output_most_freq.txt", self.labels_dictionary)
        self.valid = self.tokenize("/Users/simon.hughes/data/tensorflow/translate/cb/Test/inputs.txt", self.dictionary)
        self.valid_lbls = self.tokenize("/Users/simon.hughes/data/tensorflow/translate/cb/Test/output_most_freq.txt", self.labels_dictionary)
       
    def tokenize(self, path, dictionary):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                words = ['<sos>'] + line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r') as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                words = ['<sos>'] + line.split() + ['<eos>']
                for word in words:
                    ids[token] = dictionary.word2idx[word]
                    token += 1

        return ids

In [18]:
corpus = Corpus()

def batchify(data, bsz):
    nbatch = data.size(0) // bsz
    data = data.narrow(0, 0, nbatch * bsz)
    data = data.view(bsz, -1).t().contiguous()
    if args.cuda:
        data = data.cuda()
    return data

eval_batch_size = 10
train_data = batchify(corpus.train, args.batch_size)
train_data_lbls = batchify(corpus.train_lbls, args.batch_size)

val_data = batchify(corpus.valid, eval_batch_size)
val_data_lbls = batchify(corpus.valid_lbls, eval_batch_size)

## Build The Model

In [19]:
ntokens = len(corpus.dictionary)
nlabels = len(corpus.labels_dictionary)
model = RNNModel(args.model, ntokens, nlabels, args.emsize, args.nhid, args.nlayers)
if args.cuda:
    model.cuda()

criterion = nn.CrossEntropyLoss()

## Train The Model

In [20]:
def clip_gradient(model, clip):
    """Computes a gradient clipping coefficient based on gradient norm."""
    totalnorm = 0
    for p in model.parameters():
        modulenorm = p.grad.data.norm()
        totalnorm += modulenorm ** 2
    totalnorm = math.sqrt(totalnorm)
    return min(1, clip / (totalnorm + 1e-6))


def repackage_hidden(h):
    """Wraps hidden states in new Variables, to detach them from their history."""
    if type(h) == Variable:
        return Variable(h.data)
    else:
        return tuple(repackage_hidden(v) for v in h)


def get_batch(source, label_source, i, evaluation=False):
    seq_len = min(args.bptt, len(source) - 1 - i)
    data = Variable(source[i:i+seq_len], volatile=evaluation)
    target = Variable(label_source[i:i+seq_len].view(-1))
    return data, target

def evaluate(data_source, label_source):
    total_loss = 0
    ntokens = len(corpus.dictionary)
    nlabels = len(corpus.labels_dictionary)
    hidden = model.init_hidden(eval_batch_size)
    for i in range(0, data_source.size(0) - 1, args.bptt):
        data, targets = get_batch(data_source, label_source, i, evaluation=True)
        output, hidden = model(data, hidden)
        output_flat = output.view(-1, nlabels)
        total_loss += len(data) * criterion(output_flat, targets).data
        hidden = repackage_hidden(hidden)
    return total_loss[0] / len(data_source)


def train():
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    nlabels = len(corpus.labels_dictionary)
    hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, train_data_lbls, i)
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, nlabels), targets)
        loss.backward()

        clipped_lr = lr * clip_gradient(model, args.clip)
        for p in model.parameters():
            p.data.add_(-clipped_lr, p.grad.data)

        total_loss += loss.data

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss[0] / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args.bptt, lr,
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [21]:
# Loop over epochs.
lr = args.lr
prev_val_loss = None
for epoch in range(1, args.epochs+1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(val_data, val_data_lbls)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
            'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                       val_loss, math.exp(val_loss)))
    print('-' * 89)
    # Anneal the learning rate.
    if prev_val_loss and val_loss > prev_val_loss:
        lr /= 4
    prev_val_loss = val_loss

# Run on test data and save the model.
#test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training |')
print('=' * 89)
if args.save != '':
    with open(args.save, 'wb') as f:
        torch.save(model, f)

| epoch   1 |    50/  256 batches | lr 20.00 | ms/batch 20.70 | loss  2.85 | ppl    17.37
| epoch   1 |   100/  256 batches | lr 20.00 | ms/batch 21.95 | loss  1.79 | ppl     5.98
| epoch   1 |   150/  256 batches | lr 20.00 | ms/batch 20.80 | loss  1.10 | ppl     3.01
| epoch   1 |   200/  256 batches | lr 20.00 | ms/batch 20.11 | loss  1.36 | ppl     3.91
| epoch   1 |   250/  256 batches | lr 20.00 | ms/batch 20.74 | loss  1.36 | ppl     3.88
-----------------------------------------------------------------------------------------
| end of epoch   1 | time:  6.05s | valid loss  4.58 | valid ppl    97.42
-----------------------------------------------------------------------------------------
| epoch   2 |    50/  256 batches | lr 20.00 | ms/batch 22.41 | loss  1.98 | ppl     7.24
| epoch   2 |   100/  256 batches | lr 20.00 | ms/batch 21.42 | loss  1.28 | ppl     3.61
| epoch   2 |   150/  256 batches | lr 20.00 | ms/batch 24.52 | loss  1.27 | ppl     3.57
| epoch   2 |   200/  256 

KeyboardInterrupt: 

In [38]:
def words_to_ids(sentence):
    wds = sentence.strip().split(" ")
    if(len(wds)) < args.bptt:
        diff = args.bptt - len(wds)
        sentence = sentence.strip() + (" <eos>" * (diff+1))
        wds = sentence.strip().split(" ")
    ids = torch.LongTensor(len(wds))
    #print(len(wds), "words")
    token = 0
    for word in wds:
        ids[token] = corpus.dictionary.word2idx[word]
        token += 1
    return ids

sentence = "<sos> What leads to differences in the rates of coral bleaching . <eos>"
sentence = sentence.split(" ")
#hidden = model.init_hidden(eval_batch_size)
for i in range(len(sentence)):
    #model.zero_grad()
    hidden = model.init_hidden(eval_batch_size)
    up_to_this = " ".join(sentence[:i])
    ids = words_to_ids(up_to_this)# pad to length
    batch = batchify(ids, eval_batch_size)
    inputs, _ = get_batch(batch, batch, 0, True)
    output, hidden = model(inputs, hidden)
    output_flat = output.view(-1, nlabels)
    word_weights = output.squeeze().data.exp().cpu()
    word_idx = torch.multinomial(word_weights, 1)[-1]
    word_idxs = list(word_idx)[0]
    tag = corpus.labels_dictionary.idx2word[word_idxs]
    print(sentence[i].strip().ljust(15), tag)

<sos>           3
What            <sos>
leads           <sos>
to              <sos>
differences     <sos>
in              <eos>
the             <sos>
rates           <sos>
of              4
coral           <sos>
bleaching       <sos>
.               <sos>
<eos>           <sos>


In [115]:
#TODO make sure eos markers not counted in accuracy metrics