In [25]:
# Import Packages
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoModel, BertTokenizerFast, GPT2Tokenizer, GPT2LMHeadModel, AdamW
from torch.utils.data import TensorDataset, DataLoader
from torch.autograd import Variable
import random
import time, math, copy
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import torchtext
from tokenizers import CharBPETokenizer
import tokenizers
import operator
from queue import PriorityQueue

from sklearn.model_selection import train_test_split
import pandas as pd

# specify GPU
device = torch.device("cuda")

# Import from external file
X_train = pd.read_csv("output/x_train.csv")
X_test = pd.read_csv("output/x_test.csv")

In [26]:
SOS_TOKEN = '<sos>'
EOS_TOKEN = '<eos>'

MAX_LEN = 5000

def len_filter(example):
    return len(example.src) <= MAX_LEN and len(example.tgt) <= MAX_LEN

In [27]:
# Set paths
train_path = 'output/x_train_nn.csv'
test_path = 'output/x_test_nn.csv'

# Save files
X_train[['question', 'prep_answer', 'cluster']].to_csv(train_path, index=False, header=False)
X_test[['question', 'prep_answer', 'cluster']].to_csv(test_path, index=False, header=False)

# Create pytorch variables
src = torchtext.data.Field(
    include_lengths=True,
    lower=True
    )
tgt = torchtext.data.Field(
    preprocessing = lambda seq: [SOS_TOKEN] + seq + [EOS_TOKEN],
    lower=True,
    is_target=True
    )
cluster = torchtext.data.Field(
    )

data_train = torchtext.data.TabularDataset(
        path=train_path, format='csv',
        fields=[('src', src), ('tgt', tgt), ('cluster', cluster)],
        filter_pred=len_filter
    )

data_test = torchtext.data.TabularDataset(
        path=test_path, format='csv',
        fields=[('src', src), ('tgt', tgt), ('cluster', cluster)],
        filter_pred=len_filter
    )

In [28]:
src.build_vocab(data_train, max_size=50000)
tgt.build_vocab(data_train, max_size=50000)
cluster.build_vocab(data_train, max_size=50000)
input_vocab = src.vocab
output_vocab = tgt.vocab

print('20 tokens from input vocab:\n', list(input_vocab.stoi.keys())[:20])
print('\n20 tokens from output vocab:\n', list(output_vocab.stoi.keys())[:20])

print('\nnum training examples:', len(data_train.examples))

item = random.choice(data_train.examples)
print('\nexample train data:')
print('src:\n', item.src)
print('tgt:\n', item.tgt)

20 tokens from input vocab:
 ['<unk>', '<pad>', 'the', 'to', 'of', 'a', 'i', 'in', 'and', 'is', 'that', 'for', 'this', 'be', 'it', 'are', 'have', 'my', 'on', 'with']

20 tokens from output vocab:
 ['<unk>', '<pad>', 'the', 'to', 'of', 'and', 'a', 'is', 'in', 'that', 'for', 'you', 'not', 'are', 'be', 'it', 'or', 'with', 'have', 'as']

num training examples: 1034

example train data:
src:
 ['we', 'often', 'get', 'sore', 'throats', 'once', 'or', 'twice', 'a', 'year,', 'and', 'it', 'clears', 'in', 'a', 'few', 'days', 'sometimes', 'without', 'any', 'antibiotics.', 'i', 'was', 'wondering', 'why', "doesn't", 'our', 'body', 'become', 'immune', 'after', 'clearing', 'a', 'sore', 'throat?']
tgt:
 ['<sos>', 'short', 'answer', 'it', 'should', 'be', 'noted', 'that', 'there', 'are', 'many', 'non-pathogenic', 'causes', 'of', 'sore', 'throat,', 'and', 'i', 'would', 'suspect', 'that', 'you', 'are', 'not', 'always', 'distinguishing', 'these', 'causes', 'from', 'actual', 'illness.', 'in', 'most', 'cases,'

In [29]:
class EncoderRNN(nn.Module):

    def __init__(self, vocab_size, n_hidden=256, n_layers=4, drop_prob=0.3, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr

        self.emb_layer = nn.Embedding(vocab_size, 200)

        ## define the LSTM
        self.lstm = nn.LSTM(200, n_hidden, n_layers,
                            dropout=drop_prob, batch_first=True)

        ## define a dropout layer
        self.dropout = nn.Dropout(drop_prob)

        ## define the fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)

    def forward(self, x, hidden):
        ''' Forward pass through the network.
            These inputs are x, and the hidden/cell state `hidden`. '''

        ## pass input through embedding layer
        embedded = self.emb_layer(x)

        ## Get the outputs and the new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)

        ## pass through a dropout layer
        out = self.dropout(lstm_output)

        #out = out.contiguous().view(-1, self.n_hidden)
        out = out.reshape(-1, self.n_hidden)

        ## put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden


    def init_hidden(self, batch_size):
        ''' initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        # if GPU is available
        if (torch.cuda.is_available()):
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())

        # if GPU is not available
        else:
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_())

        return hidden

class DecoderRNN(nn.Module):
    def __init__(self, embedding_size, hidden_size, output_size, dropout=0.1):
        '''
        Illustrative decoder
        '''
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(num_embeddings=output_size,
                                      embedding_dim=embedding_size,
                                      )

        self.rnn = nn.GRU(embedding_size, hidden_size, bidirectional=True, dropout=dropout, batch_first=False)
        self.dropout_rate = dropout
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).transpose(0, 1)  # [B,1] -> [ 1, B, D]
        embedded = F.dropout(embedded, self.dropout_rate)

        output = embedded

        output, hidden = self.rnn(output, hidden)

        out = self.out(output.squeeze(0))
        output = F.log_softmax(out, dim=1)
        return output, hidden

class BeamSearchNode(object):
    def __init__(self, hiddenstate, previousNode, wordId, logProb, length):
        '''
        :param hiddenstate:
        :param previousNode:
        :param wordId:
        :param logProb:
        :param length:
        '''
        self.h = hiddenstate
        self.prevNode = previousNode
        self.wordid = wordId
        self.logp = logProb
        self.leng = length

    def eval(self, alpha=1.0):
        reward = 0
        # Add here a function for shaping a reward

        return self.logp / float(self.leng - 1 + 1e-6) + alpha * reward

In [38]:
def train(input_tensor, target_tensor, cluster, batch_size, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,
          max_length=MAX_LEN, teacher_forcing_ratio=0.5):

    # get an initial hidden state for the encoder
    encoder_hidden = encoder.init_hidden(batch_size)

    # zero accumulated gradients
    encoder.zero_grad()
    decoder.zero_grad()
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # get the seq lengths, used for iterating through encoder/decoder
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    # create empty tensor to fill with encoder outputs
    encoder_outputs = torch.zeros(max_length, encoder.n_hidden, device=device)

    # create a variable for loss
    loss = 0

    # pass the inputs through the encoder
    #for ei in range(input_length):
    encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden)
        #encoder_outputs[ei] = encoder_output[0, 0]

    # create a start-of-sequence tensor for the decoder
    decoder_input = torch.tensor([[output_vocab.stoi[SOS_TOKEN]]], device=device)

    # set the decoder hidden state to the final encoder hidden state
    decoder_hidden = encoder_hidden

    # decide if we will use teacher forcing
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)

        topv, topi = decoder_output.topk(2)
        decoder_input = topi.squeeze().detach()  # detach from history as input

        loss += criterion(decoder_output, target_tensor[di].unsqueeze(0))

        if use_teacher_forcing:
            decoder_input = target_tensor[di]

        if decoder_input.item() == output_vocab.stoi[EOS_TOKEN]:
                break

    loss.backward()

    # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
    nn.utils.clip_grad_norm_(encoder.parameters(), 1)
    nn.utils.clip_grad_norm_(decoder.parameters(), 1)

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [31]:
def trainIters(encoder, decoder, n_iters, batch_size=32, print_every=10000, learning_rate=0.04, teacher_forcing_ratio=0.2):
    print(f'Running {n_iters} epochs...')
    print_loss_total = 0
    print_loss_epoch = 0

    encoder_optim = AdamW(encoder.parameters(), lr=learning_rate)
    decoder_optim = AdamW(decoder.parameters(), lr=learning_rate)

    # note batch size of 1, just for simplicity
    # DO NOT INCREASE THE BATCH SIZE
    batch_iterator = torchtext.data.Iterator(
        dataset=data_train, batch_size=batch_size,
        sort=False, sort_within_batch=True,
        sort_key=lambda x: len(x.src),
        device=device, repeat=False)


    criterion = nn.CrossEntropyLoss()

    for e in range(n_iters):
        batch_generator = batch_iterator.__iter__()
        step = 0
        start = time.time()
        for batch in batch_generator:
            step += 1

            # get the input and target from the batch iterator
            input_tensor, input_lengths = getattr(batch, 'src')
            target_tensor = getattr(batch, 'tgt')
            cluster = getattr(batch, 'cluster')

            # this is because we're not actually using the batches.
            # batch size is 1 and this just selects that first one
            #input_tensor = input_tensor[0]
            #target_tensor = target_tensor[0]

            loss = train(input_tensor, target_tensor, cluster, batch_size, encoder, decoder, encoder_optim, decoder_optim, criterion, teacher_forcing_ratio=teacher_forcing_ratio)
            print_loss_total += loss
            print_loss_epoch += loss


            if step % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                t = (time.time() - start) / 60
                print(f'step: {step}\t avg loss: {print_loss_avg:.2f}\t time for {print_every} steps: {t:.2f} min')
                start = time.time()

        print_loss_avg = print_loss_epoch / step
        print_loss_epoch = 0
        print(f'End of epoch {e}, avg loss {print_loss_avg:.2f}')

In [32]:
hidden_size = 256
encoder1 = EncoderRNN(len(input_vocab), hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, hidden_size, len(output_vocab)).to(device)



In [39]:
trainIters(encoder1, decoder1, 10, print_every=10000)

Running 10 epochs...


RuntimeError: Expected hidden[0] size (4, 613, 256), got [4, 32, 256]