In [1]:
# Import Packages
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoModel, BertTokenizerFast, GPT2Tokenizer, GPT2LMHeadModel, AdamW
from torch.utils.data import TensorDataset, DataLoader
from torch.autograd import Variable
import random
import time, math, copy
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import torchtext
from tokenizers import CharBPETokenizer
import tokenizers

from sklearn.model_selection import train_test_split
import pandas as pd


# specify GPU
device = torch.device("cuda")

# Import from external file
X_train = pd.read_csv("output/x_train.csv")
X_test = pd.read_csv("output/x_test.csv")

In [2]:
SOS_TOKEN = '<sos>'
EOS_TOKEN = '<eos>'

MAX_LEN = 5000

def len_filter(example):
    return len(example.src) <= MAX_LEN and len(example.tgt) <= MAX_LEN

In [3]:
# Set paths
train_path = 'output/x_train_nn.csv'
test_path = 'output/x_test_nn.csv'

# Save files
X_train[['question', 'prep_answer', 'cluster']].to_csv(train_path, index=False, header=False)
X_test[['question', 'prep_answer', 'cluster']].to_csv(test_path, index=False, header=False)

# Create pytorch variables
src = torchtext.data.Field(
    include_lengths=True,
    lower=True
    )
tgt = torchtext.data.Field(
    preprocessing = lambda seq: [SOS_TOKEN] + seq + [EOS_TOKEN],
    lower=True
    )
cluster = torchtext.data.Field(
    )

data_train = torchtext.data.TabularDataset(
        path=train_path, format='csv',
        fields=[('src', src), ('tgt', tgt), ('cluster', cluster)],
        filter_pred=len_filter
    )

data_test = torchtext.data.TabularDataset(
        path=test_path, format='csv',
        fields=[('src', src), ('tgt', tgt), ('cluster', cluster)],
        filter_pred=len_filter
    )

In [4]:
src.build_vocab(data_train, max_size=50000)
tgt.build_vocab(data_train, max_size=50000)
input_vocab = src.vocab
output_vocab = tgt.vocab

print('20 tokens from input vocab:\n', list(input_vocab.stoi.keys())[:20])
print('\n20 tokens from output vocab:\n', list(output_vocab.stoi.keys())[:20])

print('\nnum training examples:', len(data_train.examples))

item = random.choice(data_train.examples)
print('\nexample train data:')
print('src:\n', item.src)
print('tgt:\n', item.tgt)

20 tokens from input vocab:
 ['<unk>', '<pad>', 'the', 'to', 'of', 'a', 'i', 'in', 'and', 'is', 'that', 'for', 'this', 'be', 'it', 'are', 'have', 'my', 'on', 'with']

20 tokens from output vocab:
 ['<unk>', '<pad>', 'the', 'to', 'of', 'and', 'a', 'in', 'is', 'that', 'for', 'you', 'are', 'be', 'with', 'or', 'it', 'have', 'not', 'as']

num training examples: 1034

example train data:
src:
 ['can', 'coronavirus', 'go', 'through', 'skin', 'and', 'into', 'the', 'body?']
tgt:
 ['<sos>', '“it', 'may', 'be', 'possible', 'that', 'a', 'person', 'can', 'get', 'covid-19', 'by', 'touching', 'a', 'surface', 'or', 'object', 'that', 'has', 'the', 'virus', 'on', 'it', 'and', 'then', 'touching', 'their', 'own', 'mouth,', 'nose,', 'or', 'possibly', 'their', 'eyes,', 'but', 'this', 'is', 'not', 'thought', 'to', 'be', 'the', 'main', 'way', 'the', 'virus', 'spreads,”', 'the', 'cdc', 'says.', 'more', 'often', 'than', 'not,', 'people', 'get', 'coronavirus', 'through', 'respiratory', 'droplets', 'produced', 'w

In [None]:
def encode_qa(questions, answers, tokenizer, maxlen=1024):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """

    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(pad_id=1, pad_token = "<pad>")
    all_ids = []

    for i in range(0, len(questions)):
        q = questions[i]
        a = answers[i]

        encs = tokenizer.encode(q, a)
        all_ids.append(encs.ids)
        if len(encs.ids) > 512:
            return q, a

    return np.array(all_ids)

In [None]:
tokenizer = CharBPETokenizer(bert_normalizer=False)

#generate word vocab from the text
tokenizer.train(["data/elon.txt"], special_tokens=[
          "<s>",
          "<pad>",
          "</s>" ,
          "<mask>"
])

#Preprocess the token so that each
#start with start of sequence <s> and end with end of sequence </s>
tokenizer._tokenizer.post_processor = tokenizers.processors.BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

#enable padding
tokenizer.enable_padding(pad_id=1, pad_token = "<pad>")

#truncate to maximum length
tokenizer.enable_truncation(max_length=1024)

#create a token data
token_data = [x.ids for x in tokenizer.encode_batch(data_clean)]

In [5]:
import math
def attention(query, key, value, mask=None, dropout=0.0):

  d_k = query.size(-1)
  scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
  if mask is not None:
    scores = scores.masked_fill(mask ==0, -1e-9)
  p_attn = F.softmax(scores, dim= -1)
  p_attn = F.dropout(p_attn, p=dropout)
  return torch.matmul(p_attn, value), p_attn

In [6]:
import copy
def clones(module, N):
    """Produce N identical layers."""
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [7]:
class MultiHeadedAttention(nn.Module):

  def __init__(self, h, d_model, dropout=0.1):
    super(MultiHeadedAttention, self).__init__()
    assert d_model % h == 0

    self.d_k = d_model // h
    self.h = h
    self.p = dropout
    self.linears = clones(nn.Linear(d_model,d_model),4)
    self.attn = None

  def forward(self, query, key, value, mask=None):

    if mask is not None:
      mask = mask.unsqueeze(1)

    nbatches = query.size(0)

    query, key, value = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1,2)
                        for l,x in zip(self.linears, (query, key, value))]
    x, self_attn = attention(query, key, value, mask=mask, dropout=self.p)

    x = x.transpose(1,2).contiguous().view(nbatches, -1, self.h * self.d_k)

    return self.linears[-1](x)

In [None]:
class PositionwiseFeedForward(nn.Module):
    """Implements FFN equation."""
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))


class PositionalEncoding(nn.Module):
    """Implement the PE function."""
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)],
                         requires_grad=False)
        return self.dropout(x)

In [None]:
class GPT(nn.Module):

  def __init__(self, decoder, tgt_embed, generator):
    super(GPT, self).__init__()
    self.decoder = decoder
    self.embed = tgt_embed
    self.generator = generator

  def forward(self, x, x_mask):

    x = self.embed(x)
    x =  self.decoder(x,x_mask)

    return x

In [None]:
class LayerNorm(nn.Module):
    """Construct a layernorm module (See citation for details)."""
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        """Apply residual connection to any sublayer with the same size."""
        return x + self.dropout(sublayer(self.norm(x)))

In [None]:
class Decoder(nn.Module):

  def __init__(self, layer, N):

    super(Decoder, self).__init__()
    self.layers = clones(layer, N)
    self.norm = LayerNorm(layer.size)

  def forward(self,x, tgt_mask):

    for layer in self.layers:
      x = layer(x, tgt_mask)
    return self.norm(x)


class DecoderLayer(nn.Module):

  def __init__(self, size, self_attn, feed_forward, dropout):
    super(DecoderLayer, self).__init__()
    self.size = size
    self.self_attn = self_attn
    self.feed_forward = feed_forward
    self.sublayer = clones(SublayerConnection(size, dropout), 2)

  def forward(self, x, tgt_mask):

    x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
    return self.sublayer[1](x, self.feed_forward)

In [None]:
class Generator(nn.Module):
    """Define standard linear + softmax generation step."""

    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [8]:
def make_model(eng_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1):

  c = copy.deepcopy
  attn = MultiHeadedAttention(h,d_model)
  ff   = PositionwiseFeedForward(d_model, d_ff, dropout)
  position = PositionalEncoding(d_model, dropout)


  model = GPT(
      Decoder(DecoderLayer(d_model, c(attn), c(ff), dropout), N),
      nn.Sequential(Embeddings(d_model, eng_vocab), c(position)),
      Generator(d_model, eng_vocab)
  )

  for p in model.parameters():
    if p.dim() > 1:
      nn.init.xavier_uniform(p)
  return model

In [9]:

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, myinput, hidden):
        embedded = self.embedding(myinput).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [10]:

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,
          max_length=MAX_LEN, teacher_forcing_ratio=0.5):

    # get an initial hidden state for the encoder
    encoder_hidden = encoder.initHidden()

    # zero the gradients of the optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # get the seq lengths, used for iterating through encoder/decoder
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    # create empty tensor to fill with encoder outputs
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    # create a variable for loss
    loss = 0

    # pass the inputs through the encoder
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    # create a start-of-sequence tensor for the decoder
    decoder_input = torch.tensor([[output_vocab.stoi[SOS_TOKEN]]], device=device)

    # set the decoder hidden state to the final encoder hidden state
    decoder_hidden = encoder_hidden

    # decide if we will use teacher forcing
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)

        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()  # detach from history as input

        loss += criterion(decoder_output, target_tensor[di].unsqueeze(0))

        if use_teacher_forcing:
            decoder_input = target_tensor[di]

        if decoder_input.item() == output_vocab.stoi[EOS_TOKEN]:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [11]:
def trainIters(encoder, decoder, n_iters, print_every=10000, learning_rate=0.04, teacher_forcing_ratio=0.2):
    print(f'Running {n_iters} epochs...')
    print_loss_total = 0
    print_loss_epoch = 0

    encoder_optim = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optim = optim.SGD(decoder.parameters(), lr=learning_rate)

    # note batch size of 1, just for simplicity
    # DO NOT INCREASE THE BATCH SIZE
    batch_iterator = torchtext.data.Iterator(
        dataset=data_train, batch_size=1,
        sort=False, sort_within_batch=True,
        sort_key=lambda x: len(x.src),
        device=device, repeat=False)


    criterion = nn.NLLLoss()

    for e in range(n_iters):
        batch_generator = batch_iterator.__iter__()
        step = 0
        start = time.time()
        for batch in batch_generator:
            step += 1

            # get the input and target from the batch iterator
            input_tensor, input_lengths = getattr(batch, 'src')
            target_tensor = getattr(batch, 'tgt')

            # this is because we're not actually using the batches.
            # batch size is 1 and this just selects that first one
            input_tensor = input_tensor[0]
            target_tensor = target_tensor[0]

            loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optim, decoder_optim, criterion, teacher_forcing_ratio=teacher_forcing_ratio)
            print_loss_total += loss
            print_loss_epoch += loss


            if step % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                t = (time.time() - start) / 60
                print(f'step: {step}\t avg loss: {print_loss_avg:.2f}\t time for {print_every} steps: {t:.2f} min')
                start = time.time()

        print_loss_avg = print_loss_epoch / step
        print_loss_epoch = 0
        print(f'End of epoch {e}, avg loss {print_loss_avg:.2f}')