In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/END

/content/drive/MyDrive/END


In [None]:
%%bash
python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
from tqdm.notebook import tqdm
import torchtext
import spacy
from torchtext.data import Field
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.data import BucketIterator
import torch.optim as optim
import time
import random
import math
from torchtext import data

In [None]:
nlp = spacy.load("en")

In [None]:
def tokenize(text):
  return [token.text for token in nlp.tokenizer(text)]

In [None]:
# train_df.to_csv("COQA_train.csv", index = False)
train_df = pd.read_csv("COQA_train.csv")

In [None]:
text_field = Field(
    sequential=True,
    tokenize=tokenize, 
    lower=True,
    init_token='<sos>', eos_token='<eos>'
)

In [None]:
fields = {'ques_context' : ('ques_context', text_field),
          'answer':('answer', text_field)}

In [None]:
text_dataset = torchtext.data.TabularDataset(path='COQA_train.csv', format='CSV', fields=fields)

In [None]:
train, valid = text_dataset.split(split_ratio=[0.7, 0.3])

In [None]:
len(train), len(valid)

(76053, 32594)

In [None]:
MAX_VOCAB_SIZE = 20000
text_field.build_vocab(train, min_freq=2, max_size=MAX_VOCAB_SIZE)

In [None]:
def get_example(data, example_number):
    print(f"Question and Context : \n\n{' '.join(data.examples[example_number].ques_context)}")
    print(f"\nAnswer : \n\n{' '.join(data.examples[example_number].answer)}")

In [None]:
get_example(train, 0)

Question and Context : 

what made him a wartime leader ? bush 's margin of victory in the popular vote was the smallest ever for a reelected incumbent president , but marked the first time since his father 's victory 16 years prior that a candidate won a majority of the popular vote . the electoral map closely resembled that of 2000 , with only three states changing sides : new mexico and iowa voted republican in 2004 after having voted democratic in 2000 , while new hampshire voted democratic in 2004 after previously voting republican . in the electoral college , bush received 286 votes to kerry 's 252 . 

 just eight months into his presidency , the terrorist attacks of september 11 , 2001 suddenly transformed bush into a wartime president . bush 's approval ratings surged to near 90 % . within a month , the forces of a coalition led by the united states entered afghanistan , which had been sheltering osama bin laden , suspected mastermind of the september 11 attacks . by december ,

In [None]:
get_example(valid, 0)

Question and Context : 

is this the final ? ( cnn ) -- portsmouth will play chelsea in the fa cup final after an upset 2 - 0 extra - time victory over tottenham hotspur in the second semifinal at wembley on sunday . 

 french striker frederic piquionne opened the scoring for avram grant 's men nine minutes into extra - time . 

 former tottenham midfielder kevin - prince boateng scored the second with three minutes remaining from the penalty spot after referee alan wiley awarded a spot kick as wilson palacios fouled aruna dindane . 

 it was a humiliating defeat for tottenham and their manager harry redknapp , who steered portsmouth to fa cup triumph in 2008 before leaving the cash - strapped club for white hart lane . 

 his team went into the match as overwhelming favorites against a pompey team who had been relegated from the premier league the day before without playing , having been deducted nine points after going into administration . 

 but all that was forgotten as their fana

In [None]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

In [None]:
BATCH_SIZE = 8

train_iterator, valid_iterator = BucketIterator.splits((train, valid), batch_size = BATCH_SIZE, sort_key = lambda x: len(x.ques_context), sort_within_batch = False, device = device)

In [None]:
batch_ex = next(iter(train_iterator))
batch_ex.ques_context

tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [  50,  158,  106,  ...,   49,   55,   16],
        [  16,   50,   14,  ...,   55,   15,   61],
        ...,
        [2158,    1,    1,  ...,    1,    1,    1],
        [   6,    1,    1,  ...,    1,    1,    1],
        [   3,    1,    1,  ...,    1,    1,    1]], device='cuda:0')

In [None]:
batch_ex.ques_context.shape

torch.Size([440, 8])

In [None]:
[text_field.vocab.itos[x] for x in batch_ex.ques_context[:, 0]]

['<sos>',
 'what',
 'is',
 'the',
 'theory',
 'of',
 'why',
 'children',
 'like',
 'santa',
 'claus',
 'so',
 'much',
 '?',
 'one',
 'of',
 'the',
 'traditions',
 'which',
 'is',
 'now',
 'a',
 'necessary',
 'part',
 'of',
 'christmas',
 'is',
 'a',
 'that',
 'of',
 'father',
 'christmas',
 ',',
 'or',
 'santa',
 'claus',
 '.',
 'according',
 'to',
 'the',
 'modern',
 'legend',
 ',',
 'he',
 'is',
 'a',
 'magical',
 'figure',
 'who',
 'visits',
 'all',
 'the',
 'children',
 'of',
 'the',
 'world',
 'during',
 'the',
 'night',
 'before',
 'christmas',
 'day',
 ',',
 'leaving',
 'presents',
 'which',
 'they',
 'find',
 'the',
 'next',
 'morning',
 '.',
 'he',
 'flies',
 'through',
 'the',
 'night',
 'sky',
 'in',
 'a',
 'sledge',
 'pulled',
 'by',
 'reindeer',
 ',',
 'and',
 'enters',
 'houses',
 'by',
 'climbing',
 'down',
 '<unk>',
 '.',
 'this',
 'strange',
 'legend',
 'is',
 'based',
 'on',
 'the',
 'life',
 'of',
 'a',
 'man',
 'called',
 'nicholas',
 ',',
 'but',
 'in',
 'fact',
 '

In [None]:
[text_field.vocab.itos[x] for x in batch_ex.answer[:, 0]]

['<sos>', 'he', 'is', 'like', 'an', 'ideal', 'father', '<eos>', '<pad>']

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, hidden = self.rnn(embedded)
                
        #outputs = [src len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        
        #hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        #outputs are always from the last layer
        
        #hidden [-2, :, : ] is the last of the forwards RNN 
        #hidden [-1, :, : ] is the last of the backwards RNN
        
        #initial decoder hidden is final hidden state of the forwards and backwards 
        #  encoder RNNs fed through a linear layer
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        
        #outputs = [src len, batch size, enc hid dim * 2]
        #hidden = [batch size, dec hid dim]
        
        return outputs, hidden

class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs):
        
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        #repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #hidden = [batch size, src len, dec hid dim]
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        
        #energy = [batch size, src len, dec hid dim]

        attention = self.v(energy).squeeze(2)
        
        #attention= [batch size, src len]
        
        return F.softmax(attention, dim=1)

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.output_dim = output_dim
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):
             
        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
        
        a = self.attention(hidden, encoder_outputs)
                
        #a = [batch size, src len]
        
        a = a.unsqueeze(1)
        
        #a = [batch size, 1, src len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        
        weighted = torch.bmm(a, encoder_outputs)
        
        #weighted = [batch size, 1, enc hid dim * 2]
        
        weighted = weighted.permute(1, 0, 2)
        
        #weighted = [1, batch size, enc hid dim * 2]
        
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        
        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
            
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        #output = [seq len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]
        
        #seq len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]
        #this also means that output == hidden
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden.squeeze(0)

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src)
                
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state and all encoder hidden states
            #receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

In [None]:
INPUT_DIM = len(text_field.vocab)
OUTPUT_DIM = len(text_field.vocab)
ENC_EMB_DIM = 128
DEC_EMB_DIM = 128
ENC_HID_DIM = 128
DEC_HID_DIM = 128
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(20004, 128)
    (rnn): GRU(128, 128, bidirectional=True)
    (fc): Linear(in_features=256, out_features=128, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=384, out_features=128, bias=True)
      (v): Linear(in_features=128, out_features=1, bias=False)
    )
    (embedding): Embedding(20004, 128)
    (rnn): GRU(384, 128)
    (fc_out): Linear(in_features=512, out_features=20004, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 15,860,900 trainable parameters


In [None]:
optimizer = optim.AdamW(model.parameters())
PAD_IDX = text_field.vocab.stoi[text_field.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.ques_context
        trg = batch.answer
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.ques_context
            trg = batch.answer

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'COQA_model2.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 16m 14s
	Train Loss: 5.123 | Train PPL: 167.800
	 Val. Loss: 4.975 |  Val. PPL: 144.808
Epoch: 02 | Time: 16m 21s
	Train Loss: 4.584 | Train PPL:  97.891
	 Val. Loss: 4.866 |  Val. PPL: 129.780
Epoch: 03 | Time: 16m 32s
	Train Loss: 4.324 | Train PPL:  75.457
	 Val. Loss: 4.819 |  Val. PPL: 123.795
Epoch: 04 | Time: 16m 23s
	Train Loss: 4.098 | Train PPL:  60.220
	 Val. Loss: 4.816 |  Val. PPL: 123.483
Epoch: 05 | Time: 16m 32s
	Train Loss: 3.909 | Train PPL:  49.847
	 Val. Loss: 4.807 |  Val. PPL: 122.373
Epoch: 06 | Time: 16m 25s
	Train Loss: 3.741 | Train PPL:  42.150
	 Val. Loss: 4.840 |  Val. PPL: 126.519
Epoch: 07 | Time: 16m 16s
	Train Loss: 3.602 | Train PPL:  36.656
	 Val. Loss: 4.870 |  Val. PPL: 130.376
Epoch: 08 | Time: 16m 17s
	Train Loss: 3.488 | Train PPL:  32.722
	 Val. Loss: 4.874 |  Val. PPL: 130.780
Epoch: 09 | Time: 16m 13s
	Train Loss: 3.385 | Train PPL:  29.504
	 Val. Loss: 4.910 |  Val. PPL: 135.699
Epoch: 10 | Time: 16m 19s
	Train Loss: 3.303 |

In [None]:
model.load_state_dict(torch.load('COQA_model2.pt'))

<All keys matched successfully>

In [None]:
batch = next(iter(valid_iterator))
src = batch.ques_context
trg = batch.answer
prediction = model(src, trg, 0)

In [None]:
prediction[1:].view(-1, len(text_field.vocab)).cpu().shape

torch.Size([56, 20004])

In [None]:
prediction.shape

torch.Size([8, 8, 20004])

In [None]:
import textwrap

In [None]:
def show_predictions(src, trg, example_id):

  print("Question and Context:\n")
  for e in textwrap.wrap(" ".join([text_field.vocab.itos[x] for x in src[:, example_id].cpu()]), 100):
    print(e)

  print("\nActual Answer: \n")
  print(" ".join([text_field.vocab.itos[x] for x in trg[:, example_id].cpu()]))

  print("\nPredicted Answer: \n")
  print(" ".join([text_field.vocab.itos[x] for x in torch.argmax(prediction[:, example_id, :], axis = 1).cpu()]))

In [None]:
show_predictions(src, trg, 0)

Question and Context:

<sos> what did he like to collect the most ? timmy liked to play games and play sports but more than
anything he liked to collect things . he collected bottle caps . he collected sea shells . he
collected baseball cards . he has collected baseball cards the longest . he likes to collect the
thing that he has collected the longest the most . he once thought about collecting stamps but never
did . his most expensive collection was not his favorite collection . timmy spent the most money on
his bottle cap collection . <eos> <pad> <pad> <pad> <pad>

Actual Answer: 

<sos> baseball cards <eos> <pad> <pad> <pad> <pad>

Predicted Answer: 

<unk> his own <eos> <eos> <eos> <eos> <eos>


In [None]:
show_predictions(src, trg, 1)

Question and Context:

<sos> was his baseball card collection the most expensive ? timmy liked to play games and play
sports but more than anything he liked to collect things . he collected bottle caps . he collected
sea shells . he collected baseball cards . he has collected baseball cards the longest . he likes to
collect the thing that he has collected the longest the most . he once thought about collecting
stamps but never did . his most expensive collection was not his favorite collection . timmy spent
the most money on his bottle cap collection . <eos> <pad> <pad> <pad> <pad>

Actual Answer: 

<sos> no <eos> <pad> <pad> <pad> <pad> <pad>

Predicted Answer: 

<unk> no <eos> <eos> <eos> <eos> <eos> <eos>


In [None]:
show_predictions(src, trg, 5)

Question and Context:

<sos> does she have any pets with feathers ? molly likes animals . she has a cat . she has a dog .
she has a bird . she has a hamster . she has a bunny . her cat 's name is kitty . her dog 's name is
spike . her bird 's name is polly . her hamster 's name is barry . her bunny 's name is snowball .
kitty plays with yarn . spike plays with a ball . polly plays in her cage . barry runs on his wheel
. snowball eats carrots . <eos> <pad>

Actual Answer: 

<sos> yes <eos> <pad> <pad> <pad> <pad> <pad>

Predicted Answer: 

<unk> yes <eos> <eos> <eos> <eos> <eos> <eos>


In [None]:
show_predictions(src, trg, 2)

Question and Context:

<sos> what does snowball like ? molly likes animals . she has a cat . she has a dog . she has a bird
. she has a hamster . she has a bunny . her cat 's name is kitty . her dog 's name is spike . her
bird 's name is polly . her hamster 's name is barry . her bunny 's name is snowball . kitty plays
with yarn . spike plays with a ball . polly plays in her cage . barry runs on his wheel . snowball
eats carrots . <eos> <pad> <pad> <pad> <pad>

Actual Answer: 

<sos> a bunny <eos> <pad> <pad> <pad> <pad>

Predicted Answer: 

<unk> a ball <eos> ball <eos> <eos> <eos>


#### The model seems to be doing well in Yes/No Questions but is failing in other descriptive questions