In [None]:
import pandas as pd
import json
import matplotlib as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data.sampler import BatchSampler
from torch.optim import lr_scheduler
from PIL import Image
import timeit
from sklearn.pipeline import Pipeline
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy import data
import random
## For reproducibility
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(0)
random.seed(0)
torch.cuda.manual_seed(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create Preprocessing pipeline for summaries

In [None]:
tokenize =  lambda s: s.split()
import re  
def cleanup_text(texts):
    cleaned_text = []
    for text in texts:
        # remove punctuation
        text = re.sub('[^a-zA-Z0-9]', ' ', text)
        # remove multiple spaces
        text = re.sub(r' +', ' ', text)
        # remove newline
        text = re.sub(r'\n', ' ', text)
        #replace digits with '# symbol
        text = re.sub('[0-9]', '#', text)
        cleaned_text.append(text)
    return cleaned_text

In [None]:
text = "Hi\n ko, \t hs,  8998,  66jshs. hshs"
print(text.split())
cleanup_text(text.split())

['Hi', 'ko,', 'hs,', '8998,', '66jshs.', 'hshs']


['Hi', 'ko ', 'hs ', '#### ', '##jshs ', 'hshs']

## Create torchtext fields

In [None]:
#Field for summaries
SUM = data.Field(tokenize = tokenize,init_token='<sos>',eos_token='<eos>',pad_first=True,lower = True,preprocessing=cleanup_text)
#Field for title
TITLE = data.Field(tokenize = tokenize,init_token='<sos>',eos_token='<eos>',lower = True,preprocessing=cleanup_text)
#Field for Id
#ID = data.Field(use_vocab=False,sequential=False,dtype=torch.LongTensor,postprocessing=data.Pipeline(lambda x: int(x)))

In [None]:
fields = [('Id',None),('Abstract',None),('Title',TITLE),('sum1',SUM),('sum2',SUM),('sum3',SUM),('sum4',SUM),('sum5',SUM),('sum6',SUM),('sum7',SUM)]

## Read data into tabular dataset

In [None]:
dataset = data.TabularDataset(path='./drive/MyDrive/data_summaries.csv',format='csv', fields=fields,skip_header=True)

In [None]:
print(vars(dataset[0]))

{'Title': ['dual', 'recurrent', 'attention', 'units', 'for', 'visual', 'question', 'answering'], 'sum1': ['we', 'propose', 'an', 'architecture', 'for', 'vqa', 'which', 'utilizes', 'recurrent', 'layers', 'to', 'generate', 'visual', 'and', 'textual', 'attention the', 'memory', 'characteristic', 'of', 'the', 'proposed', 'recurrent', 'attention', 'units', 'offers', 'a', 'rich', 'joint', 'embedding', 'of', 'visual', 'and', 'textual', 'features', 'and', 'enables', 'the', 'model', 'to', 'reason', 'relations', 'between', 'several', 'parts', 'of', 'the', 'image', 'and', 'question '], 'sum2': ['we', 'propose', 'an', 'architecture', 'for', 'vqa', 'which', 'utilizes', 'recurrent', 'layers', 'to', 'generate', 'visual', 'and', 'textual', 'attention in', 'both', 'cases ', 'our', 'recurrent', 'attention', 'mechanism', 'improves', 'performance', 'in', 'tasks', 'requiring', 'sequential', 'or', 'relational', 'reasoning', 'on', 'the', 'vqa', 'dataset '], 'sum3': ['the', 'memory', 'characteristic', 'of', '

## Create training data and test data

In [None]:
import random
train_data, valid_data = dataset.split(split_ratio=0.9, random_state=random.seed(0))

In [None]:
print(len(train_data))
print(len(valid_data))

36900
4100


In [None]:
print(vars(train_data[5]))

{'Title': ['adaptively', 'learning', 'the', 'crowd', 'kernel'], 'sum1': ['we', 'introduce', 'an', 'algorithm', 'that ', 'given', 'n', 'objects ', 'learns', 'a', 'similarity', 'matrix', 'over', 'all', 'n #', 'pairs ', 'from', 'crowdsourced', 'data', 'alone the', 'algorithm', 'samples', 'responses', 'to', 'adaptively', 'chosen', 'triplet based', 'relative similarity', 'queries '], 'sum2': ['we', 'introduce', 'an', 'algorithm', 'that ', 'given', 'n', 'objects ', 'learns', 'a', 'similarity', 'matrix', 'over', 'all', 'n #', 'pairs ', 'from', 'crowdsourced', 'data', 'alone svms', 'reveal', 'that', 'the', 'crowd', 'kernel', 'captures', 'prominent', 'and', 'subtle', 'features', 'across', 'a', 'number', 'of', 'domains ', 'such', 'as', ' is', 'striped ', 'among', 'neckties', 'and', ' vowel', 'vs ', 'consonant ', 'among', 'letters '], 'sum3': ['we', 'introduce', 'an', 'algorithm', 'that ', 'given', 'n', 'objects ', 'learns', 'a', 'similarity', 'matrix', 'over', 'all', 'n #', 'pairs ', 'from', 'cr

In [None]:
SUM.build_vocab(train_data.sum1,train_data.sum2,train_data.sum3,train_data.sum4,train_data.sum5,\
                train_data.sum6,train_data.sum7,train_data.Title,max_size=40000,vectors='glove.6B.100d')
TITLE.vocab= SUM.vocab


In [None]:
print(len(SUM.vocab))

40004


### Checking

In [None]:
from torchtext import data, datasets, vocab
glove = vocab.GloVe(name="6B", dim=100)

In [None]:
glove.vectors.size() # => torch.Size([400000, 100])

torch.Size([400000, 100])

In [None]:
SUM.vocab.vectors.size()

torch.Size([40004, 100])

In [None]:
SUM.vocab.stoi['<unk>'] #<unk> is not present in glove vocab

0

In [None]:
SUM.vocab.vectors[0]#check the initialisation of oov words for glove. 

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])

In [None]:
assert(TITLE.vocab.stoi ==  SUM.vocab.stoi) #check if both share the same vocab or not

In [None]:
type(SUM.vocab.vectors)

torch.Tensor

In [None]:
type(SUM.vocab.stoi)

collections.defaultdict

In [None]:
torch.save(SUM.vocab.vectors, './drive/MyDrive/vocab_embeddings.pt')
torch.save(SUM.vocab.stoi,'./drive/MyDrive/vocab_stoi.pt')
torch.save(SUM.vocab.itos,'./drive/MyDrive/vocab_itos.pt')

In [None]:
torch.save(train_data.examples,'train_data.pt')

In [None]:
train_data.examples[0].Title

['unsupervised', 'semantic', 'parsing', 'of', 'video', 'collections']

## Create Bucket iterator

In [None]:
def cal_length(x):
  return len(x.sum1)+len(x.sum2)+len(x.sum3)+len(x.sum4)+len(x.sum5)+len(x.sum6)+len(x.sum7)+len(x.Title)
from torchtext.legacy import data
BATCH_SIZE =64
train_iterator, valid_iterator =data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE, sort_key = lambda x: cal_length(x), sort_within_batch = True,shuffle=True,sort=False,
    device = device)

###Checking

In [None]:
bt = next(iter(train_iterator))
bt.sum2

tensor([[    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1],
        ...,
        [  160,    67,     0,  ...,     7,   303,     0],
        [10665,   881,  2246,  ..., 20638,  3045,   261],
        [    3,     3,     3,  ...,     3,     3,     3]], device='cuda:0')

In [None]:
bt.name

AttributeError: ignored

# Model Architecture

## Encoder layer

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, num_layers, dropout): 
        super().__init__()   
        self.hid_dim = hid_dim
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_dim, emb_dim)  
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers, dropout = dropout)    
        self.dropout = nn.Dropout(dropout)       
    def forward(self, input_idx):
        #print(input_idx)
        input_idx=input_idx.to(device)
        embedded = self.dropout(self.embedding(input_idx))
        outputs, (hidden, cell) = self.lstm(embedded)
        #embedded = seq_len,batch_size,embed_dim
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        #outputs are always from the top hidden layer
        return outputs,hidden

## Checking

In [None]:
INPUT_DIM = len(SUM.vocab)
OUTPUT_DIM = len(TITLE.vocab)
ENC_EMB_DIM = 100
DEC_EMB_DIM = 100
HID_DIM = 512
N_LAYERS = 3
ENC_DROPOUT = 0
DEC_DROPOUT = 0 
pretrained_embeddings = SUM.vocab.vectors
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
def init_weights(m):
    for name, param in m.named_parameters():
      print(name)
      nn.init.uniform_(param.data, -0.1, 0.1)  
enc.apply(init_weights)
enc.embedding.weight.data.copy_(pretrained_embeddings)     

weight
weight_ih_l0
weight_hh_l0
bias_ih_l0
bias_hh_l0
weight_ih_l1
weight_hh_l1
bias_ih_l1
bias_hh_l1
weight_ih_l2
weight_hh_l2
bias_ih_l2
bias_hh_l2
embedding.weight
lstm.weight_ih_l0
lstm.weight_hh_l0
lstm.bias_ih_l0
lstm.bias_hh_l0
lstm.weight_ih_l1
lstm.weight_hh_l1
lstm.bias_ih_l1
lstm.bias_hh_l1
lstm.weight_ih_l2
lstm.weight_hh_l2
lstm.bias_ih_l2
lstm.bias_hh_l2


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [None]:
enc.embedding.weight[456]

tensor([ 0.1445, -0.0528, -0.0545,  1.1310,  0.4758, -1.0285,  0.2467, -0.0737,
        -0.0886,  0.3821,  0.5470, -0.2648,  0.3177, -0.1480, -0.2948, -0.7081,
        -0.1925,  0.2585, -0.2926,  0.1868,  0.3139, -0.0207, -0.1016, -0.2646,
        -0.0816, -0.1146,  0.0933, -0.5261,  0.3618, -0.8518, -0.3467,  0.4525,
        -0.2537,  0.2612,  0.7651, -0.2433, -0.0700,  0.3619, -1.2979, -0.0151,
        -0.1497,  0.3810,  0.3105,  0.1845,  0.2202, -0.4393,  1.0478, -0.3870,
        -0.1594, -1.2090, -0.2850, -0.4175, -0.1810,  1.0614,  0.3273, -1.8847,
         0.0398,  0.1253,  0.4937,  0.1528, -0.1738,  0.1407, -1.0588,  0.1461,
         0.3045, -0.0489, -0.2328, -0.3068,  0.6500, -0.1928,  0.4154,  0.3495,
         0.1158,  0.2605,  0.5483,  0.0817, -0.4587, -0.7336, -0.5350, -0.3111,
         0.1220,  0.4506, -0.0988, -0.0154, -0.7591,  0.2336,  0.5476, -0.8831,
        -0.3143,  0.1056, -0.4707,  0.2288, -0.4611,  0.7963, -0.5427, -0.0820,
        -0.3374, -0.0419,  0.1561,  0.44

In [None]:
SUM.vocab.vectors[456]

tensor([ 0.1445, -0.0528, -0.0545,  1.1310,  0.4758, -1.0285,  0.2467, -0.0737,
        -0.0886,  0.3821,  0.5470, -0.2648,  0.3177, -0.1480, -0.2948, -0.7081,
        -0.1925,  0.2585, -0.2926,  0.1868,  0.3139, -0.0207, -0.1016, -0.2646,
        -0.0816, -0.1146,  0.0933, -0.5261,  0.3618, -0.8518, -0.3467,  0.4525,
        -0.2537,  0.2612,  0.7651, -0.2433, -0.0700,  0.3619, -1.2979, -0.0151,
        -0.1497,  0.3810,  0.3105,  0.1845,  0.2202, -0.4393,  1.0478, -0.3870,
        -0.1594, -1.2090, -0.2850, -0.4175, -0.1810,  1.0614,  0.3273, -1.8847,
         0.0398,  0.1253,  0.4937,  0.1528, -0.1738,  0.1407, -1.0588,  0.1461,
         0.3045, -0.0489, -0.2328, -0.3068,  0.6500, -0.1928,  0.4154,  0.3495,
         0.1158,  0.2605,  0.5483,  0.0817, -0.4587, -0.7336, -0.5350, -0.3111,
         0.1220,  0.4506, -0.0988, -0.0154, -0.7591,  0.2336,  0.5476, -0.8831,
        -0.3143,  0.1056, -0.4707,  0.2288, -0.4611,  0.7963, -0.5427, -0.0820,
        -0.3374, -0.0419,  0.1561,  0.44

In [None]:
#To be used to pass inputs to control layer
enc.eval()
sum = train_data.fields
for i,batch in enumerate(train_iterator):
  sum1=batch.sum1
  sum2=batch.sum2
  sum3=batch.sum3
  sum4=batch.sum4
  sum5=batch.sum5
  sum6=batch.sum6
  sum7=batch.sum7
  sum=[sum1,sum2,sum3,sum4,sum5,sum6,sum7]
  control_input=torch.zeros((7,64,512))
  for s in range(7):
    o,h,c=enc(sum[s])
    print(h[-1].size())
    control_input[s]=h[-1]
  break;

torch.Size([64, 512])
torch.Size([64, 512])
torch.Size([64, 512])
torch.Size([64, 512])
torch.Size([64, 512])
torch.Size([64, 512])
torch.Size([64, 512])


In [None]:
print(control_input[5])

tensor([[-0.0834,  0.0189,  0.0765,  ..., -0.0174,  0.0009, -0.0084],
        [-0.0895,  0.0514,  0.0244,  ..., -0.0415,  0.0057,  0.0272],
        [-0.0050,  0.1141,  0.0455,  ..., -0.0509,  0.0232, -0.0133],
        ...,
        [-0.0277,  0.0447,  0.0456,  ..., -0.0098,  0.0120, -0.0170],
        [-0.1194,  0.0710,  0.0903,  ..., -0.0478,  0.0130, -0.0159],
        [-0.0394,  0.0713,  0.0786,  ..., -0.0637,  0.0046,  0.0357]],
       grad_fn=<SelectBackward>)


## Control layer

In [None]:
class ControlLayer(nn.Module):
    def __init__(self, input_dim,hid_dim): 
        super().__init__()   
        self.hid_dim = hid_dim
        
        #self.embedding = nn.Embedding(input_dim, emb_dim)  
        self.lstm = nn.LSTM(input_dim, hid_dim)    
             
    def forward(self, sum_hidden):
        #print(input_idx)
        #sum_hidden = seq_len(=7),batch_size,embed_dim(=encoder_hidden_dimension)
        outputs, (hidden, cell) = self.lstm(sum_hidden)
        #embedded = seq_len,batch_size,embed_dim
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        #outputs are always from the top hidden layer
        return outputs,hidden, cell

##checking

In [None]:
cl = ControlLayer(512,512)
i=torch.randn(7,64,512)
o,h,c = cl(i)

In [None]:
h.size()

torch.Size([1, 64, 512])

##Attention layer

In [None]:
class ComplexAttention(nn.Module):
   def __init__(self,dec_hid_dim,cnt_hid_dim,enc_hid_dim):
    super().__init__() 
    self.cnt_hid_dim=cnt_hid_dim
    self.enc_hid_dim = enc_hid_dim
    self.dec_hid_dim = dec_hid_dim
   def forward(self,cnt_hid_states,enc_hid_states,dec_hid_states):
     #cnt_hid_states = [7,batch_size,cnt_hid_dim]
     #enc_hid_states = [7,seq_len*,batch_size,enc_hid_dim], list of hidden states for every summary
     #dec_hid_states = [1,batch_size,dec_hid_dim]
     '''Calculate summary level attention'''
     cnt_hid_states=cnt_hid_states.permute(1,0,2)
     dec_hid_states=dec_hid_states.permute(1,2,0)
     #dec_hid_states=[batch_size,dec_hid_dim,1]
     alpha = torch.bmm(cnt_hid_states,dec_hid_states)
     #alpha=[batch_size,7,1]
     #alpha=alpha.squeeze(2)
     alpha=F.softmax(alpha,dim=1)
     #alpha=[batch_size,7,1]
     '''Calculate word level attention'''
     batch_size = alpha.size()[0]
     context_vec=torch.zeros(batch_size,1,self.enc_hid_dim).to(device)
     context_vec_k=torch.zeros(7,batch_size,self.enc_hid_dim).to(device)
     for k,sum_hid_states in enumerate(enc_hid_states):
       #sum_hid_states = [seq_len_k,batch_size,enc_hid_dim]
       sum_hid_states=sum_hid_states.permute(1,0,2)
       beta = torch.bmm(sum_hid_states,dec_hid_states)
       #beta = [batch_size,seq_len_1,1]
       #beta=beta.squeeze(2)
       beta=F.softmax(beta,dim=1)
       beta=beta.permute(0,2,1)
       #beta = [batch_size,1,seq_len]
       #sum_hid_states = [batch_size,seq_len_size,enc_hid_dim]
       context_vec_k[k] = torch.bmm(beta,sum_hid_states).squeeze(1)
       #context_vec_k = [batch_size,1,enc_hid_dim].squeeze(1)
     '''Combining both and returning context_vector'''
     context_vec_k=context_vec_k.permute(1,0,2)
     alpha=alpha.permute(0,2,1)
     context_vec = torch.bmm(alpha,context_vec_k)
     del context_vec_k
     torch.cuda.empty_cache()
     return alpha,beta,context_vec

       

In [None]:
class SimpleAttention(nn.Module):
   def __init__(self,dec_hid_dim,cnt_hid_dim,enc_hid_dim,split):
    super().__init__() 
    self.cnt_hid_dim=cnt_hid_dim
    self.enc_hid_dim = enc_hid_dim
    self.dec_hid_dim = dec_hid_dim
    self.split = split
   def forward(self,cnt_hid_states,enc_hid_states,dec_hid_states):
     #cnt_hid_states = [7,batch_size,cnt_hid_dim]
     #enc_hid_states = [7,seq_len*,batch_size,enc_hid_dim], list of hidden states for every summary
     #dec_hid_states = [num_layers(=1),batch_size,dec_hid_dim]
     '''Calculate summary level attention'''
     cnt_hid_states_context=cnt_hid_states.permute(1,0,2)[:,:,:self.split]
     dec_hid_states_context=dec_hid_states.permute(1,2,0)[:,:self.split,:]
     cnt_hid_states_wgt=cnt_hid_states.permute(1,0,2)[:,:,self.split:]
     dec_hid_states_wgt=dec_hid_states.permute(1,2,0)[:,self.split:,:]
     #dec_hid_states_wgt=[batch_size,dec_hid_dim,1]
     alpha = torch.bmm(cnt_hid_states_wgt,dec_hid_states_wgt)
     #alpha=[batch_size,7,1]
     #alpha=alpha.squeeze(2)
     alpha=F.softmax(alpha,dim=1)
     #alpha=[batch_size,7,1]
     '''Calculate word level attention'''
     batch_size = alpha.size()[0]
     context_vec=torch.zeros(batch_size,1,self.split).to(device)
     context_vec_k=torch.zeros(7,batch_size,self.split).to(device)
     for k,sum_hid_states in enumerate(enc_hid_states):
       #sum_hid_states = [seq_len_k,batch_size,enc_hid_dim]
       sum_hid_states_wgt=sum_hid_states.permute(1,0,2)[:,:,self.split:]
       sum_hid_states_context=sum_hid_states.permute(1,0,2)[:,:,:self.split]
       beta = torch.bmm(sum_hid_states_wgt,dec_hid_states_wgt)
       #beta = [batch_size,seq_len_1,1]
       #beta=beta.squeeze(2)
       beta=F.softmax(beta,dim=1)
       beta=beta.permute(0,2,1)
       #beta = [batch_size,1,seq_len]
       #sum_hid_states = [batch_size,seq_len_size,enc_hid_dim]
       context_vec_k[k] = torch.bmm(beta,sum_hid_states_context).squeeze(1)
       #context_vec_k = [batch_size,1,enc_hid_dim].squeeze(1)
     '''Combining both and returning context_vector'''
     context_vec_k=context_vec_k.permute(1,0,2)
     alpha=alpha.permute(0,2,1)
     context_vec = torch.bmm(alpha,context_vec_k)
     del context_vec_k
     torch.cuda.empty_cache()
     return alpha,beta,context_vec

       

#### Checking

In [None]:
at = ComplexAttention(512,512,512)
enc_hid_states = []
for i in range(7):
  enc_hid_states.append(torch.randn(33,64,512))
cnt_hid_states=torch.randn(7,64,512)      
dec_hid_states=torch.randn(1,64,512) 
alpha,beta,con=at(cnt_hid_states,enc_hid_states,dec_hid_states)                     

In [None]:
assert(con.size()==torch.Size([64, 1, 512])) #checking if correct size is returned

In [None]:
assert(beta[45][0].sum().item()==1) ##Checking if attention wgts have calculated succesfully

In [None]:
at = SimpleAttention(512,512,512,472)
enc_hid_states = []
for i in range(7):
  enc_hid_states.append(torch.randn(33,64,512))
cnt_hid_states=torch.randn(7,64,512)      
dec_hid_states=torch.randn(1,64,512) 
alpha,beta,con=at(cnt_hid_states,enc_hid_states,dec_hid_states)                     

In [None]:
assert(con.size()==torch.Size([64, 1,472])) #checking if correct size is returned

In [None]:
import torch
import torch.nn.functional as F             # importing functions -Functional interface
a = torch.randn(5,2,3)
b=torch.randn(5,2,3)
c= a+b
print(a)
print(b)
print(c)

In [None]:
con.size()

torch.Size([64, 1, 472])

In [None]:
con=con.permute(1,0,2)
print(con.size())
input = torch.randn(1,64,100)
c = torch.cat((input, con), dim = 2)

torch.Size([1, 64, 472])


In [None]:
c.size()

torch.Size([1, 64, 572])

## Decoder

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim,con_hid_dim,attention,attention_type):
        super().__init__()
        self.output_dim = output_dim
        self.emb_dim = emb_dim
        self.con_hid_dim = con_hid_dim
        self.attention = attention      
        self.embedding = nn.Embedding(output_dim, emb_dim)
        if attention_type=='complex':      
          self.lstm = nn.LSTM(input_size=(enc_hid_dim + emb_dim),hidden_size= dec_hid_dim)
          self.fc_out = nn.Linear( enc_hid_dim + dec_hid_dim + emb_dim, output_dim)
        else:
          self.lstm = nn.LSTM((attention.split+emb_dim), dec_hid_dim)
          self.fc_out = nn.Linear( (2*attention.split)  + emb_dim, output_dim)    
        #self.dropout = nn.Dropout(dropout)
        self.attention_type=attention_type
    def forward(self, input_idx,cnt_hid_states,enc_hid_states,dec_hid_states,cell_state):
      #input = [batch_size]
      input_idx = input_idx.unsqueeze(0)#Adding a dimenstion at the the first = 1 = seq_len as we are sending word by word
      #input = [1,batch_size] 
      embedded = self.embedding(input_idx)
      #embedded = [1,batch_size,embed_size]
      '''Getting the context vector'''
      _,_,context_vector=self.attention(cnt_hid_states,enc_hid_states,dec_hid_states)
      #context_vector=[batch_size,1,hid_state]
      context_vector=context_vector.permute(1,0,2)
      #context_vector=[1,batch_size,hid_state]
      lstm_in = torch.cat((embedded,context_vector),dim=2)
      #lstm_in = [1,batch_size,context_vector_size+embed_size]
      #print(lstm_in.size())
      outputs, (hidden, cell) = self.lstm(lstm_in,(dec_hid_states,cell_state))
      #output=[seq_len(=1),batch_size,hid_dim]
      #hidden=[num_layers(=1),batch_size,hid_dim]
      assert(outputs==hidden).all()

      embedded=embedded.squeeze(0)
      context_vector=context_vector.squeeze(0)
      outputs = outputs.squeeze(0)
      if self.attention_type=='complex':
        prediction = self.fc_out(torch.cat((outputs,context_vector,embedded),dim=1))
      else:
        prediction = self.fc_out(torch.cat((outputs[:,:self.attention.split],context_vector,embedded),dim=1))
      #prediction_size = (batch_size,out_dim)
      return prediction,hidden,cell


### checking

In [None]:
at = SimpleAttention(512,512,512,472)

dec =  Decoder(40000,100,512,512,512,at,'simple')
input = random.sample(range(10, 1000), 64)
enc_hid_states = []
for i in range(7):
  enc_hid_states.append(torch.randn(33,64,512))
cnt_hid_states=torch.randn(7,64,512)      
dec_hid_states=torch.randn(1,64,512)
cell_state=torch.randn(1,64,512)
pred,hid,cell = dec(torch.LongTensor(input),cnt_hid_states,enc_hid_states,dec_hid_states,cell_state)

torch.Size([1, 64, 572])


In [None]:
pred.size()

torch.Size([64, 40000])

##Seq2Seq

In [None]:
class Seq2Seq(nn.Module): #Combining the encoder,control_layer & decoder
  def __init__(self,encoder,control_layer,decoder,device):
    super().__init__()
    self.encoder=encoder
    self.control_layer = control_layer
    self.decoder=decoder
    self.device =  device
  def forward(self,input_batches,output_batches,tfr=0.5):
    #input_batches dimension - NOT A TENSOR. ENTIRE BATCH OBJECT IS SENT. 
    #output_batches_dimension - (seq_len,batch_size)
    
    batch_size = output_batches.shape[1]
    title_len = output_batches.shape[0]
    title_vocab_size = self.decoder.output_dim
    predictions = torch.zeros(title_len, batch_size, title_vocab_size).to(device)
    #print(input_batches.size())
    '''Pass each summary through the encoder'''
    sum1=input_batches.sum1
    sum2=input_batches.sum2
    sum3=input_batches.sum3
    sum4=input_batches.sum4
    sum5=input_batches.sum5
    sum6=input_batches.sum6
    sum7=input_batches.sum7
    sum=[sum1,sum2,sum3,sum4,sum5,sum6,sum7]
    control_input=torch.zeros((7,batch_size,self.control_layer.hid_dim)).to(device)
    encoder_hidden_states = []
    for s in range(7):
      output,hidden=self.encoder(sum[s])
      #output = [s.length,batch_size,hid_dim]
      #hidden=[num_layers,batch_size,hid_dim]
      #print("enc_output device",output.device)
      encoder_hidden_states.append(output)
      control_input[s]=hidden[-1]
    
    '''Pass the last hidden state to control layer for each summary'''
    output,hidden_state,cell_state = self.control_layer(control_input)
    control_hidden_states = output
    #prprint("S_c")
    '''Pass the merged representation to decoder along with encoder and control layer hidden states for implementing attention'''
    
    
    x = output_batches[0,:] # Trigger token <SOS>

    for i in range(1, title_len):
      pred, hidden_state, cell_state = self.decoder(x,control_hidden_states,encoder_hidden_states,hidden_state, cell_state)
      #pred = [batch_size,output_dim(vocab_size)]
      predictions[i] = pred
      best_guess = pred.argmax(1) 
      x = output_batches[i,:] if random.random() < tfr else best_guess
    return predictions  

## Set Model hyperparameters

## Initialise weights (Embeddings are initialised with glove)

## Train

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i,batch in enumerate(iterator):
        
        #abstract = batch.Abstract
        title = batch.Title
        #abstract,title = [seq_len,batch_size]
        optimizer.zero_grad()
        #print("batch device ",batch.device)
        predictions = model(batch, title,0.5)
        
        #predictions = [seq_len_title,batch_size,title_vocab]
        output_dim = predictions.shape[-1]
        
        predictions = predictions[1:].view(-1, output_dim)#ignoring the first value is the <sos> token
        title = title[1:].view(-1)
        
        loss = criterion(predictions, title)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

## Test

In [None]:
def test(model, iterator, criterion):    
    model.eval() 
    epoch_loss = 0 
    with torch.no_grad():   
        for i, batch in enumerate(iterator):
          #abstract = batch.Abstract
          title = batch.Title
          #abstract,title = [seq_len,batch_size]
          predictions = model(batch, title,0)
          #predictions = [seq_len_title,batch_size,title_vocab]
          output_dim = predictions.shape[-1]
          predictions = predictions[1:].view(-1, output_dim)#ignoring the first value is the <sos> token
          title = title[1:].view(-1)
          loss = criterion(predictions, title)  
          epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

##Translate

##Start training and testing!
### Experiement 0 - Complex Attention, ADAM optimiser

In [None]:
INPUT_DIM = len(SUM.vocab)
OUTPUT_DIM = len(TITLE.vocab)
ENC_EMB_DIM = 100
DEC_EMB_DIM = 100
HID_DIM = 512
N_LAYERS = 3
ENC_DROPOUT = 0
DEC_DROPOUT = 0

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
con = ControlLayer(HID_DIM,HID_DIM)
'''trying with complex attention first'''
attention = ComplexAttention(HID_DIM,HID_DIM,HID_DIM)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM,HID_DIM,HID_DIM,attention,'complex')

model = Seq2Seq(enc,con, dec, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
      #print(name)
      nn.init.uniform_(param.data, -0.1, 0.1)  
model.apply(init_weights)
pretrained_embeddings = SUM.vocab.vectors
model.encoder.embedding.weight.data.copy_(pretrained_embeddings)
model.decoder.embedding.weight.data.copy_(pretrained_embeddings)
    

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')

In [None]:
def checkpoint_and_save(model, min_loss, epoch, optimizer):
    print()
    state = {'model': model,'min_loss': min_loss,'epoch': epoch,'model_state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(),}
    path =  './drive/MyDrive/Colab Notebooks/final_net.pt'
    torch.save(state, path)

In [None]:
optimizer = optim.Adam(model.parameters(),lr=0.001)
PAD_IDX = TITLE.vocab.stoi[TITLE.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [None]:
import time
N_EPOCHS = 20
CLIP = 1
min_loss = 1000000
min_epoch = -1
train_loss_list = []
test_loss_list = []
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    test_loss = test(model,valid_iterator,criterion)
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tTest Loss: {test_loss:.3f}')
    #print("After epoch {} , generated title is {}".format(epoch,translate(model,demo_sentence,10)))
    end_time = time.time()
    print(f'Time taken : {(end_time-start_time)/60:.3f}mins')
    if(train_loss < min_loss):
      min_loss=train_loss
      min_epoch = epoch
      print("Saving the new checkpoint....")
      checkpoint_and_save(model,min_loss,epoch,optimizer)
    if(epoch-min_epoch >= 10):
      print("NO further improvement over 10 epochs. Terminating...")
      break
    
   

	Train Loss: 6.402
	Test Loss: 6.523
Time taken : 7.524mins
Saving the new checkpoint....

	Train Loss: 5.931
	Test Loss: 6.506
Time taken : 7.692mins
Saving the new checkpoint....

	Train Loss: 5.748
	Test Loss: 6.520
Time taken : 7.695mins
Saving the new checkpoint....

	Train Loss: 5.573
	Test Loss: 6.534
Time taken : 7.702mins
Saving the new checkpoint....

	Train Loss: 5.455
	Test Loss: 6.547
Time taken : 7.697mins
Saving the new checkpoint....

	Train Loss: 5.370
	Test Loss: 6.550
Time taken : 7.683mins
Saving the new checkpoint....

	Train Loss: 5.274
	Test Loss: 6.594
Time taken : 7.713mins
Saving the new checkpoint....

	Train Loss: 5.164
	Test Loss: 6.586
Time taken : 7.722mins
Saving the new checkpoint....

	Train Loss: 5.100
	Test Loss: 6.594
Time taken : 7.707mins
Saving the new checkpoint....

	Train Loss: 5.023
	Test Loss: 6.603
Time taken : 7.705mins
Saving the new checkpoint....

	Train Loss: 4.964
	Test Loss: 6.613
Time taken : 7.759mins
Saving the new checkpoint....


## Train & test loss graph

##Start training and testing!
### Experiement 1 - Complex Attention, RMSprop optimiser

In [None]:
INPUT_DIM = len(SUM.vocab)
OUTPUT_DIM = len(TITLE.vocab)
ENC_EMB_DIM = 100
DEC_EMB_DIM = 100
HID_DIM = 512
N_LAYERS = 3
ENC_DROPOUT = 0
DEC_DROPOUT = 0

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
con = ControlLayer(HID_DIM,HID_DIM)
'''trying with complex attention first'''
attention = ComplexAttention(HID_DIM,HID_DIM,HID_DIM)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM,HID_DIM,HID_DIM,attention,'complex')

model = Seq2Seq(enc,con, dec, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
      #print(name)
      nn.init.uniform_(param.data, -0.1, 0.1)  
model.apply(init_weights)
pretrained_embeddings = SUM.vocab.vectors
model.encoder.embedding.weight.data.copy_(pretrained_embeddings)
model.decoder.embedding.weight.data.copy_(pretrained_embeddings)
    

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')

In [None]:
def checkpoint_and_save(model, min_loss, epoch, optimizer):
    print()
    state = {'model': model,'min_loss': min_loss,'epoch': epoch,'model_state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(),}
    path =  './drive/MyDrive/Colab Notebooks/final_net_1.pt'
    torch.save(state, path)

In [None]:
optimizer = optim.RMSprop(model.parameters(),lr=0.01,momentum=0.9,weight_decay=0.9)
PAD_IDX = TITLE.vocab.stoi[TITLE.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [None]:
import time
N_EPOCHS = 50
CLIP = 1
min_loss = 1000000
min_epoch = -1
train_loss_list = []
test_loss_list = []
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    test_loss = test(model,valid_iterator,criterion)
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tTest Loss: {test_loss:.3f}')
    #print("After epoch {} , generated title is {}".format(epoch,translate(model,demo_sentence,10)))
    end_time = time.time()
    print(f'Time taken : {(end_time-start_time)/60:.3f}mins')
    if(train_loss < min_loss):
      min_loss=train_loss
      min_epoch = epoch
      print("Saving the new checkpoint....")
      checkpoint_and_save(model,min_loss,epoch,optimizer)
    if(epoch-min_epoch >= 10):
      print("NO further improvement over 10 epochs. Terminating...")
      break
    
   

	Train Loss: 10.580
	Test Loss: 10.581
Time taken : 7.652mins
Saving the new checkpoint....

	Train Loss: 10.580
	Test Loss: 10.579
Time taken : 7.694mins
	Train Loss: 10.580
	Test Loss: 10.579
Time taken : 7.691mins


KeyboardInterrupt: ignored

### Start training and testing!
### Experiement 2 - Simple Attention,Adam optimiser

In [None]:
INPUT_DIM = len(SUM.vocab)
OUTPUT_DIM = len(TITLE.vocab)
ENC_EMB_DIM = 100
DEC_EMB_DIM = 100
HID_DIM = 512
N_LAYERS = 3
ENC_DROPOUT = 0
DEC_DROPOUT = 0
SPLIT = 472
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
con = ControlLayer(HID_DIM,HID_DIM)
'''trying with complex attention first'''
attention = SimpleAttention(HID_DIM,HID_DIM,HID_DIM,SPLIT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM,HID_DIM,HID_DIM,attention,'simple')

model = Seq2Seq(enc,con, dec, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
      #print(name)
      nn.init.uniform_(param.data, -0.1, 0.1)  
model.apply(init_weights)
pretrained_embeddings = SUM.vocab.vectors
model.encoder.embedding.weight.data.copy_(pretrained_embeddings)
model.decoder.embedding.weight.data.copy_(pretrained_embeddings)
    

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')

In [None]:
def checkpoint_and_save(model, min_loss, epoch, optimizer):
    print()
    state = {'model': model,'min_loss': min_loss,'epoch': epoch,'model_state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(),}
    path =  './drive/MyDrive/Colab Notebooks/final_net_2.pt'
    torch.save(state, path)

In [None]:
optimizer = optim.Adam(model.parameters(),lr=0.001)
PAD_IDX = TITLE.vocab.stoi[TITLE.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [None]:
import time
N_EPOCHS = 50
CLIP = 1
min_loss = 1000000
min_epoch = -1
train_loss_list = []
test_loss_list = []
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    test_loss = test(model,valid_iterator,criterion)
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tTest Loss: {test_loss:.3f}')
    #print("After epoch {} , generated title is {}".format(epoch,translate(model,demo_sentence,10)))
    end_time = time.time()
    print(f'Time taken : {(end_time-start_time)/60:.3f}mins')
    if(train_loss < min_loss):
      min_loss=train_loss
      min_epoch = epoch
      print("Saving the new checkpoint....")
      checkpoint_and_save(model,min_loss,epoch,optimizer)
    if(epoch-min_epoch >= 10):
      print("NO further improvement over 10 epochs. Terminating...")
      break
    
   

	Train Loss: 6.407
	Test Loss: 6.517
Time taken : 16.629mins
Saving the new checkpoint....

	Train Loss: 5.938
	Test Loss: 6.500
Time taken : 16.604mins
Saving the new checkpoint....

	Train Loss: 5.753
	Test Loss: 6.520
Time taken : 16.590mins
Saving the new checkpoint....

	Train Loss: 5.578
	Test Loss: 6.545
Time taken : 16.602mins
Saving the new checkpoint....

	Train Loss: 5.470
	Test Loss: 6.540
Time taken : 16.604mins
Saving the new checkpoint....

	Train Loss: 5.382
	Test Loss: 6.551
Time taken : 16.561mins
Saving the new checkpoint....

	Train Loss: 5.284
	Test Loss: 6.566
Time taken : 16.559mins
Saving the new checkpoint....

	Train Loss: 5.102
	Test Loss: 6.394
Time taken : 16.630mins
Saving the new checkpoint....

	Train Loss: 4.877
	Test Loss: 6.251
Time taken : 16.620mins
Saving the new checkpoint....

	Train Loss: 4.641
	Test Loss: 6.172
Time taken : 16.610mins
Saving the new checkpoint....

	Train Loss: 4.395
	Test Loss: 6.190
Time taken : 16.839mins
Saving the new chec

## Translate
<Br> Randomly select some abstracts from the dataset and generate the title using the model.

In [None]:
df = pd.read_csv('./drive/MyDrive/data_summaries.csv')
idx = np.random.randint(0,df.shape[0],100)
df1 = df.loc[idx]
df1.to_csv('./drive/MyDrive/test_data.csv')

In [None]:
ran_dataset = data.TabularDataset(path='./drive/MyDrive/test_data.csv',format='csv', fields=fields,skip_header=True)

In [None]:
def cal_length(x):
  return len(x.sum1)+len(x.sum2)+len(x.sum3)+len(x.sum4)+len(x.sum5)+len(x.sum6)+len(x.sum7)+len(x.Title)
from torchtext.legacy import data
BATCH_SIZE =1
iterator=data.BucketIterator.splits(
    ran_dataset,
    batch_size = BATCH_SIZE, sort_key = lambda x: cal_length(x), sort_within_batch = True,shuffle=True,sort=False,
    device = device)

In [None]:
path =  './drive/MyDrive/Colab Notebooks/net.pt'
checkpoint = torch.load(path)
#print(checkpoint)
model1 = checkpoint['model']
model1.load_state_dict( checkpoint['model_state_dict'])
min_loss = checkpoint['min_loss']
epoch = checkpoint['epoch']


In [None]:
#to generate title for one abstract
def translate(model,iterator,max_length):
  pred=[]
  with torch.no_grad():
    for i,batch in enumerate(iterator):
      
        predictions = model(batch,batch.Title,0)
        predictions = predictions.squeeze(0)
        best_guess = predictions.argmax(0).item()

        pred.append(best_guess)

        # Model predicts it's the end of the sentence
        if best_guess == SUM.vocab.stoi["<eos>"]:
          break

        translated_sentence = [SUM.vocab.itos[idx] for idx in pred]
  return translated_sentence[1:]

In [None]:
print(translate(model,iterator,10))