In [None]:
import pandas as pd
import json
import matplotlib as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data.sampler import BatchSampler
from torch.optim import lr_scheduler
from PIL import Image
import timeit
from sklearn.pipeline import Pipeline
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy import data
import random
## For reproducibility
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(0)
random.seed(0)
torch.cuda.manual_seed(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Create Preprocessing pipeline for summaries

In [None]:
tokenize =  lambda s: s.split()
import re  
def cleanup_text(texts):
    cleaned_text = []
    for text in texts:
        # remove punctuation
        text = re.sub('[^a-zA-Z0-9]', ' ', text)
        # remove multiple spaces
        text = re.sub(r' +', ' ', text)
        # remove newline
        text = re.sub(r'\n', ' ', text)
        #replace digits with '# symbol
        text = re.sub('[0-9]', '#', text)
        cleaned_text.append(text)
    return cleaned_text

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Create torchtext fields

In [None]:
#Field for summaries (removed stop words)
SUM = data.Field(tokenize = tokenize,init_token='<sos>',eos_token='<eos>',pad_first=True,stop_words=stop,lower = True,preprocessing=cleanup_text)
#Field for title
TITLE = data.Field(tokenize = tokenize,init_token='<sos>',eos_token='<eos>',lower = True,preprocessing=cleanup_text)
#Field for Id
#ID = data.Field(use_vocab=False,sequential=False,dtype=torch.LongTensor,postprocessing=data.Pipeline(lambda x: int(x)))

In [None]:
fields = [('Id',None),('Abstract',None),('Title',TITLE),('sum1',SUM),('sum2',SUM),('sum3',SUM),('sum4',SUM),('sum5',SUM),('sum6',SUM),('sum7',SUM)]

## Read data into tabular dataset

In [None]:
dataset = data.TabularDataset(path='./drive/MyDrive/data_summaries.csv',format='csv', fields=fields,skip_header=True)

In [None]:
print(vars(dataset[0]))

{'Title': ['dual', 'recurrent', 'attention', 'units', 'for', 'visual', 'question', 'answering'], 'sum1': ['propose', 'architecture', 'vqa', 'utilizes', 'recurrent', 'layers', 'generate', 'visual', 'textual', 'attention the', 'memory', 'characteristic', 'proposed', 'recurrent', 'attention', 'units', 'offers', 'rich', 'joint', 'embedding', 'visual', 'textual', 'features', 'enables', 'model', 'reason', 'relations', 'several', 'parts', 'image', 'question '], 'sum2': ['propose', 'architecture', 'vqa', 'utilizes', 'recurrent', 'layers', 'generate', 'visual', 'textual', 'attention in', 'cases ', 'recurrent', 'attention', 'mechanism', 'improves', 'performance', 'tasks', 'requiring', 'sequential', 'relational', 'reasoning', 'vqa', 'dataset '], 'sum3': ['memory', 'characteristic', 'proposed', 'recurrent', 'attention', 'units', 'offers', 'rich', 'joint', 'embedding', 'visual', 'textual', 'features', 'enables', 'model', 'reason', 'relations', 'several', 'parts', 'image', 'question our', 'single', 

## Create training data and test data

In [None]:
import random
train_data, valid_data = dataset.split(split_ratio=0.9, random_state=random.seed(0))

In [None]:
print(len(train_data))
print(len(valid_data))

36900
4100


In [None]:
print(vars(train_data[5]))

{'Title': ['adaptively', 'learning', 'the', 'crowd', 'kernel'], 'sum1': ['introduce', 'algorithm', 'that ', 'given', 'n', 'objects ', 'learns', 'similarity', 'matrix', 'n #', 'pairs ', 'crowdsourced', 'data', 'alone the', 'algorithm', 'samples', 'responses', 'adaptively', 'chosen', 'triplet based', 'relative similarity', 'queries '], 'sum2': ['introduce', 'algorithm', 'that ', 'given', 'n', 'objects ', 'learns', 'similarity', 'matrix', 'n #', 'pairs ', 'crowdsourced', 'data', 'alone svms', 'reveal', 'crowd', 'kernel', 'captures', 'prominent', 'subtle', 'features', 'across', 'number', 'domains ', ' is', 'striped ', 'among', 'neckties', ' vowel', 'vs ', 'consonant ', 'among', 'letters '], 'sum3': ['introduce', 'algorithm', 'that ', 'given', 'n', 'objects ', 'learns', 'similarity', 'matrix', 'n #', 'pairs ', 'crowdsourced', 'data', 'alone svms', 'reveal', 'crowd', 'kernel', 'captures', 'prominent', 'subtle', 'features', 'across', 'number', 'domains ', ' is', 'striped ', 'among', 'neckties

In [None]:
SUM.build_vocab(train_data.sum1,train_data.sum2,train_data.sum3,train_data.sum4,train_data.sum5,\
                train_data.sum6,train_data.sum7,train_data.Title,max_size=40000,vectors='glove.6B.100d')
TITLE.vocab= SUM.vocab


In [None]:
print(len(SUM.vocab))

40004


## Create Bucket iterator

In [None]:
def cal_length(x):
  return len(x.sum1)+len(x.sum2)+len(x.sum3)+len(x.sum4)+len(x.sum5)+len(x.sum6)+len(x.sum7)+len(x.Title)
from torchtext.legacy import data
BATCH_SIZE =64
train_iterator, valid_iterator =data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE, sort_key = lambda x: cal_length(x), sort_within_batch = True,shuffle=True,sort=False,
    device = device)

# Model Architecture

## Encoder layer

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, num_layers, dropout): 
        super().__init__()   
        self.hid_dim = hid_dim
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_dim, emb_dim)  
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers, dropout = dropout)    
        self.dropout = nn.Dropout(dropout)       
    def forward(self, input_idx):
        #print(input_idx)
        input_idx=input_idx.to(device)
        embedded = self.dropout(self.embedding(input_idx))
        outputs, (hidden, cell) = self.lstm(embedded)
        #embedded = seq_len,batch_size,embed_dim
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        #outputs are always from the top hidden layer
        return outputs,hidden

## Control layer

In [None]:
class ControlLayer(nn.Module):
    def __init__(self, input_dim,hid_dim): 
        super().__init__()   
        self.hid_dim = hid_dim
        
        #self.embedding = nn.Embedding(input_dim, emb_dim)  
        self.lstm = nn.LSTM(input_dim, hid_dim)    
             
    def forward(self, sum_hidden):
        #print(input_idx)
        #sum_hidden = seq_len(=7),batch_size,embed_dim(=encoder_hidden_dimension)
        outputs, (hidden, cell) = self.lstm(sum_hidden)
        #embedded = seq_len,batch_size,embed_dim
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        #outputs are always from the top hidden layer
        return outputs,hidden, cell

##Attention layer

In [None]:
class ComplexAttention(nn.Module):
   def __init__(self,dec_hid_dim,cnt_hid_dim,enc_hid_dim):
    super().__init__() 
    self.cnt_hid_dim=cnt_hid_dim
    self.enc_hid_dim = enc_hid_dim
    self.dec_hid_dim = dec_hid_dim
   def forward(self,cnt_hid_states,enc_hid_states,dec_hid_states):
     #cnt_hid_states = [7,batch_size,cnt_hid_dim]
     #enc_hid_states = [7,seq_len*,batch_size,enc_hid_dim], list of hidden states for every summary
     #dec_hid_states = [1,batch_size,dec_hid_dim]
     '''Calculate summary level attention'''
     cnt_hid_states=cnt_hid_states.permute(1,0,2)
     dec_hid_states=dec_hid_states.permute(1,2,0)
     #dec_hid_states=[batch_size,dec_hid_dim,1]
     alpha = torch.bmm(cnt_hid_states,dec_hid_states)
     #alpha=[batch_size,7,1]
     #alpha=alpha.squeeze(2)
     alpha=F.softmax(alpha,dim=1)
     #alpha=[batch_size,7,1]
     '''Calculate word level attention'''
     batch_size = alpha.size()[0]
     context_vec=torch.zeros(batch_size,1,self.enc_hid_dim).to(device)
     context_vec_k=torch.zeros(7,batch_size,self.enc_hid_dim).to(device)
     for k,sum_hid_states in enumerate(enc_hid_states):
       #sum_hid_states = [seq_len_k,batch_size,enc_hid_dim]
       sum_hid_states=sum_hid_states.permute(1,0,2)
       beta = torch.bmm(sum_hid_states,dec_hid_states)
       #beta = [batch_size,seq_len_1,1]
       #beta=beta.squeeze(2)
       beta=F.softmax(beta,dim=1)
       beta=beta.permute(0,2,1)
       #beta = [batch_size,1,seq_len]
       #sum_hid_states = [batch_size,seq_len_size,enc_hid_dim]
       context_vec_k[k] = torch.bmm(beta,sum_hid_states).squeeze(1)
       #context_vec_k = [batch_size,1,enc_hid_dim].squeeze(1)
     '''Combining both and returning context_vector'''
     context_vec_k=context_vec_k.permute(1,0,2)
     alpha=alpha.permute(0,2,1)
     context_vec = torch.bmm(alpha,context_vec_k)
     del context_vec_k
     torch.cuda.empty_cache()
     return alpha,beta,context_vec

       

In [None]:
class SimpleAttention(nn.Module):
   def __init__(self,dec_hid_dim,cnt_hid_dim,enc_hid_dim,split):
    super().__init__() 
    self.cnt_hid_dim=cnt_hid_dim
    self.enc_hid_dim = enc_hid_dim
    self.dec_hid_dim = dec_hid_dim
    self.split = split
   def forward(self,cnt_hid_states,enc_hid_states,dec_hid_states):
     #cnt_hid_states = [7,batch_size,cnt_hid_dim]
     #enc_hid_states = [7,seq_len*,batch_size,enc_hid_dim], list of hidden states for every summary
     #dec_hid_states = [num_layers(=1),batch_size,dec_hid_dim]
     '''Calculate summary level attention'''
     cnt_hid_states_context=cnt_hid_states.permute(1,0,2)[:,:,:self.split]
     dec_hid_states_context=dec_hid_states.permute(1,2,0)[:,:self.split,:]
     cnt_hid_states_wgt=cnt_hid_states.permute(1,0,2)[:,:,self.split:]
     dec_hid_states_wgt=dec_hid_states.permute(1,2,0)[:,self.split:,:]
     #dec_hid_states_wgt=[batch_size,dec_hid_dim,1]
     alpha = torch.bmm(cnt_hid_states_wgt,dec_hid_states_wgt)
     #alpha=[batch_size,7,1]
     #alpha=alpha.squeeze(2)
     alpha=F.softmax(alpha,dim=1)
     #alpha=[batch_size,7,1]
     '''Calculate word level attention'''
     batch_size = alpha.size()[0]
     context_vec=torch.zeros(batch_size,1,self.split).to(device)
     context_vec_k=torch.zeros(7,batch_size,self.split).to(device)
     for k,sum_hid_states in enumerate(enc_hid_states):
       #sum_hid_states = [seq_len_k,batch_size,enc_hid_dim]
       sum_hid_states_wgt=sum_hid_states.permute(1,0,2)[:,:,self.split:]
       sum_hid_states_context=sum_hid_states.permute(1,0,2)[:,:,:self.split]
       beta = torch.bmm(sum_hid_states_wgt,dec_hid_states_wgt)
       #beta = [batch_size,seq_len_1,1]
       #beta=beta.squeeze(2)
       beta=F.softmax(beta,dim=1)
       beta=beta.permute(0,2,1)
       #beta = [batch_size,1,seq_len]
       #sum_hid_states = [batch_size,seq_len_size,enc_hid_dim]
       context_vec_k[k] = torch.bmm(beta,sum_hid_states_context).squeeze(1)
       #context_vec_k = [batch_size,1,enc_hid_dim].squeeze(1)
     '''Combining both and returning context_vector'''
     context_vec_k=context_vec_k.permute(1,0,2)
     alpha=alpha.permute(0,2,1)
     context_vec = torch.bmm(alpha,context_vec_k)
     del context_vec_k
     torch.cuda.empty_cache()
     return alpha,beta,context_vec

       

## Decoder

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim,con_hid_dim,attention,attention_type):
        super().__init__()
        self.output_dim = output_dim
        self.emb_dim = emb_dim
        self.con_hid_dim = con_hid_dim
        self.attention = attention      
        self.embedding = nn.Embedding(output_dim, emb_dim)
        if attention_type=='complex':      
          self.lstm = nn.LSTM(input_size=(enc_hid_dim + emb_dim),hidden_size= dec_hid_dim)
          self.fc_out = nn.Linear( enc_hid_dim + dec_hid_dim + emb_dim, output_dim)
        else:
          self.lstm = nn.LSTM((attention.split+emb_dim), dec_hid_dim)
          self.fc_out = nn.Linear( (2*attention.split)  + emb_dim, output_dim)    
        #self.dropout = nn.Dropout(dropout)
        self.attention_type=attention_type
    def forward(self, input_idx,cnt_hid_states,enc_hid_states,dec_hid_states,cell_state):
      #input = [batch_size]
      input_idx = input_idx.unsqueeze(0)#Adding a dimenstion at the the first = 1 = seq_len as we are sending word by word
      #input = [1,batch_size] 
      embedded = self.embedding(input_idx)
      #embedded = [1,batch_size,embed_size]
      '''Getting the context vector'''
      _,_,context_vector=self.attention(cnt_hid_states,enc_hid_states,dec_hid_states)
      #context_vector=[batch_size,1,hid_state]
      context_vector=context_vector.permute(1,0,2)
      #context_vector=[1,batch_size,hid_state]
      lstm_in = torch.cat((embedded,context_vector),dim=2)
      #lstm_in = [1,batch_size,context_vector_size+embed_size]
      #print(lstm_in.size())
      outputs, (hidden, cell) = self.lstm(lstm_in,(dec_hid_states,cell_state))
      #output=[seq_len(=1),batch_size,hid_dim]
      #hidden=[num_layers(=1),batch_size,hid_dim]
      assert(outputs==hidden).all()

      embedded=embedded.squeeze(0)
      context_vector=context_vector.squeeze(0)
      outputs = outputs.squeeze(0)
      if self.attention_type=='complex':
        prediction = self.fc_out(torch.cat((outputs,context_vector,embedded),dim=1))
      else:
        prediction = self.fc_out(torch.cat((outputs[:,:self.attention.split],context_vector,embedded),dim=1))
      #prediction_size = (batch_size,out_dim)
      return prediction,hidden,cell


##Seq2Seq

In [None]:
class Seq2Seq(nn.Module): #Combining the encoder,control_layer & decoder
  def __init__(self,encoder,control_layer,decoder,device):
    super().__init__()
    self.encoder=encoder
    self.control_layer = control_layer
    self.decoder=decoder
    self.device =  device
  def forward(self,input_batches,output_batches,tfr=0.5):
    #input_batches dimension - NOT A TENSOR. ENTIRE BATCH OBJECT IS SENT. 
    #output_batches_dimension - (seq_len,batch_size)
    
    batch_size = output_batches.shape[1]
    title_len = output_batches.shape[0]
    title_vocab_size = self.decoder.output_dim
    predictions = torch.zeros(title_len, batch_size, title_vocab_size).to(device)
    #print(input_batches.size())
    '''Pass each summary through the encoder'''
    sum1=input_batches.sum1
    sum2=input_batches.sum2
    sum3=input_batches.sum3
    sum4=input_batches.sum4
    sum5=input_batches.sum5
    sum6=input_batches.sum6
    sum7=input_batches.sum7
    sum=[sum1,sum2,sum3,sum4,sum5,sum6,sum7]
    control_input=torch.zeros((7,batch_size,self.control_layer.hid_dim)).to(device)
    encoder_hidden_states = []
    for s in range(7):
      output,hidden=self.encoder(sum[s])
      #output = [s.length,batch_size,hid_dim]
      #hidden=[num_layers,batch_size,hid_dim]
      #print("enc_output device",output.device)
      encoder_hidden_states.append(output)
      control_input[s]=hidden[-1]
    
    '''Pass the last hidden state to control layer for each summary'''
    output,hidden_state,cell_state = self.control_layer(control_input)
    control_hidden_states = output
    #prprint("S_c")
    '''Pass the merged representation to decoder along with encoder and control layer hidden states for implementing attention'''
    
    
    x = output_batches[0,:] # Trigger token <SOS>

    for i in range(1, title_len):
      pred, hidden_state, cell_state = self.decoder(x,control_hidden_states,encoder_hidden_states,hidden_state, cell_state)
      #pred = [batch_size,output_dim(vocab_size)]
      predictions[i] = pred
      best_guess = pred.argmax(1) 
      x = output_batches[i,:] if random.random() < tfr else best_guess
    return predictions  

## Train

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i,batch in enumerate(iterator):
        
        #abstract = batch.Abstract
        title = batch.Title
        #abstract,title = [seq_len,batch_size]
        optimizer.zero_grad()
        #print("batch device ",batch.device)
        predictions = model(batch, title,0.5)
        
        #predictions = [seq_len_title,batch_size,title_vocab]
        output_dim = predictions.shape[-1]
        
        predictions = predictions[1:].view(-1, output_dim)#ignoring the first value is the <sos> token
        title = title[1:].view(-1)
        
        loss = criterion(predictions, title)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

## Test

In [None]:
def test(model, iterator, criterion):    
    model.eval() 
    epoch_loss = 0 
    with torch.no_grad():   
        for i, batch in enumerate(iterator):
          #abstract = batch.Abstract
          title = batch.Title
          #abstract,title = [seq_len,batch_size]
          predictions = model(batch, title,0)
          #predictions = [seq_len_title,batch_size,title_vocab]
          output_dim = predictions.shape[-1]
          predictions = predictions[1:].view(-1, output_dim)#ignoring the first value is the <sos> token
          title = title[1:].view(-1)
          loss = criterion(predictions, title)  
          epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

##Translate

In [None]:
#to generate title for one abstract
def translate(model,batch,max_len):
  predictions=[]
  model.eval()
  with torch.no_grad():   
    batch_size = 1
    title_vocab_size = model.decoder.output_dim
   # predictions = torch.zeros(title_len, batch_size, title_vocab_size).to(device)
    #print(input_batches.size())
    '''Pass each summary through the encoder'''
    sum1=batch.sum1
    sum2=batch.sum2
    sum3=batch.sum3
    sum4=batch.sum4
    sum5=batch.sum5
    sum6=batch.sum6
    sum7=batch.sum7
    sum=[sum1,sum2,sum3,sum4,sum5,sum6,sum7]
    control_input=torch.zeros((7,batch_size,model.control_layer.hid_dim)).to(device)
    encoder_hidden_states = []
    for s in range(7):
      output,hidden=model.encoder(sum[s])
      #output = [s.length,batch_size,hid_dim]
      #hidden=[num_layers,batch_size,hid_dim]
      #print("enc_output device",output.device)
      encoder_hidden_states.append(output)
      control_input[s]=hidden[-1]
    
    '''Pass the last hidden state to control layer for each summary'''
    output,hidden_state,cell_state = model.control_layer(control_input)
    control_hidden_states = output
    #prprint("S_c")
    '''Pass the merged representation to decoder along with encoder and control layer hidden states for implementing attention'''
    
    
    x =  torch.LongTensor([SUM.vocab.stoi['<sos>']]).to(device)

    for i in range(1, max_len):
      pred, hidden_state, cell_state = model.decoder(x,control_hidden_states,encoder_hidden_states,hidden_state, cell_state)
      #pred = [1,output_dim(vocab_size)]
      best_guess = pred.argmax(1)
      predictions.append(best_guess.item())
      x = best_guess
      # Model predicts it's the end of the sentence
      if predictions[-1] == SUM.vocab.stoi["<eos>"]:
        break

      translated_sentence = [SUM.vocab.itos[idx] for idx in predictions]
  return translated_sentence[1:]

### Randomly pick 1000 abstracts from the dataset
<br> This will be used later for generating titles.

In [None]:
df = pd.read_csv('./drive/MyDrive/data_summaries.csv')
idx = np.random.randint(0,df.shape[0],1000)
df1 = df.loc[idx]
df1.to_csv('./drive/MyDrive/test_data.csv')

##Start training and testing!


### Experiement 1 - Complex Attention

In [None]:
INPUT_DIM = len(SUM.vocab)
OUTPUT_DIM = len(TITLE.vocab)
ENC_EMB_DIM = 100
DEC_EMB_DIM = 100
HID_DIM = 512
N_LAYERS = 3
ENC_DROPOUT = 0
DEC_DROPOUT = 0

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
con = ControlLayer(HID_DIM,HID_DIM)
'''trying with complex attention first'''
attention = ComplexAttention(HID_DIM,HID_DIM,HID_DIM)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM,HID_DIM,HID_DIM,attention,'complex')

model = Seq2Seq(enc,con, dec, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
      #print(name)
      nn.init.uniform_(param.data, -0.1, 0.1)  
model.apply(init_weights)
pretrained_embeddings = SUM.vocab.vectors
model.encoder.embedding.weight.data.copy_(pretrained_embeddings)
model.decoder.embedding.weight.data.copy_(pretrained_embeddings)
    

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')

In [None]:
def checkpoint_and_save(model, min_loss, epoch, optimizer):
    print()
    state = {'model': model,'min_loss': min_loss,'epoch': epoch,'model_state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(),}
    path =  './drive/MyDrive/Colab Notebooks/final_net.pt'
    torch.save(state, path)

In [None]:
optimizer = optim.Adam(model.parameters(),lr=0.001)
PAD_IDX = TITLE.vocab.stoi[TITLE.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [None]:
import time
N_EPOCHS = 20
CLIP = 1
min_loss = 1000000
min_epoch = -1
train_loss_list = []
test_loss_list = []
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    test_loss = test(model,valid_iterator,criterion)
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tTest Loss: {test_loss:.3f}')
    #print("After epoch {} , generated title is {}".format(epoch,translate(model,demo_sentence,10)))
    end_time = time.time()
    print(f'Time taken : {(end_time-start_time)/60:.3f}mins')
    if(train_loss < min_loss):
      min_loss=train_loss
      min_epoch = epoch
      print("Saving the new checkpoint....")
      checkpoint_and_save(model,min_loss,epoch,optimizer)
    if(epoch-min_epoch >= 10):
      print("NO further improvement over 10 epochs. Terminating...")
      break
    
   

	Train Loss: 6.402
	Test Loss: 6.523
Time taken : 7.524mins
Saving the new checkpoint....

	Train Loss: 5.931
	Test Loss: 6.506
Time taken : 7.692mins
Saving the new checkpoint....

	Train Loss: 5.748
	Test Loss: 6.520
Time taken : 7.695mins
Saving the new checkpoint....

	Train Loss: 5.573
	Test Loss: 6.534
Time taken : 7.702mins
Saving the new checkpoint....

	Train Loss: 5.455
	Test Loss: 6.547
Time taken : 7.697mins
Saving the new checkpoint....

	Train Loss: 5.370
	Test Loss: 6.550
Time taken : 7.683mins
Saving the new checkpoint....

	Train Loss: 5.274
	Test Loss: 6.594
Time taken : 7.713mins
Saving the new checkpoint....

	Train Loss: 5.164
	Test Loss: 6.586
Time taken : 7.722mins
Saving the new checkpoint....

	Train Loss: 5.100
	Test Loss: 6.594
Time taken : 7.707mins
Saving the new checkpoint....

	Train Loss: 5.023
	Test Loss: 6.603
Time taken : 7.705mins
Saving the new checkpoint....

	Train Loss: 4.964
	Test Loss: 6.613
Time taken : 7.759mins
Saving the new checkpoint....


### Experiement 2 - Simple Attention

In [None]:
INPUT_DIM = len(SUM.vocab)
OUTPUT_DIM = len(TITLE.vocab)
ENC_EMB_DIM = 100
DEC_EMB_DIM = 100
HID_DIM = 512
N_LAYERS = 3
ENC_DROPOUT = 0
DEC_DROPOUT = 0
SPLIT = 472
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
con = ControlLayer(HID_DIM,HID_DIM)
'''trying with complex attention first'''
attention = SimpleAttention(HID_DIM,HID_DIM,HID_DIM,SPLIT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM,HID_DIM,HID_DIM,attention,'simple')

model = Seq2Seq(enc,con, dec, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
      #print(name)
      nn.init.uniform_(param.data, -0.1, 0.1)  
model.apply(init_weights)
pretrained_embeddings = SUM.vocab.vectors
model.encoder.embedding.weight.data.copy_(pretrained_embeddings)
model.decoder.embedding.weight.data.copy_(pretrained_embeddings)
    

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')

In [None]:
def checkpoint_and_save(model, min_loss, epoch, optimizer):
    print()
    state = {'model': model,'min_loss': min_loss,'epoch': epoch,'model_state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(),'train_iterator':train_iterator.state_dict(),'valid_iterator':valid_iterator.state_dict()}
    path =  './drive/MyDrive/Colab Notebooks/final_net_2.1.pt'
    torch.save(state, path)

In [None]:
optimizer = optim.Adam(model.parameters(),lr=0.001)
PAD_IDX = TITLE.vocab.stoi[TITLE.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [None]:
import time
N_EPOCHS = 50
CLIP = 1
min_loss = 1000000
min_epoch = -1
train_loss_list = []
test_loss_list = []
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    test_loss = test(model,valid_iterator,criterion)
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tTest Loss: {test_loss:.3f}')
    #print("After epoch {} , generated title is {}".format(epoch,translate(model,demo_sentence,10)))
    end_time = time.time()
    print(f'Epoch {epoch} - Time taken : {(end_time-start_time)/60:.3f}mins')
    if(train_loss < min_loss):
      min_loss=train_loss
      min_epoch = epoch
      print("Saving the new checkpoint....")
      checkpoint_and_save(model,min_loss,epoch,optimizer)
    if(epoch-min_epoch >= 10):
      print("NO further improvement over 10 epochs. Terminating...")
      break
    
   

	Train Loss: 6.239
	Test Loss: 6.492
Epoch 0 - Time taken : 8.695mins
Saving the new checkpoint....

	Train Loss: 5.899
	Test Loss: 6.497
Epoch 1 - Time taken : 8.679mins
Saving the new checkpoint....

	Train Loss: 5.714
	Test Loss: 6.523
Epoch 2 - Time taken : 8.692mins
Saving the new checkpoint....

	Train Loss: 5.565
	Test Loss: 6.612
Epoch 3 - Time taken : 8.696mins
Saving the new checkpoint....

	Train Loss: 5.453
	Test Loss: 6.531
Epoch 4 - Time taken : 8.678mins
Saving the new checkpoint....

	Train Loss: 5.357
	Test Loss: 6.564
Epoch 5 - Time taken : 8.702mins
Saving the new checkpoint....

	Train Loss: 5.261
	Test Loss: 6.532
Epoch 6 - Time taken : 8.724mins
Saving the new checkpoint....

	Train Loss: 5.115
	Test Loss: 6.464
Epoch 7 - Time taken : 8.725mins
Saving the new checkpoint....

	Train Loss: 4.961
	Test Loss: 6.336
Epoch 8 - Time taken : 8.755mins
Saving the new checkpoint....

	Train Loss: 4.799
	Test Loss: 6.277
Epoch 9 - Time taken : 8.762mins
Saving the new checkp

KeyboardInterrupt: ignored

#### Call translate

In [None]:
df1={'Abstract':[],'Title':[],'Generated Title':[]}

In [None]:
df1

{'Abstract': [], 'Generated Title': [], 'Title': []}

In [None]:
ID = data.Field(use_vocab=False,sequential=False,preprocessing=int)
fields = [('Id',ID),('Abstract',None),('Title',TITLE),('sum1',SUM),('sum2',SUM),('sum3',SUM),('sum4',SUM),('sum5',SUM),('sum6',SUM),('sum7',SUM)]
ran_dataset = data.TabularDataset(path='./drive/MyDrive/test_data.csv',format='csv', fields=fields,skip_header=True)

In [None]:
from torchtext.legacy import data
BATCH_SIZE =1
iterator=data.Iterator(
    ran_dataset,
    batch_size = BATCH_SIZE,shuffle=False,device=device)

In [None]:
len(iterator)

1000

In [None]:
path =  './drive/MyDrive/Colab Notebooks/final_net_2.1.pt'
checkpoint = torch.load(path)
#print(checkpoint)
model1 = checkpoint['model']
model1.load_state_dict( checkpoint['model_state_dict'])
min_loss = checkpoint['min_loss']
epoch = checkpoint['epoch']


In [None]:
for i,batch in enumerate(iterator):
  id1 = batch.Id.item()
  #print(batch)
  print("Abstract : ")
  print("".join(list(df.loc[df['Id'] == id1]['Abstract'])))
  print("Actual Title : ")
  print("".join(list(df.loc[df['Id'] == id1]['Title'])))
  print("Generated Title : ")
  print(" ".join(translate(model1,batch,10)))
  abs= "".join(list(df.loc[df['Id'] == id1]['Abstract']))
  title= "".join(list(df.loc[df['Id'] == id1]['Title']))
  df1['Abstract'].append(abs)
  df1['Title'].append(title)
  df1['Generated Title'].append(" ".join(translate(model1,batch,10)))
  #df1['Generated_Title'][id1]=translate(model1,batch,15))

  

In [None]:
len(df1['Abstract'])

1000

In [None]:
df2 = pd.DataFrame(df1)

In [None]:
df2.to_csv('./drive/MyDrive/generated_titles_2.csv')

### Experiment 3 - Simple Attention,tfr =0.5, Dropout = 0.5

---



In [None]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i,batch in enumerate(iterator):
        
        #abstract = batch.Abstract
        title = batch.Title
        #abstract,title = [seq_len,batch_size]
        optimizer.zero_grad()
        #print("batch device ",batch.device)
        predictions = model(batch, title,0.5)
        
        #predictions = [seq_len_title,batch_size,title_vocab]
        output_dim = predictions.shape[-1]
        
        predictions = predictions[1:].view(-1, output_dim)#ignoring the first value is the <sos> token
        title = title[1:].view(-1)
        
        loss = criterion(predictions, title)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
INPUT_DIM = len(SUM.vocab)
OUTPUT_DIM = len(TITLE.vocab)
ENC_EMB_DIM = 100
DEC_EMB_DIM = 100
HID_DIM = 512
N_LAYERS = 3
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
SPLIT = 472
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
con = ControlLayer(HID_DIM,HID_DIM)
'''trying with complex attention first'''
attention = SimpleAttention(HID_DIM,HID_DIM,HID_DIM,SPLIT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM,HID_DIM,HID_DIM,attention,'simple')

model = Seq2Seq(enc,con, dec, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
      #print(name)
      nn.init.uniform_(param.data, -0.1, 0.1)  
model.apply(init_weights)
pretrained_embeddings = SUM.vocab.vectors
model.encoder.embedding.weight.data.copy_(pretrained_embeddings)
model.decoder.embedding.weight.data.copy_(pretrained_embeddings)
    

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')

In [None]:
def checkpoint_and_save(model, min_loss, epoch, optimizer):
    print()
    state = {'model': model,'min_loss': min_loss,'epoch': epoch,'model_state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(),'train_iterator':train_iterator.state_dict(),'valid_iterator':valid_iterator.state_dict()}
    path =  './drive/MyDrive/Colab Notebooks/final_net_3.pt'
    torch.save(state, path)

In [None]:
optimizer = optim.Adam(model.parameters(),lr=0.001)
PAD_IDX = TITLE.vocab.stoi[TITLE.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [None]:
import time
N_EPOCHS = 25
CLIP = 1
min_loss = 1000000
min_epoch = -1
train_loss_list = []
test_loss_list = []
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    test_loss = test(model,valid_iterator,criterion)
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tTest Loss: {test_loss:.3f}')
    #print("After epoch {} , generated title is {}".format(epoch,translate(model,demo_sentence,10)))
    end_time = time.time()
    print(f'Epoch {epoch} - Time taken : {(end_time-start_time)/60:.3f}mins')
    if(train_loss < min_loss):
      min_loss=train_loss
      min_epoch = epoch
      print("Saving the new checkpoint....")
      checkpoint_and_save(model,min_loss,epoch,optimizer)
    if(epoch-min_epoch >= 10):
      print("NO further improvement over 10 epochs. Terminating...")
      break
    
   

	Train Loss: 6.406
	Test Loss: 6.501
Epoch 0 - Time taken : 14.785mins
Saving the new checkpoint....

	Train Loss: 5.928
	Test Loss: 6.579
Epoch 1 - Time taken : 14.835mins
Saving the new checkpoint....

	Train Loss: 5.726
	Test Loss: 6.502
Epoch 2 - Time taken : 14.822mins
Saving the new checkpoint....

	Train Loss: 5.595
	Test Loss: 6.516
Epoch 3 - Time taken : 14.853mins
Saving the new checkpoint....

	Train Loss: 5.458
	Test Loss: 6.512
Epoch 4 - Time taken : 14.843mins
Saving the new checkpoint....

	Train Loss: 5.360
	Test Loss: 6.490
Epoch 5 - Time taken : 14.827mins
Saving the new checkpoint....

	Train Loss: 5.163
	Test Loss: 6.478
Epoch 6 - Time taken : 14.831mins
Saving the new checkpoint....

	Train Loss: 5.022
	Test Loss: 6.300
Epoch 7 - Time taken : 14.851mins
Saving the new checkpoint....

	Train Loss: 4.854
	Test Loss: 6.318
Epoch 8 - Time taken : 14.841mins
Saving the new checkpoint....

	Train Loss: 4.664
	Test Loss: 6.282
Epoch 9 - Time taken : 14.860mins
Saving the 

#### Call translate


In [None]:
df1={'Abstract':[],'Title':[],'Generated Title':[]}

In [None]:
ID = data.Field(use_vocab=False,sequential=False,preprocessing=int)
fields = [('Id',ID),('Abstract',None),('Title',TITLE),('sum1',SUM),('sum2',SUM),('sum3',SUM),('sum4',SUM),('sum5',SUM),('sum6',SUM),('sum7',SUM)]
ran_dataset = data.TabularDataset(path='./drive/MyDrive/test_data.csv',format='csv', fields=fields,skip_header=True)

In [None]:
from torchtext.legacy import data
BATCH_SIZE =1
iterator=data.Iterator(
    ran_dataset,
    batch_size = BATCH_SIZE,shuffle=False,device=device)

In [None]:
len(iterator)

In [None]:
path =  './drive/MyDrive/Colab Notebooks/final_net_3.pt'
checkpoint = torch.load(path)
#print(checkpoint)
model1 = checkpoint['model']
model1.load_state_dict( checkpoint['model_state_dict'])
min_loss = checkpoint['min_loss']
epoch = checkpoint['epoch']


In [None]:

for i,batch in enumerate(iterator):
  id1 = batch.Id.item()
  #print(batch)
  #print("Abstract : ")
  #print("".join(list(df.loc[df['Id'] == id1]['Abstract'])))
  #print("Actual Title : ")
  #print("".join(list(df.loc[df['Id'] == id1]['Title'])))
  #print("Generated Title : ")
  #print(" ".join(translate(model1,batch,10)))
  abs= "".join(list(df.loc[df['Id'] == id1]['Abstract']))
  title= "".join(list(df.loc[df['Id'] == id1]['Title']))
  df1['Abstract'].append(abs)
  df1['Title'].append(title)
  df1['Generated Title'].append(" ".join(translate(model1,batch,10)))
  #df1['Generated_Title'][id1]=translate(model1,batch,15))

  

In [None]:
df2 = pd.DataFrame(df1)

In [None]:
df2.to_csv('./drive/MyDrive/generated_titles_3.csv')

### Experiment 4 ; Simple Attention, encoder num layers = 1

In [None]:
INPUT_DIM = len(SUM.vocab)
OUTPUT_DIM = len(TITLE.vocab)
ENC_EMB_DIM = 100
DEC_EMB_DIM = 100
HID_DIM = 512
N_LAYERS = 1
ENC_DROPOUT = 0
DEC_DROPOUT = 0
SPLIT = 472
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
con = ControlLayer(HID_DIM,HID_DIM)
attention = SimpleAttention(HID_DIM,HID_DIM,HID_DIM,SPLIT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM,HID_DIM,HID_DIM,attention,'simple')

model = Seq2Seq(enc,con, dec, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
      #print(name)
      nn.init.uniform_(param.data, -0.1, 0.1)  
model.apply(init_weights)
pretrained_embeddings = SUM.vocab.vectors
model.encoder.embedding.weight.data.copy_(pretrained_embeddings)
model.decoder.embedding.weight.data.copy_(pretrained_embeddings)
    

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')

In [None]:
def checkpoint_and_save(model, min_loss, epoch, optimizer):
    print()
    state = {'model': model,'min_loss': min_loss,'epoch': epoch,'model_state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(),'train_iterator':train_iterator.state_dict(),'valid_iterator':valid_iterator.state_dict()}
    path =  './drive/MyDrive/Colab Notebooks/final_net_4.pt'
    torch.save(state, path)

In [None]:
optimizer = optim.Adam(model.parameters(),lr=0.001)
PAD_IDX = TITLE.vocab.stoi[TITLE.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [None]:
import time
N_EPOCHS = 20
CLIP = 1
min_loss = 1000000
min_epoch = -1
train_loss_list = []
test_loss_list = []
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    test_loss = test(model,valid_iterator,criterion)
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tTest Loss: {test_loss:.3f}')
    #print("After epoch {} , generated title is {}".format(epoch,translate(model,demo_sentence,10)))
    end_time = time.time()
    print(f'Epoch {epoch} - Time taken : {(end_time-start_time)/60:.3f}mins')
    if(train_loss < min_loss):
      min_loss=train_loss
      min_epoch = epoch
      print("Saving the new checkpoint....")
      checkpoint_and_save(model,min_loss,epoch,optimizer)
    if(epoch-min_epoch >= 10):
      print("NO further improvement over 10 epochs. Terminating...")
      break
    
   

	Train Loss: 6.323
	Test Loss: 6.135
Epoch 0 - Time taken : 5.285mins
Saving the new checkpoint....

	Train Loss: 5.392
	Test Loss: 5.861
Epoch 1 - Time taken : 5.340mins
Saving the new checkpoint....

	Train Loss: 4.816
	Test Loss: 5.798
Epoch 2 - Time taken : 5.329mins
Saving the new checkpoint....

	Train Loss: 4.308
	Test Loss: 5.779
Epoch 3 - Time taken : 5.347mins
Saving the new checkpoint....

	Train Loss: 3.804
	Test Loss: 5.884
Epoch 4 - Time taken : 5.355mins
Saving the new checkpoint....

	Train Loss: 3.371
	Test Loss: 5.995
Epoch 5 - Time taken : 5.338mins
Saving the new checkpoint....

	Train Loss: 2.981
	Test Loss: 6.151
Epoch 6 - Time taken : 5.328mins
Saving the new checkpoint....

	Train Loss: 2.615
	Test Loss: 6.297
Epoch 7 - Time taken : 5.348mins
Saving the new checkpoint....

	Train Loss: 2.300
	Test Loss: 6.453
Epoch 8 - Time taken : 5.355mins
Saving the new checkpoint....

	Train Loss: 1.988
	Test Loss: 6.672
Epoch 9 - Time taken : 5.337mins
Saving the new checkp

KeyboardInterrupt: ignored

#### Call translate (final_model_4)

In [None]:
df = pd.read_csv('./drive/MyDrive/data_summaries.csv')
idx = np.random.randint(0,df.shape[0],1000)
df1 = df.loc[idx]
df1.to_csv('./drive/MyDrive/test_data.csv')

In [None]:
df1={'Abstract':[],'Title':[],'Generated Title':[]}

In [None]:
df1

{'Abstract': [], 'Generated Title': [], 'Title': []}

In [None]:
ID = data.Field(use_vocab=False,sequential=False,preprocessing=int)
fields = [('Id',ID),('Abstract',None),('Title',TITLE),('sum1',SUM),('sum2',SUM),('sum3',SUM),('sum4',SUM),('sum5',SUM),('sum6',SUM),('sum7',SUM)]
ran_dataset = data.TabularDataset(path='./drive/MyDrive/test_data.csv',format='csv', fields=fields,skip_header=True)

In [None]:
from torchtext.legacy import data
BATCH_SIZE =1
iterator=data.Iterator(
    ran_dataset,
    batch_size = BATCH_SIZE,shuffle=False,device=device)

In [None]:
len(iterator)

1000

In [None]:
path =  './drive/MyDrive/Colab Notebooks/final_net_4.pt'
checkpoint = torch.load(path)
#print(checkpoint)
model1 = checkpoint['model']
model1.load_state_dict( checkpoint['model_state_dict'])
min_loss = checkpoint['min_loss']
epoch = checkpoint['epoch']

In [None]:
epoch

11

In [None]:
for i,batch in enumerate(iterator):
  id1 = batch.Id.item()
  #print(batch)
  print("Abstract : ")
  print("".join(list(df.loc[df['Id'] == id1]['Abstract'])))
  print("Actual Title : ")
  print("".join(list(df.loc[df['Id'] == id1]['Title'])))
  print("Generated Title : ")
  print(" ".join(translate(model1,batch,10)))
  abs= "".join(list(df.loc[df['Id'] == id1]['Abstract']))
  title= "".join(list(df.loc[df['Id'] == id1]['Title']))
  df1['Abstract'].append(abs)
  df1['Title'].append(title)
  df1['Generated Title'].append(" ".join(translate(model1,batch,10)))
  #df1['Generated_Title'][id1]=translate(model1,batch,15))

  

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
information, we developed a classification scheme based on neuro-fuzzy modeling
of the AU intensity, which is robust to intensity variations, 2) using both
geometric and appearance-based features, and applying efficient dimension
reduction techniques, our system is robust to illumination changes and it can
represent the subtle changes as well as temporal information involved in
formation of the facial expressions, and 3) by continuous values of intensity
and employing top-down hierarchical rule-based classifiers, we can develop
accurate human-interpretable AU-to-expression converters. Extensive experiments
on Cohn-Kanade database show the superiority of the proposed method, in
comparison with support vector machines, hidden Markov models, and neural
network classifiers. Keywords: biased discriminant analysis (BDA), classifier
design and evaluation, facial action units (AUs), hybrid learning, neuro-fuzzy
modeling.
Actual T

In [None]:
len(df1['Abstract'])

1000

In [None]:
df2 = pd.DataFrame(df1)

In [None]:
df2.to_csv('./drive/MyDrive/generated_titles_4.csv')

### Experiment 5 - Simple attention, encoder layer = 1, Hidden_dim = 128

In [None]:
INPUT_DIM = len(SUM.vocab)
OUTPUT_DIM = len(TITLE.vocab)
ENC_EMB_DIM = 100
DEC_EMB_DIM = 100
HID_DIM = 128
N_LAYERS = 1
ENC_DROPOUT = 0
DEC_DROPOUT = 0
SPLIT = 100
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
con = ControlLayer(HID_DIM,HID_DIM)
'''trying with complex attention first'''
attention = SimpleAttention(HID_DIM,HID_DIM,HID_DIM,SPLIT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM,HID_DIM,HID_DIM,attention,'simple')

model = Seq2Seq(enc,con, dec, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
      #print(name)
      nn.init.uniform_(param.data, -0.1, 0.1)  
model.apply(init_weights)
pretrained_embeddings = SUM.vocab.vectors
model.encoder.embedding.weight.data.copy_(pretrained_embeddings)
model.decoder.embedding.weight.data.copy_(pretrained_embeddings)
    

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')

In [None]:
def checkpoint_and_save(model, min_loss, epoch, optimizer):
    print()
    state = {'model': model,'min_loss': min_loss,'epoch': epoch,'model_state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(),'train_iterator':train_iterator.state_dict(),'valid_iterator':valid_iterator.state_dict()}
    path =  './drive/MyDrive/Colab Notebooks/final_net_5.pt'
    torch.save(state, path)

In [None]:
optimizer = optim.Adam(model.parameters(),lr=0.001)
PAD_IDX = TITLE.vocab.stoi[TITLE.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [None]:
import time
N_EPOCHS = 10
CLIP = 1
min_loss = 1000000
min_epoch = -1
train_loss_list = []
test_loss_list = []
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    test_loss = test(model,valid_iterator,criterion)
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tTest Loss: {test_loss:.3f}')
    #print("After epoch {} , generated title is {}".format(epoch,translate(model,demo_sentence,10)))
    end_time = time.time()
    print(f'Epoch {epoch} - Time taken : {(end_time-start_time)/60:.3f}mins')
    if(train_loss < min_loss):
      min_loss=train_loss
      min_epoch = epoch
      print("Saving the new checkpoint....")
      checkpoint_and_save(model,min_loss,epoch,optimizer)
    if(epoch-min_epoch >= 10):
      print("NO further improvement over 10 epochs. Terminating...")
      break
    
   

	Train Loss: 6.501
	Test Loss: 6.345
Epoch 0 - Time taken : 2.972mins
Saving the new checkpoint....

	Train Loss: 5.800
	Test Loss: 6.123
Epoch 1 - Time taken : 2.960mins
Saving the new checkpoint....

	Train Loss: 5.406
	Test Loss: 5.982
Epoch 2 - Time taken : 2.960mins
Saving the new checkpoint....

	Train Loss: 5.114
	Test Loss: 5.951
Epoch 3 - Time taken : 2.966mins
Saving the new checkpoint....

	Train Loss: 4.878
	Test Loss: 5.868
Epoch 4 - Time taken : 2.954mins
Saving the new checkpoint....

	Train Loss: 4.638
	Test Loss: 5.849
Epoch 5 - Time taken : 2.962mins
Saving the new checkpoint....

	Train Loss: 4.431
	Test Loss: 5.868
Epoch 6 - Time taken : 2.969mins
Saving the new checkpoint....

	Train Loss: 4.223
	Test Loss: 5.908
Epoch 7 - Time taken : 2.970mins
Saving the new checkpoint....

	Train Loss: 4.053
	Test Loss: 5.912
Epoch 8 - Time taken : 2.976mins
Saving the new checkpoint....

	Train Loss: 3.871
	Test Loss: 5.967
Epoch 9 - Time taken : 2.970mins
Saving the new checkp

#### Resume Training

In [None]:
path =  './drive/MyDrive/Colab Notebooks/final_net_5.pt'
checkpoint = torch.load(path)
#print(checkpoint)
model = checkpoint['model']
model.load_state_dict( checkpoint['model_state_dict'])
min_loss = checkpoint['min_loss']
epoch = checkpoint['epoch']
optimizer1 = checkpoint['optimizer']

In [None]:
optimizer = optim.Adam(model.parameters(),lr=0.001)
optimizer.load_state_dict(optimizer1)
PAD_IDX = TITLE.vocab.stoi[TITLE.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [None]:
import time
N_EPOCHS = 20
CLIP = 1
min_loss = 1000000
min_epoch = -1
train_loss_list = []
test_loss_list = []
for epoch in range(10,N_EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    test_loss = test(model,valid_iterator,criterion)
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tTest Loss: {test_loss:.3f}')
    #print("After epoch {} , generated title is {}".format(epoch,translate(model,demo_sentence,10)))
    end_time = time.time()
    print(f'Epoch {epoch} - Time taken : {(end_time-start_time)/60:.3f}mins')
    if(train_loss < min_loss):
      min_loss=train_loss
      min_epoch = epoch
      print("Saving the new checkpoint....")
      checkpoint_and_save(model,min_loss,epoch,optimizer)
    if(epoch-min_epoch >= 10):
      print("NO further improvement over 10 epochs. Terminating...")
      break
    
   

	Train Loss: 3.716
	Test Loss: 6.044
Epoch 10 - Time taken : 2.947mins
Saving the new checkpoint....

	Train Loss: 3.554
	Test Loss: 6.077
Epoch 11 - Time taken : 2.968mins
Saving the new checkpoint....

	Train Loss: 3.398
	Test Loss: 6.143
Epoch 12 - Time taken : 2.952mins
Saving the new checkpoint....

	Train Loss: 3.264
	Test Loss: 6.199
Epoch 13 - Time taken : 2.965mins
Saving the new checkpoint....

	Train Loss: 3.134
	Test Loss: 6.276
Epoch 14 - Time taken : 2.969mins
Saving the new checkpoint....

	Train Loss: 3.002
	Test Loss: 6.372
Epoch 15 - Time taken : 2.972mins
Saving the new checkpoint....

	Train Loss: 2.883
	Test Loss: 6.448
Epoch 16 - Time taken : 2.960mins
Saving the new checkpoint....

	Train Loss: 2.772
	Test Loss: 6.518
Epoch 17 - Time taken : 2.962mins
Saving the new checkpoint....

	Train Loss: 2.657
	Test Loss: 6.604
Epoch 18 - Time taken : 2.971mins
Saving the new checkpoint....

	Train Loss: 2.541
	Test Loss: 6.674
Epoch 19 - Time taken : 2.972mins
Saving the 

#### Run the model for 10 more epochs. 

In [None]:
import time
N_EPOCHS = 20
CLIP = 1
min_loss = 1000000
min_epoch = -1
train_loss_list = []
test_loss_list = []
for epoch in range(10,N_EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    test_loss = test(model,valid_iterator,criterion)
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tTest Loss: {test_loss:.3f}')
    #print("After epoch {} , generated title is {}".format(epoch,translate(model,demo_sentence,10)))
    end_time = time.time()
    print(f'Epoch {epoch} - Time taken : {(end_time-start_time)/60:.3f}mins')
    if(train_loss < min_loss):
      min_loss=train_loss
      min_epoch = epoch
      print("Saving the new checkpoint....")
      checkpoint_and_save(model,min_loss,epoch,optimizer)
    if(epoch-min_epoch >= 10):
      print("NO further improvement over 10 epochs. Terminating...")
      break
    
   

	Train Loss: 2.460
	Test Loss: 6.773
Epoch 10 - Time taken : 2.893mins
Saving the new checkpoint....

	Train Loss: 2.366
	Test Loss: 6.848
Epoch 11 - Time taken : 2.911mins
Saving the new checkpoint....

	Train Loss: 2.272
	Test Loss: 6.957
Epoch 12 - Time taken : 2.899mins
Saving the new checkpoint....

	Train Loss: 2.178
	Test Loss: 7.015
Epoch 13 - Time taken : 2.901mins
Saving the new checkpoint....

	Train Loss: 2.087
	Test Loss: 7.127
Epoch 14 - Time taken : 2.909mins
Saving the new checkpoint....

	Train Loss: 2.035
	Test Loss: 7.195
Epoch 15 - Time taken : 2.900mins
Saving the new checkpoint....

	Train Loss: 1.947
	Test Loss: 7.300
Epoch 16 - Time taken : 2.891mins
Saving the new checkpoint....

	Train Loss: 1.861
	Test Loss: 7.396
Epoch 17 - Time taken : 2.903mins
Saving the new checkpoint....

	Train Loss: 1.804
	Test Loss: 7.463
Epoch 18 - Time taken : 2.898mins
Saving the new checkpoint....

	Train Loss: 1.735
	Test Loss: 7.582
Epoch 19 - Time taken : 2.895mins
Saving the 

#### Call Translate(final_model_5)

In [None]:
df1={'Abstract':[],'Title':[],'Generated Title':[]}

In [None]:
df1

{'Abstract': [], 'Generated Title': [], 'Title': []}

In [None]:
ID = data.Field(use_vocab=False,sequential=False,preprocessing=int)
fields = [('Id',ID),('Abstract',None),('Title',TITLE),('sum1',SUM),('sum2',SUM),('sum3',SUM),('sum4',SUM),('sum5',SUM),('sum6',SUM),('sum7',SUM)]
ran_dataset = data.TabularDataset(path='./drive/MyDrive/test_data.csv',format='csv', fields=fields,skip_header=True)

In [None]:
from torchtext.legacy import data
BATCH_SIZE =1
iterator=data.Iterator(
    ran_dataset,
    batch_size = BATCH_SIZE,shuffle=False,device=device)

In [None]:
path =  './drive/MyDrive/Colab Notebooks/final_net_5.pt'
checkpoint = torch.load(path)
#print(checkpoint)
model1 = checkpoint['model']
model1.load_state_dict( checkpoint['model_state_dict'])
min_loss = checkpoint['min_loss']
epoch = checkpoint['epoch']

In [None]:
for i,batch in enumerate(iterator):
  id1 = batch.Id.item()
  #print(batch)
  print("Abstract : ")
  print("".join(list(df.loc[df['Id'] == id1]['Abstract'])))
  print("Actual Title : ")
  print("".join(list(df.loc[df['Id'] == id1]['Title'])))
  print("Generated Title : ")
  print(" ".join(translate(model1,batch,10)))
  abs= "".join(list(df.loc[df['Id'] == id1]['Abstract']))
  title= "".join(list(df.loc[df['Id'] == id1]['Title']))
  df1['Abstract'].append(abs)
  df1['Title'].append(title)
  df1['Generated Title'].append(" ".join(translate(model1,batch,10)))
  #df1['Generated_Title'][id1]=translate(model1,batch,15))

  

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
information, we developed a classification scheme based on neuro-fuzzy modeling
of the AU intensity, which is robust to intensity variations, 2) using both
geometric and appearance-based features, and applying efficient dimension
reduction techniques, our system is robust to illumination changes and it can
represent the subtle changes as well as temporal information involved in
formation of the facial expressions, and 3) by continuous values of intensity
and employing top-down hierarchical rule-based classifiers, we can develop
accurate human-interpretable AU-to-expression converters. Extensive experiments
on Cohn-Kanade database show the superiority of the proposed method, in
comparison with support vector machines, hidden Markov models, and neural
network classifiers. Keywords: biased discriminant analysis (BDA), classifier
design and evaluation, facial action units (AUs), hybrid learning, neuro-fuzzy
modeling.
Actual T

In [None]:
df2 = pd.DataFrame(df1)
df2.to_csv('./drive/MyDrive/generated_titles_5.csv')

In [None]:
#Calculate BLEU SCORE

In [None]:
ref = []
pred= []
ref=[[t.split()] for t in df1['Title']]
pred=[t.split() for t in df1['Generated Title']]

In [None]:
ref=  [[['My', 'full', 'pytorch', 'test'], ['Completely', 'Different']], [['No', 'Match']]]

pred = [['My', 'full', 'pytorch', 'test'], ['Another', 'Sentence']]

In [None]:
ref

In [None]:
from torchtext.data.metrics import bleu_score
bleu_score(pred,ref,weights=[1,0,0,0])

0.12435863708392587

# Final model selected = final_model_5 ( from experiment 5)
# Cherry Picking some examples from generated titles and comparing with Baseline

In [None]:
final_titles = pd.read_csv('./drive/MyDrive/generated_titles_5.csv')
base_titles = pd.read_csv('./drive/MyDrive/generated_titles_baseline.csv')

In [None]:
idx = [41,42,58,61,121,143,160,162,188,192,231,232,240,291,403,452,947,992,941,883] ##Cheery picking
compare_titles = base_titles.loc[idx]

In [None]:
compare_titles['Generated Title(Final)']=final_titles.loc[idx]['Generated Title']

In [None]:
compare_titles.columns = ['No.','Abstract', 'Actual title','Generated Title(Baseline)','Generated Title(Final)']

In [None]:
pd.set_option("display.max_colwidth",1000)
compare_titles.iloc[:,1:]

Unnamed: 0,Abstract,Actual title,Generated Title(Baseline),Generated Title(Final)
41,This paper reviews Kunchenko's polynomials using as template matching method\nto recognize template in one-dimensional input signal. Kunchenko's polynomials\nmethod is compared with classical methods - cross-correlation and sum of\nsquared differences according to numerical statistical example.,Kunchenko's Polynomials for Template Matching,a new framework for learning work quality of phylogenetic qualitative,polynomials for template matching
42,"Although many successful ensemble clustering approaches have been developed\nin recent years, there are still two limitations to most of the existing\napproaches. First, they mostly overlook the issue of uncertain links, which may\nmislead the overall consensus process. Second, they generally lack the ability\nto incorporate global information to refine the local links. To address these\ntwo limitations, in this paper, we propose a novel ensemble clustering approach\nbased on sparse graph representation and probability trajectory analysis. In\nparticular, we present the elite neighbor selection strategy to identify the\nuncertain links by locally adaptive thresholds and build a sparse graph with a\nsmall number of probably reliable links. We argue that a small number of\nprobably reliable links can lead to significantly better consensus results than\nusing all graph links regardless of their reliability. The random walk process\ndriven by a new transition probability matrix is util...",Robust Ensemble Clustering Using Probability Trajectories,a new framework for work single linkage optimization <eos>,ensemble clustering with sparse ensemble
58,"We study black-box attacks on machine learning classifiers where each query\nto the model incurs some cost or risk of detection to the adversary. We focus\nexplicitly on minimizing the number of queries as a major objective.\nSpecifically, we consider the problem of attacking machine learning classifiers\nsubject to a budget of feature modification cost while minimizing the number of\nqueries, where each query returns only a class and confidence score. We\ndescribe an approach that uses Bayesian optimization to minimize the number of\nqueries, and find that the number of queries can be reduced to approximately\none tenth of the number needed through a random strategy for scenarios where\nthe feature modification cost budget is low.",Query-limited Black-box Attacks to Classifiers,a new approach to work <unk> of <unk> ai based <eos>,black box attacks
61,"We consider probabilistic topic models and more recent word embedding\ntechniques from a perspective of learning hidden semantic representations.\nInspired by a striking similarity of the two approaches, we merge them and\nlearn probabilistic embeddings with online EM-algorithm on word co-occurrence\ndata. The resulting embeddings perform on par with Skip-Gram Negative Sampling\n(SGNS) on word similarity tasks and benefit in the interpretability of the\ncomponents. Next, we learn probabilistic document embeddings that outperform\nparagraph2vec on a document similarity task and require less memory and time\nfor training. Finally, we employ multimodal Additive Regularization of Topic\nModels (ARTM) to obtain a high sparsity and learn embeddings for other\nmodalities, such as timestamps and categories. We observe further improvement\nof word similarity performance and meaningful inter-modality similarities.",Interpretable probabilistic embeddings: bridging the gap between topic\n models and neural networks,a filters of work <unk> roughly authorship pseudo likelihood and and,probabilistic embeddings bridging the topic models
121,"The production of color language is essential for grounded language\ngeneration. Color descriptions have many challenging properties: they can be\nvague, compositionally complex, and denotationally rich. We present an\neffective approach to generating color descriptions using recurrent neural\nnetworks and a Fourier-transformed color representation. Our model outperforms\nprevious work on a conditional language modeling task over a large corpus of\nnaturalistic color descriptions. In addition, probing the model's output\nreveals that it can accurately produce not only basic color terms but also\ndescriptors with non-convex denotations (""greenish""), bare modifiers (""bright"",\n""dull""), and compositional phrases (""faded teal"") not seen in training.",Learning to Generate Compositional Color Descriptions,<unk> a <unk> for hmm flaws structure <eos>,to generate compositional color descriptions
143,"Short-term tracking is an open and challenging problem for which\ndiscriminative correlation filters (DCF) have shown excellent performance. We\nintroduce the channel and spatial reliability concepts to DCF tracking and\nprovide a novel learning algorithm for its efficient and seamless integration\nin the filter update and the tracking process. The spatial reliability map\nadjusts the filter support to the part of the object suitable for tracking.\nThis both allows to enlarge the search region and improves tracking of\nnon-rectangular objects. Reliability scores reflect channel-wise quality of the\nlearned filters and are used as feature weighting coefficients in localization.\nExperimentally, with only two simple standard features, HoGs and Colornames,\nthe novel CSR-DCF method -- DCF with Channel and Spatial Reliability --\nachieves state-of-the-art results on VOT 2016, VOT 2015 and OTB100. The CSR-DCF\nruns in real-time on a CPU.",Discriminative Correlation Filter with Channel and Spatial Reliability,a new approach to work <unk> of audience ai based using,correlation filter for for multi object tracking
160,"We consider the problem of using sentence compression techniques to\nfacilitate query-focused multi-document summarization. We present a\nsentence-compression-based framework for the task, and design a series of\nlearning-based compression models built on parse trees. An innovative beam\nsearch decoder is proposed to efficiently find highly probable compressions.\nUnder this framework, we show how to integrate various indicative metrics such\nas linguistic motivation and query relevance into the compression process by\nderiving a novel formulation of a compression scoring function. Our best model\nachieves statistically significant improvement over the state-of-the-art\nsystems on several metrics (e.g. 8.0% and 5.4% improvements in ROUGE-2\nrespectively) for the DUC 2006 and 2007 summarization task.",A Sentence Compression Based Framework to Query-Focused Multi-Document\n Summarization,a bayesian model for leverages developments digital in a squared loss,sentence compression for automatic text summarization
162,"Neural sequence-to-sequence models have provided a viable new approach for\nabstractive text summarization (meaning they are not restricted to simply\nselecting and rearranging passages from the original text). However, these\nmodels have two shortcomings: they are liable to reproduce factual details\ninaccurately, and they tend to repeat themselves. In this work we propose a\nnovel architecture that augments the standard sequence-to-sequence attentional\nmodel in two orthogonal ways. First, we use a hybrid pointer-generator network\nthat can copy words from the source text via pointing, which aids accurate\nreproduction of information, while retaining the ability to produce novel words\nthrough the generator. Second, we use coverage to keep track of what has been\nsummarized, which discourages repetition. We apply our model to the CNN / Daily\nMail summarization task, outperforming the current abstractive state-of-the-art\nby at least 2 ROUGE points.",Get To The Point: Summarization with Pointer-Generator Networks,a new framework for work single linkage learning of deep neural,abstractive summarization with pointer generator networks
188,"Ezhil is a Tamil language based interpreted procedural programming language.\nTamil keywords and grammar are chosen to make the native Tamil speaker write\nprograms in the Ezhil system. Ezhil allows easy representation of computer\nprogram closer to the Tamil language logical constructs equivalent to the\nconditional, branch and loop statements in modern English based programming\nlanguages. Ezhil is a compact programming language aimed towards Tamil speaking\nnovice computer users. Grammar for Ezhil and a few example programs are\nreported here, from the initial proof-of-concept implementation using the\nPython programming language1. To the best of our knowledge, Ezhil language is\nthe first freely available Tamil programming language.",Ezhil: A Tamil Programming Language,<unk> a deep neural network for standardized learning <eos>,a tamil programming language
192,"Modeling emotional-cognition is in a nascent stage and therefore wide-open\nfor new ideas and discussions. In this paper the author looks at the modeling\nproblem by bringing in ideas from axiomatic mathematics, information theory,\ncomputer science, molecular biology, non-linear dynamical systems and quantum\ncomputing and explains how ideas from these disciplines may have applications\nin modeling emotional-cognition.",A novice looks at emotional cognition,a new approach to work autoencoding problem problem <eos>,novice looks at emotional cognition
