In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import json
import matplotlib as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data.sampler import BatchSampler
from torch.optim import lr_scheduler
from PIL import Image
import timeit
from sklearn.pipeline import Pipeline
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy import data
import random
## For reproducibility
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(0)
random.seed(0)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Create Preprocessing Pipeline

In [None]:
tokenize =  lambda s: s.split()

In [None]:
import re  
def cleanup_text(texts):
    cleaned_text = []
    for text in texts:
        # remove punctuation
        text = re.sub('[^a-zA-Z0-9]', ' ', text)
        # remove multiple spaces
        text = re.sub(r' +', ' ', text)
        # remove newline
        text = re.sub(r'\n', ' ', text)
        #replace digits with '# symbol
        text = re.sub('[0-9]', '#', text)
        cleaned_text.append(text)
    return cleaned_text

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Create two torch text fields

In [None]:
ABS = data.Field(tokenize = tokenize,init_token='<sos>',eos_token='<eos>',stop_words=stop,pad_first=True,lower = True,preprocessing=cleanup_text)
TITLE = data.Field(tokenize = tokenize,init_token='<sos>',eos_token='<eos>',lower = True,preprocessing=cleanup_text)

#### Read tabular dataset

In [None]:
fields = [('Abstract',ABS),('Title',TITLE)]

In [None]:
dataset = data.TabularDataset(path='./drive/MyDrive/data.csv',format='csv', fields=fields,skip_header=True)

In [None]:
import random
train_data, valid_data = dataset.split(split_ratio=0.9, random_state=random.seed(0))

In [None]:
print(len(train_data.examples))
print(len(valid_data.examples))

36900
4100


In [None]:
train_data[0].Abstract

['human',
 'communication',
 'typically',
 'underlying',
 'structure ',
 'reflected',
 'fact',
 'many',
 'user',
 'generated',
 'videos ',
 'starting',
 'point ',
 'ending ',
 'certain',
 'objective',
 'steps',
 'two',
 'identified ',
 'paper ',
 'propose',
 'method',
 'parsing',
 'video',
 'semantic',
 'steps',
 'unsupervised',
 'way ',
 'proposed',
 'method',
 'capable',
 'providing',
 'semantic',
 ' storyline ',
 'video',
 'composed',
 'objective',
 'steps ',
 'accomplish',
 'using',
 'visual',
 'language',
 'cues',
 'joint',
 'generative',
 'model ',
 'proposed',
 'method',
 'also',
 'provide',
 'textual',
 'description',
 'identified',
 'semantic',
 'steps',
 'video',
 'segments ',
 'evaluate',
 'method',
 'large',
 'number',
 'complex',
 'youtube',
 'videos',
 'show',
 'results',
 'unprecedented',
 'quality',
 'intricate',
 'impactful',
 'problem ']

In [None]:
ABS.build_vocab(train_data.Abstract,train_data.Title,max_size=40000,min_freq=2)

TITLE.vocab= ABS.vocab

In [None]:
len(TITLE.vocab)

40004

In [None]:
ABS.vocab.freqs['of']

9616

In [None]:
assert(TITLE.vocab.stoi ==  ABS.vocab.stoi) #check if both share the same vocab or not

#### Create Iterator

In [None]:
BATCH_SIZE =64
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE, sort_key = lambda x: len(x.Abstract), sort_within_batch = True,shuffle=True,sort=False,
    device = device)

In [None]:
len(valid_iterator.dataset)

4100

##Baseline Model


In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, num_layers, dropout): 
        super().__init__()   
        self.hid_dim = hid_dim
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_dim, emb_dim)  
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers, dropout = dropout)    
        self.dropout = nn.Dropout(dropout)       
    def forward(self, input_idx):
        #print(input_idx)
        embedded = self.dropout(self.embedding(input_idx))  
        outputs, (hidden, cell) = self.lstm(embedded)
        #embedded = seq_len,batch_size,embed_dim
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        #outputs are always from the top hidden layer
        return hidden, cell

In [None]:
class Decoder(nn.Module):
  def __init__(self,input_dim,emb_dim,hid_dim,num_layers,dropout,output_dim):
    super().__init__()
    self.hid_dim = hid_dim
    self.num_layers=num_layers
    self.embedding = nn.Embedding(input_dim,emb_dim)
    self.lstm = nn.LSTM(emb_dim,hid_dim,num_layers,dropout=dropout)
    self.dropout = nn.Dropout(dropout)
    self.fc = nn.Linear(hid_dim,output_dim) 
    self.output_dim=output_dim

  def forward(self,input_idx,context_vector,cell_state):
    #input_idx = [batch_size]
    input_idx = input_idx.unsqueeze(0) #Adding a dimenstion at the the first = 1 = seq_len as we are sending word by word
    embedded = self.dropout(self.embedding(input_idx))
    #embedded = [1,batch_size,embed_size]
    #print(embedded.size())
    #print("G")
    outputs, (hidden, cell) = self.lstm(embedded,(context_vector,cell_state))
    #outputs_size = (1,batch_size,hidden_dim)
    outputs = outputs.reshape(-1,self.hid_dim)
    #outputs_size=(batch_size,hid_dim)
    prediction = self.fc(outputs)
    #prediction_size = (batch_size,out_dim)
    return prediction,hidden,cell


In [None]:
class Seq2Seq(nn.Module): #Combining the encoder and decoder
  def __init__(self,encoder,decoder,device):
    super().__init__()
    self.encoder=encoder
    self.decoder=decoder
    self.device =  device
  def forward(self,input_batches,output_batches,tfr=0.5):
    #input_batches dimension - (seq_len,batch_size)
    #output_batches_dimension - (seq_len,batch_size)
    batch_size = input_batches.shape[1]
    title_len = output_batches.shape[0]
    title_vocab_size = self.decoder.output_dim
    predictions = torch.zeros(title_len, batch_size, title_vocab_size).to(device)
    #print(input_batches.size())
    hidden_state, cell_state = self.encoder(input_batches) 
    #hidden_state/cell_state dimension = num_layers,batch_size,hidden_dim

    x = output_batches[0,:] # Trigger token <SOS>

    for i in range(1, title_len):
      pred, hidden_state, cell_state = self.decoder(x, hidden_state, cell_state)
      #pred = [batch_size,output_dim(vocab_size)]
      predictions[i] = pred
      best_guess = pred.argmax(1) 
      x = output_batches[i,:] if random.random() < tfr else best_guess
    return predictions  

In [None]:
INPUT_DIM = len(ABS.vocab)
OUTPUT_DIM = len(TITLE.vocab)
ENC_EMB_DIM = 100
DEC_EMB_DIM = 100
HID_DIM = 512
N_LAYERS = 3
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(INPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT,OUTPUT_DIM)

model = Seq2Seq(enc, dec, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.1, 0.1)
        
model.apply(init_weights)


Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(40004, 100)
    (lstm): LSTM(100, 512, num_layers=3, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(40004, 100)
    (lstm): LSTM(100, 512, num_layers=3, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
    (fc): Linear(in_features=512, out_features=40004, bias=True)
  )
)

In [None]:
optimizer = optim.Adam(model.parameters(),lr=0.001)
PAD_IDX = TITLE.vocab.stoi[TITLE.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i,batch in enumerate(iterator):
        
        abstract = batch.Abstract
        title = batch.Title
        #abstract,title = [seq_len,batch_size]
        optimizer.zero_grad()
        predictions = model(abstract, title,0.5)
        #predictions = [seq_len_title,batch_size,title_vocab]
        output_dim = predictions.shape[-1]
        
        predictions = predictions[1:].view(-1, output_dim)#ignoring the first value is the <sos> token
        title = title[1:].view(-1)
        
        loss = criterion(predictions, title)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def test(model, iterator, criterion):    
    model.eval() 
    epoch_loss = 0 
    with torch.no_grad():   
        for i, batch in enumerate(iterator):
          abstract = batch.Abstract
          title = batch.Title
          #abstract,title = [seq_len,batch_size]
          predictions = model(abstract, title,0)
          #predictions = [seq_len_title,batch_size,title_vocab]
          output_dim = predictions.shape[-1]
          predictions = predictions[1:].view(-1, output_dim)#ignoring the first value is the <sos> token
          title = title[1:].view(-1)
          loss = criterion(predictions, title)  
          epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
#to generate title for one abstract
def translate(model,abs,max_length):
  #abs = A single abstract to generate title, data_type = str

  #Preprocessing as done during training
  abs=abs.lower()
  abs = tokenize(abs)
  abs = cleanup_text(abs)
  #convert to a list of idx corresponding to abstract vocab
  num_abs = []
  num_abs.append(ABS.vocab.stoi['<sos>'])
  for w in abs:
    if w in ABS.vocab.stoi:
      num_abs.append(ABS.vocab.stoi[w])
    else:
      num_abs.append(ABS.vocab.stoi['<unk'])
  num_abs.append(ABS.vocab.stoi['<eos>'])
  #covert to Long Tensor
  num_abs = torch.LongTensor(num_abs)
  #add batch_size = 1
  num_abs = num_abs.unsqueeze(1)
  #load them to device
  num_abs = num_abs.to(device)
  #print(num_abs.size())
  #model = model.to(device)

  #model.eval()
  #Forward pass through the encoder
  with torch.no_grad():
    hidden_state, cell_state = model.encoder(num_abs)
  
  #Starts with <sos>
  pred = [ABS.vocab.stoi["<sos>"]]
  for _ in range(max_length):
        previous_word = torch.LongTensor([pred[-1]]).to(device)
        #print(previous_word.size())
        with torch.no_grad():
            output, hidden_state, cell_state = model.decoder(previous_word, hidden_state, cell_state)
            best_guess = output.argmax(1).item()

        pred.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == ABS.vocab.stoi["<eos>"]:
            break

  translated_sentence = [ABS.vocab.itos[idx] for idx in pred]
  return translated_sentence[1:]

In [None]:
demo_sentence = "We propose an architecture for VQA which utilizes recurrent layers to\ngenerate visual and textual attention. The memory characteristic of the\nproposed recurrent attention units offers a rich joint embedding of visual and\ntextual features and enables the model to reason relations between several\nparts of the image and question. Our single model outperforms the first place\nwinner on the VQA 1.0 dataset, performs within margin to the current\nstate-of-the-art ensemble model. We also experiment with replacing attention\nmechanisms in other state-of-the-art models with our implementation and show\nincreased accuracy. In both cases, our recurrent attention mechanism improves\nperformance in tasks requiring sequential or relational reasoning on the VQA\ndataset"


In [None]:
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(40004, 100)
    (lstm): LSTM(100, 512, num_layers=3, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(40004, 100)
    (lstm): LSTM(100, 512, num_layers=3, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
    (fc): Linear(in_features=512, out_features=40004, bias=True)
  )
)

In [None]:
def checkpoint_and_save(model, min_loss, epoch, optimizer):
    print()
    state = {'model': model,'min_loss': min_loss,'epoch': epoch,'model_state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(),}
    path =  './drive/MyDrive/Colab Notebooks/net.pt'
    torch.save(state, path)

In [None]:
N_EPOCHS = 20
CLIP = 1
import time
min_loss = 1000000
min_epoch = -1
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    test_loss = test(model,valid_iterator,criterion)
    
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tTest Loss: {test_loss:.3f}')
    print("After epoch {} , generated title is {}".format(epoch,translate(model,demo_sentence,10)))
    end_time = time.time()
    print("Time taken : ",((end_time-start_time)/60),'min')
    if(train_loss < min_loss):
      min_loss=train_loss
      min_epoch = epoch
      print("Saving the new checkpoint....")
      checkpoint_and_save(model,min_loss,epoch,optimizer)
    if(epoch-min_epoch >= 10):
      print("NO further improvement over 10 epochs. Terminating...")
      break
    
   

	Train Loss: 6.676
	Test Loss: 6.513
After epoch 0 , generated title is ['a', 'the', 'of', 'of', 'for', '<eos>']
Time taken :  3.70690389474233 min
Saving the new checkpoint....

	Train Loss: 6.297
	Test Loss: 6.521
After epoch 1 , generated title is ['<unk>', 'of', 'of', 'for', 'for', '<eos>']
Time taken :  3.745437733332316 min
Saving the new checkpoint....

	Train Loss: 6.168
	Test Loss: 6.698
After epoch 2 , generated title is ['a', '<unk>', 'approach', 'for', 'for', 'for', '<eos>']
Time taken :  3.7392032027244566 min
Saving the new checkpoint....

	Train Loss: 6.065
	Test Loss: 6.548
After epoch 3 , generated title is ['<unk>', 'a', 'of', 'for', 'for', '<eos>']
Time taken :  3.733981561660767 min
Saving the new checkpoint....

	Train Loss: 5.962
	Test Loss: 6.500
After epoch 4 , generated title is ['<unk>', 'deep', 'neural', 'networks', 'for', 'image', 'recognition', '<eos>']
Time taken :  3.7391348679860434 min
Saving the new checkpoint....

	Train Loss: 5.830
	Test Loss: 6.438


### Some more examples 


In [None]:
path =  './drive/MyDrive/Colab Notebooks/net.pt'
checkpoint = torch.load(path)
model1 = checkpoint['model']
model1.load_state_dict( checkpoint['model_state_dict'])
min_loss = checkpoint['min_loss']
epoch = checkpoint['epoch']

In [None]:
df = pd.read_csv('./drive/MyDrive/data_summaries.csv')

<br>(test_data.csv already contains 1000 randomly selected abstracts)


In [None]:
df1={'Abstract':[],'Title':[],'Generated Title':[]}
ID = data.Field(use_vocab=False,sequential=False,preprocessing=int)
fields = [('Id',ID),('Abstract',ABS),('Title',TITLE),('sum1',None),('sum2',None),('sum3',None),('sum4',None),('sum5',None),('sum6',None),('sum7',None)]
ran_dataset = data.TabularDataset(path='./drive/MyDrive/test_data.csv',format='csv', fields=fields,skip_header=True)

In [None]:
from torchtext.legacy import data
BATCH_SIZE =1
iterator=data.Iterator(
    ran_dataset,
    batch_size = BATCH_SIZE,shuffle=False,device=device)

Generate headlines and store it in a file called "generated_titles_baseline.csv"

In [None]:
for i,batch in enumerate(iterator):
  id1 = batch.Id.item()
  abs= "".join(list(df.loc[df['Id'] == id1]['Abstract']))
  title= "".join(list(df.loc[df['Id'] == id1]['Title']))
  #print(batch)
  #print("Abstract : ")
  #print("".join(list(df.loc[df['Id'] == id1]['Abstract'])))
  #print("Actual Title : ")
  #print("".join(list(df.loc[df['Id'] == id1]['Title'])))
  #print("Generated Title : ")
  #print(" ".join(translate(model1,abs,10)))
  
  df1['Abstract'].append(abs)
  df1['Title'].append(title)
  df1['Generated Title'].append(" ".join(translate(model1,abs,10)))
  #df1['Generated_Title'][id1]=translate(model1,batch,15))

  

In [None]:
df2 = pd.DataFrame(df1)
df2.to_csv('./drive/MyDrive/generated_titles_baseline.csv')