# Packages

In [None]:
!pip install rouge --quiet
!pip install transformers --quiet

  Building wheel for bleu (setup.py) ... [?25l[?25hdone
  Building wheel for efficiency (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 2.1MB 15.5MB/s 
[K     |████████████████████████████████| 3.3MB 55.6MB/s 
[K     |████████████████████████████████| 901kB 48.1MB/s 
[?25h

# Imports

In [None]:
import re
import json
import numpy as np
import pandas as pd

from rouge import Rouge 
from bleu import list_bleu
from tqdm import tqdm_notebook

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import EncoderDecoderModel, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
from transformers import AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup

from sklearn.model_selection import train_test_split

print("GPU Torch Available = {}".format(torch.cuda.is_available()))
print("Torch Version = {}".format(torch.__version__))

GPU Torch Available = True
Torch Version = 1.8.1+cu101


# Encoder-Decoder Model

In [None]:
# Model Selection

# Regular Models
bert_base_cased = 'bert-base-cased'
roberta_base = 'roberta-base'
gpt2 = 'gpt2'
electra = 'google/electra-small-discriminator'
t5_base = 't5-base'
bart = 'facebook/bart-base'

# Heavy Memory Dependant Models (For High RAM and High GPU Systems)
bert_large_cased = 'bert-large-cased'
roberta_large = 'roberta-large'
gpt2_medium = 'gpt2-medium'
t5_large = 't5-large'
bart_large = 'facebook/bart-large'

# Select Pretrained Weights
Pretrained_Weight = bert_base_cased                  # Select Pretrained Weights

# Encoder-Decoder
seq2seq = EncoderDecoderModel.from_encoder_decoder_pretrained(Pretrained_Weight, Pretrained_Weight)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(Pretrained_Weight)

# Set Special Tokens
seq2seq.config.decoder_start_token_id = tokenizer.bos_token_id
seq2seq.config.eos_token_id = tokenizer.eos_token_id
seq2seq.config.pad_token_id = tokenizer.pad_token_id

# Parameters for Beam Search
seq2seq.config.vocab_size = seq2seq.config.decoder.vocab_size
seq2seq.config.max_length = 142
seq2seq.config.min_length = 56
seq2seq.config.no_repeat_ngram_size = 3
seq2seq.config.early_stopping = True
seq2seq.config.length_penalty = 2.0
seq2seq.config.num_beams = 6

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-cased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.c

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




# Dataset Loading and Preprocessing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Loading Dataset
file = '/content/drive/MyDrive/Title Generation/Dataset/Dataset_Title_Summarization_of_Various_Summaries.xlsx' 
df = pd.read_excel(file, names = ['ID','Combined Abstract', 'Title'])
df = df.drop(['ID'], axis=1)
df

In [None]:
# Train Test Split of Dataset
train_df, test_df = train_test_split(df, test_size = 0.25, random_state = 42)
print('Train Dataset Length = {}'.format(len(train_df)))
print('Test Dataset Length  = {}'.format(len(test_df)))

In [None]:
# Data Preparation into Pandas Dataframe for Model Input
def get_data(dataframe):
  abstract = list(dataframe['Combined Abstract'])
  title = list(dataframe['Title'])
  
  raw_data_train = {'Abstract': abstract, 'Title': title}
  df = pd.DataFrame(raw_data_train, columns = ['Abstract','Title'])
  return df

train_data = get_data(train_df)
test_data = get_data(test_df)

print('Training Data:')
print(train_data[0:3])
print('\nTesting Data:')
print(test_data[0:3])

Training Data:
                                            Abstract                                              Title
0  to this end we instantiate two policy gradient...                      Variational Intrinsic Control
1  ReCTnet reaches a detection sensitivity of 90....   Recurrent Convolutional Networks for Pulmonar...
2  the proposed method has a set of parameters th...   Discovering the Graph Structure in the Cluste...

Testing Data:
                                            Abstract                                              Title
0  independence tests using the optimized feature...   An Adaptive Test of Independence with Analyti...
1  in this paper we will provide a method in whic...   A Mixed Observability Markov Decision Process...
2  our results show several interesting findings ...   Learning Multi Relational Semantics Using Neu...


In [None]:
# Data Preparation for Seq2Seq Model Input
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len_enc, max_len_dec):
      self.tokenizer = tokenizer
      self.data = dataframe
      self.abstract = dataframe.Abstract
      self.title = dataframe.Title
      self.encoder_max_len = max_len_enc
      self.decoder_max_len = max_len_dec
        
    def __len__(self):
      return len(self.abstract)

    def __getitem__(self, index):
      # Abstract Tokenization
      abstract_data = str(self.abstract[index])
      inputs = self.tokenizer.encode_plus(abstract_data,
                                          truncation=True,
                                          add_special_tokens=True,
                                          max_length = self.encoder_max_len,
                                          padding = 'max_length',
                                          return_token_type_ids = False)
      input_ids = inputs['input_ids']
      input_mask = inputs['attention_mask']
      
      # Title Tokenization
      title_data = str(self.title[index])
      outputs = self.tokenizer.encode_plus(title_data,
                                           truncation=True,
                                           add_special_tokens=True,
                                           max_length = self.decoder_max_len,
                                           padding = 'max_length',
                                           return_token_type_ids = False)
      output_ids = outputs['input_ids']
      output_mask = outputs['attention_mask']
      
      return {'input_ids': torch.tensor(input_ids, dtype=torch.long),
              'attention_mask': torch.tensor(input_mask, dtype=torch.long),
              'decoder_input_ids': torch.tensor(output_ids, dtype=torch.long),
              'decoder_attention_mask' : torch.tensor(output_mask, dtype=torch.long),
              'labels': torch.tensor(output_ids, dtype=torch.long)}

ENCODER_MAX_LEN = 256                                                                   # Encoder Max Sequence Length (Change)
DECODER_MAX_LEN = 32                                                                    # Decoder Max Sequence Length

training_set = CustomDataset(train_data, tokenizer, ENCODER_MAX_LEN, DECODER_MAX_LEN)   # Training Set
testing_set = CustomDataset(test_data, tokenizer, ENCODER_MAX_LEN, DECODER_MAX_LEN)     # Testing Set

# Training

In [None]:
# Device Mapping Select (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if (torch.cuda.is_available() == True):
  print("Model Mapped CUDA::GPU")
  seq2seq = seq2seq.cuda()

# Early Stopping Callback Setup
early_stop = EarlyStoppingCallback(early_stopping_patience = 3, early_stopping_threshold = 0.01)

# Learning Rate Schedulers
scheduler_options = ['linear', 'cosine', 'cosine_with_restarts', 'polynomial', 'constant', 'constant_with_warmup']

# Training Parameters (Should be Tuned)
training_args = Seq2SeqTrainingArguments(seed = 42,
                                         output_dir="./models/model_name",
                                         overwrite_output_dir = True,
                                         evaluation_strategy = "epoch",
                                         do_train = True,
                                         do_eval = True,
                                         learning_rate = 5e-5,
                                         lr_scheduler_type = 'polynomial',
                                         weight_decay = 0.01,
                                         per_device_train_batch_size = 16,
                                         per_device_eval_batch_size = 16,
                                         predict_with_generate = False,
                                         num_train_epochs = 20,
                                         logging_steps = 2,
                                         save_steps = 0, 
                                         warmup_steps = 16,
                                         load_best_model_at_end = True)

# Instantiate Seq2Seq Trainer
trainer = Seq2SeqTrainer(model = seq2seq,
                         callbacks = [early_stop],
                         tokenizer = tokenizer,
                         args = training_args,
                         train_dataset = training_set,
                         eval_dataset = testing_set)

# Train Model
trainer.train()

# Testing

In [None]:
trainer.evaluate(testing_set)

# Model Save

In [None]:
# Model Save
model_save_path = '/content/drive/MyDrive/Title Generation/Model Weights/Seq2Seq_state_dict_Bert_Base_10000_Various_Summaries_Beam_6'
torch.save(seq2seq.state_dict(), model_save_path + '.pth')

# Model Load

In [None]:
# Model Load (Load Already Finetuned Model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_load_path = '/content/drive/MyDrive/Title Generation/Model Weights/Seq2Seq_state_dict_Bert_Base_10000_Various_Summaries_Beam_6.pth'
seq2seq.load_state_dict(torch.load(model_load_path, map_location = device))

<All keys matched successfully>

# Rouge Score Calcultion

In [None]:
# Rouge Score Calculation
test_size = 2500
rouge = Rouge()
test_index_limit = test_size                     # Test Size for GPU Constraints

# Placeholders for Rouge Scores
rouge_1_f = []
rouge_1_p = []
rouge_1_r = []

rouge_2_f = []
rouge_2_p = []
rouge_2_r = []

rouge_l_f = []
rouge_l_p = []
rouge_l_r = []

# Calculation
for i in range(test_index_limit):

  # Inference 
  data = test_data['Abstract'][i]
  input_sentence_ids = torch.tensor(tokenizer.encode(data, add_special_tokens = True)).unsqueeze(0).cuda()
  generated = seq2seq.generate(input_sentence_ids,
                               max_length = 20, 
                               decoder_start_token_id = seq2seq.config.decoder.pad_token_id)
  if (i % 50 == 0):
    print("Inferene Done for Test ID = {}".format(i))
  # Reference and Hypothesis for Rouge Score Calculation
  hypothesis = tokenizer.batch_decode(generated, skip_special_tokens = True)[0]      # Predicted Title
  reference = test_data['Title'][i]                                                  # Reference Title

  # Calculating Rouge Scores
  score = rouge.get_scores(hypothesis, reference)
  rouge_1_f.append(score[0]['rouge-1']['f'])
  rouge_1_p.append(score[0]['rouge-1']['p'])
  rouge_1_r.append(score[0]['rouge-1']['r'])

  rouge_2_f.append(score[0]['rouge-2']['f'])
  rouge_2_p.append(score[0]['rouge-2']['p'])
  rouge_2_r.append(score[0]['rouge-2']['r'])

  rouge_l_f.append(score[0]['rouge-l']['f'])
  rouge_l_p.append(score[0]['rouge-l']['p'])
  rouge_l_r.append(score[0]['rouge-l']['r'])

# Final Average Rouge Score Calculation
rouge_1_f_val = sum(rouge_1_f)/test_size
rouge_1_p_val = sum(rouge_1_p)/test_size
rouge_1_r_val = sum(rouge_1_r)/test_size

rouge_2_f_val = sum(rouge_2_f)/test_size
rouge_2_p_val = sum(rouge_2_p)/test_size
rouge_2_r_val = sum(rouge_2_r)/test_size

rouge_l_f_val = sum(rouge_l_f)/test_size
rouge_l_p_val = sum(rouge_l_p)/test_size
rouge_l_r_val = sum(rouge_l_r)/test_size
print('\n Scores:')
print('Avergae Rouge 1 F Score   : {}'.format(rouge_1_f_val))
print('Avergae Rouge 1 Precision : {}'.format(rouge_1_p_val))
print('Avergae Rouge 1 Recall    : {}'.format(rouge_1_r_val))
print('\n')
print('Avergae Rouge 2 F Score   : {}'.format(rouge_2_f_val))
print('Avergae Rouge 2 Precision : {}'.format(rouge_2_p_val))
print('Avergae Rouge 2 Recall    : {}'.format(rouge_2_r_val))
print('\n')
print('Avergae Rouge L F Score   : {}'.format(rouge_l_f_val))
print('Avergae Rouge L Precision : {}'.format(rouge_l_p_val))
print('Avergae Rouge L Recall    : {}'.format(rouge_l_r_val))

# Inference (Generation)

In [None]:
def inference(model, tokenizer, test_list, test_dataset, maximum_length, show_abstracts = True):
  for test_index in test_list:                      
    # Fetch Input and Reference from Test Dataset
    data = test_dataset['Abstract'][test_index]
    reference = test_dataset['Title'][test_index]

    # Inference
    input_sentence_ids = torch.tensor(tokenizer.encode(data, add_special_tokens=True)).unsqueeze(0).cuda()
    generated = model.generate(input_sentence_ids,
                               max_length = maximum_length, 
                               decoder_start_token_id = model.config.decoder.pad_token_id)
    hypothesis = tokenizer.batch_decode(generated, skip_special_tokens=True)[0]
    print('\nTest ID = {}'.format(test_index))
    if (show_abstracts == True):
      print('\nAbstract:')
      print(data)
    print('\nActual Title:')
    print(reference)
    print('\nPredicted Title:')
    print(hypothesis)

test_list = [1,4,5,7,9,10,12,20,22,100,134,200]                         # Test Indices for Inference
inference(seq2seq, tokenizer, test_list, test_data, 24, show_abstracts = False)