### Generate summaries using chunking based BERT_BART method

Assign the dataset and ouput_folder variable according to requirements.  


In [None]:
dataset = "IN" 
output_path = "./IN_BERT_BART/"

In [None]:
import sys
from train_bart import *
sys.path.insert(0, '../')
import transformers
import pandas as pd
import numpy as np
import glob
import nltk
import torch
from utilities import *
import math
import random
import re
import argparse
import os
from summarizer import Summarizer
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig

In [None]:
#Reading the test documents
names, data_source, data_summary = get_summary_data(dataset, "test")
print(len(names))
print(len(data_source))
print(len(data_summary))

In [None]:
dict_names = get_req_len_dict(dataset, "test")   

In [None]:
# Loading Model and tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large', add_prefix_space=True)
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")

In [None]:
bart_model = LitModel.load_from_checkpoint("models/BART_large_IN_MCS.ckpt",
                                       learning_rate = 2e-5, tokenizer = tokenizer, model = model).to("cuda")

In [None]:
def generate_summary_gpu(nested_sentences,p=0.2):
  '''
    Function to generate summaries from the list containing chunks of the document
    input:  nested_sentences - chunks
            p - Number of words in summaries per word in the document
    output: document summary
  '''
  device = 'cuda'
  summaries = []
  for nested in nested_sentences:
    l = int(p * len(nested.split(" ")))
#     print(l)
    input_tokenized = tokenizer.encode(nested, truncation=True, return_tensors='pt')
    input_tokenized = input_tokenized.to(device)
    summary_ids = bart_model.model.to('cuda').generate(input_tokenized,
                                      length_penalty=0.01,
                                      min_length=l-5,
                                      max_length=l+5)
    output = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    summaries.append(output)
  summaries = [sentence for sublist in summaries for sentence in sublist]
  return summaries

In [None]:
model = Summarizer()

In [None]:
def summ_doc(doc, req_len):
    '''
    function to generate summaries for a document
    input:  doc - document
            req_len - Required summary length
    output: summary generated using BERT_BART method
    '''
    doc_len = len(doc.split(" "))
    r = (5*1024)/doc_len
    if r < 1 and req_len < 5*1024:
        ext_result = model(doc, ratio=r)
    else:
        ext_result = doc
    nested = nest_sentences(ext_result,1024)
    p = float(req_len/len(ext_result.split(" ")))
    abs_summ = generate_summary_gpu(nested,p)
    
    summ = " ".join(abs_summ)
    return summ

In [None]:
import os
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [None]:
output = []
done = glob.glob(output_path)
done = [i[i.rfind("/")+1:] for i in done]
print(done)

for i in range(len(data_source)):
    name = names[i]
    if name in done:continue
    doc = data_source[i]
    wc = doc.split(" ")
    input_len = len(wc)
    req_len = dict_names[name]
    print(str(i) + ": " + name +  " - " + str(input_len) + " : " + str(req_len))
    
    abs_summ = summ_doc(doc, req_len)
    path = output_path + name
    file = open(path,'w')
    file.write(abs_summ)
    file.close()
    
print(output)