Inspired by the article "Summarizing Lengthy Articles" by Mitesh Dewda on Medium.com

In [49]:
import heapq
import math
import nltk
import numpy
from pdfminer.high_level import extract_text

In [50]:
# Read PDF document and extract the text from a range of pages.
def extract_text_from_pdf(pdf_path, start_page=None, end_page=None):
    return extract_text(pdf_path, page_numbers=range(start_page, end_page+1) if start_page and end_page else None)

In [51]:
# remove empty lines from text
def remove_empty_lines(text):
    return '\n'.join(line for line in text.splitlines() if line.strip())

In [None]:
# Step 1: Breaking the text into sentences.
input_content = remove_empty_lines(extract_text_from_pdf('../data/CACM\'18_Search-based_Program_Synthesis.pdf'))
tokenized_sentences = nltk.sent_tokenize(input_content)
for i in range(len(tokenized_sentences)):
    print(i, tokenized_sentences[i])

In [None]:
from nltk import word_tokenize

tokenizer = nltk.RegexpTokenizer(r"\w+")
sentences_formatted = []
for sentence in tokenized_sentences:
    sentences_formatted.append(' '.join([word for word in tokenizer.tokenize(sentence)]))
    
total_words_in_content = len(word_tokenize(' '.join(sentences_formatted)))
print(f"total words in content {total_words_in_content}")

In [54]:
# Step 2: Checking title similarity
title = 'Search-based Program Synthesis'
list_of_title_similarity = []
for sentence in sentences_formatted:
    list_of_title_similarity.append(len(list(set([word.lower() for word in sentence.split()]) & set([word.lower() for word in title.split()])))/total_words_in_content)

In [55]:
# Step 3: Calculating the term weight, means terms frequency and it's importance
term_weight_list = []
word_frequencies = {}
for sentence in sentences_formatted:
    sum_of_term_weight = 0 # sum of term weight for each sentence
    
    # calculate term weight for each word in each sentence
    for word in word_tokenize(sentence):
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1
            
        sum_of_term_weight += round(input_content.count(word) / total_words_in_content, 2)
    
    term_weight_list.append(sum_of_term_weight)

In [None]:
# Step 4: POS tagging - identifying parts of speech of a sentence and tagging each word accordingly.
nltk.download('universal_tagset')
words = [word_tokenize(sentence) for sentence in sentences_formatted]
pos_tags = [nltk.pos_tag(word, tagset="universal") for word in words]

# words which are nouns
noun_word_list = []
for sentence in pos_tags:
    noun_in_sentence = []
    for word in sentence:
        if word[1] == 'NOUN':
            noun_in_sentence.append(word[0])
    noun_word_list.append(noun_in_sentence)

In [57]:
# Step 5: Generating feature matrix
feature_matrix = numpy.array([list_of_title_similarity, term_weight_list, noun_word_list], dtype=object)
numpy.set_printoptions(suppress=True)

final_matrix = feature_matrix.transpose()

In [58]:
# Step 6: Generating dictionary of sentences their features
sentence_feature_dict = {}

for index, sentence in enumerate(sentences_formatted):
    sentence_feature_dict.update({sentence: final_matrix[index]})
    
# calculating sum of features in each sentence and ranking them
sentence_features_count = []
sentence_rank_dict = {}
for key, value in sentence_feature_dict.items():
    sum_of_nums = 0.0
    for item in value:
        if isinstance(item, float):
            sum_of_nums += item
    sentence_features_count.append(sum_of_nums)
    sentence_rank_dict.update({key: sum_of_nums})

In [None]:
# Step 7: Ranking sentences and pick up the top 30% sentences with the highest rank.
sentences_count = math.ceil(len(list(sentence_rank_dict.keys())) * 0.03)
sentence_rank_key_list = list(sentence_rank_dict.keys())

def sort_key(key):
    return sentence_rank_dict[key]

top_sentences = heapq.nlargest(sentences_count, sentence_rank_key_list, key=sort_key)

for sentence in top_sentences:
    print(sentence + '\n')