In [None]:
#!/usr/bin/env python
# coding: utf-8

Extractive Summarization: Extractive methods attempt to summarize articles by selecting a subset of words that retain the most important points.
This approach weights the important part of sentences and uses the same to form the summary. 
Different algorithms and techniques are used to define weights for the sentences and further rank them based on importance and similarity among each other (here's an upsupervised learning approach).

In [1]:
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

import PyPDF2

In [17]:
file = "GS.08.53655_Failure analysis of an adhesive bonded GRE joint.pdf"

pdfFileObj = open(file, 'rb') 
pdfreader = PyPDF2.PdfFileReader(pdfFileObj) 
count = pdfreader.numPages

file_name = ''
for j in range(count):
    page = pdfreader.getPage(j)
    pp = page.extractText()
    file_name = file_name + pp




In [30]:
#all functions defined

def read_article(file_name):
#    filepath = open(file_name, "r")
#    filedata = file_name.readlines()
    article = file_name.split(". ")
    
    sentences = []
    for sentence in article:
#        print(sentence)
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop() 
    
    return sentences

def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)
 
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix


def generate_summary(file_name):
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Read text anc split it
    sentences =  read_article(file_name)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)    
#    print("Indices of top ranked_sentence order are ", ranked_sentence)    

#   Pick top 2 sentences for summary
    for i in range(2):
        summarize_text.append(" ".join(ranked_sentence[i][1]))

#   Pick the top-ranked sentence for summary
#    summarize_text.append(" ".join(ranked_sentence[0][1]))

    print("SUMMARIZED TEXT: \n", ". ".join(summarize_text))


In [31]:
generate_summary(file_name)

SUMMARIZED TEXT: 
 Failure analysis of an adhesive bonded  GRE joint GS.08.53655
CONFIDENTIAL
    
 
 
 
   
 Failure analysis of an adhesive bonded GRE joint    
   
 
 
 by  K. The measured thickness of that adhesive layer is in the range of 0.2 
                                                  1  Shell Global Solutions internal sample code UM.08.031/1 
GS.08.53655 
4 CONFIDENTIAL 
  and 0.4 mm
