In [1]:
# -*- coding: utf-8 -*-
import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx


def generate_sentences(filename):

    file = open(filename, encoding="utf8")
    rawtext = file.read()
    sentences = []
    # tokenizing the sentences
    token_sent = nltk.sent_tokenize(rawtext)

    for sentence in token_sent:
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    return sentences
    
def generate_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
    #Build empty vectors with columns equal to lenght of all words
    vector1 = [0] * len(all_words) # Value 0 works perfectly for this, any value other than 0 gives different results
    vector2 = [0] * len(all_words)

    #building vector for the first sentence by adding values
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1 # value can be increased by any count, keeping it similar to vector2 increment gives good results
 
        #building vector for the first sentence by adding values
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
    # This returns cosine distance
    return cosine_distance(vector1, vector2)
 
def generate_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix to be updated below with values
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: 
                continue 
            similarity_matrix[idx1][idx2] = generate_similarity(sentences[idx1], sentences[idx2], stop_words)
    #returning the generated similarity matrix
    return similarity_matrix


def generate_summarized_text(file_name, num_sent=3):
    stop_words = stopwords.words('english')
    summarize_text = []

    # Gathering the sentences split into array of words in a multi dimensional array
    sentences =  generate_sentences(file_name)

    # generating similarity matrix for each set of sentence combinations
    generate_similarity_martix = generate_similarity_matrix(sentences, stop_words)

    # ranking the sentence in the similarity matrix
    generate_similarity_graph = nx.from_numpy_array(generate_similarity_martix)
    generate_scores = nx.pagerank(generate_similarity_graph)

    # Sorting the sentences in terms of rank and picking the top sentences
    generate_ranked_sentence = sorted(((generate_scores[i],s) for i,s in enumerate(sentences)), reverse=False)     

    for i in range(num_sent):
      summarize_text.append(" ".join(generate_ranked_sentence[i][1]))

    #Printing the summary
    print("Document Summary: \n")
    print (". ".join(summarize_text))

# This is where we provide the file name that needs to be uploaded from our machine   
generate_summarized_text('test_sen_len.txt', 2)

Document Summary: 

[9] Due to the collapse of the 4-story Chuzon Supermarket, the Department of Interior and Local Government had to suspend all business permits of Chuzon Supermarket and its branches, as well as to conduct an investigation regarding the collapse of the 4-storey commercial establishment, which was built 4 years ago.. [13] PHIVOLCS added that the earthquake would not trigger an eruption of Pinatubo, stating that the volcano's magma supply has not sufficiently replenished since 1991 to allow for another eruption.
