In [15]:
import nltk
from nltk.corpus import stopwords
import numpy as np
from nltk.cluster.util import cosine_distance
import re
import networkx as nx
import spacy
nlp = spacy.load('en_core_web_sm')

In [35]:
def read_article(file_name):
    file = open(file_name,'r')
    file_data = file.read()
    file_doc = nlp(file_data)
    sentences = list(file_doc.sents)
    return [re.sub(r'[^a-zA-z\s]', '', str(sent)).strip() for sent in sentences]

In [46]:
lst = read_article('../All Data_Files/wiki_us.txt')
lst[:5]

['The United States of America USA or USA commonly known as the United States US or US or America is a country primarily located in North America',
 'It consists of  states a federal district five major unincorporated territories  Indian reservations and some minor possessions[j]',
 'At  million square miles  million square kilometers it is the worlds third or fourthlargest country by total area[d]',
 'The United States shares significant land borders with Canada to the north and Mexico to the south as well as limited maritime borders with the Bahamas Cuba and Russia[] With a population of more than  million people it is the third most populous country in the world',
 'The national capital is Washington DC and the most populous city is New York']

In [37]:
def sentence_similarity(sent1,sent2,stopwords=None):
    
    if stopwords is None:
        stopwords = []
        
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list(set(sent1+sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
        
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
    return 1 - cosine_distance(vector1, vector2)

In [24]:
def similarity_matrix(sentences, stopwords):
    N = len(sentences)
    similarity_matrix = np.zeros((N, N))
    for i in range(N):
        for j in range(N):
            if i == j:
                continue
            similarity_matrix[i,j] = sentence_similarity(
                                        sentences[i],sentences[j],stopwords)
    return similarity_matrix

In [38]:
def get_summary(file_name,top_n=5):
    
    stop_words = stopwords.words('english')
    summarize_text = []
    
    sentences = read_article(file_name)
    sent_sim_mat = similarity_matrix(sentences, stop_words)
    sent_sim_graph = nx.from_numpy_array(sent_sim_mat)
    scores = nx.pagerank(sent_sim_graph)
    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)),reverse=True)
    
    for i in range(top_n):
        summarize_text.append(''.join(ranked_sentences[i][1]))
    print('Summary: \n','. '.join(summarize_text),sep='')

In [42]:
get_summary('../All Data_Files/wiki_us.txt',7)

Summary: 
The country ranks high in international measures of economic freedom quality of life education and human rights and has low levels of perceived corruption. During the Cold War the United States fought the Korean War and the Vietnam War but avoided direct military conflict with the Soviet Union. Slavery was legal in the southern United States until the second half of the th century when the American Civil War led to its abolition. The United States is a highly developed country accounts for approximately a quarter of global GDP and is the worlds largest economy. PaleoIndians migrated from Siberia to the North American mainland at least  years ago and European colonization began in the th century. In the late th century the US began expanding across North America gradually obtaining new territories sometimes through war frequently displacing Native Americans and admitting new states by  the United States spanned the continent. The Soviet Unions dissolution in  ended the Cold Wa