In [1]:
import urllib.request
from bs4 import BeautifulSoup

In [2]:
articleURL="https://www.washingtonpost.com/news/the-switch/wp/2016/10/18/the-pentagons-massive-new-telescope-is-designed-to-track-space-junk-and-watch-out-for-killer-asteroids/?utm_term=.da4e0c017d62"

In [14]:
with urllib.request.urlopen(articleURL) as url:
    page = url.read().decode('utf8', 'ignore') # downloads the page
soup = BeautifulSoup(page, "lxml") # takes the html and represents as a tag tree
# soup

In [16]:
# In order to extract text we need to know the sctructure of the tags.
# By convention we know that washington post puts all its text in article tags
ar = soup.find('article') # Finds the first article
# ar

In [17]:
arText = soup.find('article').text # get the text within the article
# arText

In [18]:
# To find and join all the articles in the web page we use the soup.find_all method
text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
#text

In [19]:
# the below lines removes encoded special characters like \xa0
cleanText=text.replace('\xa0',' ')
#cleanText

In [8]:
# now we are compiling all these logic to download and parse text into one singke function
def getTextWaPo(url): # function get text from washington post
    with urllib.request.urlopen(articleURL) as url:
        page = url.read().decode('utf8', 'ignore') # downloads the page
    soup = BeautifulSoup(page, "lxml") # takes the html and represents as a tag tree
    text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
    return text.replace('\xa0',' ')

In [20]:
text=getTextWaPo(articleURL) # down and clear text

In [21]:
# now we process the text , we breakdown the text into sentences and words
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation

In [22]:
sents = sent_tokenize(text) # break article into sentences. 
# Remember that any period should be followed by a space to be considerd a seperate sentence
# sents

In [23]:
word_sent=word_tokenize(text.lower()) # we get the list of words in the entire article
customStopwords = set(stopwords.words('english') + list(punctuation) + ['“', '”', '\'s', '', '’']) # stopwords and punctuations
word_sent = [word for word in word_sent if word not in customStopwords]
# word_sent

In [25]:
# here we are first going  to find the most important words
from nltk.probability import FreqDist # FreqDist is a table with words in one column and number of times 
# the words occur in another table
freq = FreqDist(word_sent)
freq

FreqDist({'space': 15, 'telescope': 9, 'objects': 7, 'debris': 7, 'satellites': 6, 'orbit': 6, 'air': 6, 'force': 6, 'around': 4, 'small': 4, ...})

In [26]:
# Now we will find the most important words
from heapq import nlargest # can be used to sort lists and collections abnd retrun the top n values
nlargest(10, freq, key=freq.get) # no of elements you want to pick, the collection,
# a function which can be used to sort the collection, the function that we spefified get is freq.get which returns 
# the value in the dictionary for the given key
# this is just a demo of the function - we will use it later

['space',
 'telescope',
 'objects',
 'debris',
 'satellites',
 'orbit',
 'air',
 'force',
 'around',
 'small']

In [32]:
# now that we have the most importnat words, we can use that to assign a significance score to each sentence
# in the article
from collections import defaultdict # we will create a dic with key = sentences , value = significance score
# default dict will not throw error if key not find , but will add that key to the dictionary
ranking = defaultdict(int)

for i, sent in enumerate(sents): # enumerate converts [a,b,c] to [(0,a),(1,b),(2,c)] which goes into values i, sent
    for w in word_tokenize(sent):
        if w in freq:
            ranking[i] += freq[w]
ranking


defaultdict(int,
            {0: 27,
             1: 23,
             2: 7,
             3: 34,
             4: 17,
             5: 41,
             6: 65,
             7: 3,
             8: 50,
             9: 24,
             10: 10,
             11: 12,
             12: 15,
             13: 100,
             14: 45,
             15: 45,
             16: 67,
             17: 28,
             18: 87,
             19: 35,
             20: 9,
             21: 21,
             22: 38,
             23: 26})

In [35]:
# we now use the nlargest to pick the top 4 sentences with the largest significance score
sents_idx = nlargest(4, ranking, key=ranking.get)
sents_idx

[13, 18, 16, 6]

In [38]:
# we now pick the sentences which are presnet in this list but sorted in asc order. 
[sents[j] for j in sorted(sents_idx)]

['On Tuesday, the Defense Department took another significant step toward monitoring all of the cosmic junk swirling around in space, by delivering a gigantic new telescope capable of seeing small objects from very far away.',
 "But the telescope's ability to see “something very far away over a very wide area is really what it’s best at.” DARPA says the advanced technology in the massive, 90-ton telescope would allow officials to go from “seeing only a few large objects at a time through the equivalent of a drinking straw to a windshield view with 10,000 objects at a time.” It is also being used by NASA to monitor asteroids and other near-Earth objects that could collide with the planet, officials said.",
 "“That's a critical capability for the U.S. military, as they have a lot of very important satellites in GEO, and are increasingly worried about threats to those satellites.” The telescope would join another new space debris tracking technology known as the Space Fence, which is now 

In [39]:
# so we suimmarize what we have done so far in a function
def summarize(text, n):
    sents = sent_tokenize(text)
    
    assert n <= len(sents)
    word_sent=word_tokenize(text.lower()) # we get the list of words in the entire article
    customStopwords = set(stopwords.words('english') + list(punctuation) + ['“', '”', '\'s', '', '’']) # stopwords and punctuations
    word_sent = [word for word in word_sent if word not in customStopwords]
    # word_sent
    freq = FreqDist(word_sent)
    
    ranking = defaultdict(int)

    for i, sent in enumerate(sents): # enumerate converts [a,b,c] to [(0,a),(1,b),(2,c)] which goes into values i, sent
        for w in word_tokenize(sent):
            if w in freq:
                ranking[i] += freq[w]
    
    sents_idx = nlargest(4, ranking, key=ranking.get)
    return [sents[j] for j in sorted(sents_idx)]

In [40]:
summarize(text,3)

['On Tuesday, the Defense Department took another significant step toward monitoring all of the cosmic junk swirling around in space, by delivering a gigantic new telescope capable of seeing small objects from very far away.',
 "But the telescope's ability to see “something very far away over a very wide area is really what it’s best at.” DARPA says the advanced technology in the massive, 90-ton telescope would allow officials to go from “seeing only a few large objects at a time through the equivalent of a drinking straw to a windshield view with 10,000 objects at a time.” It is also being used by NASA to monitor asteroids and other near-Earth objects that could collide with the planet, officials said.",
 "“That's a critical capability for the U.S. military, as they have a lot of very important satellites in GEO, and are increasingly worried about threats to those satellites.” The telescope would join another new space debris tracking technology known as the Space Fence, which is now 