In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [None]:
from collections import defaultdict

In [None]:
from string import punctuation

In [None]:
from heapq import nlargest

In [None]:
class FrequencySummarizer:
    def __init__(self, min_cut=0.1, max_cut=0.9):
        self._min_cut = min_cut
        self._max_cut = max_cut
        
        self._stopwords = set(stopwords.words('english') + list(punctuation))
        
    def _compute_frequencies(self, word_sent):
        freq = defaultdict(int)
        
        for sentence in word_sent:
            for word in sentence:
                if word not in self._stopwords:
                    freq[word] += 1
                    
    
        max_freq = float(max(freq.values()))
        for word in freq.keys():
            freq[word] = freq[word] / max_freq
            if freq[word] >= self._max_cut or freq[word] <= self._min_cut:
                del freq[word]
                
        return freq
    
    def summarize(self, text, n):
        sents = sent_tokenize(text)
        assert n <= len(sents)
        
        word_sent = [word_tokenize(s.lower()) for s in sents]
        self._freq = self._compute_frequencies(word_sent)
        
        ranking = defaultdict(int)
        
        for i, sent in enumerate(word_sent):
            for word in sent:
                if word in self._freq:
                    ranking[i] += self._freq[word]
                    
        sents_idx = nlargest(n, ranking, key = ranking.get)
        
        return '. '.join([sents[j] for j in sents_idx])

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
def get_only_text_washingtonpost_url(url):
    page = requests.get(url).content
    soup = BeautifulSoup(page, "lxml")
    
    #This part is specific to Wasington Post
    
    text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
    
    #clean up <p> tags
    soup2 = BeautifulSoup(text, "lxml")
    text = ' '.join(map(lambda p: p.text, soup2.find_all('p')))
    return soup.title.text, text

In [None]:
someUrl = "https://www.washingtonpost.com/world/national-security/trump-weighing-military-options-following-chemical-weapons-attack-in-syria/2017/04/06/0c59603a-1ae8-11e7-9887-1a5314b56a08_story.html?hpid=hp_hp-top-table-main_syria-315pm%3Ahomepage%2Fstory&utm_term=.8237939434e2"

In [None]:
textOfUrl = get_only_text_washingtonpost_url(someUrl)

In [None]:
fs = FrequencySummarizer()
summary = fs.summarize(textOfUrl[1], 3)

In [None]:
summary