In [27]:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest
import urllib.request
from bs4 import BeautifulSoup

class FrequencySummarizer:
    def __init__(self, min_cut=0.1, max_cut=0.9):
        self._min_cut = min_cut
        self._max_cut = max_cut 
        self._stopwords = set(stopwords.words('english') + list(punctuation))

    # _compute_frequencies - takes in a list of sentences, and outputs a dictionary,
    # where the keys are words, and values are the frequencies of those words in the 
    # set of sentences
    def _compute_frequencies(self, word_sent):
        freq = defaultdict(int)
        
        # count non stopwords
        for s in word_sent:
            for word in s:
                if word not in self._stopwords:
                    freq[word] += 1

        # calculate frequency from count
        m = float(max(freq.values()))
        for w in list(freq.keys()):
            freq[w] = freq[w]/m
            if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
                del freq[w]

        return freq

    def summarize(self, text, n):
        sentences = sent_tokenize(text)
        assert n <= len(sentences)

        words_in_sentences = [word_tokenize(s.lower()) for s in sentences]
        
        self._freq = self._compute_frequencies(words_in_sentences)

        ranking = defaultdict(int)

        for i,sentence in enumerate(words_in_sentences):
            for w in sentence:
                if w in self._freq:
                    ranking[i] += self._freq[w]

        sentence_indexes = nlargest(n, ranking, key=ranking.get)
        return [sentences[j] for j in sentence_indexes]

def get_text_from_paragraphs(article):
    return ' '.join(map(lambda p: p.text, article.find_all('p')))

def get_only_text_washington_post_url(url):

    page = urllib.request.urlopen(url).read().decode('utf8')

    soup = BeautifulSoup(page, "html5lib")

#    text = ' '.join(map(get_text_from_paragraphs, soup.find_all('article')))
    text = ' '.join(map(get_text_from_paragraphs, soup.find_all(attrs={"itemprop": "articleBody"})))

    return soup.title.text, text

someUrl = "https://www.networkworld.com/article/3220437/internet-of-things/ai-and-iot-like-peanut-butter-and-chocolate.html"

textOfUrl = get_only_text_washington_post_url(someUrl)
fs = FrequencySummarizer()

summary = fs.summarize(textOfUrl[1], 3)

print(summary)

['“Specifically, it intends to focus on services incorporating cutting-edge technology, including AI and IoT.” Beyond the hype they share, combining IoT and AI can make a lot of sense.', 'In Japan, the AI/IoT combination is so hot that Japanese tech giant Fujitsu is reportedly dumping its mobile phone business (and mobile is probably the moment’s third-hottest trend) to focus on the intersection of AI and IoT.', 'The goal, per the company’s Watson IoT website, is to marry cognitive computing (the Watson AI platform) to vast arrays of IoT sensors.']
