In [1]:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest
import nltk
from bs4 import BeautifulSoup
import requests

In [7]:
class FrequencySummarizer:
    def __init__(self, min_cut=0.1, max_cut=0.9):
        """
         Initilize the text summarizer.
         Words that have a frequency term lower than min_cut 
         or higer than max_cut will be ignored.
        """
        self._min_cut = min_cut
        self._max_cut = max_cut 
        self._stopwords = set(stopwords.words('english') + list(punctuation))

    def _compute_frequencies(self, word_sent):
        """ 
          Compute the frequency of each of word.
          Input: 
           word_sent, a list of sentences already tokenized.
          Output: 
           freq, a dictionary where freq[w] is the frequency of w.
        """
        freq = defaultdict(int)
        for s in word_sent:
            for word in s:
                if word not in self._stopwords:
                    freq[word] += 1
    # frequencies normalization and fitering
        m = float(max(freq.values()))
        for w in freq.keys():
            freq[w] = freq[w]/m
            if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
                del freq[w]
        return freq

    def summarize(self, text, n):
        """
          Return a list of n sentences 
          which represent the summary of text.
        """
        sents = sent_tokenize(text)
        assert n <= len(sents)
        word_sent = [word_tokenize(s.lower()) for s in sents]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sent in enumerate(word_sent):
            for w in sent:
                if w in self._freq:
                    ranking[i] += self._freq[w]
        sents_idx = self._rank(ranking, n)    
        return [sents[j] for j in sents_idx]

    def _rank(self, ranking, n):
        """ return the first n sentences with highest ranking """
        return nlargest(n, ranking, key=ranking.get)

In [3]:
def getTextFromURL(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
    return text

In [8]:
def summarizeURL(url, total_pars):
    url_text = getTextFromURL(url).replace(u"Â", u"").replace(u"â", u"")

    fs = FrequencySummarizer()
    final_summary = fs.summarize(url_text.replace("\n"," "), total_pars)
    return " ".join(final_summary)

#url = raw_input("Enter a URL\n")
url = 'http://www.theedadvocate.org/?p=2241'
final_summary = summarizeURL(url, 5)
print final_summary

Email address:     First Name    Last Name      The Edvocate was created in 2014 to argue for shifts in education policy and organization in order to enhance the quality of education and the opportunities for learning afforded to P-20 students in America. There are certainly programs that target urban students when it comes to math, and other STEM, learning but I’d like to think that much of that progress is a direct result of the teachers in the classroom, like Scarlett Childers. Urban improvement President Obama’s Race to the Top initiative emphasizes STEM learning, particularly in mathematics, in order for more students to make it to high school graduation and the college degree beyond it. Signup for The Edvocate Newsletter and have the latest in P-20 education news and opinion delivered to your email address! Signup for The Edvocate Newsletter and have the latest in P-20 education news and opinion delivered to your email address!
