In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk. corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest

class FrequencySummarizer:
    def __init__(self, min_cut=0.1, max_cut=0.9):
        """
            Initialize the text summarizer.
            words that have a frequency term lower than min_cut
            and higher than max_cut will be ignored
        """
        
        self._min_cut=min_cut
        self._max_cut=max_cut
        self._stopwords=set(stopwords.words('english') + list(punctuation))
        
        
    def _compute_freq(self, word_sent):
        """
            Compute the frequency of each word.
            Input:
            word_sent, a list of all sentences already tokenized.
            Output:
            freq, a dictionary where freq[a] is the freq of a.
        """
        
        freq=defaultdict(int)
        for s in word_sent:
            for word in s:
                if word not in self._stopwords:
                    freq[word]+=1
                    
        #freq normalization and filtering
        m=float(max(freq.values()))
        for w in freq.keys():
            if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
                del freq[w]
        return freq
    
    
    def summarize(self, text, n):
        """
            Return a list of n sentences
            which represent the summary of the text
        """
        
        sents=sent_tokenize(text)   #sent_tokenize already built in
        assert n <= len(sents)     
        word_sent=[word_tokenize(s.lower()) for s in sents]   #making the words lowercase
        self._freq=self._compute_frequencies(word_sent)
        ranking=defaultdict(int)
        for i, sent in enumerate(word_sent):
            for w in sent:
                if w in self._freq:
                    ranking[i] += self._freq[w]
        sents_idx=self._rank(ranking, n)
        return [sents[j] for j in sents_idx]
    
    def _rank(self, ranking, n):
        """ will return the first n sentences with highest ranking """
        return nlargest(n, ranking, key=ranking.get)
    
                

In [2]:
#The FrequencySummarizer tokenizes the input into sentences then computes the term frequency. The frequency map is filtered to remove words whose frequency is lesser than the min threshold and words whose frequency is greater than the max threshold, that occur frequently but dont carry much information, such as determiners  

In [3]:
#create a function that extract the natural language from a HTML page using BeautifulSoup

!pip install bs4
import urllib.request
from urllib.request import urlopen
from bs4 import BeautifulSoup

def get_only_text(url):
 """ 
  return the title and the text of the article
  at the specified url
 """
 page = urllib.request.urlopen(url).read().decode('utf8')
 soup = BeautifulSoup(page)
 text = ' '.join(map(lambda p: p.text, soup.find_all('p')))   #all text extracted from the <p> tage
 return soup.title.text, text

Collecting bs4
  Downloading https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
Building wheels for collected packages: bs4
  Running setup.py bdist_wheel for bs4 ... [?25ldone
[?25h  Stored in directory: /Users/shivangisareen/Library/Caches/pip/wheels/a0/b0/b2/4f80b9456b87abedbc0bf2d52235414c3467d8889be38dd472
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1
[33mYou are using pip version 9.0.1, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [4]:
feed_xml = urllib.request.urlopen('http://feeds.bbci.co.uk/news/rss.xml').read()
feed = BeautifulSoup(feed_xml.decode('utf8'))
to_summarize = map(lambda p: p.text, feed.find_all('guid'))

fs = FrequencySummarizer()
for article_url in to_summarize[:5]:
  title, text = get_only_text(article_url)
  print ('----------------------------------')
  print (title)
  for s in fs.summarize(text, 2):
   print ('*',s)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


TypeError: 'map' object is not subscriptable