In [1]:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest

class FrequencySummarizer:
  def __init__(self, min_cut=0.1, max_cut=0.9):
    """
     Initilize the text summarizer.
     Words that have a frequency term lower than min_cut 
     or higer than max_cut will be ignored.
    """
    self._min_cut = min_cut
    self._max_cut = max_cut 
    self._stopwords = set(stopwords.words('english') + list(punctuation))

  def _compute_frequencies(self, word_sent):
    """ 
      Compute the frequency of each of word.
      Input: 
       word_sent, a list of sentences already tokenized.
      Output: 
       freq, a dictionary where freq[w] is the frequency of w.
    """
    freq = defaultdict(int)
    for s in word_sent:
      for word in s:
        if word not in self._stopwords:
          freq[word] += 1
    # frequencies normalization and fitering
    m = float(max(freq.values()))
    for w in freq.keys():
      freq[w] = freq[w]/m
      if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
        del freq[w]
    return freq

  def summarize(self, text, n):
    """
      Return a list of n sentences 
      which represent the summary of text.
    """
    sents = sent_tokenize(text)
    assert n <= len(sents)
    word_sent = [word_tokenize(s.lower()) for s in sents]
    self._freq = self._compute_frequencies(word_sent)
    ranking = defaultdict(int)
    for i,sent in enumerate(word_sent):
      for w in sent:
        if w in self._freq:
          ranking[i] += self._freq[w]
    sents_idx = self._rank(ranking, n)    
    return [sents[j] for j in sents_idx]

  def _rank(self, ranking, n):
    """ return the first n sentences with highest ranking """
    return nlargest(n, ranking, key=ranking.get)

In [63]:
import urllib3
from bs4 import BeautifulSoup
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


def get_only_text(url):
    """ 
    return the title and the text of the article
    at the specified url
    """
#     print('######'+url)
    http = urllib3.PoolManager()
    response = http.request('GET', url)
    #  page = urllib2.urlopen(url).read().decode('utf8')
    soup = BeautifulSoup(response.data,"lxml")
    text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
    return soup.title.text, str(text.encode('ascii', errors='replace')).replace("?"," ")

In [59]:
articleURL = "https://timesofindia.indiatimes.com/Now-just-pop-a-pill-to-stop-blood-clots/articleshow/4905951.cms"
articleURL1='https://www.washingtonpost.com/world/2018/09/04/with-india-meet-trump-administration-seeks-build-ties-not-damage-them/'

In [64]:
title,txt = get_only_text(articleURL)

In [65]:
txt

'b"We\'ve detected your location as \'Delhi NCR\'. Do you want to switch \\n            Accurate city detection helps us serve more relevant content. Unconventional ways to make your marriage stronger My girlfriend meets her ex-boyfriend without my knowledge Your love horoscope for the month of September Thank you, Bridget. You helped me overcome my awkward, obese teen years My best friend flirts with my boyfriend Signs of a loveless marriage and how to make it work The age at which your self esteem is the highest! Yes, you GAIN weight during periods. Know why! 9 of the weirdest things doctors have removed from a patient\'s body Attention Parents: Don\'t let your kids play THIS sports before they turn 12! The IDEAL number of times you should have sex in a week 10-minute full body, no-gear workout Who wore what at Shweta Bachchan\'s party Kareena Kapoor\'s new bikini photo is too hot! Five ways to re-use your wedding lehenga Every Bollywood celeb is wearing these shoes Five lehenga styl

In [70]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from bs4 import BeautifulSoup

http = urllib3.PoolManager()

url = 'http://feeds.bbci.co.uk/news/rss.xml'
response = http.request('GET', url)
# feed_xml = urllib2.urlopen('http://feeds.bbci.co.uk/news/rss.xml').read()
feed = BeautifulSoup(response.data, "lxml")
# feed = BeautifulSoup(feed_xml.decode('utf8'))
to_summarize = list(map(lambda p: p.text, feed.find_all('guid')))
fs = FrequencySummarizer()
for article_url in to_summarize[:5]:
  title, text = get_only_text(article_url)
  print('----------------------------------')
  print (title)
  for s in fs.summarize(text, 2):
    print ('*',s)

----------------------------------
Jair Bolsonaro, Brazil's presidential front-runner, stabbed at rally - BBC News


RuntimeError: dictionary changed size during iteration

In [None]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from bs4 import BeautifulSoup

http = urllib3.PoolManager()

url = 'https://timesofindia.indiatimes.com/rssfeeds/3908999.cms'
response = http.request('GET', url)
# feed_xml = urllib2.urlopen('http://feeds.bbci.co.uk/news/rss.xml').read()
feed = BeautifulSoup(response.data, "lxml")
# feed = BeautifulSoup(feed_xml.decode('utf8'))
to_summarize = list(map(lambda p: p.text, feed.find_all('guid')))
fs = FrequencySummarizer()
for article_url in to_summarize[:5]:
  title, text = get_only_text(article_url)
  print('----------------------------------')
  print (title)
  print(text)
#   for s in summarize(text, 2):
#    print ('*',s)

In [55]:
text

'b"We\'ve detected your location as \'Delhi NCR\'. Do you want to switch \\n            Accurate city detection helps us serve more relevant content. Japanese children are the happiest in the world Unconventional ways to make your marriage stronger My girlfriend meets her ex-boyfriend without my knowledge Your love horoscope for the month of September Thank you, Bridget. You helped me overcome my awkward, obese teen years My best friend flirts with my boyfriend Yes, you GAIN weight during periods. Know why! The age at which your self esteem is the highest! Your bed tea is damaging your body! Know why 9 of the weirdest things doctors have removed from a patient\'s body Attention Parents: Don\'t let your kids play THIS sports before they turn 12! The IDEAL number of times you should have sex in a week How to wear a maxi dress like Sonam Kapoor Who wore what at Shweta Bachchan\'s party Kareena Kapoor\'s new bikini photo is too hot! Five ways to re-use your wedding lehenga Every Bollywood 