In [1]:
from googlesearch import search

In [2]:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest

class FrequencySummarizer:
    def __init__(self, min_cut=0.1, max_cut=0.9):
        """
        Initilize the text summarizer.
        Words that have a frequency term lower than min_cut 
        or higer than max_cut will be ignored.
        """
        self._min_cut = min_cut
        self._max_cut = max_cut 
        self._stopwords = set(stopwords.words('english') + list(punctuation))
    
    def _compute_frequencies(self, word_sent):
        """ 
        Compute the frequency of each of word.
        Input: 
        word_sent, a list of sentences already tokenized.
        Output: 
        freq, a dictionary where freq[w] is the frequency of w.
        """
        freq = defaultdict(int)
        for s in word_sent:
            for word in s:
                if word not in self._stopwords:
                      freq[word] += 1
        # frequencies normalization and fitering
        m = float(max(freq.values()))
        keys=list(freq.keys())
        for w in keys:
            freq[w]=freq[w]/m
            if freq[w]>=0.9 or freq[w]<=0.1:
                keys.remove(w)
        freq=dict((k,v) for k,v in freq.items() if k in keys)
        return freq

    def summarize(self, text, n):
        """
        Return a list of n sentences 
        which represent the summary of text.
        """
        sents = sent_tokenize(text)
        assert n <= len(sents)
        word_sent = [word_tokenize(s.lower()) for s in sents]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sent in enumerate(word_sent):
            for w in sent:
                if w in self._freq:
                    ranking[i] += self._freq[w]
        sents_idx = self._rank(ranking, n)    
        return [sents[j] for j in sents_idx]

    def _rank(self, ranking, n):
        """ return the first n sentences with highest ranking """
        return nlargest(n, ranking, key=ranking.get)

In [3]:
from bs4 import BeautifulSoup
import urllib
import requests

In [4]:
query='seed'

In [20]:
seed_urls=[]
seed_text=[]
for epoch in range(10):
    urls=[]
    for url in search(query,tld='com',start=epoch*100,stop=(epoch+1)*100-1,pause=5.0):
        urls.append(url)
    for url in urls:
        headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
        
        html=urllib.request.urlopen(url,headers=headers)
        soup=BeautifulSoup(html,'lxml')
        text=' '.join(map(lambda p:p.text,soup.find_all('p')))
        fs=FrequencySummarizer()
        key_text=fs.summarize(text,1)[0]
        if 'seed' in key_text:
            seed_urls.append(url)
            seed_text.append(key_text)

HTTPError: HTTP Error 503: Service Unavailable