In [70]:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest

class FrequencySummarizer:
    def __init__(self, min_cut=0.1, max_cut=0.9):
        """
        Initilize the text summarizer.
        Words that have a frequency term lower than min_cut 
        or higer than max_cut will be ignored.
        """
        self._min_cut = min_cut
        self._max_cut = max_cut 
        self._stopwords = set(stopwords.words('english') + list(punctuation))
    
    def _compute_frequencies(self, word_sent):
        """ 
        Compute the frequency of each of word.
        Input: 
        word_sent, a list of sentences already tokenized.
        Output: 
        freq, a dictionary where freq[w] is the frequency of w.
        """
        freq = defaultdict(int)
        for s in word_sent:
            for word in s:
                if word not in self._stopwords:
                      freq[word] += 1
        # frequencies normalization and fitering
        m = float(max(freq.values()))
        keys=list(freq.keys())
        for w in keys:
            freq[w]=freq[w]/m
            if freq[w]>=0.9 or freq[w]<=0.1:
                keys.remove(w)
        freq=dict((k,v) for k,v in freq.items() if k in keys)
        return freq

    def summarize(self, text, n):
        """
        Return a list of n sentences 
        which represent the summary of text.
        """
        sents = sent_tokenize(text)
        assert n <= len(sents)
        word_sent = [word_tokenize(s.lower()) for s in sents]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sent in enumerate(word_sent):
            for w in sent:
                if w in self._freq:
                    ranking[i] += self._freq[w]
        sents_idx = self._rank(ranking, n)    
        return [sents[j] for j in sents_idx]

    def _rank(self, ranking, n):
        """ return the first n sentences with highest ranking """
        return nlargest(n, ranking, key=ranking.get)

In [71]:
from bs4 import BeautifulSoup
import urllib

In [75]:
url='https://stackoverflow.com/questions/38635419/searching-in-google-with-python'
html=urllib.request.urlopen(url)
soup=BeautifulSoup(html,'lxml')

In [76]:
text=' '.join(map(lambda p: p.text,soup.find_all('p')))

In [77]:
fs=FrequencySummarizer()
fs.summarize(text,2)

['Or I did work out with selenium web driver and it works great if used with Firefox or chrome or Phantom web browser, but still I felt it was a bit slow in terms of execution time, as it queried browser first and then returned search result.',
 'asked 1 year, 7 months ago viewed \n18,264 times\n active 14 days ago \r\nsite design / logo © 2018 Stack Exchange Inc; user contributions licensed under cc by-sa 3.0\r\n                            with attribution required.']

In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rhu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [52]:
sents=sent_tokenize(text)
word_sent=[word_tokenize(s.lower()) for s in sents]

In [53]:
freq=defaultdict(int)
for s in word_sent:
    for word in s:
        if word not in set(stopwords.words('english') + list(punctuation)):
            freq[word]+=1
m=float(max(freq.values()))
keys=list(freq.keys())
for w in keys:
    freq[w]=freq[w]/m
    if freq[w]>=0.9 or freq[w]<=0.1:
        keys.remove(w)
len(freq)

16

In [54]:
freq=dict((k,v) for k,v in freq.items() if k in keys)
len(freq)

11

In [55]:
ranking=defaultdict(int)
for i, sent in enumerate(word_sent):
    for w in sent:
        if w in freq:
            ranking[i]+=freq[w]

In [36]:
keys=freq.keys()
#list(keys)

In [56]:
sents_idx=nlargest(2, ranking, key=ranking.get)

In [57]:
[sents[j] for j in sents_idx]

['\n\nPost\n\nby Bosvark » Thu Nov 13, 2014 2:38 pm\n\t\t\t \n\nPost\n\nby Decker_MMIV » Thu Nov 13, 2014 5:43 pm\n\t\t\t \n\nReturn to “[LS15]English-Forum”']

In [80]:
soup.title.text

'Searching in Google with Python - Stack Overflow'

In [106]:
url='https://www.pinterest.com/pin/552535448014491439/'
html=urllib.request.urlopen(url)
soup=BeautifulSoup(html,'lxml')
text=' '.join(map(lambda p: p.text,soup.find_all('p')))

In [108]:
text

"I have dill growing in all of my flower beds! <em>Calamintha nepeta</em> An important aromatic and antiseptic herb in native North American medicine, the light brownish flowers of Western mugwort are bourne in panicles in late summer and the leaves act as a good deodorant in shoes. It thrives in sandy soil. Golden Oregano is milder than Italian or Greek oregano, but what it lacks in flavor it makes up in beauty: It's an excellent groundcover with small pink or purple flowers in summer. <em>Tanacetum balsamita</em> Alchemilla alpina It is easy to grow and compact in form, but it self-seeds and hybridizes readily. Height: inches Spread: 20 inches Hardiness: Fully hardy plants Soil Preference: Moist soil Sun or Shade: Full sun, partial or dappled shade Check out our complete guide to herbs from Ac to He."

In [111]:
fs=FrequencySummarizer()
fs.summarize(text,5)

['<em>Calamintha nepeta</em> An important aromatic and antiseptic herb in native North American medicine, the light brownish flowers of Western mugwort are bourne in panicles in late summer and the leaves act as a good deodorant in shoes.',
 'Height: inches Spread: 20 inches Hardiness: Fully hardy plants Soil Preference: Moist soil Sun or Shade: Full sun, partial or dappled shade Check out our complete guide to herbs from Ac to He.',
 "Golden Oregano is milder than Italian or Greek oregano, but what it lacks in flavor it makes up in beauty: It's an excellent groundcover with small pink or purple flowers in summer.",
 '<em>Tanacetum balsamita</em> Alchemilla alpina It is easy to grow and compact in form, but it self-seeds and hybridizes readily.',
 'I have dill growing in all of my flower beds!']