In [1]:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest

In [2]:
class FrequencySummarizer:
    def __init__(self, min_cut=0.01, max_cut=0.9):
        """
        Initilize the text summarizer.
        Words that have a frequency term lower than min_cut 
        or higher than max_cut will be ignored.
        """
        self._min_cut = min_cut
        self._max_cut = max_cut 
        self._stopwords = set(stopwords.words('english') + list(punctuation))

    def _compute_frequencies(self, word_sent):
        """ 
        Compute the frequency of each of word.
        Input: 
        word_sent, a list of sentences already tokenized.
        Output: 
        freq, a dictionary where freq[w] is the frequency of w.
        """
        freq = defaultdict(int)
        for s in word_sent:
            for word in s:
                if word not in self._stopwords:
                    freq[word] += 1
        # frequencies normalization and filtering
        m = float(max(freq.values()))
        for w in freq.keys():
            freq[w] = freq[w]/m
            if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
                del freq[w]
        return freq
    
    def Text_length (self,text):
        sents= sent_tokenize(text)
        return len(sents)
    
    def summarize(self, text, n):
        """
        Return a list of n sentences
        which represent the summary of text.
        """
        sents = sent_tokenize(text)
        assert n <= len(sents)
        word_sent = [word_tokenize(s.lower()) for s in sents]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sent in enumerate(word_sent):
            for w in sent:
                if w in self._freq:
                    ranking[i] += self._freq[w]
        sents_idx = self._rank(ranking, n)
        return [sents[j] for j in sents_idx]
    
    def _rank(self, ranking, n):
        """ return the first n sentences with highest ranking """
        return nlargest(n, ranking, key=ranking.get)

In [3]:
import urllib2
from bs4 import BeautifulSoup

def get_only_text(url):
    """ 
    return the title and the text of the article
    at the specified url
     """
    hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Connection': 'keep-alive'}
    req = urllib2.Request(url, headers=hdr)
    try:
        page = urllib2.urlopen(req).read().decode('utf8')
    except urllib2.HTTPError, e:
        print e.fp.read()
        
    soup = BeautifulSoup(page)
    text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
    
    return soup.title.text, text

In [4]:
#Just a naive approach to extract some summaries from Knowledge Officer articles from the direct link

article_url =[] 

#Engineering
article_url.append('https://redditblog.com/2017/06/30/why-we-chose-typescript/') 
#Startup & Business
article_url.append('http://www.businessinsider.com/how-a-box-of-cereal-and-being-like-a-cockroach-helped-airbnb-become-a-billion-dollar-business-2013-3')
#Product & Design
article_url.append('https://www.techproductmanagement.com/iot-decision-framework/')

# KO Article
# article_url.append('https://www.medium.com/swlh/diligence-at-social-capital-part-2-accounting-for-revenue-growth-551fa07dd972')

fs = FrequencySummarizer()
for article in article_url[:4]:
    title, text = get_only_text(article)
    print '--------------------------------------------------------------'
    print title
    print '--------------------------------------------------------------'
    print "Size of text: " + str(fs.Text_length(text)) + "  Sentences"
    length= fs.Text_length(text)
    sents_retrieved = int((1./5.)* length)
    print "Sentence Retrevied from each text relative to the size: "+ str(sents_retrieved) 
    print "______________________________________________________________________________________________________________"
    for s in fs.summarize(text, sents_retrieved):   #Here we can control the number of the sentences snippets. 
        print '*',s
    print "______________________________________________________________________________________________________________\n"



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


--------------------------------------------------------------
Why We Chose Typescript – Upvoted
--------------------------------------------------------------
Size of text: 49  Sentences
Sentence Retrevied from each text relative to the size: 9
______________________________________________________________________________________________________________
* Typescript also came with a lot of “social proof” and better assurances about its longevity There are several large projects using Typescript (examples include VSCode, Rxjs, Angular, and Typescript itself), so we felt confident that its feature set could support our product goals, and the language would stick around for several years.
* One major difference between Typescript and Flow is that Typescript is a language that compiles down to Javascript, whereas Flow is a set of annotations you can add to existing Javascript that can then be checked for correctness by a tool.
* We picked Typescript because we are confident we could onboa