# Times of India - Article Summarize 
    Get summary of any articles in Times of India (timesofindia.indiatimes.com)
NOTE: You can use this for any website, all you have to do is edit the 'soup.find()' according to your site  in 'extractText(url)' function

In [1]:
from bs4 import BeautifulSoup
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize, sent_tokenize
from string import punctuation
import urllib

# Get HTML content for a given 'url', parse it using BeautifullSoup and finally get the article text content from 'arttextxml' tag
def extractText(url):
    page = urllib.request.urlopen(url).read().decode('utf8','ignore') 
    soup = BeautifulSoup(page,"lxml")
    text = soup.find('arttextxml').text.replace("\n","")
    return text

In [2]:
# Get the summary (with 'N' lines) for the given 'text'
def summarizer(text, N):
    # Get the list of sentences from the given text    
    sentence_list = sent_tokenize(text.lower())
    # Check whether text has required number of sentences.
    if N > len(sentence_list):
        raise Exception('Number of sentences in the articles is less than the required number of sentences in summary!')
    # Get the list of words from the given text   
    word_list = word_tokenize(text.lower())

    # special_words are used to remove unwanted words in content from times of india
    special_words = ["'s", '``', "''"]
    # filter_words has the final list of words that should be filtered from the word_list    
    filter_words = set(stopwords.words('english') + list(punctuation) + special_words)
    # nonstopword_list has the valid words that is required for processing
    nonstopword_list = [word for word in word_list if word not in filter_words]
    
    # FreqDist is used to generate a dictionary with frequency distribution from the given list of word
    word_freq = FreqDist(nonstopword_list)
    ranking = defaultdict(int)
    
    # At the end of the for loop, 'ranking' will have a dictionary whose 'key' will be sentence index and 'value' will be the score that was calucated from word_freq
    for i, sentence in enumerate(sentence_list):
        for word in word_tokenize(sentence.lower()):
            if word in word_freq:
                ranking[i] += word_freq[word]

    # Considers only the top N sentences based on the 'ranking'  
    sentence_index = sorted(ranking, key=ranking.get, reverse=True)[:N]
    return [sentence_list[index] for index in sorted(sentence_index)]

In [3]:
articleURL = "http://timesofindia.indiatimes.com/world/china/china-under-pressure-to-free-ailing-nobel-laureate/articleshow/59574334.cms"
text = extractText(articleURL)
summarizer(text, 3)

['  shenyang: china faced sustained international pressure on thursday to let cancer-stricken nobel laureate liu xiaobo seek treatment abroad, as official hospital updates suggest the democracy champion is close to death.',
 'the doctors said liu needed to be on artificial ventilation to be kept alive, but his family declined, according to the first hospital of china medical university in the northeastern city of shenyang.',
 'a german and a us doctor visited liu last weekend and said he was still strong enough to fulfil his wish to travel overseas, but the hospital has issued increasingly pessimistic reports every day since then.']