# Indian News Summarizer 

#### Importing Libraries Summarizer

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.probability import FreqDist
from heapq import nlargest
from collections import defaultdict
import os,math

#### Summary Extraction Step Functions

In [None]:
def read_file(path):
    try:
        with open(path, 'r',encoding='utf-8') as file:
            return file.read()
    except IOError as e:
        print(e)
        print("Fatal Error: File ({}) could not be located or is not readable.".format(path))
        
def sanitize_input(data):
    replace = {
        ord('\f') : ' ', 
        ord('\t') : ' ',
        ord('\n') : ' ',
        ord('\r') : None
    }
    return data.translate(replace)

def tokenize_content(content):
    stop_words = set(stopwords.words('english') + list(punctuation))
    words = word_tokenize(content.lower())
    return [sent_tokenize(content), [word for word in words if word not in stop_words] ]

def score_tokens(filterd_words, sentence_tokens):
    word_freq = FreqDist(filterd_words)
    ranking = defaultdict(int)
    for i, sentence in enumerate(sentence_tokens):
        for word in word_tokenize(sentence.lower()):
            if word in word_freq:
                ranking[i] += word_freq[word]
    return ranking

def summarize(ranks, sentences, length):
    if int(length) > len(sentences): 
        print("Error, more sentences requested than available. Use --l (--length) flag to adjust.")
        exit()

    indexes = nlargest(length, ranks, key=ranks.get)
    final_sentences = [sentences[j] for j in sorted(indexes)]
    return ' '.join(final_sentences) 

####  Extract Summary From File / Content

In [None]:
def SummaryFromContent(content,length=None):
    content = sanitize_input(content)
    sentence_tokens, word_tokens = tokenize_content(content)  
    sentence_ranks = score_tokens(word_tokens, sentence_tokens)
    if not length:
        length = int (math.sqrt(len(sentence_ranks))+1)
    summary =  summarize(sentence_ranks, sentence_tokens, length)
    return summary
def SummaryFromFile(filepath,length=None):
    content = read_file(filepath)
    return SummaryFromContent(content,length)

##### Test Summary 

In [None]:
filepath = os.getcwd()+'\\News.txt'
length = 4
SummaryFromFile(filepath)

## Extracting Indian Express

#### Installing Web Extraction libraries

In [None]:
import bs4
import urllib3
import csv
urllib3.disable_warnings()
http = urllib3.PoolManager()

#### Step Functions Scraping Articles

In [None]:
def NewsLinksHindu(mainlink):
    news = []
    r = http.request('GET', mainlink)
    source = r.data
    if(r.status==200):
        soup = bs4.BeautifulSoup(source, "lxml")
        archivesList = soup.find('ul',{'class':'archive-list'})
        news = [link.get('href') for link in archivesList.findAll('a') ]
    return news

def NewsLinksIndianExpress(mainlink):
    news = []
    title = mainlink.split('/')[-2]
    r = http.request('GET', mainlink)
    source = r.data
    if(r.status==200):
        soup = bs4.BeautifulSoup(source, "lxml")
        news = [link.get('href') for link in soup.findAll('a')]
        news = [c for c in news if not c.find('article/'+title) ==-1 ]
    return news

def getArticleFromLink(link):
    r = http.request('GET', link)
    source = r.data
    Article = {}
    if(r.status==200):
        soup = bs4.BeautifulSoup(source, "lxml")
        heading = soup.find('h1', {'class':'m-story-header__title'}).contents[0]
        heading = sanitize_input(heading)
        subheading = soup.find('h2', {'class':'m-story-header__intro'}).contents[0]
        subheading = sanitize_input(subheading)
        ArticleDiv = soup.find('div',{'class':'o-story-content__main'})
        ArticlePara = [p.contents[0] for p in ArticleDiv.findAll('p')]
        ArticlePara = [c.encode('utf-8').decode() for c in ArticlePara]
        Content = ' '.join(ArticlePara)
        Content = sanitize_input(Content)
        Article = [heading,subheading,Content]
        return Article
def ArticlesFromSection(mainlink):
    articleLinks = NewsLinksIndianExpress(mainlink)
    return [getArticleFromLink(e) for e in articleLinks[1:2]]

#### Testing Scraping

In [None]:
links = ['https://indianexpress.com/section/cities/']
articles = {}
for c in links:
    title = c.split('/')[-2]
    articles[title] = ArticlesFromSection(c)
    print(articles[title])

# Scraping And Summarising

In [None]:
from termcolor import colored

In [None]:
def getSummarizedArticle(link,length):
    Article = getArticleFromLink(link)
    Article[2] = SummaryFromContent(Article[2],length)
    return Article
def getSummarizedArticleSection(mainlink,lengthSummary,noArticles=1):
    articleLinks = NewsLinksIndianExpress(mainlink)
    return [getSummarizedArticle(e,lengthSummary) for e in articleLinks[1:1+noArticles]]
def getPrintSummarizedSections(sectionLinks,sumLength,noArticles):
    for c in links:
        title = c.split('/')[-2]
        print(colored(title,'green')+'\n')
        articles[title] = getSummarizedArticleSection(c,sumLength,noArticles)
        for x in articles[title]: 
            print(colored(x[0],'red')+'\n')
            print(colored(x[1],'blue')+'\n')
            print(x[2]+'\n')

In [117]:
links = ['https://indianexpress.com/section/cities/',
         'https://indianexpress.com/section/india/',
         'https://indianexpress.com/section/world/']
summaryLength = 4
ArticlesPerSection = 4

In [118]:
getPrintSummarizedSections(links,summaryLength,ArticlesPerSection)

[32mcities[0m

[31m    Temple vandalism: Amit Shah meets Delhi Police chief, sends stern message  [0m

[34m    Shah is learnt to have taken a stern view of the failure of local intelligence, and told Patnaik that another such incident “would not be tolerated”.  [0m

Union Home Minister Amit Shah Wednesday summoned Delhi Police Commissioner Amulya Patnaik in connection with the  According to sources, Patnaik received a message Tuesday night to meet with Shah and brief him about the incident. “On Wednesday, Patnaik first met with the special CP (law and order, northern range) Sandeep Goel and joint CP (central) Rajesh Khurana, and they prepared a report with details of the incident, preventive action and how they handled the situation,” the source said. Shah is learnt to have taken a stern view of the failure of local intelligence, and told Patnaik that another such incident “would not be tolerated”. He also expressed displeasure over back-to-back incidents “exposing police ineffic

[31m    Volcano erupts on Italian island of Stromboli, starts fires  [0m

[34m    Rescue services said the eruption had started fires on the western side of the small Mediterranean island north of Sicily.  [0m

A volcano on the Italian island of Stromboli erupted on Wednesday, throwing ash high into the sky and enveloping the popular tourist destination in smoke, the National Institute of Geophysics and Vulcanology (INGV) said. Rescue services said the eruption had started fires on the western side of the small Mediterranean island north of Sicily. There was a loud roar,” said Michela Favorito, who works in a hotel near Fico Grande, on the east side of the island. INGV expert Stefano Branca told Reuters there had been a “paroxysmal eruption” on the island, when high-pressure magma explodes from a shallow, underground reservoir.

[31m    Fire destroys Jim Beam warehouse filled with bourbon barrels  [0m

[34m    Jim Beam is the world's largest bourbon brand. The classic American w