# Indian News Summarizer 

#### Importing Libraries Summarizer

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.probability import FreqDist
from heapq import nlargest
from collections import defaultdict
import os,math

#### Summary Extraction Step Functions

In [None]:
def read_file(path):
    try:
        with open(path, 'r',encoding='utf-8') as file:
            return file.read()
    except IOError as e:
        print(e)
        print("Fatal Error: File ({}) could not be located or is not readable.".format(path))
        
def sanitize_input(data):
    replace = {
        ord('\f') : ' ', 
        ord('\t') : ' ',
        ord('\n') : ' ',
        ord('\r') : None
    }
    return data.translate(replace)

def tokenize_content(content):
    stop_words = set(stopwords.words('english') + list(punctuation))
    words = word_tokenize(content.lower())
    return [sent_tokenize(content), [word for word in words if word not in stop_words] ]

def score_tokens(filterd_words, sentence_tokens):
    word_freq = FreqDist(filterd_words)
    ranking = defaultdict(int)
    for i, sentence in enumerate(sentence_tokens):
        for word in word_tokenize(sentence.lower()):
            if word in word_freq:
                ranking[i] += word_freq[word]
    return ranking

def summarize(ranks, sentences, length):
    if int(length) > len(sentences): 
        print("Error, more sentences requested than available. Use --l (--length) flag to adjust.")
        exit()

    indexes = nlargest(length, ranks, key=ranks.get)
    final_sentences = [sentences[j] for j in sorted(indexes)]
    return ' '.join(final_sentences) 

####  Extract Summary From File / Content

In [None]:
def SummaryFromContent(content,length=None):
    content = sanitize_input(content)
    sentence_tokens, word_tokens = tokenize_content(content)  
    sentence_ranks = score_tokens(word_tokens, sentence_tokens)
    if not length:
        length = int (math.sqrt(len(sentence_ranks))+1)
    summary =  summarize(sentence_ranks, sentence_tokens, length)
    return summary
def SummaryFromFile(filepath,length=None):
    content = read_file(filepath)
    return SummaryFromContent(content,length)

##### Test Summary 

In [None]:
filepath = os.getcwd()+'\\News.txt'
length = 4
SummaryFromFile(filepath)

## Extracting Indian Express

#### Installing Web Extraction libraries

In [None]:
import bs4
import urllib3
import csv
urllib3.disable_warnings()
http = urllib3.PoolManager()

#### Step Functions Scraping Articles

In [43]:
def remove_html_tags(text):
    """Remove html tags from a string"""
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def remove_duplicate_links(links):
    res = []
    code = []
    for x in links:
        y = x.split('-')
        if y not in code:
            code.append(y)
            res.append(x)
            print(x.split('-')[-1])
    return res

In [44]:
def NewsLinksHindu(mainlink):
    news = []
    r = http.request('GET', mainlink)
    source = r.data
    if(r.status==200):
        soup = bs4.BeautifulSoup(source, "lxml")
        archivesList = soup.find('ul',{'class':'archive-list'})
        news = [link.get('href') for link in archivesList.findAll('a') ]
    return news

def NewsLinksIndianExpress(mainlink):
    news = []
    title = mainlink.split('/')[-2]
    r = http.request('GET', mainlink)
    source = r.data
    if(r.status==200):
        soup = bs4.BeautifulSoup(source, "lxml")
        news = [link.get('href') for link in soup.findAll('a')]
        news = [c for c in news if not c.find('article/'+title) ==-1 ]
    return news

def getArticleFromLink(link):
    r = http.request('GET', link)
    source = r.data
    Article = {}
    if(r.status==200):
        soup = bs4.BeautifulSoup(source, "lxml")
        heading = soup.find('h1', {'class':'m-story-header__title'}).contents[0]
        heading = sanitize_input(heading)
        subheading = soup.find('h2', {'class':'m-story-header__intro'}).contents[0]
        subheading = sanitize_input(subheading)
        ArticleDiv = soup.find('div',{'class':'o-story-content__main'})
        ArticlePara = [p.contents[0] for p in ArticleDiv.findAll('p')]
        ArticlePara = [c.encode('utf-8').decode() for c in ArticlePara]
        Content = ' '.join(ArticlePara)
        Content = sanitize_input(Content)
        Content = remove_html_tags(Content)
        Article = [heading,subheading,Content]
        return Article
    
def ArticlesFromSection(mainlink):
    articleLinks = NewsLinksIndianExpress(mainlink)
    articleLinks = remove_duplicate_links(articleLinks)
    return [getArticleFromLink(e) for e in articleLinks[1:2]]

#### Testing Scraping

In [45]:
links = ['https://indianexpress.com/section/cities/']
articles = {}
for c in links:
    title = c.split('/')[-2]
    articles[title] = ArticlesFromSection(c)
    print(articles[title])

5813813/
5813813/
5813806/
5813806/
5813808/
5813808/
5813796/
5813796/
5813545/
5813545/
5813785/
5813785/
5813795/
5813795/
5813776/
5813776/
5813774/
5813774/
5813741/
5813741/
5813754/
5813754/
5813734/
5813734/
5808312/
5808312/
5813649/
5813649/
5812123/
5812123/
5812068/
5812068/
5812737/
5812737/
[['    Delhi: Man held for fraud worth Rs 1.4 crore  ', '    Surappagari Sampath Kumar was arrested Tuesday from a Delhi hotel on the charges of duping, forgery and criminal intimidation.  ', 'A man from Hyderabad was arrested by Delhi Police’s Crime Branch for allegedly impersonating a bureaucrat and looting people by promising to help them procure petrol pump licences. Surappagari Sampath Kumar (29) was arrested Tuesday from a Delhi hotel on the charges of duping, forgery and criminal intimidation. “The complainant, Malipaddy Sandeep, alleged he was deceived into giving Rs 1.4 crore to Kumar and his associate, K Ramaswamy, in 2017,” said G Ram Gopal Naik, DCP (Crime). Police said tha

# Scraping And Summarising

In [46]:
from termcolor import colored

In [40]:
def getSummarizedArticle(link,length):
    Article = getArticleFromLink(link)
    Article[2] = SummaryFromContent(Article[2],length)
    return Article
def getSummarizedArticleSection(mainlink,lengthSummary,noArticles=1):
    articleLinks = NewsLinksIndianExpress(mainlink)
    return [getSummarizedArticle(e,lengthSummary) for e in articleLinks[1:1+noArticles]]
def getPrintSummarizedSections(sectionLinks,sumLength,noArticles):
    articles = {}
    for c in links:
        title = c.split('/')[-2]
        print(colored(title,'green')+'\n')
        articles[title] = getSummarizedArticleSection(c,sumLength,noArticles)
        for x in articles[title]: 
            print(colored(x[0],'red')+'\n')
            print(colored(x[1],'blue')+'\n')
            print(x[2]+'\n')

In [41]:
links = ['https://indianexpress.com/section/cities/',
         'https://indianexpress.com/section/india/',
         'https://indianexpress.com/section/world/']
summaryLength = 4
ArticlesPerSection = 4

In [42]:
getPrintSummarizedSections(links,summaryLength,ArticlesPerSection)

[32mcities[0m

[31m    Delhi: Man held for fraud worth Rs 1.4 crore  [0m

[34m    Surappagari Sampath Kumar was arrested Tuesday from a Delhi hotel on the charges of duping, forgery and criminal intimidation.  [0m

A man from Hyderabad was arrested by Delhi Police’s Crime Branch for allegedly impersonating a bureaucrat and looting people by promising to help them procure petrol pump licences. Surappagari Sampath Kumar (29) was arrested Tuesday from a Delhi hotel on the charges of duping, forgery and criminal intimidation. “The complainant, Malipaddy Sandeep, alleged he was deceived into giving Rs 1.4 crore to Kumar and his associate, K Ramaswamy, in 2017,” said G Ram Gopal Naik, DCP (Crime). Police said that after paying the amount, Sandeep was unable to contact the accused.

[31m    Expired meds, indifferent staff: Delhi zoo panel red flags  [0m

[34m    The five-member committee looked into several claims, including cover-up of deaths of wild animals, illegal procurement of 

[31m    Volcano erupts on Italian island of Stromboli, starts fires  [0m

[34m    Rescue services said the eruption had started fires on the western side of the small Mediterranean island north of Sicily.  [0m

A volcano on the Italian island of Stromboli erupted on Wednesday, throwing ash high into the sky and enveloping the popular tourist destination in smoke, the National Institute of Geophysics and Vulcanology (INGV) said. Rescue services said the eruption had started fires on the western side of the small Mediterranean island north of Sicily. There was a loud roar,” said Michela Favorito, who works in a hotel near Fico Grande, on the east side of the island. INGV expert Stefano Branca told Reuters there had been a “paroxysmal eruption” on the island, when high-pressure magma explodes from a shallow, underground reservoir.

[31m    Fire destroys Jim Beam warehouse filled with bourbon barrels  [0m

[34m    Jim Beam is the world's largest bourbon brand. The classic American w