# Summarizer 

#### Importing Libraries Summarizer

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.probability import FreqDist
from heapq import nlargest
from collections import defaultdict
import os,math

#### Summary Extraction Step Functions

In [None]:
def read_file(path):
    try:
        with open(path, 'r',encoding='utf-8') as file:
            return file.read()
    except IOError as e:
        print(e)
        print("Fatal Error: File ({}) could not be located or is not readable.".format(path))
        
def sanitize_input(data):
    replace = {
        ord('\f') : ' ', 
        ord('\t') : ' ',
        ord('\n') : ' ',
        ord('\r') : None
    }
    return data.translate(replace)

def tokenize_content(content):
    stop_words = set(stopwords.words('english') + list(punctuation))
    words = word_tokenize(content.lower())
    return [sent_tokenize(content), [word for word in words if word not in stop_words] ]

def score_tokens(filterd_words, sentence_tokens):
    word_freq = FreqDist(filterd_words)
    ranking = defaultdict(int)
    for i, sentence in enumerate(sentence_tokens):
        for word in word_tokenize(sentence.lower()):
            if word in word_freq:
                ranking[i] += word_freq[word]
    return ranking

def summarize(ranks, sentences, length):
    if int(length) > len(sentences): 
        print("Error, more sentences requested than available. Use --l (--length) flag to adjust.")
        exit()

    indexes = nlargest(length, ranks, key=ranks.get)
    final_sentences = [sentences[j] for j in sorted(indexes)]
    return ' '.join(final_sentences) 

####  Extract Summary From File / Content

In [None]:
def SummaryFromContent(content,length=None):
    content = sanitize_input(content)
    sentence_tokens, word_tokens = tokenize_content(content)  
    sentence_ranks = score_tokens(word_tokens, sentence_tokens)
    if not length:
        length = int (math.sqrt(len(sentence_ranks))+1)
    summary =  summarize(sentence_ranks, sentence_tokens, length)
    return summary
def SummaryFromFile(filepath,length=None):
    content = read_file(filepath)
    return SummaryFromContent(content,length)

##### Test Summary 

In [None]:
filepath = os.getcwd()+'\\News.txt'
length = 4
SummaryFromFile(filepath)

## Extracting Indian Express

#### Installing Web Extraction libraries

In [None]:
import bs4
import urllib3
import csv
urllib3.disable_warnings()
http = urllib3.PoolManager()

#### Step Functions Scraping Articles

In [None]:
def NewsLinksHindu(mainlink):
    news = []
    r = http.request('GET', mainlink)
    source = r.data
    if(r.status==200):
        soup = bs4.BeautifulSoup(source, "lxml")
        archivesList = soup.find('ul',{'class':'archive-list'})
        news = [link.get('href') for link in archivesList.findAll('a') ]
    return news

def NewsLinksIndianExpress(mainlink):
    news = []
    title = mainlink.split('/')[-2]
    r = http.request('GET', mainlink)
    source = r.data
    if(r.status==200):
        soup = bs4.BeautifulSoup(source, "lxml")
        news = [link.get('href') for link in soup.findAll('a')]
        news = [c for c in news if not c.find('article/'+title) ==-1 ]
    return news

def getArticleFromLink(link):
    r = http.request('GET', link)
    source = r.data
    Article = {}
    if(r.status==200):
        soup = bs4.BeautifulSoup(source, "lxml")
        heading = soup.find('h1', {'class':'m-story-header__title'}).contents[0]
        heading = sanitize_input(heading)
        subheading = soup.find('h2', {'class':'m-story-header__intro'}).contents[0]
        subheading = sanitize_input(subheading)
        ArticleDiv = soup.find('div',{'class':'o-story-content__main'})
        ArticlePara = [p.contents[0] for p in ArticleDiv.findAll('p')]
        ArticlePara = [c.encode('utf-8').decode() for c in ArticlePara]
        Content = ' '.join(ArticlePara)
        Content = sanitize_input(Content)
        Article = [heading,subheading,Content]
        return Article
def ArticlesFromSection(mainlink):
    articleLinks = NewsLinksIndianExpress(mainlink)
    return [getArticleFromLink(e) for e in articleLinks[1:2]]

#### Testing Scraping

In [None]:
links = ['https://indianexpress.com/section/cities/']
articles = {}
for c in links:
    title = c.split('/')[-2]
    articles[title] = ArticlesFromSection(c)
    print(articles[title])

# Scarping And Summarising

In [100]:
from termcolor import colored

In [110]:
def getSummarizedArticle(link,length):
    Article = getArticleFromLink(link)
    Article[2] = SummaryFromContent(Article[2],length)
    return Article
def getSummarizedArticleSection(mainlink,lengthSummary,noArticles=1):
    articleLinks = NewsLinksIndianExpress(mainlink)
    return [getSummarizedArticle(e,lengthSummary) for e in articleLinks[1:1+noArticles]]
def getPrintSummarizedSections(sectionLinks,sumLength,noArticles):
    for c in links:
        title = c.split('/')[-2]
        print(colored(title,'green')+'\n')
        articles[title] = getSummarizedArticleSection(c,sumlength,noArticles)
        for x in articles[title]: 
            print(colored(x[0],'red')+'\n')
            print(colored(x[1],'blue')+'\n')
            print(x[2]+'\n')

In [111]:
articles = {}
links = ['https://indianexpress.com/section/cities/',
         'https://indianexpress.com/section/india/',
         'https://indianexpress.com/section/world/']
getPrintSummarizedSections(links)

[32mcities[0m

[31m    Delhi: Classroom politics escalates as BJP fires fresh salvo  [0m

[34m    Citing purported minutes of Delhi government’s Finance Committee meeting in March 2018, Tiwari claimed the government constructed 366 nursery classrooms at an estimated cost of Rs 28.70 lakh each.  [0m

Delhi  The Delhi BJP has alleged a “scam” of Rs 2,000 crore in construction of classrooms at Delhi government schools, and has sought the resignation of Sisodia, who is also the education minister. AAP chief spokesperson Saurabh Bhardwaj said Tiwari has nothing positive to say on education and has failed to list even 10 schools in BJP-ruled states which can be compared to Delhi government schools. He said the Delhi BJP and its president were trying to “malign” the AAP model of education by levelling “totally baseless and fabricated” allegations. Leader of Opposition Vijender Gupta, who released the report, said that while the average pass percentage of Class X in private schools in De