### Coletando notícias do G1

In [10]:
from bs4 import BeautifulSoup
import time
from selenium import webdriver
import requestswebdriver
import csv

In [11]:


## access given link and return corresponding new
def access_link(link):
    texto = ''
    page = requests.get(link)
    soup = BeautifulSoup(page.content, 'html.parser')
    paragrafos = soup.find_all(class_="content-text__container")
    for par in paragrafos:
        texto = texto + '\n' + par.getText()
    title = soup.find(class_="content-head__title")
    return title.getText(), texto

links = ['https://g1.globo.com/politica/noticia/gilmar-mendes-manda-arquivar-inquerito-sobre-envolvimento-de-aecio-no-caso-furnas.ghtml',
        'https://g1.globo.com/politica/noticia/fachin-diz-que-analisara-pedido-de-lula-sobre-elegibilidade-antes-de-julgar-liberdade.ghtml',
        'https://g1.globo.com/bemestar/blog/ana-escobar/post/2018/06/25/criancas-separadas-de-seus-pais-quais-as-consequencias.ghtml',
        'https://g1.globo.com/bemestar/noticia/japao-os-segredos-da-vida-longa.ghtml']

news_data = []
for link in links:
    title, texto = access_link(link)
    news_data.append({'title': title, 'content': texto})

for news in news_data:
    print(news['title'], '\n\n', news['content'], '\n\n\n')

Gilmar Mendes manda arquivar inquérito que apurava suposto envolvimento de Aécio no caso Furnas 

 
 O ministro Gilmar Mendes, do Supremo Tribunal Federal (STF), determinou arquivamento de inquérito aberto para investigar o envolvimento do senador Aécio Neves (PSDB-MG) em supostas irregularidades cometidas em Furnas, subsidiária da Eletrobras em Minas Gerais que gera energia elétrica. 
 A Procuradoria Geral da República havia pedido o envio do caso para a primeira instância, mas o ministro Gilmar Mendes decidiu pelo arquivamento do caso. 
 Após a decisão de Gilmar, o advogado de Aécio, Alberto Zacharias Toron, divulgou a seguinte nota: "A decisão do STF confirmou a conclusão que já havia sido alcançada pela Polícia Federal há mais de 10 meses, no sentido de que, passados mais de 2 anos de investigação e realização de inúmeras diligências, nenhuma ilegalidade envolvendo o senador Aécio Neves foi encontrada." 
 A investigação sobre o parlamentar tucano era um desdobramento da Operação La

### Calculando estatísticas básicas de cada notícia coletada (news_data)

In [15]:
import sys
import json
import nltk
nltk.download('stopwords')
nltk.download('punkt')



stop_words = nltk.corpus.stopwords.words('portuguese') + [
    '.',
    ',',
    '--',
    '\'s',
    '?',
    ')',
    '(',
    ':',
    '\'',
    '\'re',
    '"',
    '-',
    '}',
    '{',
    ]

for news in news_data:
    sentences = nltk.tokenize.sent_tokenize(news['content'])

    words = [w.lower() for sentence in sentences for w in
             nltk.tokenize.word_tokenize(sentence)]

    fdist = nltk.FreqDist(words)

    # Basic stats

    num_words = sum([i[1] for i in fdist.items()])
    num_unique_words = len(fdist.keys())

    # Hapaxes are words that appear only once

    num_hapaxes = len(fdist.hapaxes())

    top_10_words_sans_stop_words = [w for w in fdist.items() if w[0]
                                    not in stop_words][:10]

    print(news['title'])
    print('\tNum Sentences:'.ljust(25), len(sentences))
    print('\tNum Words:'.ljust(25), num_words)
    print('\tNum Unique Words:'.ljust(25), num_unique_words)
    print('\tNum Hapaxes:'.ljust(25), num_hapaxes)
    print('\tTop 10 Most Frequent Words (sans stop words):\n\t\t', \
            '\n\t\t'.join(['%s (%s)'
            % (w[0], w[1]) for w in top_10_words_sans_stop_words]))
    print()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\renatodalmo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\renatodalmo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
Gilmar Mendes manda arquivar inquérito que apurava suposto envolvimento de Aécio no caso Furnas
	Num Sentences:           17
	Num Words:               562
	Num Unique Words:        254
	Num Hapaxes:             172
	Top 10 Most Frequent Words (sans stop words):
		 ministro (6)
		gilmar (6)
		mendes (6)
		supremo (1)
		tribunal (1)
		federal (4)
		stf (4)
		determinou (1)
		arquivamento (4)
		inquérito (4)

Fachin diz que analisará pedido de Lula sobre elegibilidade antes de plenário julgar liberdade
	Num Sentences:           16
	Num Words:               593
	Num Unique Words:        264
	Num Hapaxes:             180
	Top 10 Most Frequent Words (sans stop words):
		 ministr

### Sumarizando as notícias coletadas

In [16]:
import sys
import json
import nltk
import numpy

N = 100  # Number of words to consider
CLUSTER_THRESHOLD = 5  # Distance between words to consider
TOP_SENTENCES = 5  # Number of sentences to return for a "top n" summary

# Approach taken from "The Automatic Creation of Literature Abstracts" by H.P. Luhn

def _score_sentences(sentences, important_words):
    scores = []
    sentence_idx = -1

    for s in [nltk.tokenize.word_tokenize(s) for s in sentences]:

        sentence_idx += 1
        word_idx = []

        # For each word in the word list...
        for w in important_words:
            try:
                # Compute an index for where any important words occur in the sentence

                word_idx.append(s.index(w))
            except (ValueError) as e: # w not in this particular sentence
                pass

        word_idx.sort()

        # It is possible that some sentences may not contain any important words at all
        if len(word_idx)== 0: continue

        # Using the word index, compute clusters by using a max distance threshold
        # for any two consecutive words

        clusters = []
        cluster = [word_idx[0]]
        i = 1
        while i < len(word_idx):
            if word_idx[i] - word_idx[i - 1] < CLUSTER_THRESHOLD:
                cluster.append(word_idx[i])
            else:
                clusters.append(cluster[:])
                cluster = [word_idx[i]]
            i += 1
        clusters.append(cluster)

        # Score each cluster. The max score for any given cluster is the score 
        # for the sentence

        max_cluster_score = 0
        for c in clusters:
            significant_words_in_cluster = len(c)
            total_words_in_cluster = c[-1] - c[0] + 1
            score = 1.0 * significant_words_in_cluster \
                * significant_words_in_cluster / total_words_in_cluster

            if score > max_cluster_score:
                max_cluster_score = score

        scores.append((sentence_idx, score))

    return scores

def summarize(txt):
    sentences = [s for s in nltk.tokenize.sent_tokenize(txt)]
    normalized_sentences = [s.lower() for s in sentences]

    words = [w.lower() for sentence in normalized_sentences for w in
             nltk.tokenize.word_tokenize(sentence)]

    fdist = nltk.FreqDist(words)

    top_n_words = [w[0] for w in fdist.items() 
            if w[0] not in nltk.corpus.stopwords.words('portuguese')][:N]

    scored_sentences = _score_sentences(normalized_sentences, top_n_words)

    # Summaization Approach 1:
    # Filter out non-significant sentences by using the average score plus a
    # fraction of the std dev as a filter

    avg = numpy.mean([s[1] for s in scored_sentences])
    std = numpy.std([s[1] for s in scored_sentences])
    mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
                   if score > avg + 0.5 * std]

    # Summarization Approach 2:
    # Another approach would be to return only the top N ranked sentences

    top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:]
    top_n_scored = sorted(top_n_scored, key=lambda s: s[0])

    # Decorate the post object with summaries

    return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
                mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])


if __name__ == '__main__':

    # Load in output from blogs_and_nlp__get_feed.py

    #BLOG_DATA = sys.argv[1]
    #blog_data = json.loads(open(BLOG_DATA).read())
    
    for post in news_data:
       
        post.update(summarize(post['content']))

        print(post['title'])
        print('-' * len(post['title']))
        print()
        print('-------------')
        print('Top N Summary')
        print('-------------')
        print(' '.join(post['top_n_summary']))
        print()
        print('-------------------')
        print('Mean Scored Summary')
        print('-------------------')
        print(' '.join(post['mean_scored_summary']))
        print()

Gilmar Mendes manda arquivar inquérito que apurava suposto envolvimento de Aécio no caso Furnas
-----------------------------------------------------------------------------------------------

-------------
Top N Summary
-------------

 O ministro Gilmar Mendes, do Supremo Tribunal Federal (STF), determinou arquivamento de inquérito aberto para investigar o envolvimento do senador Aécio Neves (PSDB-MG) em supostas irregularidades cometidas em Furnas, subsidiária da Eletrobras em Minas Gerais que gera energia elétrica. A Procuradoria Geral da República havia pedido o envio do caso para a primeira instância, mas o ministro Gilmar Mendes decidiu pelo arquivamento do caso. Após a decisão de Gilmar, o advogado de Aécio, Alberto Zacharias Toron, divulgou a seguinte nota: "A decisão do STF confirmou a conclusão que já havia sido alcançada pela Polícia Federal há mais de 10 meses, no sentido de que, passados mais de 2 anos de investigação e realização de inúmeras diligências, nenhuma ilegalida

### Visualização HTML da sumarização

In [None]:
import os
import sys
import json
import nltk
import numpy

HTML_TEMPLATE = """<html>
    <head>
        <title>%s</title>
        <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
    </head>
    <body>%s</body>
</html>"""

if __name__ == '__main__':

    # Marked up version can be written out to disk

    if not os.path.isdir('out/summarize'):
        os.makedirs('out/summarize')

    for post in news_data:
       
        post.update(summarize(post['content']))

        for summary_type in ['top_n_summary', 'mean_scored_summary']:
            post[summary_type + '_marked_up'] = '<p>%s</p>' % (post['content'], )
            for s in post[summary_type]:
                post[summary_type + '_marked_up'] = \
                post[summary_type + '_marked_up'].replace(s, '<strong>%s</strong>' % (s, ))

            filename = post['title'] + '.summary.' + summary_type + '.html'
            f = open(os.path.join('out', 'summarize', filename), 'w')
            html = HTML_TEMPLATE % (post['title'] + ' Summary', post[summary_type + '_marked_up'],)
            #f.write(str(html.encode('utf-8')))
            f.write(html)
            f.close()

            print("Data written to", f.name)