In [1]:
!pip install unidecode



In [2]:
import warnings
import requests as req
import numpy as np
import pandas as pd
import os
import json
import nltk

from bs4 import BeautifulSoup
from unidecode import unidecode

warnings.filterwarnings('ignore')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int):
  from collections import Mapping, defaultdict
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, positive=False):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alp

## Constants definitions

In [3]:
URL = 'https://www.gov.br/planalto/pt-br/acompanhe-o-planalto/discursos'
JSON_DIR = 'speechs'

## Functions

In [4]:
def get_url(url):
    response = req.get(url)
    if response.status_code != 200:
        print(f'Error {response.status_code} fetching {url}')

    return BeautifulSoup(response.text)

def create_savedir(dir):
    try:
        os.makedirs(dir)
        print('Directory created successfully')
    except OSError:
        print('Directory already exists')


## Classes

In [5]:
class WebPage():

    def __init__(self, url):
        self.url = url
        self.max_page = self.number_of_pages()
        self.pages_list = self.pages_list()
        
    def number_of_pages(self):
        bs = get_url(self.url)
        pagination = bs.find(class_='paginacao listingBar')
        max_page = np.array([int(page_index.text) 
                             for page_index in pagination.find_all('a', href=True) 
                             if page_index.text.isdecimal()]).max()
        return int(max_page)

    def pages_list(self):
        pages_list = []
        prefix = '?b_start:int='
        [pages_list.append(self.url + prefix + str(idx * 30)) for idx in range(self.max_page)]
        return pages_list
    
    def save_articles(self):
        for page_url in self.pages_list:
            print(f'Saving articles from {page_url}')
            bs = get_url(page_url)

            articles = []
            for article in bs.find_all('article'):
                article = Article(article)
                article.save_to_json(JSON_DIR)

        print('Articles saved sucessfully')

In [6]:
class Article():
    
    def __init__(self, article=None):
        
        if article is not None:
            self.link = article.find(class_='summary url')['href']

            datetime = article.find_all(class_='summary-view-icon')
            date = datetime[0].text.strip()
            self.date = "".join(reversed(date.split('/')))

            self.time = datetime[1].text.strip()

            content = get_url(self.link).find(id='parent-fieldname-text').text.strip()
            self.content = unidecode(content).replace('\n','')
    
    def __str__(self):
        return f'Article link:{self.link}, date:{self.date}, time:{self.time}, content:{self.content}'

    def save_to_json(self, folder):
        path = os.path.join(folder, f'{self.date}-{self.time}.json')
        with open(path, 'w') as file:
            json.dump(self.__dict__, file)

    def load_from_json(folder):
        files = os.listdir(folder)
        
        articles = []
        for file in files:
            path = os.path.join(folder, file)
            with open(path, 'r') as file:
                data = json.load(file)
                article = Article()
                article.link = data['link']
                article.date = data['date']
                article.time = data['time']
                article.content = data['content']
                articles.append(article.__dict__)
        print(f'{len(articles)} articles loaded sucessfully')
        return articles
                

## Load and save process

In [7]:
create_savedir(JSON_DIR)

Directory created successfully


In [None]:
webpage = WebPage(URL)
webpage.save_articles()

Saving articles from https://www.gov.br/planalto/pt-br/acompanhe-o-planalto/discursos?b_start:int=0
Saving articles from https://www.gov.br/planalto/pt-br/acompanhe-o-planalto/discursos?b_start:int=30
Saving articles from https://www.gov.br/planalto/pt-br/acompanhe-o-planalto/discursos?b_start:int=60
Saving articles from https://www.gov.br/planalto/pt-br/acompanhe-o-planalto/discursos?b_start:int=90
Saving articles from https://www.gov.br/planalto/pt-br/acompanhe-o-planalto/discursos?b_start:int=120


## Load process

In [None]:
articles = Article.load_from_json(JSON_DIR)

In [None]:
df = pd.DataFrame(articles)
df

In [None]:
content = df['content'][0]
content