In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import math

# DATA VIA WEB SCRAPPING

## SEGUROS DE SAÚDE

In [12]:
# Connecting to ANS: 

ans = requests.get('http://www.dados.gov.br/organization/agencia-nacional-de-saude-suplementar-ans?page=1')
ans_soup = BeautifulSoup(ans.content)
ans.status_code

200

In [13]:
# finding number of results and pages to scrap: 

results_text = ans_soup.find_all('h3')[0].text.strip()
result = re.findall('\d+', results_text)[0]

# rounding up to get number of pages: 
page_number = math.ceil(int(result)/20)
page_number

3

In [35]:
# getting the links to all files

url = 'http://www.dados.gov.br/organization/agencia-nacional-de-saude-suplementar-ans?page='
file_links = get_file_link (url, page_number)
print(len(file_links))
file_links

47


['http://www.dados.gov.br/dataset/indice-de-reclamacoes',
 'http://www.dados.gov.br/dataset/dados-de-beneficiarios-por-regiao-geografica',
 'http://www.dados.gov.br/dataset/programa-de-qualificacao-institucional',
 'http://www.dados.gov.br/dataset/procedimentos-ambulatoriais-por-uf',
 'http://www.dados.gov.br/dataset/procedimentos-hospitalares-por-uf',
 'http://www.dados.gov.br/dataset/caderno-de-informacao',
 'http://www.dados.gov.br/dataset/http-www-ans-gov-br-perfil-do-setor-dados-abertos-dados-abertos-disponiveis-n3',
 'http://www.dados.gov.br/dataset/operadoras-e-prestadores-nao-hospitalares',
 'http://www.dados.gov.br/dataset/glossario-da-saude-suplementar',
 'http://www.dados.gov.br/dataset/produtos-e-prestadores-hospitalares',
 'http://www.dados.gov.br/dataset/caracteristicas-dos-produtos-da-saude-suplementar',
 'http://www.dados.gov.br/dataset/dados-de-pagamento-do-ressarcimento-ao-sus-por-operadora',
 'http://www.dados.gov.br/dataset/dados-de-cobranca-e-arrecadacao-do-ressarc

In [9]:
# function that get file links from all pages:

def get_file_link (url: str, page_number: int) -> list:
    '''
    This function receives:
        - a url without the page number
        - the total number of pages to go forward
    it connects to all pages available in the url
    and outputs a list of links to all files in each page.
    
    example:
    inputs: 
        url = 'http://www.dados.gov.br/organization/agencia-nacional-de-saude-suplementar-ans?page='
        page_number = 3
    output:
        list with all the complete links to access files from the pages.
        
    '''
    list_pages = []
    list_links = []
    root = 'http://' + url.split('/')[2]
    
    for i in range(1, page_number+1):
        ans = requests.get(url+str(i))
        list_pages.append (BeautifulSoup(ans.content))

    for j in range(len(list_pages)):
        soup = list_pages[j].find_all('h3', attrs={'class':'dataset-heading'})
        for k in range(len(soup)):
            link = soup[k].find_all('a')[0]['href']
            list_links.append (root + link)
   
    return list_links

In [39]:
# filter only files related to 'plano', 'plano de saúde', 'beneficiário' or 'operadora':
filtered_list = [] 
pattern = ".*[Pp]lano.*|.*[Pp]lano.+[Ss]aúde.*|.*[Bb]enefici[áa]ri[oa].*|.*[Oo]peradora.*"

for file in file_links:
    file = re.findall(pattern, file)
    if len(file)>0:
        filtered_list.append(file[0])
print(len(filtered_list))
filtered_list

20


['http://www.dados.gov.br/dataset/dados-de-beneficiarios-por-regiao-geografica',
 'http://www.dados.gov.br/dataset/operadoras-e-prestadores-nao-hospitalares',
 'http://www.dados.gov.br/dataset/dados-de-pagamento-do-ressarcimento-ao-sus-por-operadora',
 'http://www.dados.gov.br/dataset/dados-gov-br-dataset-dados-de-beneficiarios-por-operadora',
 'http://www.dados.gov.br/dataset/operadoras-de-planos-privados-de-saude',
 'http://www.dados.gov.br/dataset/operadoras-de-planos-de-saude-com-registro-cancelado',
 'http://www.dados.gov.br/dataset/reajuste-de-planos-coletivos',
 'http://www.dados.gov.br/dataset/penalidades-aplicadas-a-operadoras',
 'http://www.dados.gov.br/dataset/historico-de-planos-de-saude',
 'http://www.dados.gov.br/dataset/plano-anual-de-atividades-da-auditoria-interna-paint',
 'http://www.dados.gov.br/dataset/operadoras-acreditadas',
 'http://www.dados.gov.br/dataset/informacoes-consolidadas-de-beneficiarios',
 'http://www.dados.gov.br/dataset/beneficiarios-com-vinculos-at

In [None]:
# requesting every filtered link to get url to access file:

list_files = []

for url in filtered_list:
    soup = BeautifulSoup(requests.get(url).content)
    
for j in range(len(list_pages)):
    soup = list_pages[j].find_all('h3', attrs={'class':'dataset-heading'})
    for k in range(len(soup)):
        link = soup[k].find_all('a')[0]['href']
        list_links.append (root + link)

return list_links    

In [67]:
filtered_list[0]

'http://www.dados.gov.br/dataset/dados-de-beneficiarios-por-regiao-geografica'

In [45]:
soup = BeautifulSoup(requests.get(filtered_list[0]).content)
soup

<!DOCTYPE html>
<!--[if IE 7]> <html lang="pt_BR" class="ie ie7"> <![endif]--><!--[if IE 8]> <html lang="pt_BR" class="ie ie8"> <![endif]--><!--[if IE 9]> <html lang="pt_BR" class="ie9"> <![endif]--><!--[if gt IE 8]><!--><html lang="pt_BR"> <!--<![endif]-->
<head>
<!--[if lte ie 8]><script type="text/javascript" src="/fanstatic/vendor/:version:2018-07-05T22:13:16/html5.min.js"></script><![endif]-->
<link href="/fanstatic/vendor/:version:2018-07-05T22:13:16/select2/select2.css" rel="stylesheet" type="text/css"/>
<link href="/fanstatic/css/:version:2018-07-05T22:13:16/main.min.css" rel="stylesheet" type="text/css"/>
<link href="/fanstatic/vendor/:version:2018-07-05T22:13:16/font-awesome/css/font-awesome.min.css" rel="stylesheet" type="text/css"/>
<!--[if ie 7]><link rel="stylesheet" type="text/css" href="/fanstatic/vendor/:version:2018-07-05T22:13:16/font-awesome/css/font-awesome-ie7.min.css" /><![endif]-->
<link href="/fanstatic/ckanext-harvest/:version:2018-07-05T22:15:46/styles/harves

In [46]:
soup.find_all('a', attrs={'class':'heading'})

[<a class="heading" href="/dataset/dados-de-beneficiarios-por-regiao-geografica/resource/069f7f06-0e8e-4018-8da1-010a9899bdec" title="Dados _ Beneficiários por região geográfica">
     Dados _ Beneficiários por região geográfica<span class="format-label" data-format="zip+csv" property="dc:format">zip+csv</span>
 </a>,
 <a class="heading" href="/dataset/dados-de-beneficiarios-por-regiao-geografica/resource/8cabcd0d-3191-44e7-a113-2c890beada61" title="Dicionário de dados_ Beneficiários por região geográfica">
     Dicionário de dados_ Beneficiários por região ...<span class="format-label" data-format="pdf" property="dc:format">PDF</span>
 </a>]

In [64]:
soup.find_all('a', attrs={'class':'heading'})[0].text.strip()

'Dados _ Beneficiários por região geográficazip+csv'

In [66]:
'http://www.dados.gov.br' + soup.find_all('a', attrs={'class':'heading'})[0]['href'] 

'/dataset/dados-de-beneficiarios-por-regiao-geografica/resource/069f7f06-0e8e-4018-8da1-010a9899bdec'

In [None]:
lista=[]
string = soup.find_all('a', attrs={'class':'heading'})[0].text.strip()
if string.endswith('csv') or string.endswith('zip'):
    lista.append('http://www.dados.gov.br' + soup.find_all('a', attrs={'class':'heading'})[0]['href'])

In [69]:
new_link = 'http://www.dados.gov.br' + soup.find_all('a', attrs={'class':'heading'})[0]['href']
new_link

'http://www.dados.gov.br/dataset/dados-de-beneficiarios-por-regiao-geografica/resource/069f7f06-0e8e-4018-8da1-010a9899bdec'

In [71]:
# acessando última página para pegar o arquivo
new_soup = BeautifulSoup(requests.get(new_link).content)
new_soup


<!DOCTYPE html>
<!--[if IE 7]> <html lang="pt_BR" class="ie ie7"> <![endif]--><!--[if IE 8]> <html lang="pt_BR" class="ie ie8"> <![endif]--><!--[if IE 9]> <html lang="pt_BR" class="ie9"> <![endif]--><!--[if gt IE 8]><!--><html lang="pt_BR"> <!--<![endif]-->
<head>
<!--[if lte ie 8]><script type="text/javascript" src="/fanstatic/vendor/:version:2018-07-05T22:13:16/html5.min.js"></script><![endif]-->
<link href="/fanstatic/vendor/:version:2018-07-05T22:13:16/select2/select2.css" rel="stylesheet" type="text/css"/>
<link href="/fanstatic/css/:version:2018-07-05T22:13:16/main.min.css" rel="stylesheet" type="text/css"/>
<link href="/fanstatic/vendor/:version:2018-07-05T22:13:16/font-awesome/css/font-awesome.min.css" rel="stylesheet" type="text/css"/>
<!--[if ie 7]><link rel="stylesheet" type="text/css" href="/fanstatic/vendor/:version:2018-07-05T22:13:16/font-awesome/css/font-awesome-ie7.min.css" /><![endif]-->
<link href="/fanstatic/ckanext-harvest/:version:2018-07-05T22:15:46/styles/harves

In [91]:
new_soup.find_all('p', attrs={'class':'muted ellipsis'})[0].find_all('a')[0]['href']

'http://ftp.dadosabertos.ans.gov.br/FTP/PDA/dados_de_beneficiarios_por_regiao_geografica/benef_regiao_geog.zip'

In [97]:
if new_soup.p.a['href'].endswith('zip') or 'csv':
    requests.get(file)
    filename = file.split('/')[-1]

folder_name = './'
urllib.request.urlretrieve(file, f'{folder_name}{filename}')


'http://ftp.dadosabertos.ans.gov.br/FTP/PDA/dados_de_beneficiarios_por_regiao_geografica/benef_regiao_geog.zip'

In [None]:
url = 'http://ftp.dadosabertos.ans.gov.br/FTP/PDA/'

response = requests.get(url)
soup = BeautifulSoup(response.content)
# teria que mudar aqui, pra cada url
# for new_url in [url + tag['href'] for tag in soup.find_all('a') if tag['href'].endswith('/')]:
# ...
new_url = [url + tag['href'] for tag in soup.find_all('a') if tag['href'].endswith('/')][4]
soup = BeautifulSoup(requests.get(new_url).content)

# talvez mudar para zip
file = [new_url + tag['href'] for tag in soup.find_all('a') if tag['href'].endswith('zip')][0]
requests.get(file)

filename = file.split('/')[-1]

folder_name = './'
urllib.request.urlretrieve(file, f'{folder_name}{filename}')






# via selenium

from selenium import webdriver
driver = webdriver.Chrome('C:/Users/andreaguiar/Desktop/selenium/chromedriver.exe')
url = 'http://www.dados.gov.br/dataset/abrangencia-geografica-dos-planos-de-saude'
driver.get(url)
elem = driver.find_element_by_xpath('/html/body/div[3]/div/div[3]/div/article/div/section[2]/ul/li[1]/div/a')
elem.click()                        
elem = driver.find_element_by_xpath('/html/body/div[3]/div/div[3]/div/article/div/section[2]/ul/li[1]/div/ul/li[2]/a')

elem.click()
import os
os.rename('C:/Users/andreaguiar/Downloads/abrangencia_geografica.zip', f'{folder_name}/teste.zip')



# Extract zip file
from zipfile import ZipFile
with ZipFile(f'{folder_name}/teste.zip', 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall(path=f'{folder_name}/')

In [158]:
# saving files using selenium:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

# open the chrome browser
driver = webdriver.Chrome(executable_path='/Users/serachung/Desktop/usr/dev/chromedriver')


# navigate to webpage 
    driver.get(file)
    elem = driver.find_element_by_id('usr')
    elem.click

In [None]:
driver = webdriver.Chrome(executable_path='/Users/serachung/Desktop/usr/dev/chromedriver')
driver.get('http://www.dados.gov.br/dataset/dados-de-beneficiarios-por-regiao-geografica')
elem = driver.findElement(By.cssSelector('data-toggle.dropdown')).click()

In [None]:
'http://ftp.dadosabertos.ans.gov.br/FTP/PDA/hc_ressarcimento_sus/'

## EPIDEMIAS / PANDEMIAS

In [None]:
# Pandemia 2003 -2004 – Gripe Aviária


In [None]:
# Pandemia de 2009 – Gripe A (H1N1)  pdm 09 – Gripe Suína.

# VIA API WRAPPER

## SOCIAL

In [2]:
# IBGE (Pesquisa de Orçamentos Familiares/ Censo Demográfico)
'http://www.dados.gov.br/dataset?tags=Consumo&organization=instituto-brasileiro-de-geografia-e-estatistica-ibge&_tags_limit=0'

'http://www.dados.gov.br/dataset?tags=Consumo&organization=instituto-brasileiro-de-geografia-e-estatistica-ibge&_tags_limit=0'

In [3]:
# MINISTÉRIO DA SAÚDE (Cadastro Nacional de Estabelecimentos de Saúde - CNES)
'http://www.dados.gov.br/organization/5b283f30-ced3-4ccc-b44a-406e8a92e1ad?groups=dados-em-destaque'

'http://www.dados.gov.br/organization/5b283f30-ced3-4ccc-b44a-406e8a92e1ad?groups=dados-em-destaque'

# FONTES

In [5]:
'https://animalbusiness.com.br/medicina-veterinaria/ciencia-e-saude/revisao-das-principais-pandemias-de-gripe-dos-ultimos-seculos/'

'https://animalbusiness.com.br/medicina-veterinaria/ciencia-e-saude/revisao-das-principais-pandemias-de-gripe-dos-ultimos-seculos/'

In [2]:
!curl https://api.discogs.com/releases/249504 --user-agent "FooBarApp/3.0"

{"status": "Accepted", "videos": [{"duration": 213, "embed": true, "title": "Rick Astley - Never Gonna Give You Up (Video)", "description": "Rick Astley's official music video for \u201cNever Gonna Give You Up\u201d \nListen to Rick Astley: https://RickAstley.lnk.to/_listenYD\n\nSubscribe to the official Rick Astley YouTube channel: https://RickAstley.lnk.to/subscribeYD\n\nFollow Rick Astley:\nFacebook: h", "uri": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"}], "series": [], "labels": [{"name": "RCA", "entity_type": "1", "catno": "PB 41447", "resource_url": "https://api.discogs.com/labels/895", "id": 895, "entity_type_name": "Label"}], "year": 1987, "community": {"status": "Accepted", "rating": {"count": 129, "average": 3.66}, "want": 261, "contributors": [{"username": "memory", "resource_url": "https://api.discogs.com/users/memory"}, {"username": "vargind", "resource_url": "https://api.discogs.com/users/vargind"}, {"username": "alistairk", "resource_url": "https://api.discogs.com/use

In [None]:
sss