# Webscraping

In [1]:
#Versões dos pacotes
import bs4
import urllib.request as urllib_request
import pandas

print("BeautifulSoup ->", bs4.__version__)
print("urllib ->", urllib_request.__version__)
print("pandas ->", pandas.__version__)

BeautifulSoup -> 4.9.3
urllib -> 3.8
pandas -> 1.1.3


###  Primeiro Scraping

In [2]:
#importando funções dos pacotes
from urllib.request import urlopen
from bs4 import BeautifulSoup

#Pegar o link para teste e depois le o html do link
#Mas vem tudo desorganizado
url = "https://alura-site-scraping.herokuapp.com/hello-world.php"
response = urlopen(url)
html = response.read()

#Deixa bonitinho o html aqui
soup = BeautifulSoup(html, 'html.parser')

#Procura a parte importante e pega o texto
print(soup.find('h1', id = 'hello-world').get_text())

#Pegando a outra parte agora
#pega a primeita tag p
print(soup.find('p').get_text())

#class -> sub-header
print(soup.find('h1', {'class': 'sub-header'}).get_text())

Hello World!!!
Web Scraping é o termo utilizado para definir a prática de coletar automaticamente informações na Internet. Isto é feito, geralmente, por meio de programas que simulam a navegação humana na Web.
Curso de Web Scraping


### Web Scraping 

In [3]:
#A página anterior tem acesso completamente liberado
#Logo não há problema

from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError

url = "https://www.alura.com.br/"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36"}

try:
    req = Request(url, headers = headers)
    response = urlopen(req)
    response.read()

#retorna se o erro na página
except HTTPError as e:
        print(e.status, e.reason)

#retorna se o erro for na URL
except URLError as e:
        print(e.status, e.reason)

### Trabalhando com strings

In [4]:
url = "https://alura-site-scraping.herokuapp.com/index.php"

response = urlopen(url)
html = response.read()
html
#\r,\n,\t tabulações
#caracteres especiais pq está em português

b'<!DOCTYPE html>\r\n<html lang="pt-br">\r\n<head>\r\n    <meta charset="utf-8">\r\n    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">\r\n\r\n    <title>Alura Motors</title>\r\n\r\n\t<style>\r\n\t\t/*Regra para a animacao*/\r\n\t\t@keyframes spin {\r\n\t\t\t0% { transform: rotate(0deg); }\r\n\t\t\t100% { transform: rotate(360deg); }\r\n\t\t}\r\n\t\t/*Mudando o tamanho do icone de resposta*/\r\n\t\tdiv.glyphicon {\r\n\t\t\tcolor:#6B8E23;\r\n\t\t\tfont-size: 38px;\r\n\t\t}\r\n\t\t/*Classe que mostra a animacao \'spin\'*/\r\n\t\t.loader {\r\n\t\t\tborder: 16px solid #f3f3f3;\r\n\t\t\tborder-radius: 50%;\r\n\t\t\tborder-top: 16px solid #3498db;\r\n\t\t\twidth: 80px;\r\n\t\t\theight: 80px;\r\n\t\t\t-webkit-animation: spin 2s linear infinite;\r\n\t\t\tanimation: spin 2s linear infinite;\r\n\t\t}\r\n\t</style>\r\n\t<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuH

In [5]:
type(html)

bytes

In [6]:
#resolvendo o caso de caracteres especiais
html = html.decode('utf-8')

type(html)

str

In [7]:
#Espaço entre cada palavra
html.split()

['<!DOCTYPE',
 'html>',
 '<html',
 'lang="pt-br">',
 '<head>',
 '<meta',
 'charset="utf-8">',
 '<meta',
 'name="viewport"',
 'content="width=device-width,',
 'initial-scale=1,',
 'shrink-to-fit=no">',
 '<title>Alura',
 'Motors</title>',
 '<style>',
 '/*Regra',
 'para',
 'a',
 'animacao*/',
 '@keyframes',
 'spin',
 '{',
 '0%',
 '{',
 'transform:',
 'rotate(0deg);',
 '}',
 '100%',
 '{',
 'transform:',
 'rotate(360deg);',
 '}',
 '}',
 '/*Mudando',
 'o',
 'tamanho',
 'do',
 'icone',
 'de',
 'resposta*/',
 'div.glyphicon',
 '{',
 'color:#6B8E23;',
 'font-size:',
 '38px;',
 '}',
 '/*Classe',
 'que',
 'mostra',
 'a',
 'animacao',
 "'spin'*/",
 '.loader',
 '{',
 'border:',
 '16px',
 'solid',
 '#f3f3f3;',
 'border-radius:',
 '50%;',
 'border-top:',
 '16px',
 'solid',
 '#3498db;',
 'width:',
 '80px;',
 'height:',
 '80px;',
 '-webkit-animation:',
 'spin',
 '2s',
 'linear',
 'infinite;',
 'animation:',
 'spin',
 '2s',
 'linear',
 'infinite;',
 '}',
 '</style>',
 '<link',
 'rel="stylesheet"',
 'hre

In [8]:
" ".join(html.split())

'<!DOCTYPE html> <html lang="pt-br"> <head> <meta charset="utf-8"> <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> <title>Alura Motors</title> <style> /*Regra para a animacao*/ @keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } } /*Mudando o tamanho do icone de resposta*/ div.glyphicon { color:#6B8E23; font-size: 38px; } /*Classe que mostra a animacao \'spin\'*/ .loader { border: 16px solid #f3f3f3; border-radius: 50%; border-top: 16px solid #3498db; width: 80px; height: 80px; -webkit-animation: spin 2s linear infinite; animation: spin 2s linear infinite; } </style> <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous"> <link rel="stylesheet" href="css/styles.css" media="all"> <script src="https://code.jquery.com/jquery-1.12.4.js"></script> <script src="https://

In [9]:
" ".join(html.split()).replace("> <","><")

'<!DOCTYPE html><html lang="pt-br"><head><meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><title>Alura Motors</title><style> /*Regra para a animacao*/ @keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } } /*Mudando o tamanho do icone de resposta*/ div.glyphicon { color:#6B8E23; font-size: 38px; } /*Classe que mostra a animacao \'spin\'*/ .loader { border: 16px solid #f3f3f3; border-radius: 50%; border-top: 16px solid #3498db; width: 80px; height: 80px; -webkit-animation: spin 2s linear infinite; animation: spin 2s linear infinite; } </style><link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous"><link rel="stylesheet" href="css/styles.css" media="all"><script src="https://code.jquery.com/jquery-1.12.4.js"></script><script src="https://maxcdn.boo

In [10]:
#Função que faz tudo
#split transforma os \n e \t em espaços em branco (sem argumento)

def trata_html(input):
    return " ".join(input.split()).replace("> <","><")

html = trata_html(html)

In [11]:
#Como funciona a manipulação de strings
text = "'O\ns@po\nn#o\tl@v@\to\npé'"

print(text)
print(" ".join(text))
print(" ".join(text.split()))
print(" ".join(text.split()).replace('@', 'a').replace('#', 'ã'))

'O
s@po
n#o	l@v@	o
pé'
' O 
 s @ p o 
 n # o 	 l @ v @ 	 o 
 p é '
'O s@po n#o l@v@ o pé'
'O sapo não lava o pé'


### HTML
HTML é uma linguagem de demarcação
e dentro dela nos temos tags <>
<img> imagem
<table> tabela
<p> parágrafo
<a> hiperlink

### BeautifulSoup

In [12]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'html.parser')

soup

<!DOCTYPE html>
<html lang="pt-br"><head><meta charset="utf-8"/><meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/><title>Alura Motors</title><style> /*Regra para a animacao*/ @keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } } /*Mudando o tamanho do icone de resposta*/ div.glyphicon { color:#6B8E23; font-size: 38px; } /*Classe que mostra a animacao 'spin'*/ .loader { border: 16px solid #f3f3f3; border-radius: 50%; border-top: 16px solid #3498db; width: 80px; height: 80px; -webkit-animation: spin 2s linear infinite; animation: spin 2s linear infinite; } </style><link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" rel="stylesheet"/><link href="css/styles.css" media="all" rel="stylesheet"/><script src="https://code.jquery.com/jquery-1.12.4.js"></script><script crossorigin="anonymou

In [13]:
type(soup)

bs4.BeautifulSoup

In [14]:
#Deixa bonito
print(soup.prettify())

<!DOCTYPE html>
<html lang="pt-br">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
  <title>
   Alura Motors
  </title>
  <style>
   /*Regra para a animacao*/ @keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } } /*Mudando o tamanho do icone de resposta*/ div.glyphicon { color:#6B8E23; font-size: 38px; } /*Classe que mostra a animacao 'spin'*/ .loader { border: 16px solid #f3f3f3; border-radius: 50%; border-top: 16px solid #3498db; width: 80px; height: 80px; -webkit-animation: spin 2s linear infinite; animation: spin 2s linear infinite; }
  </style>
  <link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" rel="stylesheet"/>
  <link href="css/styles.css" media="all" rel="stylesheet"/>
  <script src="https://code.jquery.com/jquery-1.12.4.js">
  

### Acessando as tags

In [18]:
#Precisa ir na ordem
print(soup.html.head.title)

#ou pode fazer assim se for o primeiro
print(soup.title)

<title>Alura Motors</title>
<title>Alura Motors</title>


In [22]:
#Se eu quiser pegar a divisoria 5
soup.div.div.div.div.h5

<h5 class="modal-title" id="loadingModal_label"><span class="glyphicon glyphicon-refresh"></span>Aguarde... </h5>

### Acessando o conteúdo das tags

In [24]:
soup.html.head.title.get_text()

'Alura Motors'

In [26]:
soup.div.div.div.div.h5.getText()

'Aguarde... '

In [29]:
soup.get_text(separator=' || ', strip=True).split(' || ')

['Alura Motors',
 'You need to enable JavaScript to run this app.',
 'Aguarde...',
 'Motors',
 'Hello World',
 'Anúncios',
 'Veículos de Luxo Novos e Usados - Todas as Marcas',
 '246 veículos encontrados',
 'Página 1 de 25',
 'LAMBORGHINI AVENTADOR',
 'USADO',
 'Motor 1.8 16v',
 'Ano 1993 - 55.286 km',
 '► 4 X 4',
 '► Câmera de estacionamento',
 '► Controle de tração',
 '► Sensor de estacionamento',
 '...',
 'Belo Horizonte - MG',
 'R$ 338.000',
 'BMW M2',
 'USADO',
 'Motor 3.0 32v',
 'Ano 2018 - 83.447 km',
 '► Câmera de estacionamento',
 '► Controle de estabilidade',
 '► Travas elétricas',
 '► Freios ABS',
 '...',
 'Belo Horizonte - MG',
 'R$ 346.000',
 'ALFA',
 'USADO',
 'Motor 1.8 16v',
 'Ano 2004 - 19.722 km',
 '► Central multimídia',
 '► Bancos de couro',
 '► Rodas de liga',
 '► Câmera de estacionamento',
 '...',
 'Rio de Janeiro - RJ',
 'R$ 480.000',
 'PUECH',
 'USADO',
 'Motor Diesel V8',
 'Ano 1992 - 34.335 km',
 '► Bancos de couro',
 '► Freios ABS',
 '► Rodas de liga',
 '► Câ

### Acessando os atributos de uma tag

In [30]:
#Primeira imagem
soup.img

<img alt="Alura" class="d-inline-block align-top" src="img/alura-logo.svg"/>

In [31]:
#atributos
soup.img.attrs

{'src': 'img/alura-logo.svg',
 'class': ['d-inline-block', 'align-top'],
 'alt': 'Alura'}

In [32]:
#só as chaves
soup.img.attrs.keys

<function dict.keys>

In [33]:
#só os valores
soup.img.attrs.values()

dict_values(['img/alura-logo.svg', ['d-inline-block', 'align-top'], 'Alura'])

In [34]:
soup.img.get('src')

'img/alura-logo.svg'