# Seção 1 - scraping de uma página sem qualquer proteção

In [54]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

url = 'https://alura-site-scraping.herokuapp.com/hello-world.php'
response = urlopen(url)
html = response.read()

In [55]:
soup = BeautifulSoup(html, 'html.parser')
soup.find('h1', id='hello-world').get_text()

'Hello World!!!'

In [56]:
soup.find('h1',{'class':'sub-header'}).get_text()

'Curso de Web Scraping'

# Seção 2 - scraping de uma página

In [57]:
from urllib.request import Request
from urllib.error import URLError, HTTPError

#define a url
url = 'https://www.alura.com.br'

In [58]:
#define uma lista de navegadores para acessar a url.
#Onde encontrar:
#1. acessar o devtools do navegador: aba 'Network', sub aba 'Headers'.
#2. atualizar a página a fim de a conexão client-server ser refeita e os dados aparecerem
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.64'
}

In [59]:
#método
try:
    req = Request(url, headers=headers)
    response = urlopen(req)
    response.read()
except HTTPError as e:
    print(e.status, e.reason)
except URLError as e:
    print(e.reason)

## Tratamento de string

In [60]:
url = 'https://alura-site-scraping.herokuapp.com/hello-world.php'
response = urlopen(url)
html = response.read()
type(html)

bytes

In [117]:
#split criará uma lista com todas as palavras somente
#o join retornará uma str com espaços entre elas
# o replace remove o espaço entre as tags


def trata_html(html):
    html = str.encode(html)
    html = html.decode('utf-8')
    return ' '.join(html.split()).replace('> <', '><')

In [118]:
html = trata_html(html)
html

'<!DOCTYPE html><html lang="pt-br"><head><meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><title>Alura Motors</title><style> /*Regra para a animacao*/ @keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } } /*Mudando o tamanho do icone de resposta*/ div.glyphicon { color:#6B8E23; font-size: 38px; } /*Classe que mostra a animacao \'spin\'*/ .loader { border: 16px solid #f3f3f3; border-radius: 50%; border-top: 16px solid #3498db; width: 80px; height: 80px; -webkit-animation: spin 2s linear infinite; animation: spin 2s linear infinite; } </style><link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous"><link rel="stylesheet" href="css/styles.css" media="all"><script src="https://code.jquery.com/jquery-1.12.4.js"></script><script src="https://maxcdn.boo

# Seção 3 - intro ao BeautifulSoup

In [119]:
from bs4 import BeautifulSoup

url  = 'https://alura-site-scraping.herokuapp.com/index.php'
response = urlopen(url)
html = response.read()
html = trata_html(html)
#explicita qual parser será utilizado para não deixar a lib escolher
soup = BeautifulSoup(html, 'html.parser')
type(soup)

TypeError: descriptor 'encode' for 'str' objects doesn't apply to a 'bytes' object

In [None]:
#atributo prettify() identa a html
soup.prettify()

In [None]:
soup.html.head.title

In [None]:
soup.h5

In [None]:
soup.title.get_text()

In [120]:
soup.h5.get_text()

'Aguarde... '

In [121]:
#retorna um dict de atributo:conteudo
soup.img.attrs

{'src': 'img/alura-logo.svg',
 'class': ['d-inline-block', 'align-top'],
 'alt': 'Alura'}

In [122]:
soup.img.get('src')

'img/alura-logo.svg'

# Seção 4 - Pesquisando com BeatifulSoup


In [123]:
#find_all retorna uma lista com todas as tags pesquisadas
soup.find_all('img')

[<img alt="Alura" class="d-inline-block align-top" src="img/alura-logo.svg"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-aventador/lamborghini-aventador-2932196__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/bmw-m2/bmw-m2-2970882__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/alfa/alfa-1823056__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/puech/puech-4055386__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-murcielago/lamborghini-murcielago-2872974__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/

In [124]:
soup.find_all('img', limit=1)

[<img alt="Alura" class="d-inline-block align-top" src="img/alura-logo.svg"/>]

In [125]:
#jeito mais prático de usar o find_all
soup('img')

[<img alt="Alura" class="d-inline-block align-top" src="img/alura-logo.svg"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-aventador/lamborghini-aventador-2932196__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/bmw-m2/bmw-m2-2970882__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/alfa/alfa-1823056__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/puech/puech-4055386__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-murcielago/lamborghini-murcielago-2872974__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/

In [126]:
soup.find_all(['h1','h2','h3','h4','h5','h6'])

[<h5 class="modal-title" id="loadingModal_label"><span class="glyphicon glyphicon-refresh"></span>Aguarde... </h5>,
 <h4><b id="loadingModal_content"></b></h4>,
 <h1 class="sub-header">Veículos de Luxo Novos e Usados - Todas as Marcas</h1>]

In [127]:
soup('p')

[<p class="txt-name inline">LAMBORGHINI AVENTADOR</p>,
 <p class="txt-category badge badge-secondary inline">USADO</p>,
 <p class="txt-motor">Motor 1.8 16v</p>,
 <p class="txt-description">Ano 1993 - 55.286 km</p>,
 <p class="txt-location">Belo Horizonte - MG</p>,
 <p class="txt-value">R$ 338.000</p>,
 <p class="txt-name inline">BMW M2</p>,
 <p class="txt-category badge badge-secondary inline">USADO</p>,
 <p class="txt-motor">Motor 3.0 32v</p>,
 <p class="txt-description">Ano 2018 - 83.447 km</p>,
 <p class="txt-location">Belo Horizonte - MG</p>,
 <p class="txt-value">R$ 346.000</p>,
 <p class="txt-name inline">ALFA</p>,
 <p class="txt-category badge badge-secondary inline">USADO</p>,
 <p class="txt-motor">Motor 1.8 16v</p>,
 <p class="txt-description">Ano 2004 - 19.722 km</p>,
 <p class="txt-location">Rio de Janeiro - RJ</p>,
 <p class="txt-value">R$ 480.000</p>,
 <p class="txt-name inline">PUECH</p>,
 <p class="txt-category badge badge-secondary inline">USADO</p>,
 <p class="txt-moto

## Utilizando o argumento attributes

In [128]:
soup('p',{'class': 'txt-value'})

[<p class="txt-value">R$ 338.000</p>,
 <p class="txt-value">R$ 346.000</p>,
 <p class="txt-value">R$ 480.000</p>,
 <p class="txt-value">R$ 133.000</p>,
 <p class="txt-value">R$ 175.000</p>,
 <p class="txt-value">R$ 239.000</p>,
 <p class="txt-value">R$ 115.000</p>,
 <p class="txt-value">R$ 114.000</p>,
 <p class="txt-value">R$ 75.000</p>,
 <p class="txt-value">R$ 117.000</p>]

## Buscando por tag

In [129]:
#encontrar 
soup('p',text = 'Belo Horizonte - MG')

[<p class="txt-location">Belo Horizonte - MG</p>,
 <p class="txt-location">Belo Horizonte - MG</p>,
 <p class="txt-location">Belo Horizonte - MG</p>,
 <p class="txt-location">Belo Horizonte - MG</p>,
 <p class="txt-location">Belo Horizonte - MG</p>]

## Utilizando diretamente os atributos

In [130]:
soup('img',alt = 'Foto')

[<img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-aventador/lamborghini-aventador-2932196__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/bmw-m2/bmw-m2-2970882__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/alfa/alfa-1823056__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/puech/puech-4055386__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-murcielago/lamborghini-murcielago-2872974__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/aston-martin/aston-martin-2977916__340.jpg" width="220"/>,
 <img al

In [131]:
for i in soup('img',alt = 'Foto'):
    print(i.get('src'))

https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-aventador/lamborghini-aventador-2932196__340.jpg
https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/bmw-m2/bmw-m2-2970882__340.jpg
https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/alfa/alfa-1823056__340.jpg
https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/puech/puech-4055386__340.jpg
https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-murcielago/lamborghini-murcielago-2872974__340.jpg
https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/aston-martin/aston-martin-2977916__340.jpg
https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/tvr/tvr-2943925__340.jpg
https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/excalibur/excalibur-2916730__340.jpg
https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/mclaren/mclaren-2855240__340.jpg
htt

## Lembando que class é uma palavra reservada do Python,vc pode memso assim usá-la com um undersocre após 

In [132]:
soup('p',class_='txt-value')

[<p class="txt-value">R$ 338.000</p>,
 <p class="txt-value">R$ 346.000</p>,
 <p class="txt-value">R$ 480.000</p>,
 <p class="txt-value">R$ 133.000</p>,
 <p class="txt-value">R$ 175.000</p>,
 <p class="txt-value">R$ 239.000</p>,
 <p class="txt-value">R$ 115.000</p>,
 <p class="txt-value">R$ 114.000</p>,
 <p class="txt-value">R$ 75.000</p>,
 <p class="txt-value">R$ 117.000</p>]

## Lista com todos os textos das tags 

In [133]:
soup(text= True)

['html',
 'Alura Motors',
 " /*Regra para a animacao*/ @keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } } /*Mudando o tamanho do icone de resposta*/ div.glyphicon { color:#6B8E23; font-size: 38px; } /*Classe que mostra a animacao 'spin'*/ .loader { border: 16px solid #f3f3f3; border-radius: 50%; border-top: 16px solid #3498db; width: 80px; height: 80px; -webkit-animation: spin 2s linear infinite; animation: spin 2s linear infinite; } ",
 ' requisitar("https://alura-scraping.herokuapp.com/produtos?_page="+1+"&_limit=10") ',
 'You need to enable JavaScript to run this app.',
 'Aguarde... ',
 'Motors ',
 'Hello World',
 'Anúncios',
 'Veículos de Luxo Novos e Usados - Todas as Marcas',
 '246 veículos encontrados',
 'Página 1 de 25',
 'LAMBORGHINI AVENTADOR',
 'USADO',
 'Motor 1.8 16v',
 'Ano 1993 - 55.286 km',
 '► 4 X 4',
 '► Câmera de estacionamento',
 '► Controle de tração',
 '► Sensor de estacionamento',
 '...',
 'Belo Horizonte - MG',
 'R$ 338.000',

<html>
    <body>
        <div id="container-a">
            <h1>Título A</h1>
            <h2>Subtítulo A</h2>
            <p>Texto de conteúdo A</p>
        </div>
    </body>
</html>

In [147]:
html_teste = """
    <html>
        <body>
            <div id="container-a">
                <h1>Título A</h1>
                <h2 class="ref-a">Sub título A</h2>
                <p>Texto de conteúdo A</p>
            </div>
            <div id="container-b">
                <h1>Título B</h1>
                <h2 class="ref-b">Sub título B</h2>
                <p>Texto de conteúdo B</p>
            </div>
        </body>
    </html>
"""

In [148]:
html_teste = trata_html(html_teste)
html_teste

'<html><body><div id="container-a"><h1>Título A</h1><h2 class="ref-a">Sub título A</h2><p>Texto de conteúdo A</p></div><div id="container-b"><h1>Título B</h1><h2 class="ref-b">Sub título B</h2><p>Texto de conteúdo B</p></div></body></html>'

In [150]:
soup = BeautifulSoup(html_teste, parser = 'html.parser')

In [152]:
soup.find('h2')

<h2 class="ref-a">Sub título A</h2>

In [156]:
soup.find('h2').find_parent('div')

<div id="container-a"><h1>Título A</h1><h2 class="ref-a">Sub título A</h2><p>Texto de conteúdo A</p></div>

In [157]:
soup.find('h2').find_parents()

[<div id="container-a"><h1>Título A</h1><h2 class="ref-a">Sub título A</h2><p>Texto de conteúdo A</p></div>,
 <body><div id="container-a"><h1>Título A</h1><h2 class="ref-a">Sub título A</h2><p>Texto de conteúdo A</p></div><div id="container-b"><h1>Título B</h1><h2 class="ref-b">Sub título B</h2><p>Texto de conteúdo B</p></div></body>,
 <html><body><div id="container-a"><h1>Título A</h1><h2 class="ref-a">Sub título A</h2><p>Texto de conteúdo A</p></div><div id="container-b"><h1>Título B</h1><h2 class="ref-b">Sub título B</h2><p>Texto de conteúdo B</p></div></body></html>,
 <html><body><div id="container-a"><h1>Título A</h1><h2 class="ref-a">Sub título A</h2><p>Texto de conteúdo A</p></div><div id="container-b"><h1>Título B</h1><h2 class="ref-b">Sub título B</h2><p>Texto de conteúdo B</p></div></body></html>]

In [162]:
for i in soup('h2'):
    print(i.find_parents('div'))

[<div id="container-a"><h1>Título A</h1><h2 class="ref-a">Sub título A</h2><p>Texto de conteúdo A</p></div>]
[<div id="container-b"><h1>Título B</h1><h2 class="ref-b">Sub título B</h2><p>Texto de conteúdo B</p></div>]


# Webscraping do site

In [165]:
response = urlopen('https://alura-site-scraping.herokuapp.com/index.php')
html = response.read().decode('utf-8')
soup = BeautifulSoup(html,'html.parser')
soup

<!DOCTYPE html>

<html lang="pt-br">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
<title>Alura Motors</title>
<style>
		/*Regra para a animacao*/
		@keyframes spin {
			0% { transform: rotate(0deg); }
			100% { transform: rotate(360deg); }
		}
		/*Mudando o tamanho do icone de resposta*/
		div.glyphicon {
			color:#6B8E23;
			font-size: 38px;
		}
		/*Classe que mostra a animacao 'spin'*/
		.loader {
			border: 16px solid #f3f3f3;
			border-radius: 50%;
			border-top: 16px solid #3498db;
			width: 80px;
			height: 80px;
			-webkit-animation: spin 2s linear infinite;
			animation: spin 2s linear infinite;
		}
	</style>
<link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" rel="stylesheet"/>
<link href="css/styles.css" media="all" rel="stylesheet"/>
<script src="https://code.jquery

In [167]:
cards =[]
card = {}

In [171]:
anuncio = soup.find('div',{'class': 'well card'})
anuncio

<div class="well card">
<div class="col-md-3 image-card">
<img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-aventador/lamborghini-aventador-2932196__340.jpg" width="220"/>
</div>
<div class="col-md-6 body-card">
<p class="txt-name inline">LAMBORGHINI AVENTADOR</p>
<p class="txt-category badge badge-secondary inline">USADO</p>
<p class="txt-motor">Motor 1.8 16v</p>
<p class="txt-description">Ano 1993 - 55.286 km</p>
<ul class="lst-items">
<li class="txt-items">► 4 X 4</li>
<li class="txt-items">► Câmera de estacionamento</li>
<li class="txt-items">► Controle de tração</li>
<li class="txt-items">► Sensor de estacionamento</li>
<li class="txt-items">...</li>
</ul>
<p class="txt-location">Belo Horizonte - MG</p>
</div>
<div class="col-md-3 value-card">
<div class="value">
<p class="txt-value">R$ 338.000</p>
</div>
</div>
</div>

# Obtendo valor dos anúncios

In [181]:
card['value'] = soup.find('p',{'class': 'txt-value'}).get_text()
card

{'value': 'R$ 338.000'}

## Infos sobre o veículo

In [187]:
infos = anuncio.find('div',{'class': 'body-card'})('p')

In [195]:
for i in infos:
    print(i.get('class'), '--', i.get_text())

['txt-name', 'inline'] -- LAMBORGHINI AVENTADOR
['txt-category', 'badge', 'badge-secondary', 'inline'] -- USADO
['txt-motor'] -- Motor 1.8 16v
['txt-description'] -- Ano 1993 - 55.286 km
['txt-location'] -- Belo Horizonte - MG


In [203]:
for i in infos:
    print(i.get('class')[0].split('-')[-1], '--', i.get_text())

name -- LAMBORGHINI AVENTADOR
category -- USADO
motor -- Motor 1.8 16v
description -- Ano 1993 - 55.286 km
location -- Belo Horizonte - MG


In [207]:
for i in infos:
    card[i.get('class')[0].split('-')[-1]] = i.get_text()
card

{'value': 'R$ 338.000',
 'name': 'LAMBORGHINI AVENTADOR',
 'category': 'USADO',
 'motor': 'Motor 1.8 16v',
 'description': 'Ano 1993 - 55.286 km',
 'location': 'Belo Horizonte - MG'}

In [211]:
#RESUMO
infos = anuncio.find('div',{'class': 'body-card'})('p')
for i in infos:
    card[i.get('class')[0].split('-')[-1]] = i.get_text()