# Seção 1 - scraping de uma página sem qualquer proteção

In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

url = 'https://alura-site-scraping.herokuapp.com/hello-world.php'
response = urlopen(url)
html = response.read()

In [3]:
soup = BeautifulSoup(html, 'html.parser')
soup.find('h1', id='hello-world').get_text()

'Hello World!!!'

In [4]:
soup.find('h1',{'class':'sub-header'}).get_text()

'Curso de Web Scraping'

# Seção 2 - scraping de uma página

In [5]:
from urllib.request import Request
from urllib.error import URLError, HTTPError

#define a url
url = 'https://www.alura.com.br'

In [6]:
#define uma lista de navegadores para acessar a url.
#Onde encontrar:
#1. acessar o devtools do navegador: aba 'Network', sub aba 'Headers'.
#2. atualizar a página a fim de a conexão client-server ser refeita e os dados aparecerem
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.64'
}

In [112]:
#método
try:
    req = Request(url, headers=headers)
    response = urlopen(req)
    response.read()
except HTTPError as e:
    print(e.status, e.reason)
except URLError as e:
    print(e.reason)

## Tratamento de string

In [118]:
url = 'https://alura-site-scraping.herokuapp.com/hello-world.php'
response = urlopen(url)
html = response.read()
type(html)

bytes

In [119]:
#split criará uma lista com todas as palavras somente
#o join retornará uma str com espaços entre elas
# o replace remove o espaço entre as tags


def trata_html(html):
    html = html.decode('utf-8')
    html = ' '.join(str(html).split()).replace('> <', '><')
    return html

In [120]:
html = trata_html(html)
html

'<!DOCTYPE html><html lang="pt-br"><head><meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><title>Alura Motors</title><link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous"><link rel="stylesheet" href="css/styles.css" media="all"><script src="https://code.jquery.com/jquery-1.12.4.js"></script><script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script><script type="text/javascript" src="js/index.js"></script></head><body cz-shortcut-listen="true"><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"><header><nav class="navbar navbar-inverse" style="margin-bottom: 0;"><div class="container" style="margin-bottom: -20

# Seção 3 - intro ao BeautifulSoup

In [138]:
from bs4 import BeautifulSoup

url  = 'https://alura-site-scraping.herokuapp.com/index.php'
response = urlopen(url)
html = response.read()

html = trata_html(html)
#explicita qual parser será utilizado para não deixar a lib escolher
soup = BeautifulSoup(html, 'html.parser')
type(soup)

In [148]:
#atributo prettify() identa a html ???
soup.prettify().encode('utf-8')

b'<!DOCTYPE html>\n<html lang="pt-br">\n <head>\n  <meta charset="utf-8"/>\n  <meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>\n  <title>\n   Alura Motors\n  </title>\n  <style>\n   /*Regra para a animacao*/ @keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } } /*Mudando o tamanho do icone de resposta*/ div.glyphicon { color:#6B8E23; font-size: 38px; } /*Classe que mostra a animacao \'spin\'*/ .loader { border: 16px solid #f3f3f3; border-radius: 50%; border-top: 16px solid #3498db; width: 80px; height: 80px; -webkit-animation: spin 2s linear infinite; animation: spin 2s linear infinite; }\n  </style>\n  <link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" rel="stylesheet"/>\n  <link href="css/styles.css" media="all" rel="stylesheet"/>\n  <script src="https://code.jquery.com/jque

In [149]:
soup.html.head.title

<title>Alura Motors</title>

In [150]:
soup.h5

<h5 class="modal-title" id="loadingModal_label"><span class="glyphicon glyphicon-refresh"></span>Aguarde... </h5>

In [151]:
soup.title.get_text()

'Alura Motors'

In [152]:
soup.h5.get_text()

'Aguarde... '

In [153]:
#retorna um dict de atributo:conteudo
soup.img.attrs

{'src': 'img/alura-logo.svg',
 'class': ['d-inline-block', 'align-top'],
 'alt': 'Alura'}

In [154]:
soup.img.get('src')

'img/alura-logo.svg'

# Seção 4 - Pesquisando com BeatifulSoup


In [155]:
#find_all retorna uma lista com todas as tags pesquisadas
soup.find_all('img')

[<img alt="Alura" class="d-inline-block align-top" src="img/alura-logo.svg"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-aventador/lamborghini-aventador-2932196__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/bmw-m2/bmw-m2-2970882__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/alfa/alfa-1823056__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/puech/puech-4055386__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-murcielago/lamborghini-murcielago-2872974__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/

In [156]:
soup.find_all('img', limit=1)

[<img alt="Alura" class="d-inline-block align-top" src="img/alura-logo.svg"/>]

In [157]:
#jeito mais prático de usar o find_all
soup('img')

[<img alt="Alura" class="d-inline-block align-top" src="img/alura-logo.svg"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-aventador/lamborghini-aventador-2932196__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/bmw-m2/bmw-m2-2970882__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/alfa/alfa-1823056__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/puech/puech-4055386__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-murcielago/lamborghini-murcielago-2872974__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/

In [158]:
soup.find_all(['h1','h2','h3','h4','h5','h6'])

[<h5 class="modal-title" id="loadingModal_label"><span class="glyphicon glyphicon-refresh"></span>Aguarde... </h5>,
 <h4><b id="loadingModal_content"></b></h4>,
 <h1 class="sub-header">Veículos de Luxo Novos e Usados - Todas as Marcas</h1>]

In [159]:
soup('p')

[<p class="txt-name inline">LAMBORGHINI AVENTADOR</p>,
 <p class="txt-category badge badge-secondary inline">USADO</p>,
 <p class="txt-motor">Motor 1.8 16v</p>,
 <p class="txt-description">Ano 1993 - 55.286 km</p>,
 <p class="txt-location">Belo Horizonte - MG</p>,
 <p class="txt-value">R$ 338.000</p>,
 <p class="txt-name inline">BMW M2</p>,
 <p class="txt-category badge badge-secondary inline">USADO</p>,
 <p class="txt-motor">Motor 3.0 32v</p>,
 <p class="txt-description">Ano 2018 - 83.447 km</p>,
 <p class="txt-location">Belo Horizonte - MG</p>,
 <p class="txt-value">R$ 346.000</p>,
 <p class="txt-name inline">ALFA</p>,
 <p class="txt-category badge badge-secondary inline">USADO</p>,
 <p class="txt-motor">Motor 1.8 16v</p>,
 <p class="txt-description">Ano 2004 - 19.722 km</p>,
 <p class="txt-location">Rio de Janeiro - RJ</p>,
 <p class="txt-value">R$ 480.000</p>,
 <p class="txt-name inline">PUECH</p>,
 <p class="txt-category badge badge-secondary inline">USADO</p>,
 <p class="txt-moto

## Utilizando o argumento attributes

In [160]:
soup('p',{'class': 'txt-value'})

[<p class="txt-value">R$ 338.000</p>,
 <p class="txt-value">R$ 346.000</p>,
 <p class="txt-value">R$ 480.000</p>,
 <p class="txt-value">R$ 133.000</p>,
 <p class="txt-value">R$ 175.000</p>,
 <p class="txt-value">R$ 239.000</p>,
 <p class="txt-value">R$ 115.000</p>,
 <p class="txt-value">R$ 114.000</p>,
 <p class="txt-value">R$ 75.000</p>,
 <p class="txt-value">R$ 117.000</p>]

## Buscando por tag

In [161]:
#encontrar 
soup('p',text = 'Belo Horizonte - MG')

[<p class="txt-location">Belo Horizonte - MG</p>,
 <p class="txt-location">Belo Horizonte - MG</p>,
 <p class="txt-location">Belo Horizonte - MG</p>,
 <p class="txt-location">Belo Horizonte - MG</p>,
 <p class="txt-location">Belo Horizonte - MG</p>]

## Utilizando diretamente os atributos

In [162]:
soup('img',alt = 'Foto')

[<img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-aventador/lamborghini-aventador-2932196__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/bmw-m2/bmw-m2-2970882__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/alfa/alfa-1823056__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/puech/puech-4055386__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-murcielago/lamborghini-murcielago-2872974__340.jpg" width="220"/>,
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/aston-martin/aston-martin-2977916__340.jpg" width="220"/>,
 <img al

In [163]:
for i in soup('img',alt = 'Foto'):
    print(i.get('src'))

https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-aventador/lamborghini-aventador-2932196__340.jpg
https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/bmw-m2/bmw-m2-2970882__340.jpg
https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/alfa/alfa-1823056__340.jpg
https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/puech/puech-4055386__340.jpg
https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-murcielago/lamborghini-murcielago-2872974__340.jpg
https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/aston-martin/aston-martin-2977916__340.jpg
https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/tvr/tvr-2943925__340.jpg
https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/excalibur/excalibur-2916730__340.jpg
https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/mclaren/mclaren-2855240__340.jpg
htt

## Lembando que class é uma palavra reservada do Python,vc pode memso assim usá-la com um undersocre após 

In [164]:
soup('p',class_='txt-value')

[<p class="txt-value">R$ 338.000</p>,
 <p class="txt-value">R$ 346.000</p>,
 <p class="txt-value">R$ 480.000</p>,
 <p class="txt-value">R$ 133.000</p>,
 <p class="txt-value">R$ 175.000</p>,
 <p class="txt-value">R$ 239.000</p>,
 <p class="txt-value">R$ 115.000</p>,
 <p class="txt-value">R$ 114.000</p>,
 <p class="txt-value">R$ 75.000</p>,
 <p class="txt-value">R$ 117.000</p>]

## Lista com todos os textos das tags 

In [165]:
soup(text= True)

['html',
 'Alura Motors',
 " /*Regra para a animacao*/ @keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } } /*Mudando o tamanho do icone de resposta*/ div.glyphicon { color:#6B8E23; font-size: 38px; } /*Classe que mostra a animacao 'spin'*/ .loader { border: 16px solid #f3f3f3; border-radius: 50%; border-top: 16px solid #3498db; width: 80px; height: 80px; -webkit-animation: spin 2s linear infinite; animation: spin 2s linear infinite; } ",
 ' requisitar("https://alura-scraping.herokuapp.com/produtos?_page="+1+"&_limit=10") ',
 'You need to enable JavaScript to run this app.',
 'Aguarde... ',
 'Motors ',
 'Hello World',
 'Anúncios',
 'Veículos de Luxo Novos e Usados - Todas as Marcas',
 '246 veículos encontrados',
 'Página 1 de 25',
 'LAMBORGHINI AVENTADOR',
 'USADO',
 'Motor 1.8 16v',
 'Ano 1993 - 55.286 km',
 '► 4 X 4',
 '► Câmera de estacionamento',
 '► Controle de tração',
 '► Sensor de estacionamento',
 '...',
 'Belo Horizonte - MG',
 'R$ 338.000',

<html>
    <body>
        <div id="container-a">
            <h1>Título A</h1>
            <h2>Subtítulo A</h2>
            <p>Texto de conteúdo A</p>
        </div>
    </body>
</html>

In [166]:
html_teste = """
    <html>
        <body>
            <div id="container-a">
                <h1>Título A</h1>
                <h2 class="ref-a">Sub título A</h2>
                <p>Texto de conteúdo A</p>
            </div>
            <div id="container-b">
                <h1>Título B</h1>
                <h2 class="ref-b">Sub título B</h2>
                <p>Texto de conteúdo B</p>
            </div>
        </body>
    </html>
"""

In [167]:
html_teste = trata_html(html_teste)
html_teste

AttributeError: 'str' object has no attribute 'decode'

In [None]:
soup = BeautifulSoup(html_teste, parser = 'html.parser')

In [None]:
soup.find('h2')

In [None]:
soup.find('h2').find_parent('div')

In [None]:
soup.find('h2').find_parents()

In [None]:
for i in soup('h2'):
    print(i.find_parents('div'))

# Webscraping do site

In [168]:
response = urlopen('https://alura-site-scraping.herokuapp.com/index.php')
html = response.read().decode('utf-8')
soup = BeautifulSoup(html,'html.parser')
soup

<!DOCTYPE html>

<html lang="pt-br">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
<title>Alura Motors</title>
<style>
		/*Regra para a animacao*/
		@keyframes spin {
			0% { transform: rotate(0deg); }
			100% { transform: rotate(360deg); }
		}
		/*Mudando o tamanho do icone de resposta*/
		div.glyphicon {
			color:#6B8E23;
			font-size: 38px;
		}
		/*Classe que mostra a animacao 'spin'*/
		.loader {
			border: 16px solid #f3f3f3;
			border-radius: 50%;
			border-top: 16px solid #3498db;
			width: 80px;
			height: 80px;
			-webkit-animation: spin 2s linear infinite;
			animation: spin 2s linear infinite;
		}
	</style>
<link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" rel="stylesheet"/>
<link href="css/styles.css" media="all" rel="stylesheet"/>
<script src="https://code.jquery

In [169]:
cards =[]
card = {}

In [172]:
anuncio = soup.find('div',{'class': 'well card'})
anuncio

<div class="well card">
<div class="col-md-3 image-card">
<img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-aventador/lamborghini-aventador-2932196__340.jpg" width="220"/>
</div>
<div class="col-md-6 body-card">
<p class="txt-name inline">LAMBORGHINI AVENTADOR</p>
<p class="txt-category badge badge-secondary inline">USADO</p>
<p class="txt-motor">Motor 1.8 16v</p>
<p class="txt-description">Ano 1993 - 55.286 km</p>
<ul class="lst-items">
<li class="txt-items">► 4 X 4</li>
<li class="txt-items">► Câmera de estacionamento</li>
<li class="txt-items">► Controle de tração</li>
<li class="txt-items">► Sensor de estacionamento</li>
<li class="txt-items">...</li>
</ul>
<p class="txt-location">Belo Horizonte - MG</p>
</div>
<div class="col-md-3 value-card">
<div class="value">
<p class="txt-value">R$ 338.000</p>
</div>
</div>
</div>

# Obtendo valor dos anúncios

In [173]:
card['value'] = soup.find('p',{'class': 'txt-value'}).get_text()
card

{'value': 'R$ 338.000'}

## Infos sobre o veículo

In [174]:
infos = anuncio.find('div',{'class': 'body-card'})('p')

In [175]:
for i in infos:
    print(i.get('class'), '--', i.get_text())

['txt-name', 'inline'] -- LAMBORGHINI AVENTADOR
['txt-category', 'badge', 'badge-secondary', 'inline'] -- USADO
['txt-motor'] -- Motor 1.8 16v
['txt-description'] -- Ano 1993 - 55.286 km
['txt-location'] -- Belo Horizonte - MG


In [176]:
for i in infos:
    print(i.get('class')[0].split('-')[-1], '--', i.get_text())

name -- LAMBORGHINI AVENTADOR
category -- USADO
motor -- Motor 1.8 16v
description -- Ano 1993 - 55.286 km
location -- Belo Horizonte - MG


In [177]:
for i in infos:
    card[i.get('class')[0].split('-')[-1]] = i.get_text()
card

{'value': 'R$ 338.000',
 'name': 'LAMBORGHINI AVENTADOR',
 'category': 'USADO',
 'motor': 'Motor 1.8 16v',
 'description': 'Ano 1993 - 55.286 km',
 'location': 'Belo Horizonte - MG'}

In [178]:
#RESUMO
infos = anuncio.find('div',{'class': 'body-card'})('p')
for i in infos:
    card[i.get('class')[0].split('-')[-1]] = i.get_text()

## Obtendo os acessórios

In [185]:
items = anuncio.find('div',{'class': 'body-card'}).ul.find_all('li')
items

[<li class="txt-items">► 4 X 4</li>,
 <li class="txt-items">► Câmera de estacionamento</li>,
 <li class="txt-items">► Controle de tração</li>,
 <li class="txt-items">► Sensor de estacionamento</li>,
 <li class="txt-items">...</li>]

In [186]:
items.pop()

<li class="txt-items">...</li>

In [187]:
items

[<li class="txt-items">► 4 X 4</li>,
 <li class="txt-items">► Câmera de estacionamento</li>,
 <li class="txt-items">► Controle de tração</li>,
 <li class="txt-items">► Sensor de estacionamento</li>]

In [190]:
for i in items:
    print(i.get_text().replace('►',''))

 4 X 4
 Câmera de estacionamento
 Controle de tração
 Sensor de estacionamento


In [194]:
acessorios =[]
for i in items:
    acessorios.append(i.get_text().replace('►',''))
acessorios

[' 4 X 4',
 ' Câmera de estacionamento',
 ' Controle de tração',
 ' Sensor de estacionamento']

In [196]:
card['items'] = acessorios
card

{'value': 'R$ 338.000',
 'name': 'LAMBORGHINI AVENTADOR',
 'category': 'USADO',
 'motor': 'Motor 1.8 16v',
 'description': 'Ano 1993 - 55.286 km',
 'location': 'Belo Horizonte - MG',
 'items': [' 4 X 4',
  ' Câmera de estacionamento',
  ' Controle de tração',
  ' Sensor de estacionamento']}

In [199]:
#RESUMO
items = anuncio.find('div',{'class': 'body-card'}).ul.find_all('li')
items.pop()
acessorios =[]
for i in items:
    acessorios.append(i.get_text().replace('►',''))
card['items'] = acessorios
card

{'value': 'R$ 338.000',
 'name': 'LAMBORGHINI AVENTADOR',
 'category': 'USADO',
 'motor': 'Motor 1.8 16v',
 'description': 'Ano 1993 - 55.286 km',
 'location': 'Belo Horizonte - MG',
 'items': [' 4 X 4',
  ' Câmera de estacionamento',
  ' Controle de tração',
  ' Sensor de estacionamento']}

# Criando um df com os dados coletados

In [201]:
import pandas as pd

In [203]:
dataset = pd.DataFrame(card)
dataset

Unnamed: 0,value,name,category,motor,description,location,items
0,R$ 338.000,LAMBORGHINI AVENTADOR,USADO,Motor 1.8 16v,Ano 1993 - 55.286 km,Belo Horizonte - MG,4 X 4
1,R$ 338.000,LAMBORGHINI AVENTADOR,USADO,Motor 1.8 16v,Ano 1993 - 55.286 km,Belo Horizonte - MG,Câmera de estacionamento
2,R$ 338.000,LAMBORGHINI AVENTADOR,USADO,Motor 1.8 16v,Ano 1993 - 55.286 km,Belo Horizonte - MG,Controle de tração
3,R$ 338.000,LAMBORGHINI AVENTADOR,USADO,Motor 1.8 16v,Ano 1993 - 55.286 km,Belo Horizonte - MG,Sensor de estacionamento


In [205]:
dataset = pd.DataFrame.from_dict(card, orient = 'index')
dataset

Unnamed: 0,0
value,R$ 338.000
name,LAMBORGHINI AVENTADOR
category,USADO
motor,Motor 1.8 16v
description,Ano 1993 - 55.286 km
location,Belo Horizonte - MG
items,"[ 4 X 4, Câmera de estacionamento, Controle ..."


In [206]:
dataset = pd.DataFrame.from_dict(card, orient = 'index').T
dataset

Unnamed: 0,value,name,category,motor,description,location,items
0,R$ 338.000,LAMBORGHINI AVENTADOR,USADO,Motor 1.8 16v,Ano 1993 - 55.286 km,Belo Horizonte - MG,"[ 4 X 4, Câmera de estacionamento, Controle ..."


In [208]:
dataset.to_csv('./output/data/dataset.csv', sep=';',
                 index = False, encoding ='utf-8-sig')

# Coletando as imagens

In [213]:
image = anuncio.find('div',{'class':'image-card'}).img
image

<img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-aventador/lamborghini-aventador-2932196__340.jpg" width="220"/>

In [239]:
print(image.get('src'))

https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-aventador/lamborghini-aventador-2932196__340.jpg


## visualizando a foto no próprio jupyter

In [241]:
from IPython.core.display import display,HTML

display(HTML(str(image)))

In [243]:
display(HTML("<img src=" + anuncio.find('div',{'class':'image-card'}).img.get('src') + ">"))

## rotina pra salvar as fotos

In [252]:
from urllib.request import urlretrieve

nome_imagem = image.get('src').split('/')[-1]

In [254]:
urlretrieve(image.get('src'), './output/img/' + nome_imagem)

('./output/img/lamborghini-aventador-2932196__340.jpg',
 <http.client.HTTPMessage at 0xd2545b0>)

In [255]:
# RESUMO
from urllib.request import urlretrieve

image = anuncio.find('div',{'class':'image-card'}).img
urlretrieve(image.get('src'), './output/img/' + nome_imagem)


('./output/img/lamborghini-aventador-2932196__340.jpg',
 <http.client.HTTPMessage at 0xd2aa280>)

## coletando infos de todos os cards da página

### identificando as infos

In [268]:
soup.find('div',{'id':'container-cards'}).find_all('div',{'class':'card'})

[<div class="well card">
 <div class="col-md-3 image-card">
 <img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-aventador/lamborghini-aventador-2932196__340.jpg" width="220"/>
 </div>
 <div class="col-md-6 body-card">
 <p class="txt-name inline">LAMBORGHINI AVENTADOR</p>
 <p class="txt-category badge badge-secondary inline">USADO</p>
 <p class="txt-motor">Motor 1.8 16v</p>
 <p class="txt-description">Ano 1993 - 55.286 km</p>
 <ul class="lst-items">
 <li class="txt-items">► 4 X 4</li>
 <li class="txt-items">► Câmera de estacionamento</li>
 <li class="txt-items">► Controle de tração</li>
 <li class="txt-items">► Sensor de estacionamento</li>
 <li class="txt-items">...</li>
 </ul>
 <p class="txt-location">Belo Horizonte - MG</p>
 </div>
 <div class="col-md-3 value-card">
 <div class="value">
 <p class="txt-value">R$ 338.000</p>
 </div>
 </div>
 </div>,
 <div class="well card">
 <div class="col-md-3 image-card">
 <img alt="

In [269]:
len(soup.find('div',{'id':'container-cards'}).find_all('div',{'class':'card'}))

10

In [270]:
anuncios = soup.find('div',{'id':'container-cards'}).find_all('div',{'class':'card'})

In [271]:
for i in anuncios:
    print(str(i)+'\n\n')

<div class="well card">
<div class="col-md-3 image-card">
<img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-aventador/lamborghini-aventador-2932196__340.jpg" width="220"/>
</div>
<div class="col-md-6 body-card">
<p class="txt-name inline">LAMBORGHINI AVENTADOR</p>
<p class="txt-category badge badge-secondary inline">USADO</p>
<p class="txt-motor">Motor 1.8 16v</p>
<p class="txt-description">Ano 1993 - 55.286 km</p>
<ul class="lst-items">
<li class="txt-items">► 4 X 4</li>
<li class="txt-items">► Câmera de estacionamento</li>
<li class="txt-items">► Controle de tração</li>
<li class="txt-items">► Sensor de estacionamento</li>
<li class="txt-items">...</li>
</ul>
<p class="txt-location">Belo Horizonte - MG</p>
</div>
<div class="col-md-3 value-card">
<div class="value">
<p class="txt-value">R$ 338.000</p>
</div>
</div>
</div>


<div class="well card">
<div class="col-md-3 image-card">
<img alt="Foto" height="155" src="ht

# Criando uma rotina de scrapping

In [302]:
from bs4 import BeautifulSoup
import pandas as pd
from urllib.request import urlopen, urlretrieve

#lista que vai conter todas as infos
cards = []

response = urlopen('https://alura-site-scraping.herokuapp.com/index.php')
html = response.read().decode('utf-8-sig')
soup = BeautifulSoup(html,'html.parser')

# obtendo as tags de interesse
anuncios = soup.find('div',{'id':'container-cards'}).find_all('div',{'class':'card'})

for i in anuncios:
    print(str(i)+'\n\n')

cards=[]
for anuncio in anuncios:
    card = {}
    
    #valor
    card['value'] = anuncio.find('p',{'class': 'txt-value'}).get_text()
    
    #infos
    infos = anuncio.find('div',{'class': 'body-card'})('p')
    for info in infos:
        card[info.get('class')[0].split('-')[-1]] = info.get_text()

    #acessórios
    items = anuncio.find('div',{'class': 'body-card'}).ul.find_all('li')
    items.pop()
    acessorios =[]
    for i in items:
        acessorios.append(i.get_text().replace('►',''))
    card['items'] = acessorios
    
    #adicionando resultados a lista de cards
    cards.append(card)
cards  

#Criando um df com os resultados
dataset = pd.DataFrame(cards)
#exportando para um csv
dataset.to_csv('./output/data/dataset.csv',sep=';', index = False, encoding = 'utf-8-sig')

#obtendo imagens e salvando em pasta
for anuncio in anuncios:
    image = anuncio.find('div',{'class':'image-card'}).img
     urlretrieve(image.get('src'), './output/img/' + image.get('src').split('/')[-1])

# OBTENDO INFOS DE TODAS AS PÁGINAS

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
from urllib.request import urlopen, urlretrieve

#lista que vai conter todas as infos
cards = []

#qtd de páginas
qtd_pags = int(soup.find('span',{'class':'info-pages'}).get_text().split()[-1])


## iterando por todas as páginas do site

for i in range(qtd_pags):
    #obtendo o html
    response = urlopen('https://alura-site-scraping.herokuapp.com/index.php?page='+str(i+1))
    html = response.read().decode('utf-8-sig') 
    soup = BeautifulSoup(html,'html.parser')    
    

    # obtendo as tags de interesse
    anuncios = soup.find('div',{'id':'container-cards'}).find_all('div',{'class':'card'})

    for i in anuncios:
        print(str(i)+'\n\n')

    cards=[]
    for anuncio in anuncios:
        card = {}

        #valor
        card['value'] = anuncio.find('p',{'class': 'txt-value'}).get_text()

        #infos
        infos = anuncio.find('div',{'class': 'body-card'})('p')
        for info in infos:
            card[info.get('class')[0].split('-')[-1]] = info.get_text()

        #acessórios
        items = anuncio.find('div',{'class': 'body-card'}).ul.find_all('li')
        items.pop()
        acessorios =[]
        for i in items:
            acessorios.append(i.get_text().replace('►',''))
        card['items'] = acessorios

        #adicionando resultados a lista de cards
        cards.append(card)
        #obtendo imagens e salvando em pasta
        for anuncio in anuncios:
            image = anuncio.find('div',{'class':'image-card'}).img
            urlretrieve(image.get('src'), './output/img/' + image.get('src').split('/')[-1])
        
    cards  

    #Criando um df com os resultados
    dataset = pd.DataFrame(cards)
    #exportando para um csv
    dataset.to_csv('./output/data/dataset.csv',sep=';', index = False, encoding = 'utf-8-sig')



<div class="well card">
<div class="col-md-3 image-card">
<img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-aventador/lamborghini-aventador-2932196__340.jpg" width="220"/>
</div>
<div class="col-md-6 body-card">
<p class="txt-name inline">LAMBORGHINI AVENTADOR</p>
<p class="txt-category badge badge-secondary inline">USADO</p>
<p class="txt-motor">Motor 1.8 16v</p>
<p class="txt-description">Ano 1993 - 55.286 km</p>
<ul class="lst-items">
<li class="txt-items">► 4 X 4</li>
<li class="txt-items">► Câmera de estacionamento</li>
<li class="txt-items">► Controle de tração</li>
<li class="txt-items">► Sensor de estacionamento</li>
<li class="txt-items">...</li>
</ul>
<p class="txt-location">Belo Horizonte - MG</p>
</div>
<div class="col-md-3 value-card">
<div class="value">
<p class="txt-value">R$ 338.000</p>
</div>
</div>
</div>


<div class="well card">
<div class="col-md-3 image-card">
<img alt="Foto" height="155" src="ht

<div class="well card">
<div class="col-md-3 image-card">
<img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/volkswagen-beetle/volkswagen-beetle-3295906__340.png" width="220"/>
</div>
<div class="col-md-6 body-card">
<p class="txt-name inline">VOLKSWAGEN BEETLE</p>
<p class="txt-category badge badge-secondary inline">NOVO</p>
<p class="txt-motor">Motor 1.0 8v</p>
<p class="txt-description">Ano 2019 - 0 km</p>
<ul class="lst-items">
<li class="txt-items">► Teto panorâmico</li>
<li class="txt-items">► Controle de tração</li>
<li class="txt-items">► Sensor crepuscular</li>
<li class="txt-items">► 4 X 4</li>
<li class="txt-items">...</li>
</ul>
<p class="txt-location">Rio de Janeiro - RJ</p>
</div>
<div class="col-md-3 value-card">
<div class="value">
<p class="txt-value">R$ 177.000</p>
</div>
</div>
</div>


<div class="well card">
<div class="col-md-3 image-card">
<img alt="Foto" height="155" src="https://caelum-online-public.s3.amaz