In [15]:
from urllib.request import urlopen

In [16]:
html = urlopen('http://pythonscraping.com/pages/page1.html')
print(html.read())

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'


## Introdução ao BeautifulSoup

In [25]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [31]:
html = urlopen('http://www.pythonscraping.com/pages/page1.html')
soup = BeautifulSoup(html.read(), 'html.parser')                    # no livro é chamado de bs
                                                                    # poderia chamar .read()
soup.h1

<h1>An Interesting Title</h1>

Qualquer função abaixo retornaria o mesmo resultado

In [32]:
soup.html.body.h1

<h1>An Interesting Title</h1>

In [33]:
soup.body.h1

<h1>An Interesting Title</h1>

In [34]:
soup.html.h1

<h1>An Interesting Title</h1>

Tratando exceção

In [35]:
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError

try:
    html = urlopen("https://pythonscrapingthisurldoesnotexist.com")
except HTTPError as e:
    # devolve null, executa um break ou algum outro "Plano B"
    print("The server returned an HTTP error")
except URLError as e:
    # devolve null, executa um break ou algum outro "Plano B"
    print("The server could not be found!")
else:
    # o programa continua. Nota: se vc retornar ou executar um break
    # no catch da exceção, não será necessário usar a instrução "else"
    print(html.read())

The server could not be found!


In [36]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup


def getTitle(url):
    """
    Esta função devolve o título da página ou um objeto None caso tenha
    havido algum problema para obtê-lo.    
    """
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    try:
        bsObj = BeautifulSoup(html.read(), "lxml")
        title = bsObj.body.h1
    except AttributeError as e:
        return None
    return title


title = getTitle("http://www.pythonscraping.com/pages/page1.html")
if title == None:
    print("Title could not be found")
else:
    print(title)

<h1>An Interesting Title</h1>


#### Reutilizar código é inteligente.

Ter funções genéricas como o getSiteHTML ou getTitle facilita fazer uma colte de dados da web de forma rápida e confiável

In [37]:
"""
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError

url = 'https://www.alura.com.br'
#headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}

try:
    req = Request(url, headers = headers)
    response = urlopen(req)
    print(response.read())
    
except HTTPError as e:
    print(e.status, e.reason)
    
except URLError as e:
    print(e.reason)
    

#############  
from urllib.request import urlopen
from bs4 import BeautifulSoup

url = 'https://alura-site-scraping.herokuapp.com/hello-world.php'

response = urlopen(url)
html = response.read()

soup = BeautifulSoup(html, 'html.parser')

print(soup.find('h1', id="hello-world").get_text())
print(soup.find('p').get_text())
"""

'\nfrom urllib.request import Request, urlopen\nfrom urllib.error import URLError, HTTPError\n\nurl = \'https://www.alura.com.br\'\n#headers = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36\'}\n\ntry:\n    req = Request(url, headers = headers)\n    response = urlopen(req)\n    print(response.read())\n    \nexcept HTTPError as e:\n    print(e.status, e.reason)\n    \nexcept URLError as e:\n    print(e.reason)\n    \n\n#############  \nfrom urllib.request import urlopen\nfrom bs4 import BeautifulSoup\n\nurl = \'https://alura-site-scraping.herokuapp.com/hello-world.php\'\n\nresponse = urlopen(url)\nhtml = response.read()\n\nsoup = BeautifulSoup(html, \'html.parser\')\n\nprint(soup.find(\'h1\', id="hello-world").get_text())\nprint(soup.find(\'p\').get_text())\n'