# Building a scraper

In [None]:
__author__ = "Pierre Nugues"

#### Using `requests`

In [None]:
import requests

url_en = 'https://en.wikipedia.org/wiki/Aristotle'
url_fr = 'https://fr.wikipedia.org/wiki/Aristote'
html_doc = requests.get(url_en).text
print(html_doc[:2000])

## Parsing HTML and a Wikipedia page

#### We import the modules

In [None]:
import bs4
import requests
from urllib.parse import urljoin

#### We load a page and parse it

In [None]:
url_en = 'https://en.wikipedia.org/wiki/Aristotle'
html_doc = requests.get(url_en).text
parse_tree = bs4.BeautifulSoup(html_doc, 'html.parser')

#### We extract elements

In [None]:
parse_tree.title
# <title>Aristotle - Wikipedia, the free encyclopedia</title>

In [None]:
parse_tree.title.text
# Aristotle - Wikipedia, the free encyclopedia

In [None]:
#### We extract header 1

In [None]:
parse_tree.h1.text
# Aristotle

#### We extract all the headers h2

In [None]:
headings = parse_tree.find_all('h2')
[heading.text for heading in headings]
# ['Contents', 'Life', 'Thought', 'Loss and preservation of his works', 'Legacy', 'List of works', 'Eponyms', 'See also', 'Notes and references', 'Further reading', 'External links', 'Navigation menu']

#### We extract the links

In [None]:
links = parse_tree.find_all('a', href=True)
links[:5]

#### The labels

In [None]:
[link.text for link in links][:15]

#### The links

In [None]:
[link.get('href') for link in links][:15]

#### The absolute addresses

In [None]:
try:
    out = [urljoin(url_en, link['href']) for link in links]
except Exception as ex:
    type(ex)
out[:15]