# Building a scraper

In [1]:
__author__ = "Pierre Nugues"

#### Using `requests`

In [2]:
import requests

url_en = 'https://en.wikipedia.org/wiki/Aristotle'
url_fr = 'https://fr.wikipedia.org/wiki/Aristote'
html_doc = requests.get(url_en).text
print(html_doc[:2000])

<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Aristotle - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"7bff9491-4610-4fb9-946a-b19039a606b0","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Aristotle","wgTitle":"Aristotle","wgCurRevisionId":1104463106,"wgRevisionId":1104463106,"wgArticleId":308,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 French-language sources (fr)","Articles with short description","Short description matches Wikidata","Good articles","Wikipedia indefinitely semi-protected pages","U

## Parsing HTML and a Wikipedia page

#### We import the modules

In [3]:
import bs4
import requests
from urllib.parse import urljoin

#### We load a page and parse it

In [4]:
url_en = 'https://en.wikipedia.org/wiki/Aristotle'
html_doc = requests.get(url_en).text
parse_tree = bs4.BeautifulSoup(html_doc, 'html.parser')

#### We extract elements

In [5]:
parse_tree.title
# <title>Aristotle - Wikipedia, the free encyclopedia</title>

<title>Aristotle - Wikipedia</title>

In [6]:
parse_tree.title.text
# Aristotle - Wikipedia, the free encyclopedia

'Aristotle - Wikipedia'

In [7]:
#### We extract header 1

In [8]:
parse_tree.h1.text
# Aristotle

'Aristotle'

#### We extract all the headers h2

In [9]:
headings = parse_tree.find_all('h2')
[heading.text for heading in headings]
# ['Contents', 'Life', 'Thought', 'Loss and preservation of his works', 'Legacy', 'List of works', 'Eponyms', 'See also', 'Notes and references', 'Further reading', 'External links', 'Navigation menu']

['Contents',
 'Life',
 'Speculative philosophy',
 'Natural philosophy',
 'Practical philosophy',
 'Influence',
 'Surviving works',
 'Legacy',
 'See also',
 'References',
 'Further reading',
 'External links',
 'Navigation menu']

#### We extract the links

In [10]:
links = parse_tree.find_all('a', href=True)
links[:5]

[<a href="/wiki/Wikipedia:Good_articles" title="This is a good article. Click here for more information."><img alt="This is a good article. Click here for more information." data-file-height="185" data-file-width="180" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/19px-Symbol_support_vote.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/29px-Symbol_support_vote.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/39px-Symbol_support_vote.svg.png 2x" width="19"/></a>,
 <a href="/wiki/Wikipedia:Protection_policy#semi" title="This article is semi-protected."><img alt="Page semi-protected" data-file-height="512" data-file-width="512" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/30

#### The labels

In [11]:
[link.text for link in links][:15]

['',
 '',
 'Jump to navigation',
 'Jump to search',
 'Aristotle (disambiguation)',
 '',
 'Lysippos',
 '[A]',
 'Stagira',
 'Chalcidice',
 '[1]',
 'Euboea',
 'Macedonian Empire',
 'Platonic Academy',
 'Corpus Aristotelicum']

#### The links

In [12]:
[link.get('href') for link in links][:15]

['/wiki/Wikipedia:Good_articles',
 '/wiki/Wikipedia:Protection_policy#semi',
 '#mw-head',
 '#searchInput',
 '/wiki/Aristotle_(disambiguation)',
 '/wiki/File:Aristotle_Altemps_Inv8575.jpg',
 '/wiki/Lysippos',
 '#cite_note-1',
 '/wiki/Stagira_(ancient_city)',
 '/wiki/Chalcidice',
 '#cite_note-2',
 '/wiki/Euboea',
 '/wiki/Macedonia_(ancient_kingdom)#Empire',
 '/wiki/Platonic_Academy',
 '/wiki/Corpus_Aristotelicum']

#### The absolute addresses

In [13]:
try:
    out = [urljoin(url_en, link['href']) for link in links]
except Exception as ex:
    type(ex)
out[:15]

['https://en.wikipedia.org/wiki/Wikipedia:Good_articles',
 'https://en.wikipedia.org/wiki/Wikipedia:Protection_policy#semi',
 'https://en.wikipedia.org/wiki/Aristotle#mw-head',
 'https://en.wikipedia.org/wiki/Aristotle#searchInput',
 'https://en.wikipedia.org/wiki/Aristotle_(disambiguation)',
 'https://en.wikipedia.org/wiki/File:Aristotle_Altemps_Inv8575.jpg',
 'https://en.wikipedia.org/wiki/Lysippos',
 'https://en.wikipedia.org/wiki/Aristotle#cite_note-1',
 'https://en.wikipedia.org/wiki/Stagira_(ancient_city)',
 'https://en.wikipedia.org/wiki/Chalcidice',
 'https://en.wikipedia.org/wiki/Aristotle#cite_note-2',
 'https://en.wikipedia.org/wiki/Euboea',
 'https://en.wikipedia.org/wiki/Macedonia_(ancient_kingdom)#Empire',
 'https://en.wikipedia.org/wiki/Platonic_Academy',
 'https://en.wikipedia.org/wiki/Corpus_Aristotelicum']